simple-news-crawler 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sn_crawler.rb +76 -14
- data/lib/sn_item.rb +4 -0
- metadata +50 -50
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ed289205cd65fe1a7651d9e8771a7e13cad35ac0
|
4
|
+
data.tar.gz: 3e8cb296f0e1230ed2164697c4ce04d87c5f043d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5474812498ab53f32f3c495d9d4e1f76b26dbc0a3455d968434617d3d9e096e52763d5661bffee7fd93a1f9b9d9742a1e19860d1ce76d2908c89801ab2783719
|
7
|
+
data.tar.gz: a3dee0584791af0d8ca080206a40a801bb736382c4a18bb3d1d2c04a4978f075822b9e9ab420bde5291175b90485d4abdb384153aaf9964c2b5ec764eda0ce05
|
data/lib/sn_crawler.rb
CHANGED
@@ -14,16 +14,51 @@ require 'xml'
|
|
14
14
|
require 'nokogiri'
|
15
15
|
require 'readability'
|
16
16
|
|
17
|
-
##
|
17
|
+
##
|
18
|
+
#
|
19
|
+
# A crawler class
|
20
|
+
#
|
21
|
+
# @example
|
22
|
+
# require 'sn_crawler'
|
23
|
+
#
|
24
|
+
# url = "http://vnexpress.net/rss"
|
25
|
+
# c = SNCrawler.new(url,"VNE","/channel/item:[title,description,pubDate,link]",nil)
|
26
|
+
# c.get_links(true)
|
27
|
+
# # => ["vnexpress.net/rss/tin-moi-nhat.rss",...]
|
28
|
+
#
|
29
|
+
# c.get_news(true)
|
30
|
+
# # => City in Jordan welcomes ISIS
|
31
|
+
# # Image: []
|
32
|
+
# # Now inserting City in Jordan welcomes ISIS
|
33
|
+
#
|
18
34
|
class SNCrawler
|
19
|
-
##
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
35
|
+
##
|
36
|
+
#
|
37
|
+
# Initialize parameters
|
38
|
+
#
|
39
|
+
# source => The source URL
|
40
|
+
#
|
41
|
+
# name => Some lovely names
|
42
|
+
#
|
43
|
+
# structure =>
|
44
|
+
#
|
45
|
+
# structure format: /path/to/channel/item_name:[item_attributes_name]
|
46
|
+
#
|
47
|
+
# item_attributes_name[0] => title of the page
|
48
|
+
#
|
49
|
+
# item_attributes_name[1] => description of the page
|
50
|
+
#
|
51
|
+
# item_attributes_name[2] => publicity time of the page
|
52
|
+
#
|
53
|
+
# item_attributes_name[3] => link to the page
|
54
|
+
#
|
55
|
+
# For example: /channel/item:[title,description,pubDate,url]
|
56
|
+
#
|
57
|
+
# db_conf => activerecord settings
|
58
|
+
#
|
59
|
+
# limit the number of news that you want
|
60
|
+
#
|
61
|
+
def initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100)
|
27
62
|
## The rss source's url
|
28
63
|
@source = source
|
29
64
|
@source_name = name
|
@@ -38,9 +73,16 @@ class SNCrawler
|
|
38
73
|
else
|
39
74
|
@use_db = false
|
40
75
|
end
|
76
|
+
@limit = limit
|
41
77
|
end
|
42
78
|
|
43
|
-
##
|
79
|
+
##
|
80
|
+
#
|
81
|
+
# Create table for our gem
|
82
|
+
#
|
83
|
+
# @example
|
84
|
+
# c.create_table("engine=MROONGA",true)
|
85
|
+
#
|
44
86
|
def create_table(options = "", verbose = true)
|
45
87
|
begin
|
46
88
|
ActiveRecord::Migration.class_eval do
|
@@ -67,7 +109,10 @@ class SNCrawler
|
|
67
109
|
return true
|
68
110
|
end
|
69
111
|
|
70
|
-
##
|
112
|
+
##
|
113
|
+
#
|
114
|
+
# Get urls from a source url
|
115
|
+
#
|
71
116
|
def get_links(verbose = false)
|
72
117
|
page = @agent.get(@source)
|
73
118
|
|
@@ -94,17 +139,28 @@ class SNCrawler
|
|
94
139
|
end
|
95
140
|
end
|
96
141
|
|
97
|
-
##
|
142
|
+
##
|
143
|
+
#
|
144
|
+
# Set Urls
|
145
|
+
#
|
98
146
|
def set_url(url = [])
|
99
147
|
@url = url
|
100
148
|
end
|
101
149
|
|
102
|
-
##
|
150
|
+
##
|
151
|
+
#
|
152
|
+
# Clear urls
|
153
|
+
#
|
103
154
|
def clear_url
|
104
155
|
@url = []
|
105
156
|
end
|
106
157
|
|
107
|
-
##
|
158
|
+
##
|
159
|
+
#
|
160
|
+
# Get news from urls
|
161
|
+
#
|
162
|
+
# Note that you have to run this method ONLY after running get_links and create_table(in case of using DB)
|
163
|
+
#
|
108
164
|
def get_news(verbose = false)
|
109
165
|
count = 0
|
110
166
|
channel_path = "."
|
@@ -183,6 +239,9 @@ class SNCrawler
|
|
183
239
|
else
|
184
240
|
end
|
185
241
|
count = count + 1
|
242
|
+
if verbose then
|
243
|
+
puts "Got #{count} news"
|
244
|
+
end
|
186
245
|
end
|
187
246
|
end
|
188
247
|
rescue => e
|
@@ -190,6 +249,9 @@ class SNCrawler
|
|
190
249
|
puts "Error: #{e}"
|
191
250
|
end
|
192
251
|
end
|
252
|
+
if count >= @limit then
|
253
|
+
break
|
254
|
+
end
|
193
255
|
end
|
194
256
|
|
195
257
|
if verbose then
|
data/lib/sn_item.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple-news-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nguyen Anh Tuan
|
@@ -14,220 +14,220 @@ dependencies:
|
|
14
14
|
name: json
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.8'
|
20
|
-
- -
|
20
|
+
- - '>='
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 1.8.1
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- -
|
27
|
+
- - ~>
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '1.8'
|
30
|
-
- -
|
30
|
+
- - '>='
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.8.1
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: libxml-ruby
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
|
-
- -
|
37
|
+
- - ~>
|
38
38
|
- !ruby/object:Gem::Version
|
39
39
|
version: '2.7'
|
40
|
-
- -
|
40
|
+
- - '>='
|
41
41
|
- !ruby/object:Gem::Version
|
42
42
|
version: 2.7.0
|
43
43
|
type: :runtime
|
44
44
|
prerelease: false
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
|
-
- -
|
47
|
+
- - ~>
|
48
48
|
- !ruby/object:Gem::Version
|
49
49
|
version: '2.7'
|
50
|
-
- -
|
50
|
+
- - '>='
|
51
51
|
- !ruby/object:Gem::Version
|
52
52
|
version: 2.7.0
|
53
53
|
- !ruby/object:Gem::Dependency
|
54
54
|
name: curb
|
55
55
|
requirement: !ruby/object:Gem::Requirement
|
56
56
|
requirements:
|
57
|
-
- -
|
57
|
+
- - ~>
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: '0.8'
|
60
|
-
- -
|
60
|
+
- - '>='
|
61
61
|
- !ruby/object:Gem::Version
|
62
62
|
version: 0.8.6
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
|
-
- -
|
67
|
+
- - ~>
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '0.8'
|
70
|
-
- -
|
70
|
+
- - '>='
|
71
71
|
- !ruby/object:Gem::Version
|
72
72
|
version: 0.8.6
|
73
73
|
- !ruby/object:Gem::Dependency
|
74
74
|
name: nokogiri
|
75
75
|
requirement: !ruby/object:Gem::Requirement
|
76
76
|
requirements:
|
77
|
-
- -
|
77
|
+
- - ~>
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '1.6'
|
80
|
-
- -
|
80
|
+
- - '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 1.6.3.1
|
83
83
|
type: :runtime
|
84
84
|
prerelease: false
|
85
85
|
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - ~>
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '1.6'
|
90
|
-
- -
|
90
|
+
- - '>='
|
91
91
|
- !ruby/object:Gem::Version
|
92
92
|
version: 1.6.3.1
|
93
93
|
- !ruby/object:Gem::Dependency
|
94
94
|
name: mechanize
|
95
95
|
requirement: !ruby/object:Gem::Requirement
|
96
96
|
requirements:
|
97
|
-
- -
|
97
|
+
- - ~>
|
98
98
|
- !ruby/object:Gem::Version
|
99
99
|
version: '2.7'
|
100
|
-
- -
|
100
|
+
- - '>='
|
101
101
|
- !ruby/object:Gem::Version
|
102
102
|
version: 2.7.3
|
103
103
|
type: :runtime
|
104
104
|
prerelease: false
|
105
105
|
version_requirements: !ruby/object:Gem::Requirement
|
106
106
|
requirements:
|
107
|
-
- -
|
107
|
+
- - ~>
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: '2.7'
|
110
|
-
- -
|
110
|
+
- - '>='
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: 2.7.3
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: mysql2
|
115
115
|
requirement: !ruby/object:Gem::Requirement
|
116
116
|
requirements:
|
117
|
-
- -
|
117
|
+
- - ~>
|
118
118
|
- !ruby/object:Gem::Version
|
119
119
|
version: '0.3'
|
120
|
-
- -
|
120
|
+
- - '>='
|
121
121
|
- !ruby/object:Gem::Version
|
122
122
|
version: 0.3.16
|
123
123
|
type: :runtime
|
124
124
|
prerelease: false
|
125
125
|
version_requirements: !ruby/object:Gem::Requirement
|
126
126
|
requirements:
|
127
|
-
- -
|
127
|
+
- - ~>
|
128
128
|
- !ruby/object:Gem::Version
|
129
129
|
version: '0.3'
|
130
|
-
- -
|
130
|
+
- - '>='
|
131
131
|
- !ruby/object:Gem::Version
|
132
132
|
version: 0.3.16
|
133
133
|
- !ruby/object:Gem::Dependency
|
134
134
|
name: pg
|
135
135
|
requirement: !ruby/object:Gem::Requirement
|
136
136
|
requirements:
|
137
|
-
- -
|
137
|
+
- - ~>
|
138
138
|
- !ruby/object:Gem::Version
|
139
139
|
version: '0.17'
|
140
|
-
- -
|
140
|
+
- - '>='
|
141
141
|
- !ruby/object:Gem::Version
|
142
142
|
version: 0.17.1
|
143
143
|
type: :runtime
|
144
144
|
prerelease: false
|
145
145
|
version_requirements: !ruby/object:Gem::Requirement
|
146
146
|
requirements:
|
147
|
-
- -
|
147
|
+
- - ~>
|
148
148
|
- !ruby/object:Gem::Version
|
149
149
|
version: '0.17'
|
150
|
-
- -
|
150
|
+
- - '>='
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: 0.17.1
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
154
|
name: sqlite3
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
|
-
- -
|
157
|
+
- - ~>
|
158
158
|
- !ruby/object:Gem::Version
|
159
159
|
version: '1.3'
|
160
|
-
- -
|
160
|
+
- - '>='
|
161
161
|
- !ruby/object:Gem::Version
|
162
162
|
version: 1.3.9
|
163
163
|
type: :runtime
|
164
164
|
prerelease: false
|
165
165
|
version_requirements: !ruby/object:Gem::Requirement
|
166
166
|
requirements:
|
167
|
-
- -
|
167
|
+
- - ~>
|
168
168
|
- !ruby/object:Gem::Version
|
169
169
|
version: '1.3'
|
170
|
-
- -
|
170
|
+
- - '>='
|
171
171
|
- !ruby/object:Gem::Version
|
172
172
|
version: 1.3.9
|
173
173
|
- !ruby/object:Gem::Dependency
|
174
174
|
name: activerecord
|
175
175
|
requirement: !ruby/object:Gem::Requirement
|
176
176
|
requirements:
|
177
|
-
- -
|
177
|
+
- - ~>
|
178
178
|
- !ruby/object:Gem::Version
|
179
179
|
version: '4.0'
|
180
|
-
- -
|
180
|
+
- - '>='
|
181
181
|
- !ruby/object:Gem::Version
|
182
182
|
version: 4.0.2
|
183
183
|
type: :runtime
|
184
184
|
prerelease: false
|
185
185
|
version_requirements: !ruby/object:Gem::Requirement
|
186
186
|
requirements:
|
187
|
-
- -
|
187
|
+
- - ~>
|
188
188
|
- !ruby/object:Gem::Version
|
189
189
|
version: '4.0'
|
190
|
-
- -
|
190
|
+
- - '>='
|
191
191
|
- !ruby/object:Gem::Version
|
192
192
|
version: 4.0.2
|
193
193
|
- !ruby/object:Gem::Dependency
|
194
194
|
name: ruby-readability
|
195
195
|
requirement: !ruby/object:Gem::Requirement
|
196
196
|
requirements:
|
197
|
-
- -
|
197
|
+
- - ~>
|
198
198
|
- !ruby/object:Gem::Version
|
199
199
|
version: '0.7'
|
200
|
-
- -
|
200
|
+
- - '>='
|
201
201
|
- !ruby/object:Gem::Version
|
202
202
|
version: 0.7.0
|
203
203
|
type: :runtime
|
204
204
|
prerelease: false
|
205
205
|
version_requirements: !ruby/object:Gem::Requirement
|
206
206
|
requirements:
|
207
|
-
- -
|
207
|
+
- - ~>
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0.7'
|
210
|
-
- -
|
210
|
+
- - '>='
|
211
211
|
- !ruby/object:Gem::Version
|
212
212
|
version: 0.7.0
|
213
213
|
- !ruby/object:Gem::Dependency
|
214
214
|
name: minitest
|
215
215
|
requirement: !ruby/object:Gem::Requirement
|
216
216
|
requirements:
|
217
|
-
- -
|
217
|
+
- - ~>
|
218
218
|
- !ruby/object:Gem::Version
|
219
219
|
version: '5.0'
|
220
|
-
- -
|
220
|
+
- - '>='
|
221
221
|
- !ruby/object:Gem::Version
|
222
222
|
version: 5.4.2
|
223
223
|
type: :development
|
224
224
|
prerelease: false
|
225
225
|
version_requirements: !ruby/object:Gem::Requirement
|
226
226
|
requirements:
|
227
|
-
- -
|
227
|
+
- - ~>
|
228
228
|
- !ruby/object:Gem::Version
|
229
229
|
version: '5.0'
|
230
|
-
- -
|
230
|
+
- - '>='
|
231
231
|
- !ruby/object:Gem::Version
|
232
232
|
version: 5.4.2
|
233
233
|
description: A simple news crawler. You can specify the structure of your xml or rss
|
@@ -237,9 +237,9 @@ executables: []
|
|
237
237
|
extensions: []
|
238
238
|
extra_rdoc_files: []
|
239
239
|
files:
|
240
|
-
- config/app_config.rb
|
241
240
|
- lib/sn_crawler.rb
|
242
241
|
- lib/sn_item.rb
|
242
|
+
- config/app_config.rb
|
243
243
|
homepage: http://marker68.github.io/simple-news-crawler
|
244
244
|
licenses:
|
245
245
|
- MIT
|
@@ -251,17 +251,17 @@ require_paths:
|
|
251
251
|
- lib
|
252
252
|
required_ruby_version: !ruby/object:Gem::Requirement
|
253
253
|
requirements:
|
254
|
-
- -
|
254
|
+
- - '>='
|
255
255
|
- !ruby/object:Gem::Version
|
256
|
-
version: 2.0.
|
256
|
+
version: 2.0.0
|
257
257
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
258
258
|
requirements:
|
259
|
-
- -
|
259
|
+
- - '>='
|
260
260
|
- !ruby/object:Gem::Version
|
261
261
|
version: '0'
|
262
262
|
requirements: []
|
263
263
|
rubyforge_project:
|
264
|
-
rubygems_version: 2.
|
264
|
+
rubygems_version: 2.0.14
|
265
265
|
signing_key:
|
266
266
|
specification_version: 4
|
267
267
|
summary: A simple RSS/XML news crawler
|