simple-news-crawler 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/sn_crawler.rb +76 -14
- data/lib/sn_item.rb +4 -0
- metadata +50 -50
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ed289205cd65fe1a7651d9e8771a7e13cad35ac0
|
4
|
+
data.tar.gz: 3e8cb296f0e1230ed2164697c4ce04d87c5f043d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5474812498ab53f32f3c495d9d4e1f76b26dbc0a3455d968434617d3d9e096e52763d5661bffee7fd93a1f9b9d9742a1e19860d1ce76d2908c89801ab2783719
|
7
|
+
data.tar.gz: a3dee0584791af0d8ca080206a40a801bb736382c4a18bb3d1d2c04a4978f075822b9e9ab420bde5291175b90485d4abdb384153aaf9964c2b5ec764eda0ce05
|
data/lib/sn_crawler.rb
CHANGED
@@ -14,16 +14,51 @@ require 'xml'
|
|
14
14
|
require 'nokogiri'
|
15
15
|
require 'readability'
|
16
16
|
|
17
|
-
##
|
17
|
+
##
|
18
|
+
#
|
19
|
+
# A crawler class
|
20
|
+
#
|
21
|
+
# @example
|
22
|
+
# require 'sn_crawler'
|
23
|
+
#
|
24
|
+
# url = "http://vnexpress.net/rss"
|
25
|
+
# c = SNCrawler.new(url,"VNE","/channel/item:[title,description,pubDate,link]",nil)
|
26
|
+
# c.get_links(true)
|
27
|
+
# # => ["vnexpress.net/rss/tin-moi-nhat.rss",...]
|
28
|
+
#
|
29
|
+
# c.get_news(true)
|
30
|
+
# # => City in Jordan welcomes ISIS
|
31
|
+
# # Image: []
|
32
|
+
# # Now inserting City in Jordan welcomes ISIS
|
33
|
+
#
|
18
34
|
class SNCrawler
|
19
|
-
##
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
35
|
+
##
|
36
|
+
#
|
37
|
+
# Initialize parameters
|
38
|
+
#
|
39
|
+
# source => The source URL
|
40
|
+
#
|
41
|
+
# name => Some lovely names
|
42
|
+
#
|
43
|
+
# structure =>
|
44
|
+
#
|
45
|
+
# structure format: /path/to/channel/item_name:[item_attributes_name]
|
46
|
+
#
|
47
|
+
# item_attributes_name[0] => title of the page
|
48
|
+
#
|
49
|
+
# item_attributes_name[1] => description of the page
|
50
|
+
#
|
51
|
+
# item_attributes_name[2] => publicity time of the page
|
52
|
+
#
|
53
|
+
# item_attributes_name[3] => link to the page
|
54
|
+
#
|
55
|
+
# For example: /channel/item:[title,description,pubDate,url]
|
56
|
+
#
|
57
|
+
# db_conf => activerecord settings
|
58
|
+
#
|
59
|
+
# limit the number of news that you want
|
60
|
+
#
|
61
|
+
def initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100)
|
27
62
|
## The rss source's url
|
28
63
|
@source = source
|
29
64
|
@source_name = name
|
@@ -38,9 +73,16 @@ class SNCrawler
|
|
38
73
|
else
|
39
74
|
@use_db = false
|
40
75
|
end
|
76
|
+
@limit = limit
|
41
77
|
end
|
42
78
|
|
43
|
-
##
|
79
|
+
##
|
80
|
+
#
|
81
|
+
# Create table for our gem
|
82
|
+
#
|
83
|
+
# @example
|
84
|
+
# c.create_table("engine=MROONGA",true)
|
85
|
+
#
|
44
86
|
def create_table(options = "", verbose = true)
|
45
87
|
begin
|
46
88
|
ActiveRecord::Migration.class_eval do
|
@@ -67,7 +109,10 @@ class SNCrawler
|
|
67
109
|
return true
|
68
110
|
end
|
69
111
|
|
70
|
-
##
|
112
|
+
##
|
113
|
+
#
|
114
|
+
# Get urls from a source url
|
115
|
+
#
|
71
116
|
def get_links(verbose = false)
|
72
117
|
page = @agent.get(@source)
|
73
118
|
|
@@ -94,17 +139,28 @@ class SNCrawler
|
|
94
139
|
end
|
95
140
|
end
|
96
141
|
|
97
|
-
##
|
142
|
+
##
|
143
|
+
#
|
144
|
+
# Set Urls
|
145
|
+
#
|
98
146
|
def set_url(url = [])
|
99
147
|
@url = url
|
100
148
|
end
|
101
149
|
|
102
|
-
##
|
150
|
+
##
|
151
|
+
#
|
152
|
+
# Clear urls
|
153
|
+
#
|
103
154
|
def clear_url
|
104
155
|
@url = []
|
105
156
|
end
|
106
157
|
|
107
|
-
##
|
158
|
+
##
|
159
|
+
#
|
160
|
+
# Get news from urls
|
161
|
+
#
|
162
|
+
# Note that you have to run this method ONLY after running get_links and create_table(in case of using DB)
|
163
|
+
#
|
108
164
|
def get_news(verbose = false)
|
109
165
|
count = 0
|
110
166
|
channel_path = "."
|
@@ -183,6 +239,9 @@ class SNCrawler
|
|
183
239
|
else
|
184
240
|
end
|
185
241
|
count = count + 1
|
242
|
+
if verbose then
|
243
|
+
puts "Got #{count} news"
|
244
|
+
end
|
186
245
|
end
|
187
246
|
end
|
188
247
|
rescue => e
|
@@ -190,6 +249,9 @@ class SNCrawler
|
|
190
249
|
puts "Error: #{e}"
|
191
250
|
end
|
192
251
|
end
|
252
|
+
if count >= @limit then
|
253
|
+
break
|
254
|
+
end
|
193
255
|
end
|
194
256
|
|
195
257
|
if verbose then
|
data/lib/sn_item.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple-news-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nguyen Anh Tuan
|
@@ -14,220 +14,220 @@ dependencies:
|
|
14
14
|
name: json
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.8'
|
20
|
-
- -
|
20
|
+
- - '>='
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 1.8.1
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- -
|
27
|
+
- - ~>
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '1.8'
|
30
|
-
- -
|
30
|
+
- - '>='
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.8.1
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: libxml-ruby
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
|
-
- -
|
37
|
+
- - ~>
|
38
38
|
- !ruby/object:Gem::Version
|
39
39
|
version: '2.7'
|
40
|
-
- -
|
40
|
+
- - '>='
|
41
41
|
- !ruby/object:Gem::Version
|
42
42
|
version: 2.7.0
|
43
43
|
type: :runtime
|
44
44
|
prerelease: false
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
|
-
- -
|
47
|
+
- - ~>
|
48
48
|
- !ruby/object:Gem::Version
|
49
49
|
version: '2.7'
|
50
|
-
- -
|
50
|
+
- - '>='
|
51
51
|
- !ruby/object:Gem::Version
|
52
52
|
version: 2.7.0
|
53
53
|
- !ruby/object:Gem::Dependency
|
54
54
|
name: curb
|
55
55
|
requirement: !ruby/object:Gem::Requirement
|
56
56
|
requirements:
|
57
|
-
- -
|
57
|
+
- - ~>
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: '0.8'
|
60
|
-
- -
|
60
|
+
- - '>='
|
61
61
|
- !ruby/object:Gem::Version
|
62
62
|
version: 0.8.6
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
|
-
- -
|
67
|
+
- - ~>
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '0.8'
|
70
|
-
- -
|
70
|
+
- - '>='
|
71
71
|
- !ruby/object:Gem::Version
|
72
72
|
version: 0.8.6
|
73
73
|
- !ruby/object:Gem::Dependency
|
74
74
|
name: nokogiri
|
75
75
|
requirement: !ruby/object:Gem::Requirement
|
76
76
|
requirements:
|
77
|
-
- -
|
77
|
+
- - ~>
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '1.6'
|
80
|
-
- -
|
80
|
+
- - '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 1.6.3.1
|
83
83
|
type: :runtime
|
84
84
|
prerelease: false
|
85
85
|
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - ~>
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '1.6'
|
90
|
-
- -
|
90
|
+
- - '>='
|
91
91
|
- !ruby/object:Gem::Version
|
92
92
|
version: 1.6.3.1
|
93
93
|
- !ruby/object:Gem::Dependency
|
94
94
|
name: mechanize
|
95
95
|
requirement: !ruby/object:Gem::Requirement
|
96
96
|
requirements:
|
97
|
-
- -
|
97
|
+
- - ~>
|
98
98
|
- !ruby/object:Gem::Version
|
99
99
|
version: '2.7'
|
100
|
-
- -
|
100
|
+
- - '>='
|
101
101
|
- !ruby/object:Gem::Version
|
102
102
|
version: 2.7.3
|
103
103
|
type: :runtime
|
104
104
|
prerelease: false
|
105
105
|
version_requirements: !ruby/object:Gem::Requirement
|
106
106
|
requirements:
|
107
|
-
- -
|
107
|
+
- - ~>
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: '2.7'
|
110
|
-
- -
|
110
|
+
- - '>='
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: 2.7.3
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: mysql2
|
115
115
|
requirement: !ruby/object:Gem::Requirement
|
116
116
|
requirements:
|
117
|
-
- -
|
117
|
+
- - ~>
|
118
118
|
- !ruby/object:Gem::Version
|
119
119
|
version: '0.3'
|
120
|
-
- -
|
120
|
+
- - '>='
|
121
121
|
- !ruby/object:Gem::Version
|
122
122
|
version: 0.3.16
|
123
123
|
type: :runtime
|
124
124
|
prerelease: false
|
125
125
|
version_requirements: !ruby/object:Gem::Requirement
|
126
126
|
requirements:
|
127
|
-
- -
|
127
|
+
- - ~>
|
128
128
|
- !ruby/object:Gem::Version
|
129
129
|
version: '0.3'
|
130
|
-
- -
|
130
|
+
- - '>='
|
131
131
|
- !ruby/object:Gem::Version
|
132
132
|
version: 0.3.16
|
133
133
|
- !ruby/object:Gem::Dependency
|
134
134
|
name: pg
|
135
135
|
requirement: !ruby/object:Gem::Requirement
|
136
136
|
requirements:
|
137
|
-
- -
|
137
|
+
- - ~>
|
138
138
|
- !ruby/object:Gem::Version
|
139
139
|
version: '0.17'
|
140
|
-
- -
|
140
|
+
- - '>='
|
141
141
|
- !ruby/object:Gem::Version
|
142
142
|
version: 0.17.1
|
143
143
|
type: :runtime
|
144
144
|
prerelease: false
|
145
145
|
version_requirements: !ruby/object:Gem::Requirement
|
146
146
|
requirements:
|
147
|
-
- -
|
147
|
+
- - ~>
|
148
148
|
- !ruby/object:Gem::Version
|
149
149
|
version: '0.17'
|
150
|
-
- -
|
150
|
+
- - '>='
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: 0.17.1
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
154
|
name: sqlite3
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
|
-
- -
|
157
|
+
- - ~>
|
158
158
|
- !ruby/object:Gem::Version
|
159
159
|
version: '1.3'
|
160
|
-
- -
|
160
|
+
- - '>='
|
161
161
|
- !ruby/object:Gem::Version
|
162
162
|
version: 1.3.9
|
163
163
|
type: :runtime
|
164
164
|
prerelease: false
|
165
165
|
version_requirements: !ruby/object:Gem::Requirement
|
166
166
|
requirements:
|
167
|
-
- -
|
167
|
+
- - ~>
|
168
168
|
- !ruby/object:Gem::Version
|
169
169
|
version: '1.3'
|
170
|
-
- -
|
170
|
+
- - '>='
|
171
171
|
- !ruby/object:Gem::Version
|
172
172
|
version: 1.3.9
|
173
173
|
- !ruby/object:Gem::Dependency
|
174
174
|
name: activerecord
|
175
175
|
requirement: !ruby/object:Gem::Requirement
|
176
176
|
requirements:
|
177
|
-
- -
|
177
|
+
- - ~>
|
178
178
|
- !ruby/object:Gem::Version
|
179
179
|
version: '4.0'
|
180
|
-
- -
|
180
|
+
- - '>='
|
181
181
|
- !ruby/object:Gem::Version
|
182
182
|
version: 4.0.2
|
183
183
|
type: :runtime
|
184
184
|
prerelease: false
|
185
185
|
version_requirements: !ruby/object:Gem::Requirement
|
186
186
|
requirements:
|
187
|
-
- -
|
187
|
+
- - ~>
|
188
188
|
- !ruby/object:Gem::Version
|
189
189
|
version: '4.0'
|
190
|
-
- -
|
190
|
+
- - '>='
|
191
191
|
- !ruby/object:Gem::Version
|
192
192
|
version: 4.0.2
|
193
193
|
- !ruby/object:Gem::Dependency
|
194
194
|
name: ruby-readability
|
195
195
|
requirement: !ruby/object:Gem::Requirement
|
196
196
|
requirements:
|
197
|
-
- -
|
197
|
+
- - ~>
|
198
198
|
- !ruby/object:Gem::Version
|
199
199
|
version: '0.7'
|
200
|
-
- -
|
200
|
+
- - '>='
|
201
201
|
- !ruby/object:Gem::Version
|
202
202
|
version: 0.7.0
|
203
203
|
type: :runtime
|
204
204
|
prerelease: false
|
205
205
|
version_requirements: !ruby/object:Gem::Requirement
|
206
206
|
requirements:
|
207
|
-
- -
|
207
|
+
- - ~>
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0.7'
|
210
|
-
- -
|
210
|
+
- - '>='
|
211
211
|
- !ruby/object:Gem::Version
|
212
212
|
version: 0.7.0
|
213
213
|
- !ruby/object:Gem::Dependency
|
214
214
|
name: minitest
|
215
215
|
requirement: !ruby/object:Gem::Requirement
|
216
216
|
requirements:
|
217
|
-
- -
|
217
|
+
- - ~>
|
218
218
|
- !ruby/object:Gem::Version
|
219
219
|
version: '5.0'
|
220
|
-
- -
|
220
|
+
- - '>='
|
221
221
|
- !ruby/object:Gem::Version
|
222
222
|
version: 5.4.2
|
223
223
|
type: :development
|
224
224
|
prerelease: false
|
225
225
|
version_requirements: !ruby/object:Gem::Requirement
|
226
226
|
requirements:
|
227
|
-
- -
|
227
|
+
- - ~>
|
228
228
|
- !ruby/object:Gem::Version
|
229
229
|
version: '5.0'
|
230
|
-
- -
|
230
|
+
- - '>='
|
231
231
|
- !ruby/object:Gem::Version
|
232
232
|
version: 5.4.2
|
233
233
|
description: A simple news crawler. You can specify the structure of your xml or rss
|
@@ -237,9 +237,9 @@ executables: []
|
|
237
237
|
extensions: []
|
238
238
|
extra_rdoc_files: []
|
239
239
|
files:
|
240
|
-
- config/app_config.rb
|
241
240
|
- lib/sn_crawler.rb
|
242
241
|
- lib/sn_item.rb
|
242
|
+
- config/app_config.rb
|
243
243
|
homepage: http://marker68.github.io/simple-news-crawler
|
244
244
|
licenses:
|
245
245
|
- MIT
|
@@ -251,17 +251,17 @@ require_paths:
|
|
251
251
|
- lib
|
252
252
|
required_ruby_version: !ruby/object:Gem::Requirement
|
253
253
|
requirements:
|
254
|
-
- -
|
254
|
+
- - '>='
|
255
255
|
- !ruby/object:Gem::Version
|
256
|
-
version: 2.0.
|
256
|
+
version: 2.0.0
|
257
257
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
258
258
|
requirements:
|
259
|
-
- -
|
259
|
+
- - '>='
|
260
260
|
- !ruby/object:Gem::Version
|
261
261
|
version: '0'
|
262
262
|
requirements: []
|
263
263
|
rubyforge_project:
|
264
|
-
rubygems_version: 2.
|
264
|
+
rubygems_version: 2.0.14
|
265
265
|
signing_key:
|
266
266
|
specification_version: 4
|
267
267
|
summary: A simple RSS/XML news crawler
|