simple-news-crawler 1.0.2 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/sn_crawler.rb +76 -14
  3. data/lib/sn_item.rb +4 -0
  4. metadata +50 -50
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 24fbc21d296bb28b773bb2fb6d2f955a73b67c4c
4
- data.tar.gz: 09ddb5446af9e6b2b3a2961f9c897b02f81f0c95
3
+ metadata.gz: ed289205cd65fe1a7651d9e8771a7e13cad35ac0
4
+ data.tar.gz: 3e8cb296f0e1230ed2164697c4ce04d87c5f043d
5
5
  SHA512:
6
- metadata.gz: 905f48fe28a797e4b375aa101ab42ff15c119c14d25356e5f4ebbe5b32d0b0fdf2b60659fa07f22e6ad6ac36442dbc2d9a3b80ed158a9dbd54cdf898df352e4f
7
- data.tar.gz: 4dd94b7507d88c73fde7705f592623a65116f1ffd037d5774eea51b8d78beb8d80ba5358360b00ed25177ba338a3a262ad732435d0d04b713e9fd014097419b0
6
+ metadata.gz: 5474812498ab53f32f3c495d9d4e1f76b26dbc0a3455d968434617d3d9e096e52763d5661bffee7fd93a1f9b9d9742a1e19860d1ce76d2908c89801ab2783719
7
+ data.tar.gz: a3dee0584791af0d8ca080206a40a801bb736382c4a18bb3d1d2c04a4978f075822b9e9ab420bde5291175b90485d4abdb384153aaf9964c2b5ec764eda0ce05
@@ -14,16 +14,51 @@ require 'xml'
14
14
  require 'nokogiri'
15
15
  require 'readability'
16
16
 
17
- ## A crawler class
17
+ ##
18
+ #
19
+ # A crawler class
20
+ #
21
+ # @example
22
+ # require 'sn_crawler'
23
+ #
24
+ # url = "http://vnexpress.net/rss"
25
+ # c = SNCrawler.new(url,"VNE","/channel/item:[title,description,pubDate,link]",nil)
26
+ # c.get_links(true)
27
+ # # => ["vnexpress.net/rss/tin-moi-nhat.rss",...]
28
+ #
29
+ # c.get_news(true)
30
+ # # => City in Jordan welcomes ISIS
31
+ # # Image: []
32
+ # # Now inserting City in Jordan welcomes ISIS
33
+ #
18
34
  class SNCrawler
19
- ## Initialize parameters
20
- ## structure format: /path/to/channel/item_name:[item_attributes_name]
21
- ## item_attributes_name[0] => title of the page
22
- ## item_attributes_name[1] => description of the page
23
- ## item_attributes_name[2] => publicity time of the page
24
- ## item_attributes_name[3] => link to the page
25
- ## For example: /channel/item:[title,description,pubDate,url]
26
- def initialize(source = "", name = "", structure = "", db_conf = {})
35
+ ##
36
+ #
37
+ # Initialize parameters
38
+ #
39
+ # source => The source URL
40
+ #
41
+ # name => Some lovely names
42
+ #
43
+ # structure =>
44
+ #
45
+ # structure format: /path/to/channel/item_name:[item_attributes_name]
46
+ #
47
+ # item_attributes_name[0] => title of the page
48
+ #
49
+ # item_attributes_name[1] => description of the page
50
+ #
51
+ # item_attributes_name[2] => publicity time of the page
52
+ #
53
+ # item_attributes_name[3] => link to the page
54
+ #
55
+ # For example: /channel/item:[title,description,pubDate,url]
56
+ #
57
+ # db_conf => activerecord settings
58
+ #
59
+ # limit the number of news that you want
60
+ #
61
+ def initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100)
27
62
  ## The rss source's url
28
63
  @source = source
29
64
  @source_name = name
@@ -38,9 +73,16 @@ class SNCrawler
38
73
  else
39
74
  @use_db = false
40
75
  end
76
+ @limit = limit
41
77
  end
42
78
 
43
- ## Create table for our gem
79
+ ##
80
+ #
81
+ # Create table for our gem
82
+ #
83
+ # @example
84
+ # c.create_table("engine=MROONGA",true)
85
+ #
44
86
  def create_table(options = "", verbose = true)
45
87
  begin
46
88
  ActiveRecord::Migration.class_eval do
@@ -67,7 +109,10 @@ class SNCrawler
67
109
  return true
68
110
  end
69
111
 
70
- ## Get urls from a source url
112
+ ##
113
+ #
114
+ # Get urls from a source url
115
+ #
71
116
  def get_links(verbose = false)
72
117
  page = @agent.get(@source)
73
118
 
@@ -94,17 +139,28 @@ class SNCrawler
94
139
  end
95
140
  end
96
141
 
97
- ## Set Urls
142
+ ##
143
+ #
144
+ # Set Urls
145
+ #
98
146
  def set_url(url = [])
99
147
  @url = url
100
148
  end
101
149
 
102
- ## Clear urls
150
+ ##
151
+ #
152
+ # Clear urls
153
+ #
103
154
  def clear_url
104
155
  @url = []
105
156
  end
106
157
 
107
- ## Get news from urls
158
+ ##
159
+ #
160
+ # Get news from urls
161
+ #
162
+ # Note that you have to run this method ONLY after running get_links and create_table(in case of using DB)
163
+ #
108
164
  def get_news(verbose = false)
109
165
  count = 0
110
166
  channel_path = "."
@@ -183,6 +239,9 @@ class SNCrawler
183
239
  else
184
240
  end
185
241
  count = count + 1
242
+ if verbose then
243
+ puts "Got #{count} news"
244
+ end
186
245
  end
187
246
  end
188
247
  rescue => e
@@ -190,6 +249,9 @@ class SNCrawler
190
249
  puts "Error: #{e}"
191
250
  end
192
251
  end
252
+ if count >= @limit then
253
+ break
254
+ end
193
255
  end
194
256
 
195
257
  if verbose then
@@ -7,6 +7,10 @@ require 'pg'
7
7
  require 'sqlite3'
8
8
  require 'active_record'
9
9
 
10
+ ##
11
+ #
12
+ # An item class to store news
13
+ #
10
14
  class SNItem < ActiveRecord::Base
11
15
  self.table_name = "sn_news"
12
16
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple-news-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nguyen Anh Tuan
@@ -14,220 +14,220 @@ dependencies:
14
14
  name: json
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.8'
20
- - - ">="
20
+ - - '>='
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.8.1
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - "~>"
27
+ - - ~>
28
28
  - !ruby/object:Gem::Version
29
29
  version: '1.8'
30
- - - ">="
30
+ - - '>='
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.8.1
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: libxml-ruby
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
- - - "~>"
37
+ - - ~>
38
38
  - !ruby/object:Gem::Version
39
39
  version: '2.7'
40
- - - ">="
40
+ - - '>='
41
41
  - !ruby/object:Gem::Version
42
42
  version: 2.7.0
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
- - - "~>"
47
+ - - ~>
48
48
  - !ruby/object:Gem::Version
49
49
  version: '2.7'
50
- - - ">="
50
+ - - '>='
51
51
  - !ruby/object:Gem::Version
52
52
  version: 2.7.0
53
53
  - !ruby/object:Gem::Dependency
54
54
  name: curb
55
55
  requirement: !ruby/object:Gem::Requirement
56
56
  requirements:
57
- - - "~>"
57
+ - - ~>
58
58
  - !ruby/object:Gem::Version
59
59
  version: '0.8'
60
- - - ">="
60
+ - - '>='
61
61
  - !ruby/object:Gem::Version
62
62
  version: 0.8.6
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
- - - "~>"
67
+ - - ~>
68
68
  - !ruby/object:Gem::Version
69
69
  version: '0.8'
70
- - - ">="
70
+ - - '>='
71
71
  - !ruby/object:Gem::Version
72
72
  version: 0.8.6
73
73
  - !ruby/object:Gem::Dependency
74
74
  name: nokogiri
75
75
  requirement: !ruby/object:Gem::Requirement
76
76
  requirements:
77
- - - "~>"
77
+ - - ~>
78
78
  - !ruby/object:Gem::Version
79
79
  version: '1.6'
80
- - - ">="
80
+ - - '>='
81
81
  - !ruby/object:Gem::Version
82
82
  version: 1.6.3.1
83
83
  type: :runtime
84
84
  prerelease: false
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - "~>"
87
+ - - ~>
88
88
  - !ruby/object:Gem::Version
89
89
  version: '1.6'
90
- - - ">="
90
+ - - '>='
91
91
  - !ruby/object:Gem::Version
92
92
  version: 1.6.3.1
93
93
  - !ruby/object:Gem::Dependency
94
94
  name: mechanize
95
95
  requirement: !ruby/object:Gem::Requirement
96
96
  requirements:
97
- - - "~>"
97
+ - - ~>
98
98
  - !ruby/object:Gem::Version
99
99
  version: '2.7'
100
- - - ">="
100
+ - - '>='
101
101
  - !ruby/object:Gem::Version
102
102
  version: 2.7.3
103
103
  type: :runtime
104
104
  prerelease: false
105
105
  version_requirements: !ruby/object:Gem::Requirement
106
106
  requirements:
107
- - - "~>"
107
+ - - ~>
108
108
  - !ruby/object:Gem::Version
109
109
  version: '2.7'
110
- - - ">="
110
+ - - '>='
111
111
  - !ruby/object:Gem::Version
112
112
  version: 2.7.3
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: mysql2
115
115
  requirement: !ruby/object:Gem::Requirement
116
116
  requirements:
117
- - - "~>"
117
+ - - ~>
118
118
  - !ruby/object:Gem::Version
119
119
  version: '0.3'
120
- - - ">="
120
+ - - '>='
121
121
  - !ruby/object:Gem::Version
122
122
  version: 0.3.16
123
123
  type: :runtime
124
124
  prerelease: false
125
125
  version_requirements: !ruby/object:Gem::Requirement
126
126
  requirements:
127
- - - "~>"
127
+ - - ~>
128
128
  - !ruby/object:Gem::Version
129
129
  version: '0.3'
130
- - - ">="
130
+ - - '>='
131
131
  - !ruby/object:Gem::Version
132
132
  version: 0.3.16
133
133
  - !ruby/object:Gem::Dependency
134
134
  name: pg
135
135
  requirement: !ruby/object:Gem::Requirement
136
136
  requirements:
137
- - - "~>"
137
+ - - ~>
138
138
  - !ruby/object:Gem::Version
139
139
  version: '0.17'
140
- - - ">="
140
+ - - '>='
141
141
  - !ruby/object:Gem::Version
142
142
  version: 0.17.1
143
143
  type: :runtime
144
144
  prerelease: false
145
145
  version_requirements: !ruby/object:Gem::Requirement
146
146
  requirements:
147
- - - "~>"
147
+ - - ~>
148
148
  - !ruby/object:Gem::Version
149
149
  version: '0.17'
150
- - - ">="
150
+ - - '>='
151
151
  - !ruby/object:Gem::Version
152
152
  version: 0.17.1
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: sqlite3
155
155
  requirement: !ruby/object:Gem::Requirement
156
156
  requirements:
157
- - - "~>"
157
+ - - ~>
158
158
  - !ruby/object:Gem::Version
159
159
  version: '1.3'
160
- - - ">="
160
+ - - '>='
161
161
  - !ruby/object:Gem::Version
162
162
  version: 1.3.9
163
163
  type: :runtime
164
164
  prerelease: false
165
165
  version_requirements: !ruby/object:Gem::Requirement
166
166
  requirements:
167
- - - "~>"
167
+ - - ~>
168
168
  - !ruby/object:Gem::Version
169
169
  version: '1.3'
170
- - - ">="
170
+ - - '>='
171
171
  - !ruby/object:Gem::Version
172
172
  version: 1.3.9
173
173
  - !ruby/object:Gem::Dependency
174
174
  name: activerecord
175
175
  requirement: !ruby/object:Gem::Requirement
176
176
  requirements:
177
- - - "~>"
177
+ - - ~>
178
178
  - !ruby/object:Gem::Version
179
179
  version: '4.0'
180
- - - ">="
180
+ - - '>='
181
181
  - !ruby/object:Gem::Version
182
182
  version: 4.0.2
183
183
  type: :runtime
184
184
  prerelease: false
185
185
  version_requirements: !ruby/object:Gem::Requirement
186
186
  requirements:
187
- - - "~>"
187
+ - - ~>
188
188
  - !ruby/object:Gem::Version
189
189
  version: '4.0'
190
- - - ">="
190
+ - - '>='
191
191
  - !ruby/object:Gem::Version
192
192
  version: 4.0.2
193
193
  - !ruby/object:Gem::Dependency
194
194
  name: ruby-readability
195
195
  requirement: !ruby/object:Gem::Requirement
196
196
  requirements:
197
- - - "~>"
197
+ - - ~>
198
198
  - !ruby/object:Gem::Version
199
199
  version: '0.7'
200
- - - ">="
200
+ - - '>='
201
201
  - !ruby/object:Gem::Version
202
202
  version: 0.7.0
203
203
  type: :runtime
204
204
  prerelease: false
205
205
  version_requirements: !ruby/object:Gem::Requirement
206
206
  requirements:
207
- - - "~>"
207
+ - - ~>
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0.7'
210
- - - ">="
210
+ - - '>='
211
211
  - !ruby/object:Gem::Version
212
212
  version: 0.7.0
213
213
  - !ruby/object:Gem::Dependency
214
214
  name: minitest
215
215
  requirement: !ruby/object:Gem::Requirement
216
216
  requirements:
217
- - - "~>"
217
+ - - ~>
218
218
  - !ruby/object:Gem::Version
219
219
  version: '5.0'
220
- - - ">="
220
+ - - '>='
221
221
  - !ruby/object:Gem::Version
222
222
  version: 5.4.2
223
223
  type: :development
224
224
  prerelease: false
225
225
  version_requirements: !ruby/object:Gem::Requirement
226
226
  requirements:
227
- - - "~>"
227
+ - - ~>
228
228
  - !ruby/object:Gem::Version
229
229
  version: '5.0'
230
- - - ">="
230
+ - - '>='
231
231
  - !ruby/object:Gem::Version
232
232
  version: 5.4.2
233
233
  description: A simple news crawler. You can specify the structure of your xml or rss
@@ -237,9 +237,9 @@ executables: []
237
237
  extensions: []
238
238
  extra_rdoc_files: []
239
239
  files:
240
- - config/app_config.rb
241
240
  - lib/sn_crawler.rb
242
241
  - lib/sn_item.rb
242
+ - config/app_config.rb
243
243
  homepage: http://marker68.github.io/simple-news-crawler
244
244
  licenses:
245
245
  - MIT
@@ -251,17 +251,17 @@ require_paths:
251
251
  - lib
252
252
  required_ruby_version: !ruby/object:Gem::Requirement
253
253
  requirements:
254
- - - ">="
254
+ - - '>='
255
255
  - !ruby/object:Gem::Version
256
- version: 2.0.1
256
+ version: 2.0.0
257
257
  required_rubygems_version: !ruby/object:Gem::Requirement
258
258
  requirements:
259
- - - ">="
259
+ - - '>='
260
260
  - !ruby/object:Gem::Version
261
261
  version: '0'
262
262
  requirements: []
263
263
  rubyforge_project:
264
- rubygems_version: 2.2.0
264
+ rubygems_version: 2.0.14
265
265
  signing_key:
266
266
  specification_version: 4
267
267
  summary: A simple RSS/XML news crawler