simple-news-crawler 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/sn_crawler.rb +76 -14
  3. data/lib/sn_item.rb +4 -0
  4. metadata +50 -50
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 24fbc21d296bb28b773bb2fb6d2f955a73b67c4c
4
- data.tar.gz: 09ddb5446af9e6b2b3a2961f9c897b02f81f0c95
3
+ metadata.gz: ed289205cd65fe1a7651d9e8771a7e13cad35ac0
4
+ data.tar.gz: 3e8cb296f0e1230ed2164697c4ce04d87c5f043d
5
5
  SHA512:
6
- metadata.gz: 905f48fe28a797e4b375aa101ab42ff15c119c14d25356e5f4ebbe5b32d0b0fdf2b60659fa07f22e6ad6ac36442dbc2d9a3b80ed158a9dbd54cdf898df352e4f
7
- data.tar.gz: 4dd94b7507d88c73fde7705f592623a65116f1ffd037d5774eea51b8d78beb8d80ba5358360b00ed25177ba338a3a262ad732435d0d04b713e9fd014097419b0
6
+ metadata.gz: 5474812498ab53f32f3c495d9d4e1f76b26dbc0a3455d968434617d3d9e096e52763d5661bffee7fd93a1f9b9d9742a1e19860d1ce76d2908c89801ab2783719
7
+ data.tar.gz: a3dee0584791af0d8ca080206a40a801bb736382c4a18bb3d1d2c04a4978f075822b9e9ab420bde5291175b90485d4abdb384153aaf9964c2b5ec764eda0ce05
@@ -14,16 +14,51 @@ require 'xml'
14
14
  require 'nokogiri'
15
15
  require 'readability'
16
16
 
17
- ## A crawler class
17
+ ##
18
+ #
19
+ # A crawler class
20
+ #
21
+ # @example
22
+ # require 'sn_crawler'
23
+ #
24
+ # url = "http://vnexpress.net/rss"
25
+ # c = SNCrawler.new(url,"VNE","/channel/item:[title,description,pubDate,link]",nil)
26
+ # c.get_links(true)
27
+ # # => ["vnexpress.net/rss/tin-moi-nhat.rss",...]
28
+ #
29
+ # c.get_news(true)
30
+ # # => City in Jordan welcomes ISIS
31
+ # # Image: []
32
+ # # Now inserting City in Jordan welcomes ISIS
33
+ #
18
34
  class SNCrawler
19
- ## Initialize parameters
20
- ## structure format: /path/to/channel/item_name:[item_attributes_name]
21
- ## item_attributes_name[0] => title of the page
22
- ## item_attributes_name[1] => description of the page
23
- ## item_attributes_name[2] => publicity time of the page
24
- ## item_attributes_name[3] => link to the page
25
- ## For example: /channel/item:[title,description,pubDate,url]
26
- def initialize(source = "", name = "", structure = "", db_conf = {})
35
+ ##
36
+ #
37
+ # Initialize parameters
38
+ #
39
+ # source => The source URL
40
+ #
41
+ # name => Some lovely names
42
+ #
43
+ # structure =>
44
+ #
45
+ # structure format: /path/to/channel/item_name:[item_attributes_name]
46
+ #
47
+ # item_attributes_name[0] => title of the page
48
+ #
49
+ # item_attributes_name[1] => description of the page
50
+ #
51
+ # item_attributes_name[2] => publicity time of the page
52
+ #
53
+ # item_attributes_name[3] => link to the page
54
+ #
55
+ # For example: /channel/item:[title,description,pubDate,url]
56
+ #
57
+ # db_conf => activerecord settings
58
+ #
59
+ # limit the number of news that you want
60
+ #
61
+ def initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100)
27
62
  ## The rss source's url
28
63
  @source = source
29
64
  @source_name = name
@@ -38,9 +73,16 @@ class SNCrawler
38
73
  else
39
74
  @use_db = false
40
75
  end
76
+ @limit = limit
41
77
  end
42
78
 
43
- ## Create table for our gem
79
+ ##
80
+ #
81
+ # Create table for our gem
82
+ #
83
+ # @example
84
+ # c.create_table("engine=MROONGA",true)
85
+ #
44
86
  def create_table(options = "", verbose = true)
45
87
  begin
46
88
  ActiveRecord::Migration.class_eval do
@@ -67,7 +109,10 @@ class SNCrawler
67
109
  return true
68
110
  end
69
111
 
70
- ## Get urls from a source url
112
+ ##
113
+ #
114
+ # Get urls from a source url
115
+ #
71
116
  def get_links(verbose = false)
72
117
  page = @agent.get(@source)
73
118
 
@@ -94,17 +139,28 @@ class SNCrawler
94
139
  end
95
140
  end
96
141
 
97
- ## Set Urls
142
+ ##
143
+ #
144
+ # Set Urls
145
+ #
98
146
  def set_url(url = [])
99
147
  @url = url
100
148
  end
101
149
 
102
- ## Clear urls
150
+ ##
151
+ #
152
+ # Clear urls
153
+ #
103
154
  def clear_url
104
155
  @url = []
105
156
  end
106
157
 
107
- ## Get news from urls
158
+ ##
159
+ #
160
+ # Get news from urls
161
+ #
162
+ # Note that you have to run this method ONLY after running get_links and create_table(in case of using DB)
163
+ #
108
164
  def get_news(verbose = false)
109
165
  count = 0
110
166
  channel_path = "."
@@ -183,6 +239,9 @@ class SNCrawler
183
239
  else
184
240
  end
185
241
  count = count + 1
242
+ if verbose then
243
+ puts "Got #{count} news"
244
+ end
186
245
  end
187
246
  end
188
247
  rescue => e
@@ -190,6 +249,9 @@ class SNCrawler
190
249
  puts "Error: #{e}"
191
250
  end
192
251
  end
252
+ if count >= @limit then
253
+ break
254
+ end
193
255
  end
194
256
 
195
257
  if verbose then
@@ -7,6 +7,10 @@ require 'pg'
7
7
  require 'sqlite3'
8
8
  require 'active_record'
9
9
 
10
+ ##
11
+ #
12
+ # An item class to store news
13
+ #
10
14
  class SNItem < ActiveRecord::Base
11
15
  self.table_name = "sn_news"
12
16
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple-news-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nguyen Anh Tuan
@@ -14,220 +14,220 @@ dependencies:
14
14
  name: json
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.8'
20
- - - ">="
20
+ - - '>='
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.8.1
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - "~>"
27
+ - - ~>
28
28
  - !ruby/object:Gem::Version
29
29
  version: '1.8'
30
- - - ">="
30
+ - - '>='
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.8.1
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: libxml-ruby
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
- - - "~>"
37
+ - - ~>
38
38
  - !ruby/object:Gem::Version
39
39
  version: '2.7'
40
- - - ">="
40
+ - - '>='
41
41
  - !ruby/object:Gem::Version
42
42
  version: 2.7.0
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
- - - "~>"
47
+ - - ~>
48
48
  - !ruby/object:Gem::Version
49
49
  version: '2.7'
50
- - - ">="
50
+ - - '>='
51
51
  - !ruby/object:Gem::Version
52
52
  version: 2.7.0
53
53
  - !ruby/object:Gem::Dependency
54
54
  name: curb
55
55
  requirement: !ruby/object:Gem::Requirement
56
56
  requirements:
57
- - - "~>"
57
+ - - ~>
58
58
  - !ruby/object:Gem::Version
59
59
  version: '0.8'
60
- - - ">="
60
+ - - '>='
61
61
  - !ruby/object:Gem::Version
62
62
  version: 0.8.6
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
- - - "~>"
67
+ - - ~>
68
68
  - !ruby/object:Gem::Version
69
69
  version: '0.8'
70
- - - ">="
70
+ - - '>='
71
71
  - !ruby/object:Gem::Version
72
72
  version: 0.8.6
73
73
  - !ruby/object:Gem::Dependency
74
74
  name: nokogiri
75
75
  requirement: !ruby/object:Gem::Requirement
76
76
  requirements:
77
- - - "~>"
77
+ - - ~>
78
78
  - !ruby/object:Gem::Version
79
79
  version: '1.6'
80
- - - ">="
80
+ - - '>='
81
81
  - !ruby/object:Gem::Version
82
82
  version: 1.6.3.1
83
83
  type: :runtime
84
84
  prerelease: false
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - "~>"
87
+ - - ~>
88
88
  - !ruby/object:Gem::Version
89
89
  version: '1.6'
90
- - - ">="
90
+ - - '>='
91
91
  - !ruby/object:Gem::Version
92
92
  version: 1.6.3.1
93
93
  - !ruby/object:Gem::Dependency
94
94
  name: mechanize
95
95
  requirement: !ruby/object:Gem::Requirement
96
96
  requirements:
97
- - - "~>"
97
+ - - ~>
98
98
  - !ruby/object:Gem::Version
99
99
  version: '2.7'
100
- - - ">="
100
+ - - '>='
101
101
  - !ruby/object:Gem::Version
102
102
  version: 2.7.3
103
103
  type: :runtime
104
104
  prerelease: false
105
105
  version_requirements: !ruby/object:Gem::Requirement
106
106
  requirements:
107
- - - "~>"
107
+ - - ~>
108
108
  - !ruby/object:Gem::Version
109
109
  version: '2.7'
110
- - - ">="
110
+ - - '>='
111
111
  - !ruby/object:Gem::Version
112
112
  version: 2.7.3
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: mysql2
115
115
  requirement: !ruby/object:Gem::Requirement
116
116
  requirements:
117
- - - "~>"
117
+ - - ~>
118
118
  - !ruby/object:Gem::Version
119
119
  version: '0.3'
120
- - - ">="
120
+ - - '>='
121
121
  - !ruby/object:Gem::Version
122
122
  version: 0.3.16
123
123
  type: :runtime
124
124
  prerelease: false
125
125
  version_requirements: !ruby/object:Gem::Requirement
126
126
  requirements:
127
- - - "~>"
127
+ - - ~>
128
128
  - !ruby/object:Gem::Version
129
129
  version: '0.3'
130
- - - ">="
130
+ - - '>='
131
131
  - !ruby/object:Gem::Version
132
132
  version: 0.3.16
133
133
  - !ruby/object:Gem::Dependency
134
134
  name: pg
135
135
  requirement: !ruby/object:Gem::Requirement
136
136
  requirements:
137
- - - "~>"
137
+ - - ~>
138
138
  - !ruby/object:Gem::Version
139
139
  version: '0.17'
140
- - - ">="
140
+ - - '>='
141
141
  - !ruby/object:Gem::Version
142
142
  version: 0.17.1
143
143
  type: :runtime
144
144
  prerelease: false
145
145
  version_requirements: !ruby/object:Gem::Requirement
146
146
  requirements:
147
- - - "~>"
147
+ - - ~>
148
148
  - !ruby/object:Gem::Version
149
149
  version: '0.17'
150
- - - ">="
150
+ - - '>='
151
151
  - !ruby/object:Gem::Version
152
152
  version: 0.17.1
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: sqlite3
155
155
  requirement: !ruby/object:Gem::Requirement
156
156
  requirements:
157
- - - "~>"
157
+ - - ~>
158
158
  - !ruby/object:Gem::Version
159
159
  version: '1.3'
160
- - - ">="
160
+ - - '>='
161
161
  - !ruby/object:Gem::Version
162
162
  version: 1.3.9
163
163
  type: :runtime
164
164
  prerelease: false
165
165
  version_requirements: !ruby/object:Gem::Requirement
166
166
  requirements:
167
- - - "~>"
167
+ - - ~>
168
168
  - !ruby/object:Gem::Version
169
169
  version: '1.3'
170
- - - ">="
170
+ - - '>='
171
171
  - !ruby/object:Gem::Version
172
172
  version: 1.3.9
173
173
  - !ruby/object:Gem::Dependency
174
174
  name: activerecord
175
175
  requirement: !ruby/object:Gem::Requirement
176
176
  requirements:
177
- - - "~>"
177
+ - - ~>
178
178
  - !ruby/object:Gem::Version
179
179
  version: '4.0'
180
- - - ">="
180
+ - - '>='
181
181
  - !ruby/object:Gem::Version
182
182
  version: 4.0.2
183
183
  type: :runtime
184
184
  prerelease: false
185
185
  version_requirements: !ruby/object:Gem::Requirement
186
186
  requirements:
187
- - - "~>"
187
+ - - ~>
188
188
  - !ruby/object:Gem::Version
189
189
  version: '4.0'
190
- - - ">="
190
+ - - '>='
191
191
  - !ruby/object:Gem::Version
192
192
  version: 4.0.2
193
193
  - !ruby/object:Gem::Dependency
194
194
  name: ruby-readability
195
195
  requirement: !ruby/object:Gem::Requirement
196
196
  requirements:
197
- - - "~>"
197
+ - - ~>
198
198
  - !ruby/object:Gem::Version
199
199
  version: '0.7'
200
- - - ">="
200
+ - - '>='
201
201
  - !ruby/object:Gem::Version
202
202
  version: 0.7.0
203
203
  type: :runtime
204
204
  prerelease: false
205
205
  version_requirements: !ruby/object:Gem::Requirement
206
206
  requirements:
207
- - - "~>"
207
+ - - ~>
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0.7'
210
- - - ">="
210
+ - - '>='
211
211
  - !ruby/object:Gem::Version
212
212
  version: 0.7.0
213
213
  - !ruby/object:Gem::Dependency
214
214
  name: minitest
215
215
  requirement: !ruby/object:Gem::Requirement
216
216
  requirements:
217
- - - "~>"
217
+ - - ~>
218
218
  - !ruby/object:Gem::Version
219
219
  version: '5.0'
220
- - - ">="
220
+ - - '>='
221
221
  - !ruby/object:Gem::Version
222
222
  version: 5.4.2
223
223
  type: :development
224
224
  prerelease: false
225
225
  version_requirements: !ruby/object:Gem::Requirement
226
226
  requirements:
227
- - - "~>"
227
+ - - ~>
228
228
  - !ruby/object:Gem::Version
229
229
  version: '5.0'
230
- - - ">="
230
+ - - '>='
231
231
  - !ruby/object:Gem::Version
232
232
  version: 5.4.2
233
233
  description: A simple news crawler. You can specify the structure of your xml or rss
@@ -237,9 +237,9 @@ executables: []
237
237
  extensions: []
238
238
  extra_rdoc_files: []
239
239
  files:
240
- - config/app_config.rb
241
240
  - lib/sn_crawler.rb
242
241
  - lib/sn_item.rb
242
+ - config/app_config.rb
243
243
  homepage: http://marker68.github.io/simple-news-crawler
244
244
  licenses:
245
245
  - MIT
@@ -251,17 +251,17 @@ require_paths:
251
251
  - lib
252
252
  required_ruby_version: !ruby/object:Gem::Requirement
253
253
  requirements:
254
- - - ">="
254
+ - - '>='
255
255
  - !ruby/object:Gem::Version
256
- version: 2.0.1
256
+ version: 2.0.0
257
257
  required_rubygems_version: !ruby/object:Gem::Requirement
258
258
  requirements:
259
- - - ">="
259
+ - - '>='
260
260
  - !ruby/object:Gem::Version
261
261
  version: '0'
262
262
  requirements: []
263
263
  rubyforge_project:
264
- rubygems_version: 2.2.0
264
+ rubygems_version: 2.0.14
265
265
  signing_key:
266
266
  specification_version: 4
267
267
  summary: A simple RSS/XML news crawler