simple-news-crawler 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 989f5fa8ff63672a63845c07ac225ff46ecf7786
4
+ data.tar.gz: 4c47b180951f0741d4c4a0abc131677a19054b83
5
+ SHA512:
6
+ metadata.gz: 1654a851eaccbc671176151ac137a9ca0ada40e96d6edc41f2f7404a5328111fb5530424f6f9dd83e93127c36e84b873b4aa971f23ab3f67084d3fd3a87158f9
7
+ data.tar.gz: 1a2a58e2b2393b2c7f36710703e090d0c8ab9d439d16019f44a4e234451c55a647f14de47a2179c6dad503cd1f1edd7f03979257ed583fffda6f2ecf63c304e7
@@ -0,0 +1,2 @@
1
+ class AppConfig
2
+ end
@@ -0,0 +1,200 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'sn_item'
4
+ require 'mysql2'
5
+ require 'pg'
6
+ require 'sqlite3'
7
+ require 'active_record'
8
+ require 'mechanize'
9
+ require 'date'
10
+ require 'curb'
11
+ require 'zlib'
12
+ require 'digest/sha2'
13
+ require 'xml'
14
+ require 'nokogiri'
15
+ require 'readability'
16
+
17
+ ## A crawler class
18
+ class SNCrawler
19
+ ## Initialize parameters
20
+ ## structure format: /path/to/channel/item_name:[item_attributes_name]
21
+ ## item_attributes_name[0] => title of the page
22
+ ## item_attributes_name[1] => description of the page
23
+ ## item_attributes_name[2] => publicity time of the page
24
+ ## item_attributes_name[3] => link to the page
25
+ ## For example: /channel/item:[title,description,pubDate,url]
26
+ def initialize(source = "", name = "", structure = "", db_conf = {})
27
+ ## The rss source's url
28
+ @source = source
29
+ @source_name = name
30
+ @structure = structure
31
+ ## Crawled urls will be stored here
32
+ @url = []
33
+ @agent = Mechanize.new
34
+ ## Establish a connection to DB server
35
+ if !db_conf.nil? then
36
+ ActiveRecord::Base.establish_connection(db_conf)
37
+ @use_db = true
38
+ else
39
+ @use_db = false
40
+ end
41
+ end
42
+
43
+ ## Create table for our gem
44
+ def create_table(options = "", verbose = true)
45
+ begin
46
+ ActiveRecord::Migration.class_eval do
47
+ create_table :sn_news, :options => options do |t|
48
+ t.string :lang
49
+ t.string :title
50
+ t.text :description
51
+ t.text :content
52
+ t.string :link
53
+ t.string :images
54
+ t.datetime :pubtime
55
+
56
+ t.timestamps
57
+ end
58
+
59
+ add_index :sn_news, [:title], :unique => true, :name => "unique_title_on_news"
60
+ end
61
+ rescue => e
62
+ if verbose then
63
+ puts "Error(s): #{e.to_s}"
64
+ end
65
+ return false
66
+ end
67
+ return true
68
+ end
69
+
70
+ ## Get urls from a source url
71
+ def get_links(verbose = false)
72
+ page = @agent.get(@source)
73
+
74
+ page.links_with(:href => /\.(rss|xml)/).each do |link|
75
+ src = ""
76
+ if !link.href.include? "http"
77
+ src = URI.parse(source).host + link.href
78
+ else
79
+ src = link.href
80
+ end
81
+ existed = 0
82
+ @url.each do |u|
83
+ if u == src
84
+ existed = 1
85
+ end
86
+ end
87
+ if existed == 0 && src.length <= 50
88
+ @url << src
89
+ end
90
+ end
91
+
92
+ if verbose then
93
+ puts @url.to_s
94
+ end
95
+ end
96
+
97
+ ## Set Urls
98
+ def set_url(url = [])
99
+ @url = url
100
+ end
101
+
102
+ ## Clear urls
103
+ def clear_url
104
+ @url = []
105
+ end
106
+
107
+ ## Get news from urls
108
+ def get_news(verbose = false)
109
+ count = 0
110
+ channel_path = "."
111
+ structure_path = @structure.split(/\//)
112
+ length = structure_path.length
113
+ for i in 0..(length - 2) do
114
+ channel_path += "/" + structure_path[i]
115
+ end
116
+ item_structure = structure_path[length - 1]
117
+ item_sts = item_structure.split(/:/)
118
+ ## Tag names
119
+ item_tag = item_sts[0]
120
+ item_sts2 = item_sts[1].gsub(/(\[|\])/,'').split(/,/)
121
+ title_tag = item_sts2[0]
122
+ des_tag = item_sts2[1]
123
+ pubdate_tag = item_sts2[2]
124
+ link_tag = item_sts2[3]
125
+ @url.each do |u|
126
+ request = Curl.get(u.to_s)
127
+ begin
128
+ source = XML::Parser.string(request.body_str)
129
+ content = source.parse
130
+ ## Find all channels
131
+ channels = content.root.find(channel_path)
132
+ ## For each channel processing the data
133
+ channels.each do |c|
134
+ lang = c.find_first('language').content
135
+ if lang.nil? then
136
+ lang = "en_US"
137
+ end
138
+ items = c.find(item_tag)
139
+ items.each do |i|
140
+ title = i.find_first(title_tag).content
141
+ title = title.gsub("'","")
142
+ puts title
143
+ description = i.find_first(des_tag).content
144
+ description = description.gsub("'","")
145
+ doc = Nokogiri::HTML(description)
146
+ img_url = []
147
+ doc.search('img').each do |img_tag|
148
+ img_url << img_tag.attributes['src'].value
149
+ end
150
+ puts "Image: #{img_url.to_s}"
151
+ link = i.find_first(link_tag).content.gsub(" ","")
152
+ pub_date = Time.strptime(i.find_first(pubdate_tag).content,"%A, %d %B %Y %H:%M:%S %Z")
153
+ i_source = Curl.get(link).body_str
154
+ content = Readability::Document.new(i_source).content
155
+ content = content.gsub("'","").force_encoding("UTF-8")
156
+ puts "Now inserting #{title}"
157
+ time_now = Time.now.strftime("%Y-%m-%d %H:%M:%S")
158
+ begin
159
+ if @use_db then
160
+ SNItem.create(
161
+ :lang => lang,
162
+ :title => title,
163
+ :description => description,
164
+ :link => link,
165
+ :pubtime => pub_date.strftime("%Y-%m-%d %H:%M:%S"),
166
+ :content => content,
167
+ :images => img_url.to_s,
168
+ :created_at => time_now,
169
+ :updated_at => time_now
170
+ )
171
+ else
172
+ if verbose then
173
+ puts "You do not use DB"
174
+ end
175
+ end
176
+ rescue => e
177
+ if verbose then
178
+ puts "Error(s): #{e.to_s}"
179
+ end
180
+ else
181
+ end
182
+ count = count + 1
183
+ end
184
+ end
185
+ rescue => e
186
+ if verbose then
187
+ puts "Error: #{e}"
188
+ end
189
+ end
190
+ end
191
+
192
+ if verbose then
193
+ puts "We got #{count} news today."
194
+ end
195
+ end
196
+
197
+ def finalize
198
+ ActiveRecord::Base.connection.close
199
+ end
200
+ end
@@ -0,0 +1,12 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'active_record'
4
+
5
+ require 'mysql2'
6
+ require 'pg'
7
+ require 'sqlite3'
8
+ require 'active_record'
9
+
10
+ class SNItem < ActiveRecord::Base
11
+ self.table_name = "sn_news"
12
+ end
metadata ADDED
@@ -0,0 +1,268 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simple-news-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nguyen Anh Tuan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: json
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.8.1
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.8'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.8.1
33
+ - !ruby/object:Gem::Dependency
34
+ name: libxml-ruby
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.7'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 2.7.0
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '2.7'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 2.7.0
53
+ - !ruby/object:Gem::Dependency
54
+ name: curb
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '0.8'
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 0.8.6
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '0.8'
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 0.8.6
73
+ - !ruby/object:Gem::Dependency
74
+ name: nokogiri
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - "~>"
78
+ - !ruby/object:Gem::Version
79
+ version: '1.6'
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 1.6.3.1
83
+ type: :runtime
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1.6'
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: 1.6.3.1
93
+ - !ruby/object:Gem::Dependency
94
+ name: mechanize
95
+ requirement: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - "~>"
98
+ - !ruby/object:Gem::Version
99
+ version: '2.7'
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: 2.7.3
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '2.7'
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: 2.7.3
113
+ - !ruby/object:Gem::Dependency
114
+ name: mysql2
115
+ requirement: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: '0.3'
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 0.3.16
123
+ type: :runtime
124
+ prerelease: false
125
+ version_requirements: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - "~>"
128
+ - !ruby/object:Gem::Version
129
+ version: '0.3'
130
+ - - ">="
131
+ - !ruby/object:Gem::Version
132
+ version: 0.3.16
133
+ - !ruby/object:Gem::Dependency
134
+ name: pg
135
+ requirement: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: '0.17'
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: 0.17.1
143
+ type: :runtime
144
+ prerelease: false
145
+ version_requirements: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - "~>"
148
+ - !ruby/object:Gem::Version
149
+ version: '0.17'
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 0.17.1
153
+ - !ruby/object:Gem::Dependency
154
+ name: sqlite3
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '1.3'
160
+ - - ">="
161
+ - !ruby/object:Gem::Version
162
+ version: 1.3.9
163
+ type: :runtime
164
+ prerelease: false
165
+ version_requirements: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - "~>"
168
+ - !ruby/object:Gem::Version
169
+ version: '1.3'
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: 1.3.9
173
+ - !ruby/object:Gem::Dependency
174
+ name: activerecord
175
+ requirement: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - "~>"
178
+ - !ruby/object:Gem::Version
179
+ version: '4.0'
180
+ - - ">="
181
+ - !ruby/object:Gem::Version
182
+ version: 4.0.2
183
+ type: :runtime
184
+ prerelease: false
185
+ version_requirements: !ruby/object:Gem::Requirement
186
+ requirements:
187
+ - - "~>"
188
+ - !ruby/object:Gem::Version
189
+ version: '4.0'
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: 4.0.2
193
+ - !ruby/object:Gem::Dependency
194
+ name: ruby-readability
195
+ requirement: !ruby/object:Gem::Requirement
196
+ requirements:
197
+ - - "~>"
198
+ - !ruby/object:Gem::Version
199
+ version: '0.7'
200
+ - - ">="
201
+ - !ruby/object:Gem::Version
202
+ version: 0.7.0
203
+ type: :runtime
204
+ prerelease: false
205
+ version_requirements: !ruby/object:Gem::Requirement
206
+ requirements:
207
+ - - "~>"
208
+ - !ruby/object:Gem::Version
209
+ version: '0.7'
210
+ - - ">="
211
+ - !ruby/object:Gem::Version
212
+ version: 0.7.0
213
+ - !ruby/object:Gem::Dependency
214
+ name: minitest
215
+ requirement: !ruby/object:Gem::Requirement
216
+ requirements:
217
+ - - "~>"
218
+ - !ruby/object:Gem::Version
219
+ version: '5.0'
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: 5.4.2
223
+ type: :development
224
+ prerelease: false
225
+ version_requirements: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - "~>"
228
+ - !ruby/object:Gem::Version
229
+ version: '5.0'
230
+ - - ">="
231
+ - !ruby/object:Gem::Version
232
+ version: 5.4.2
233
+ description: A simple news crawler. You can specify the structure of your xml or rss
234
+ feeds.
235
+ email: marxen68@gmail.com
236
+ executables: []
237
+ extensions: []
238
+ extra_rdoc_files: []
239
+ files:
240
+ - config/app_config.rb
241
+ - lib/sn_crawler.rb
242
+ - lib/sn_item.rb
243
+ homepage: http://marker68.github.io/simple-news-crawler
244
+ licenses:
245
+ - MIT
246
+ - GPL-2
247
+ metadata: {}
248
+ post_install_message:
249
+ rdoc_options: []
250
+ require_paths:
251
+ - lib
252
+ required_ruby_version: !ruby/object:Gem::Requirement
253
+ requirements:
254
+ - - ">="
255
+ - !ruby/object:Gem::Version
256
+ version: 2.0.1
257
+ required_rubygems_version: !ruby/object:Gem::Requirement
258
+ requirements:
259
+ - - ">="
260
+ - !ruby/object:Gem::Version
261
+ version: '0'
262
+ requirements: []
263
+ rubyforge_project:
264
+ rubygems_version: 2.2.0
265
+ signing_key:
266
+ specification_version: 4
267
+ summary: A simple RSS/XML news crawler
268
+ test_files: []