simple-news-crawler 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 989f5fa8ff63672a63845c07ac225ff46ecf7786
4
+ data.tar.gz: 4c47b180951f0741d4c4a0abc131677a19054b83
5
+ SHA512:
6
+ metadata.gz: 1654a851eaccbc671176151ac137a9ca0ada40e96d6edc41f2f7404a5328111fb5530424f6f9dd83e93127c36e84b873b4aa971f23ab3f67084d3fd3a87158f9
7
+ data.tar.gz: 1a2a58e2b2393b2c7f36710703e090d0c8ab9d439d16019f44a4e234451c55a647f14de47a2179c6dad503cd1f1edd7f03979257ed583fffda6f2ecf63c304e7
@@ -0,0 +1,2 @@
1
+ class AppConfig
2
+ end
@@ -0,0 +1,200 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'sn_item'
4
+ require 'mysql2'
5
+ require 'pg'
6
+ require 'sqlite3'
7
+ require 'active_record'
8
+ require 'mechanize'
9
+ require 'date'
10
+ require 'curb'
11
+ require 'zlib'
12
+ require 'digest/sha2'
13
+ require 'xml'
14
+ require 'nokogiri'
15
+ require 'readability'
16
+
17
+ ## A crawler class
18
+ class SNCrawler
19
+ ## Initialize parameters
20
+ ## structure format: /path/to/channel/item_name:[item_attributes_name]
21
+ ## item_attributes_name[0] => title of the page
22
+ ## item_attributes_name[1] => description of the page
23
+ ## item_attributes_name[2] => publicity time of the page
24
+ ## item_attributes_name[3] => link to the page
25
+ ## For example: /channel/item:[title,description,pubDate,url]
26
+ def initialize(source = "", name = "", structure = "", db_conf = {})
27
+ ## The rss source's url
28
+ @source = source
29
+ @source_name = name
30
+ @structure = structure
31
+ ## Crawled urls will be stored here
32
+ @url = []
33
+ @agent = Mechanize.new
34
+ ## Establish a connection to DB server
35
+ if !db_conf.nil? then
36
+ ActiveRecord::Base.establish_connection(db_conf)
37
+ @use_db = true
38
+ else
39
+ @use_db = false
40
+ end
41
+ end
42
+
43
+ ## Create table for our gem
44
+ def create_table(options = "", verbose = true)
45
+ begin
46
+ ActiveRecord::Migration.class_eval do
47
+ create_table :sn_news, :options => options do |t|
48
+ t.string :lang
49
+ t.string :title
50
+ t.text :description
51
+ t.text :content
52
+ t.string :link
53
+ t.string :images
54
+ t.datetime :pubtime
55
+
56
+ t.timestamps
57
+ end
58
+
59
+ add_index :sn_news, [:title], :unique => true, :name => "unique_title_on_news"
60
+ end
61
+ rescue => e
62
+ if verbose then
63
+ puts "Error(s): #{e.to_s}"
64
+ end
65
+ return false
66
+ end
67
+ return true
68
+ end
69
+
70
+ ## Get urls from a source url
71
+ def get_links(verbose = false)
72
+ page = @agent.get(@source)
73
+
74
+ page.links_with(:href => /\.(rss|xml)/).each do |link|
75
+ src = ""
76
+ if !link.href.include? "http"
77
+ src = URI.parse(source).host + link.href
78
+ else
79
+ src = link.href
80
+ end
81
+ existed = 0
82
+ @url.each do |u|
83
+ if u == src
84
+ existed = 1
85
+ end
86
+ end
87
+ if existed == 0 && src.length <= 50
88
+ @url << src
89
+ end
90
+ end
91
+
92
+ if verbose then
93
+ puts @url.to_s
94
+ end
95
+ end
96
+
97
+ ## Set Urls
98
+ def set_url(url = [])
99
+ @url = url
100
+ end
101
+
102
+ ## Clear urls
103
+ def clear_url
104
+ @url = []
105
+ end
106
+
107
+ ## Get news from urls
108
+ def get_news(verbose = false)
109
+ count = 0
110
+ channel_path = "."
111
+ structure_path = @structure.split(/\//)
112
+ length = structure_path.length
113
+ for i in 0..(length - 2) do
114
+ channel_path += "/" + structure_path[i]
115
+ end
116
+ item_structure = structure_path[length - 1]
117
+ item_sts = item_structure.split(/:/)
118
+ ## Tag names
119
+ item_tag = item_sts[0]
120
+ item_sts2 = item_sts[1].gsub(/(\[|\])/,'').split(/,/)
121
+ title_tag = item_sts2[0]
122
+ des_tag = item_sts2[1]
123
+ pubdate_tag = item_sts2[2]
124
+ link_tag = item_sts2[3]
125
+ @url.each do |u|
126
+ request = Curl.get(u.to_s)
127
+ begin
128
+ source = XML::Parser.string(request.body_str)
129
+ content = source.parse
130
+ ## Find all channels
131
+ channels = content.root.find(channel_path)
132
+ ## For each channel processing the data
133
+ channels.each do |c|
134
+ lang = c.find_first('language').content
135
+ if lang.nil? then
136
+ lang = "en_US"
137
+ end
138
+ items = c.find(item_tag)
139
+ items.each do |i|
140
+ title = i.find_first(title_tag).content
141
+ title = title.gsub("'","")
142
+ puts title
143
+ description = i.find_first(des_tag).content
144
+ description = description.gsub("'","")
145
+ doc = Nokogiri::HTML(description)
146
+ img_url = []
147
+ doc.search('img').each do |img_tag|
148
+ img_url << img_tag.attributes['src'].value
149
+ end
150
+ puts "Image: #{img_url.to_s}"
151
+ link = i.find_first(link_tag).content.gsub(" ","")
152
+ pub_date = Time.strptime(i.find_first(pubdate_tag).content,"%A, %d %B %Y %H:%M:%S %Z")
153
+ i_source = Curl.get(link).body_str
154
+ content = Readability::Document.new(i_source).content
155
+ content = content.gsub("'","").force_encoding("UTF-8")
156
+ puts "Now inserting #{title}"
157
+ time_now = Time.now.strftime("%Y-%m-%d %H:%M:%S")
158
+ begin
159
+ if @use_db then
160
+ SNItem.create(
161
+ :lang => lang,
162
+ :title => title,
163
+ :description => description,
164
+ :link => link,
165
+ :pubtime => pub_date.strftime("%Y-%m-%d %H:%M:%S"),
166
+ :content => content,
167
+ :images => img_url.to_s,
168
+ :created_at => time_now,
169
+ :updated_at => time_now
170
+ )
171
+ else
172
+ if verbose then
173
+ puts "You do not use DB"
174
+ end
175
+ end
176
+ rescue => e
177
+ if verbose then
178
+ puts "Error(s): #{e.to_s}"
179
+ end
180
+ else
181
+ end
182
+ count = count + 1
183
+ end
184
+ end
185
+ rescue => e
186
+ if verbose then
187
+ puts "Error: #{e}"
188
+ end
189
+ end
190
+ end
191
+
192
+ if verbose then
193
+ puts "We got #{count} news today."
194
+ end
195
+ end
196
+
197
+ def finalize
198
+ ActiveRecord::Base.connection.close
199
+ end
200
+ end
@@ -0,0 +1,12 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'active_record'
4
+
5
+ require 'mysql2'
6
+ require 'pg'
7
+ require 'sqlite3'
8
+ require 'active_record'
9
+
10
+ class SNItem < ActiveRecord::Base
11
+ self.table_name = "sn_news"
12
+ end
metadata ADDED
@@ -0,0 +1,268 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simple-news-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nguyen Anh Tuan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: json
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.8.1
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.8'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.8.1
33
+ - !ruby/object:Gem::Dependency
34
+ name: libxml-ruby
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.7'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 2.7.0
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '2.7'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 2.7.0
53
+ - !ruby/object:Gem::Dependency
54
+ name: curb
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '0.8'
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 0.8.6
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '0.8'
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 0.8.6
73
+ - !ruby/object:Gem::Dependency
74
+ name: nokogiri
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - "~>"
78
+ - !ruby/object:Gem::Version
79
+ version: '1.6'
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 1.6.3.1
83
+ type: :runtime
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1.6'
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: 1.6.3.1
93
+ - !ruby/object:Gem::Dependency
94
+ name: mechanize
95
+ requirement: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - "~>"
98
+ - !ruby/object:Gem::Version
99
+ version: '2.7'
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: 2.7.3
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '2.7'
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: 2.7.3
113
+ - !ruby/object:Gem::Dependency
114
+ name: mysql2
115
+ requirement: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: '0.3'
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 0.3.16
123
+ type: :runtime
124
+ prerelease: false
125
+ version_requirements: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - "~>"
128
+ - !ruby/object:Gem::Version
129
+ version: '0.3'
130
+ - - ">="
131
+ - !ruby/object:Gem::Version
132
+ version: 0.3.16
133
+ - !ruby/object:Gem::Dependency
134
+ name: pg
135
+ requirement: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: '0.17'
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: 0.17.1
143
+ type: :runtime
144
+ prerelease: false
145
+ version_requirements: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - "~>"
148
+ - !ruby/object:Gem::Version
149
+ version: '0.17'
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 0.17.1
153
+ - !ruby/object:Gem::Dependency
154
+ name: sqlite3
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '1.3'
160
+ - - ">="
161
+ - !ruby/object:Gem::Version
162
+ version: 1.3.9
163
+ type: :runtime
164
+ prerelease: false
165
+ version_requirements: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - "~>"
168
+ - !ruby/object:Gem::Version
169
+ version: '1.3'
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: 1.3.9
173
+ - !ruby/object:Gem::Dependency
174
+ name: activerecord
175
+ requirement: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - "~>"
178
+ - !ruby/object:Gem::Version
179
+ version: '4.0'
180
+ - - ">="
181
+ - !ruby/object:Gem::Version
182
+ version: 4.0.2
183
+ type: :runtime
184
+ prerelease: false
185
+ version_requirements: !ruby/object:Gem::Requirement
186
+ requirements:
187
+ - - "~>"
188
+ - !ruby/object:Gem::Version
189
+ version: '4.0'
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: 4.0.2
193
+ - !ruby/object:Gem::Dependency
194
+ name: ruby-readability
195
+ requirement: !ruby/object:Gem::Requirement
196
+ requirements:
197
+ - - "~>"
198
+ - !ruby/object:Gem::Version
199
+ version: '0.7'
200
+ - - ">="
201
+ - !ruby/object:Gem::Version
202
+ version: 0.7.0
203
+ type: :runtime
204
+ prerelease: false
205
+ version_requirements: !ruby/object:Gem::Requirement
206
+ requirements:
207
+ - - "~>"
208
+ - !ruby/object:Gem::Version
209
+ version: '0.7'
210
+ - - ">="
211
+ - !ruby/object:Gem::Version
212
+ version: 0.7.0
213
+ - !ruby/object:Gem::Dependency
214
+ name: minitest
215
+ requirement: !ruby/object:Gem::Requirement
216
+ requirements:
217
+ - - "~>"
218
+ - !ruby/object:Gem::Version
219
+ version: '5.0'
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: 5.4.2
223
+ type: :development
224
+ prerelease: false
225
+ version_requirements: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - "~>"
228
+ - !ruby/object:Gem::Version
229
+ version: '5.0'
230
+ - - ">="
231
+ - !ruby/object:Gem::Version
232
+ version: 5.4.2
233
+ description: A simple news crawler. You can specify the structure of your xml or rss
234
+ feeds.
235
+ email: marxen68@gmail.com
236
+ executables: []
237
+ extensions: []
238
+ extra_rdoc_files: []
239
+ files:
240
+ - config/app_config.rb
241
+ - lib/sn_crawler.rb
242
+ - lib/sn_item.rb
243
+ homepage: http://marker68.github.io/simple-news-crawler
244
+ licenses:
245
+ - MIT
246
+ - GPL-2
247
+ metadata: {}
248
+ post_install_message:
249
+ rdoc_options: []
250
+ require_paths:
251
+ - lib
252
+ required_ruby_version: !ruby/object:Gem::Requirement
253
+ requirements:
254
+ - - ">="
255
+ - !ruby/object:Gem::Version
256
+ version: 2.0.1
257
+ required_rubygems_version: !ruby/object:Gem::Requirement
258
+ requirements:
259
+ - - ">="
260
+ - !ruby/object:Gem::Version
261
+ version: '0'
262
+ requirements: []
263
+ rubyforge_project:
264
+ rubygems_version: 2.2.0
265
+ signing_key:
266
+ specification_version: 4
267
+ summary: A simple RSS/XML news crawler
268
+ test_files: []