simple-news-crawler 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/config/app_config.rb +2 -0
- data/lib/sn_crawler.rb +200 -0
- data/lib/sn_item.rb +12 -0
- metadata +268 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 989f5fa8ff63672a63845c07ac225ff46ecf7786
|
4
|
+
data.tar.gz: 4c47b180951f0741d4c4a0abc131677a19054b83
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1654a851eaccbc671176151ac137a9ca0ada40e96d6edc41f2f7404a5328111fb5530424f6f9dd83e93127c36e84b873b4aa971f23ab3f67084d3fd3a87158f9
|
7
|
+
data.tar.gz: 1a2a58e2b2393b2c7f36710703e090d0c8ab9d439d16019f44a4e234451c55a647f14de47a2179c6dad503cd1f1edd7f03979257ed583fffda6f2ecf63c304e7
|
data/lib/sn_crawler.rb
ADDED
@@ -0,0 +1,200 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'sn_item'
|
4
|
+
require 'mysql2'
|
5
|
+
require 'pg'
|
6
|
+
require 'sqlite3'
|
7
|
+
require 'active_record'
|
8
|
+
require 'mechanize'
|
9
|
+
require 'date'
|
10
|
+
require 'curb'
|
11
|
+
require 'zlib'
|
12
|
+
require 'digest/sha2'
|
13
|
+
require 'xml'
|
14
|
+
require 'nokogiri'
|
15
|
+
require 'readability'
|
16
|
+
|
17
|
+
## A crawler class
|
18
|
+
class SNCrawler
|
19
|
+
## Initialize parameters
|
20
|
+
## structure format: /path/to/channel/item_name:[item_attributes_name]
|
21
|
+
## item_attributes_name[0] => title of the page
|
22
|
+
## item_attributes_name[1] => description of the page
|
23
|
+
## item_attributes_name[2] => publicity time of the page
|
24
|
+
## item_attributes_name[3] => link to the page
|
25
|
+
## For example: /channel/item:[title,description,pubDate,url]
|
26
|
+
def initialize(source = "", name = "", structure = "", db_conf = {})
|
27
|
+
## The rss source's url
|
28
|
+
@source = source
|
29
|
+
@source_name = name
|
30
|
+
@structure = structure
|
31
|
+
## Crawled urls will be stored here
|
32
|
+
@url = []
|
33
|
+
@agent = Mechanize.new
|
34
|
+
## Establish a connection to DB server
|
35
|
+
if !db_conf.nil? then
|
36
|
+
ActiveRecord::Base.establish_connection(db_conf)
|
37
|
+
@use_db = true
|
38
|
+
else
|
39
|
+
@use_db = false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
## Create table for our gem
|
44
|
+
def create_table(options = "", verbose = true)
|
45
|
+
begin
|
46
|
+
ActiveRecord::Migration.class_eval do
|
47
|
+
create_table :sn_news, :options => options do |t|
|
48
|
+
t.string :lang
|
49
|
+
t.string :title
|
50
|
+
t.text :description
|
51
|
+
t.text :content
|
52
|
+
t.string :link
|
53
|
+
t.string :images
|
54
|
+
t.datetime :pubtime
|
55
|
+
|
56
|
+
t.timestamps
|
57
|
+
end
|
58
|
+
|
59
|
+
add_index :sn_news, [:title], :unique => true, :name => "unique_title_on_news"
|
60
|
+
end
|
61
|
+
rescue => e
|
62
|
+
if verbose then
|
63
|
+
puts "Error(s): #{e.to_s}"
|
64
|
+
end
|
65
|
+
return false
|
66
|
+
end
|
67
|
+
return true
|
68
|
+
end
|
69
|
+
|
70
|
+
## Get urls from a source url
|
71
|
+
def get_links(verbose = false)
|
72
|
+
page = @agent.get(@source)
|
73
|
+
|
74
|
+
page.links_with(:href => /\.(rss|xml)/).each do |link|
|
75
|
+
src = ""
|
76
|
+
if !link.href.include? "http"
|
77
|
+
src = URI.parse(source).host + link.href
|
78
|
+
else
|
79
|
+
src = link.href
|
80
|
+
end
|
81
|
+
existed = 0
|
82
|
+
@url.each do |u|
|
83
|
+
if u == src
|
84
|
+
existed = 1
|
85
|
+
end
|
86
|
+
end
|
87
|
+
if existed == 0 && src.length <= 50
|
88
|
+
@url << src
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
if verbose then
|
93
|
+
puts @url.to_s
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
## Set Urls
|
98
|
+
def set_url(url = [])
|
99
|
+
@url = url
|
100
|
+
end
|
101
|
+
|
102
|
+
## Clear urls
|
103
|
+
def clear_url
|
104
|
+
@url = []
|
105
|
+
end
|
106
|
+
|
107
|
+
## Get news from urls
|
108
|
+
def get_news(verbose = false)
|
109
|
+
count = 0
|
110
|
+
channel_path = "."
|
111
|
+
structure_path = @structure.split(/\//)
|
112
|
+
length = structure_path.length
|
113
|
+
for i in 0..(length - 2) do
|
114
|
+
channel_path += "/" + structure_path[i]
|
115
|
+
end
|
116
|
+
item_structure = structure_path[length - 1]
|
117
|
+
item_sts = item_structure.split(/:/)
|
118
|
+
## Tag names
|
119
|
+
item_tag = item_sts[0]
|
120
|
+
item_sts2 = item_sts[1].gsub(/(\[|\])/,'').split(/,/)
|
121
|
+
title_tag = item_sts2[0]
|
122
|
+
des_tag = item_sts2[1]
|
123
|
+
pubdate_tag = item_sts2[2]
|
124
|
+
link_tag = item_sts2[3]
|
125
|
+
@url.each do |u|
|
126
|
+
request = Curl.get(u.to_s)
|
127
|
+
begin
|
128
|
+
source = XML::Parser.string(request.body_str)
|
129
|
+
content = source.parse
|
130
|
+
## Find all channels
|
131
|
+
channels = content.root.find(channel_path)
|
132
|
+
## For each channel processing the data
|
133
|
+
channels.each do |c|
|
134
|
+
lang = c.find_first('language').content
|
135
|
+
if lang.nil? then
|
136
|
+
lang = "en_US"
|
137
|
+
end
|
138
|
+
items = c.find(item_tag)
|
139
|
+
items.each do |i|
|
140
|
+
title = i.find_first(title_tag).content
|
141
|
+
title = title.gsub("'","")
|
142
|
+
puts title
|
143
|
+
description = i.find_first(des_tag).content
|
144
|
+
description = description.gsub("'","")
|
145
|
+
doc = Nokogiri::HTML(description)
|
146
|
+
img_url = []
|
147
|
+
doc.search('img').each do |img_tag|
|
148
|
+
img_url << img_tag.attributes['src'].value
|
149
|
+
end
|
150
|
+
puts "Image: #{img_url.to_s}"
|
151
|
+
link = i.find_first(link_tag).content.gsub(" ","")
|
152
|
+
pub_date = Time.strptime(i.find_first(pubdate_tag).content,"%A, %d %B %Y %H:%M:%S %Z")
|
153
|
+
i_source = Curl.get(link).body_str
|
154
|
+
content = Readability::Document.new(i_source).content
|
155
|
+
content = content.gsub("'","").force_encoding("UTF-8")
|
156
|
+
puts "Now inserting #{title}"
|
157
|
+
time_now = Time.now.strftime("%Y-%m-%d %H:%M:%S")
|
158
|
+
begin
|
159
|
+
if @use_db then
|
160
|
+
SNItem.create(
|
161
|
+
:lang => lang,
|
162
|
+
:title => title,
|
163
|
+
:description => description,
|
164
|
+
:link => link,
|
165
|
+
:pubtime => pub_date.strftime("%Y-%m-%d %H:%M:%S"),
|
166
|
+
:content => content,
|
167
|
+
:images => img_url.to_s,
|
168
|
+
:created_at => time_now,
|
169
|
+
:updated_at => time_now
|
170
|
+
)
|
171
|
+
else
|
172
|
+
if verbose then
|
173
|
+
puts "You do not use DB"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
rescue => e
|
177
|
+
if verbose then
|
178
|
+
puts "Error(s): #{e.to_s}"
|
179
|
+
end
|
180
|
+
else
|
181
|
+
end
|
182
|
+
count = count + 1
|
183
|
+
end
|
184
|
+
end
|
185
|
+
rescue => e
|
186
|
+
if verbose then
|
187
|
+
puts "Error: #{e}"
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
if verbose then
|
193
|
+
puts "We got #{count} news today."
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
def finalize
|
198
|
+
ActiveRecord::Base.connection.close
|
199
|
+
end
|
200
|
+
end
|
data/lib/sn_item.rb
ADDED
metadata
ADDED
@@ -0,0 +1,268 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simple-news-crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nguyen Anh Tuan
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-10-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: json
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.8'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.8.1
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.8'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.8.1
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: libxml-ruby
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '2.7'
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 2.7.0
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '2.7'
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 2.7.0
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: curb
|
55
|
+
requirement: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0.8'
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 0.8.6
|
63
|
+
type: :runtime
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0.8'
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 0.8.6
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: nokogiri
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - "~>"
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: '1.6'
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.6.3.1
|
83
|
+
type: :runtime
|
84
|
+
prerelease: false
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.6'
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: 1.6.3.1
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: mechanize
|
95
|
+
requirement: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - "~>"
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '2.7'
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: 2.7.3
|
103
|
+
type: :runtime
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '2.7'
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: 2.7.3
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: mysql2
|
115
|
+
requirement: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - "~>"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '0.3'
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 0.3.16
|
123
|
+
type: :runtime
|
124
|
+
prerelease: false
|
125
|
+
version_requirements: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - "~>"
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0.3'
|
130
|
+
- - ">="
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: 0.3.16
|
133
|
+
- !ruby/object:Gem::Dependency
|
134
|
+
name: pg
|
135
|
+
requirement: !ruby/object:Gem::Requirement
|
136
|
+
requirements:
|
137
|
+
- - "~>"
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: '0.17'
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: 0.17.1
|
143
|
+
type: :runtime
|
144
|
+
prerelease: false
|
145
|
+
version_requirements: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - "~>"
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0.17'
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.17.1
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: sqlite3
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '1.3'
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: 1.3.9
|
163
|
+
type: :runtime
|
164
|
+
prerelease: false
|
165
|
+
version_requirements: !ruby/object:Gem::Requirement
|
166
|
+
requirements:
|
167
|
+
- - "~>"
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
version: '1.3'
|
170
|
+
- - ">="
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: 1.3.9
|
173
|
+
- !ruby/object:Gem::Dependency
|
174
|
+
name: activerecord
|
175
|
+
requirement: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '4.0'
|
180
|
+
- - ">="
|
181
|
+
- !ruby/object:Gem::Version
|
182
|
+
version: 4.0.2
|
183
|
+
type: :runtime
|
184
|
+
prerelease: false
|
185
|
+
version_requirements: !ruby/object:Gem::Requirement
|
186
|
+
requirements:
|
187
|
+
- - "~>"
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: '4.0'
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: 4.0.2
|
193
|
+
- !ruby/object:Gem::Dependency
|
194
|
+
name: ruby-readability
|
195
|
+
requirement: !ruby/object:Gem::Requirement
|
196
|
+
requirements:
|
197
|
+
- - "~>"
|
198
|
+
- !ruby/object:Gem::Version
|
199
|
+
version: '0.7'
|
200
|
+
- - ">="
|
201
|
+
- !ruby/object:Gem::Version
|
202
|
+
version: 0.7.0
|
203
|
+
type: :runtime
|
204
|
+
prerelease: false
|
205
|
+
version_requirements: !ruby/object:Gem::Requirement
|
206
|
+
requirements:
|
207
|
+
- - "~>"
|
208
|
+
- !ruby/object:Gem::Version
|
209
|
+
version: '0.7'
|
210
|
+
- - ">="
|
211
|
+
- !ruby/object:Gem::Version
|
212
|
+
version: 0.7.0
|
213
|
+
- !ruby/object:Gem::Dependency
|
214
|
+
name: minitest
|
215
|
+
requirement: !ruby/object:Gem::Requirement
|
216
|
+
requirements:
|
217
|
+
- - "~>"
|
218
|
+
- !ruby/object:Gem::Version
|
219
|
+
version: '5.0'
|
220
|
+
- - ">="
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: 5.4.2
|
223
|
+
type: :development
|
224
|
+
prerelease: false
|
225
|
+
version_requirements: !ruby/object:Gem::Requirement
|
226
|
+
requirements:
|
227
|
+
- - "~>"
|
228
|
+
- !ruby/object:Gem::Version
|
229
|
+
version: '5.0'
|
230
|
+
- - ">="
|
231
|
+
- !ruby/object:Gem::Version
|
232
|
+
version: 5.4.2
|
233
|
+
description: A simple news crawler. You can specify the structure of your xml or rss
|
234
|
+
feeds.
|
235
|
+
email: marxen68@gmail.com
|
236
|
+
executables: []
|
237
|
+
extensions: []
|
238
|
+
extra_rdoc_files: []
|
239
|
+
files:
|
240
|
+
- config/app_config.rb
|
241
|
+
- lib/sn_crawler.rb
|
242
|
+
- lib/sn_item.rb
|
243
|
+
homepage: http://marker68.github.io/simple-news-crawler
|
244
|
+
licenses:
|
245
|
+
- MIT
|
246
|
+
- GPL-2
|
247
|
+
metadata: {}
|
248
|
+
post_install_message:
|
249
|
+
rdoc_options: []
|
250
|
+
require_paths:
|
251
|
+
- lib
|
252
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
253
|
+
requirements:
|
254
|
+
- - ">="
|
255
|
+
- !ruby/object:Gem::Version
|
256
|
+
version: 2.0.1
|
257
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
258
|
+
requirements:
|
259
|
+
- - ">="
|
260
|
+
- !ruby/object:Gem::Version
|
261
|
+
version: '0'
|
262
|
+
requirements: []
|
263
|
+
rubyforge_project:
|
264
|
+
rubygems_version: 2.2.0
|
265
|
+
signing_key:
|
266
|
+
specification_version: 4
|
267
|
+
summary: A simple RSS/XML news crawler
|
268
|
+
test_files: []
|