webminer 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/webminer.rb +234 -0
- data/lib/webminer/constants.rb +57 -0
- data/lib/webminer/util.rb +35 -0
- metadata +47 -0
data/lib/webminer.rb
ADDED
@@ -0,0 +1,234 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'rss/1.0'
|
5
|
+
require 'rss/2.0'
|
6
|
+
require 'cgi'
|
7
|
+
require 'iconv'
|
8
|
+
|
9
|
+
#Main class, just a place holder
|
10
|
+
|
11
|
+
class WebMiner
|
12
|
+
|
13
|
+
def self.create_story(item, topic)
|
14
|
+
|
15
|
+
params=CGI::parse(item.link)
|
16
|
+
|
17
|
+
link = params["url"][0]
|
18
|
+
|
19
|
+
if Story.where(:link => link).exists?
|
20
|
+
return nil
|
21
|
+
end
|
22
|
+
|
23
|
+
story = Story.new
|
24
|
+
story.title=item.title
|
25
|
+
story.link = link
|
26
|
+
|
27
|
+
if item.source != nil
|
28
|
+
story.source = item.source
|
29
|
+
else if story.link != nil
|
30
|
+
|
31
|
+
matchdata = story.link.match'https?://(?:w{3}\.)?(.*?)/.*' #Since <source> tag is rarely set in Google News, we use domain name to identify source, e.g. www.nytimes.com, but still, only http(s) is going to work
|
32
|
+
story.source=matchdata[1]
|
33
|
+
else
|
34
|
+
return nil
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
end #if
|
39
|
+
|
40
|
+
story.topic=topic
|
41
|
+
story.date_time=item.date.rfc2822
|
42
|
+
|
43
|
+
puts "Title: "+story.title
|
44
|
+
puts "Link: "+story.link
|
45
|
+
puts "Source: "+story.source
|
46
|
+
puts "Date/time:"+story.date_time.to_s
|
47
|
+
|
48
|
+
# raw_text = self.read_story(story.link)
|
49
|
+
if !(self.get_parser_dictionary.keys.include? story.source)
|
50
|
+
puts "Skipping text extraction from non-trusted source "+story.source+":"+story.title
|
51
|
+
# story.raw_content = open(story.link) do |s| Iconv.conv('UTF-8','ISO_8859-1',s.read) end
|
52
|
+
else
|
53
|
+
story.content = Iconv.conv('UTF-8','ISO_8859-1',self.extract_text(story.link, self.get_parser_dictionary[story.source]))
|
54
|
+
if story.content == nil
|
55
|
+
puts "Text extraction failed"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
begin
|
60
|
+
story.save
|
61
|
+
puts "Saved "+story.title
|
62
|
+
rescue SQLite3::ConstraintException,ActiveRecord::RecordNotUnique => ex
|
63
|
+
puts "Skipped duplicated story: "+story.title
|
64
|
+
end
|
65
|
+
|
66
|
+
#doc.css 'div[class="storyText"]' cbsnews.com
|
67
|
+
#'span[id="articleText"]' reuters
|
68
|
+
return story
|
69
|
+
|
70
|
+
end #def
|
71
|
+
|
72
|
+
def self.parse(content, topic)
|
73
|
+
rss = RSS::Parser::parse(content, false)
|
74
|
+
if rss != nil
|
75
|
+
items = rss.items
|
76
|
+
if items != nil
|
77
|
+
for item in items
|
78
|
+
story = self.create_story(item, topic)
|
79
|
+
#print "Created story: #{story}\n"
|
80
|
+
#story.save
|
81
|
+
end #for
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end #def
|
85
|
+
|
86
|
+
def self.extract_text(link, css_path)
|
87
|
+
begin
|
88
|
+
f=open(link)
|
89
|
+
doc = Nokogiri::HTML(f)
|
90
|
+
text_nodes = doc.css(css_path)
|
91
|
+
raw_text = text_nodes.to_s
|
92
|
+
f.close
|
93
|
+
return Util.strip_story(raw_text)
|
94
|
+
rescue => ex
|
95
|
+
puts ex.message
|
96
|
+
puts "Error fetching link:"+link
|
97
|
+
return nil
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.get_parser_dictionary
|
102
|
+
return {
|
103
|
+
"google.com" => 'div[id="hostednews-article"]',
|
104
|
+
"cbsnews.com" => 'div[id="contentBody"]',
|
105
|
+
"reuters.com" => 'span[id="articleText"]',
|
106
|
+
"latimes.com" => 'div[id="story-body-text"]',
|
107
|
+
"csmonitor.com" => 'div[id="mainColumn"]',
|
108
|
+
"npr.org" => 'div[id="storytext"]',
|
109
|
+
"usatoday.com" => 'div[id="mainstory"]',
|
110
|
+
"content.usatoday.com" => 'div[id="mainstory"]',
|
111
|
+
"guardian.co.uk" => 'div[id="article-body-blocks"]',
|
112
|
+
"nytimes.com" => 'div[id="article"]',
|
113
|
+
"bloomberg.com" => 'div[id="story_content"]',
|
114
|
+
"online.wsj.com" => 'div[id="article_story_body"]',
|
115
|
+
"asia.wsj.com" => 'div[id="article_story_body"]',
|
116
|
+
"businessweek.com" => 'div[id="story-body"]',
|
117
|
+
"cnn.com" => 'div[id="cnnContentContainer"]',
|
118
|
+
"edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
|
119
|
+
"money.cnn.com" => 'div[id="storytext"]',
|
120
|
+
"abcnews.go.com" => 'div[id="innerbody"]',
|
121
|
+
"foxnews.com" => 'div[id="introduction"]',
|
122
|
+
"businessweek.com" => 'div[id="story-body"]',
|
123
|
+
"entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
|
124
|
+
"washingtonpost.com" => 'div[id="article_body"]',
|
125
|
+
# "bbc.co.uk" => 'div[id="main-content"]',
|
126
|
+
"huffingtonpost.com" => 'div[id="entry_12345"]',
|
127
|
+
"telegraph.co.uk" => 'div[id="mainBodyArea"]',
|
128
|
+
"chicagotribune.com" => 'div[id="story-body-text"]',
|
129
|
+
"foxbusiness.com" => 'div[id="introduction"]',
|
130
|
+
"thedailybeast.com" => 'div[id="main"] article',
|
131
|
+
"economictimes.indiatimes.com" => 'div[id="storydiv"]',
|
132
|
+
"forbes.com" => 'div[id="leftRail"]',
|
133
|
+
"arstechnica.com" => 'div[id="story"]',
|
134
|
+
"theregister.co.uk"=> 'div[id="body"]',
|
135
|
+
"ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
|
136
|
+
"informationweek.com"=> 'span[id="articleBody"]',
|
137
|
+
"newyorker.com"=> 'div[id="articletext"]',
|
138
|
+
"kotaku.com"=> 'div[id="page"]',
|
139
|
+
"slashgear.com"=> 'span[id="intelliTxt"]',
|
140
|
+
"pcworld.com"=> 'div[id="articleText"]',
|
141
|
+
"news.cnet.com"=> 'div[id="article"]',
|
142
|
+
"english.aljazeera.net"=> 'td[id="tdTextContent"]',
|
143
|
+
"dailymail.co.uk"=> 'div[id="js-article-text"]',
|
144
|
+
"rttnews.com"=> 'div[id=""]',
|
145
|
+
"ft.com"=> 'div[id="storyContent"]',
|
146
|
+
"politico.com"=> 'div[id="mainContent"]',
|
147
|
+
"boston.com"=> 'div[id="page1"]',
|
148
|
+
"sfgate.com"=> 'div[id="fontprefs_bottom"]',
|
149
|
+
"oregonlive.com"=> 'div[id="article"]'
|
150
|
+
#""=> 'div[id=""]',
|
151
|
+
|
152
|
+
# "wired.com"=> 'div[id=""]'?
|
153
|
+
#http://latimesblogs.latimes.com ?
|
154
|
+
}
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
def self.main
|
159
|
+
prng = Random.new
|
160
|
+
topics = Hash.new(0)
|
161
|
+
topics['Top Stories'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&output=rss'
|
162
|
+
topics['U.S.'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=n&output=rss'
|
163
|
+
topics['Health'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=m&output=rss'
|
164
|
+
topics['Business'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=b&output=rss'
|
165
|
+
topics['Technology'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=tc&output=rss'
|
166
|
+
topics['World'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=w&output=rss'
|
167
|
+
|
168
|
+
topic_threads = []
|
169
|
+
$options = {}
|
170
|
+
$options['User-Agent']='Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'
|
171
|
+
$options['Accept']='text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
172
|
+
$options['From']='anon@anon.net'
|
173
|
+
topics.keys.each do |topic|
|
174
|
+
url = topics[topic]
|
175
|
+
sleep 5+prng.rand*2
|
176
|
+
topic_threads << Thread.new(url) do
|
177
|
+
#uri = URI.parse(url)
|
178
|
+
|
179
|
+
#http = Net::HTTP.new(uri.host, uri.port)
|
180
|
+
#request = Net::HTTP::Get.new(uri.request_uri)
|
181
|
+
print "I'm working on #{url}\n"
|
182
|
+
|
183
|
+
hash = 0
|
184
|
+
|
185
|
+
while true
|
186
|
+
#response = http.request(request)
|
187
|
+
#code = response.code
|
188
|
+
#body = response.body
|
189
|
+
content = ""
|
190
|
+
begin
|
191
|
+
open(url,$options) do |s| content = s.read end
|
192
|
+
rescue OpenURI::HTTPError => ex
|
193
|
+
puts "Google caught us again! "+url
|
194
|
+
# rescue => ex
|
195
|
+
# puts "Unhandled exception:"+ex.message
|
196
|
+
rescue SocketError => ex
|
197
|
+
puts ex.message
|
198
|
+
rescue Timeout::Error => ex
|
199
|
+
puts ex.message
|
200
|
+
end
|
201
|
+
new_hash = content.hash
|
202
|
+
|
203
|
+
# Headers are lowercased
|
204
|
+
#response["cache-control"] # => public, max-age=2592000
|
205
|
+
|
206
|
+
|
207
|
+
|
208
|
+
#print "Got code: #{code}\n" # => 301
|
209
|
+
print "Got content: #{content.length}, hash: #{new_hash}\n" # => The body (HTML, XML, blob, whatever)
|
210
|
+
|
211
|
+
if new_hash != hash
|
212
|
+
print "Parsing\n"
|
213
|
+
self.parse(content, topic)
|
214
|
+
else
|
215
|
+
print "Skipping\n"
|
216
|
+
end
|
217
|
+
hash = new_hash
|
218
|
+
sleep 30+prng.rand*2
|
219
|
+
end #while
|
220
|
+
end #thread
|
221
|
+
end #each
|
222
|
+
|
223
|
+
|
224
|
+
topic_threads.each do |thr|
|
225
|
+
thr.join
|
226
|
+
end #each
|
227
|
+
|
228
|
+
end #main
|
229
|
+
|
230
|
+
end
|
231
|
+
|
232
|
+
require 'webminer/constants'
|
233
|
+
require 'webminer/util'
|
234
|
+
|
@@ -0,0 +1,57 @@
|
|
1
|
+
class WebMiner::Constants
|
2
|
+
def self.get_parser_dictionary
|
3
|
+
return {
|
4
|
+
"google.com" => 'div[id="hostednews-article"]',
|
5
|
+
"cbsnews.com" => 'div[id="contentBody"]',
|
6
|
+
"reuters.com" => 'span[id="articleText"]',
|
7
|
+
"latimes.com" => 'div[id="story-body-text"]',
|
8
|
+
"csmonitor.com" => 'div[id="mainColumn"]',
|
9
|
+
"npr.org" => 'div[id="storytext"]',
|
10
|
+
"usatoday.com" => 'div[id="mainstory"]',
|
11
|
+
"content.usatoday.com" => 'div[id="mainstory"]',
|
12
|
+
"guardian.co.uk" => 'div[id="article-body-blocks"]',
|
13
|
+
"nytimes.com" => 'div[id="article"]',
|
14
|
+
"bloomberg.com" => 'div[id="story_content"]',
|
15
|
+
"online.wsj.com" => 'div[id="article_story_body"]',
|
16
|
+
"asia.wsj.com" => 'div[id="article_story_body"]',
|
17
|
+
"businessweek.com" => 'div[id="story-body"]',
|
18
|
+
"cnn.com" => 'div[id="cnnContentContainer"]',
|
19
|
+
"edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
|
20
|
+
"money.cnn.com" => 'div[id="storytext"]',
|
21
|
+
"abcnews.go.com" => 'div[id="innerbody"]',
|
22
|
+
"foxnews.com" => 'div[id="introduction"]',
|
23
|
+
"businessweek.com" => 'div[id="story-body"]',
|
24
|
+
"entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
|
25
|
+
"washingtonpost.com" => 'div[id="article_body"]',
|
26
|
+
# "bbc.co.uk" => 'div[id="main-content"]',
|
27
|
+
"huffingtonpost.com" => 'div[id="entry_12345"]',
|
28
|
+
"telegraph.co.uk" => 'div[id="mainBodyArea"]',
|
29
|
+
"chicagotribune.com" => 'div[id="story-body-text"]',
|
30
|
+
"foxbusiness.com" => 'div[id="introduction"]',
|
31
|
+
"thedailybeast.com" => 'div[id="main"] article',
|
32
|
+
"economictimes.indiatimes.com" => 'div[id="storydiv"]',
|
33
|
+
"forbes.com" => 'div[id="leftRail"]',
|
34
|
+
"arstechnica.com" => 'div[id="story"]',
|
35
|
+
"theregister.co.uk"=> 'div[id="body"]',
|
36
|
+
"ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
|
37
|
+
"informationweek.com"=> 'span[id="articleBody"]',
|
38
|
+
"newyorker.com"=> 'div[id="articletext"]',
|
39
|
+
"kotaku.com"=> 'div[id="page"]',
|
40
|
+
"slashgear.com"=> 'span[id="intelliTxt"]',
|
41
|
+
"pcworld.com"=> 'div[id="articleText"]',
|
42
|
+
"news.cnet.com"=> 'div[id="article"]',
|
43
|
+
"english.aljazeera.net"=> 'td[id="tdTextContent"]',
|
44
|
+
"dailymail.co.uk"=> 'div[id="js-article-text"]',
|
45
|
+
"rttnews.com"=> 'div[id=""]',
|
46
|
+
"ft.com"=> 'div[id="storyContent"]',
|
47
|
+
"politico.com"=> 'div[id="mainContent"]',
|
48
|
+
"boston.com"=> 'div[id="page1"]',
|
49
|
+
"sfgate.com"=> 'div[id="fontprefs_bottom"]',
|
50
|
+
"oregonlive.com"=> 'div[id="article"]'
|
51
|
+
#""=> 'div[id=""]',
|
52
|
+
|
53
|
+
# "wired.com"=> 'div[id=""]'?
|
54
|
+
#http://latimesblogs.latimes.com ?
|
55
|
+
}
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class WebMiner::Util
|
2
|
+
|
3
|
+
def self.strip_js(raw_text)
|
4
|
+
return raw_text.gsub(/<script.*?<\/script>/im,'')
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.strip_stylesheet(raw_text)
|
8
|
+
return raw_text.gsub(/<style.*?<\/style>/im,'')
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.strip_tags(raw_text)
|
12
|
+
return raw_text.gsub(/<.*?>/im,'')
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.strip_specialchars(raw_text)
|
16
|
+
return raw_text.gsub(/[^A-Za-z0-9-]+/,' ')
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.strip_story(raw_text)
|
20
|
+
tmp1=self.strip_js(raw_text)
|
21
|
+
tmp2=self.strip_stylesheet(tmp1)
|
22
|
+
tmp3=self.strip_tags(tmp2)
|
23
|
+
tmp4=self.strip_specialchars(tmp3)
|
24
|
+
return tmp4
|
25
|
+
end
|
26
|
+
|
27
|
+
#Given 1d array X, put X(t),X(t-1)...X(t-d) into one row, resulting in a 2d array of size (N-1)x(d+1)
|
28
|
+
#n>=1
|
29
|
+
def self.displace(arr, d)
|
30
|
+
(0...arr.length-d).map {|i|
|
31
|
+
(i..(i+d)).map {|j|
|
32
|
+
arr[j]}}
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webminer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Yu Shen
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-03-25 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: I really just mine the web
|
15
|
+
email: yushen83@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/webminer.rb
|
21
|
+
- lib/webminer/util.rb
|
22
|
+
- lib/webminer/constants.rb
|
23
|
+
homepage: https://github.com/yushen
|
24
|
+
licenses: []
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
none: false
|
31
|
+
requirements:
|
32
|
+
- - ! '>='
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 1.8.19
|
44
|
+
signing_key:
|
45
|
+
specification_version: 3
|
46
|
+
summary: I mine the web
|
47
|
+
test_files: []
|