webminer 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/webminer.rb ADDED
@@ -0,0 +1,234 @@
1
+ require 'net/http'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'rss/1.0'
5
+ require 'rss/2.0'
6
+ require 'cgi'
7
+ require 'iconv'
8
+
9
+ #Main class, just a place holder
10
+
11
+ class WebMiner
12
+
13
+ def self.create_story(item, topic)
14
+
15
+ params=CGI::parse(item.link)
16
+
17
+ link = params["url"][0]
18
+
19
+ if Story.where(:link => link).exists?
20
+ return nil
21
+ end
22
+
23
+ story = Story.new
24
+ story.title=item.title
25
+ story.link = link
26
+
27
+ if item.source != nil
28
+ story.source = item.source
29
+ else if story.link != nil
30
+
31
+ matchdata = story.link.match'https?://(?:w{3}\.)?(.*?)/.*' #Since <source> tag is rarely set in Google News, we use domain name to identify source, e.g. www.nytimes.com, but still, only http(s) is going to work
32
+ story.source=matchdata[1]
33
+ else
34
+ return nil
35
+ end
36
+
37
+
38
+ end #if
39
+
40
+ story.topic=topic
41
+ story.date_time=item.date.rfc2822
42
+
43
+ puts "Title: "+story.title
44
+ puts "Link: "+story.link
45
+ puts "Source: "+story.source
46
+ puts "Date/time:"+story.date_time.to_s
47
+
48
+ # raw_text = self.read_story(story.link)
49
+ if !(self.get_parser_dictionary.keys.include? story.source)
50
+ puts "Skipping text extraction from non-trusted source "+story.source+":"+story.title
51
+ # story.raw_content = open(story.link) do |s| Iconv.conv('UTF-8','ISO_8859-1',s.read) end
52
+ else
53
+ story.content = Iconv.conv('UTF-8','ISO_8859-1',self.extract_text(story.link, self.get_parser_dictionary[story.source]))
54
+ if story.content == nil
55
+ puts "Text extraction failed"
56
+ end
57
+ end
58
+
59
+ begin
60
+ story.save
61
+ puts "Saved "+story.title
62
+ rescue SQLite3::ConstraintException,ActiveRecord::RecordNotUnique => ex
63
+ puts "Skipped duplicated story: "+story.title
64
+ end
65
+
66
+ #doc.css 'div[class="storyText"]' cbsnews.com
67
+ #'span[id="articleText"]' reuters
68
+ return story
69
+
70
+ end #def
71
+
72
+ def self.parse(content, topic)
73
+ rss = RSS::Parser::parse(content, false)
74
+ if rss != nil
75
+ items = rss.items
76
+ if items != nil
77
+ for item in items
78
+ story = self.create_story(item, topic)
79
+ #print "Created story: #{story}\n"
80
+ #story.save
81
+ end #for
82
+ end
83
+ end
84
+ end #def
85
+
86
+ def self.extract_text(link, css_path)
87
+ begin
88
+ f=open(link)
89
+ doc = Nokogiri::HTML(f)
90
+ text_nodes = doc.css(css_path)
91
+ raw_text = text_nodes.to_s
92
+ f.close
93
+ return Util.strip_story(raw_text)
94
+ rescue => ex
95
+ puts ex.message
96
+ puts "Error fetching link:"+link
97
+ return nil
98
+ end
99
+ end
100
+
101
+ def self.get_parser_dictionary
102
+ return {
103
+ "google.com" => 'div[id="hostednews-article"]',
104
+ "cbsnews.com" => 'div[id="contentBody"]',
105
+ "reuters.com" => 'span[id="articleText"]',
106
+ "latimes.com" => 'div[id="story-body-text"]',
107
+ "csmonitor.com" => 'div[id="mainColumn"]',
108
+ "npr.org" => 'div[id="storytext"]',
109
+ "usatoday.com" => 'div[id="mainstory"]',
110
+ "content.usatoday.com" => 'div[id="mainstory"]',
111
+ "guardian.co.uk" => 'div[id="article-body-blocks"]',
112
+ "nytimes.com" => 'div[id="article"]',
113
+ "bloomberg.com" => 'div[id="story_content"]',
114
+ "online.wsj.com" => 'div[id="article_story_body"]',
115
+ "asia.wsj.com" => 'div[id="article_story_body"]',
116
+ "businessweek.com" => 'div[id="story-body"]',
117
+ "cnn.com" => 'div[id="cnnContentContainer"]',
118
+ "edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
119
+ "money.cnn.com" => 'div[id="storytext"]',
120
+ "abcnews.go.com" => 'div[id="innerbody"]',
121
+ "foxnews.com" => 'div[id="introduction"]',
122
+ "businessweek.com" => 'div[id="story-body"]',
123
+ "entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
124
+ "washingtonpost.com" => 'div[id="article_body"]',
125
+ # "bbc.co.uk" => 'div[id="main-content"]',
126
+ "huffingtonpost.com" => 'div[id="entry_12345"]',
127
+ "telegraph.co.uk" => 'div[id="mainBodyArea"]',
128
+ "chicagotribune.com" => 'div[id="story-body-text"]',
129
+ "foxbusiness.com" => 'div[id="introduction"]',
130
+ "thedailybeast.com" => 'div[id="main"] article',
131
+ "economictimes.indiatimes.com" => 'div[id="storydiv"]',
132
+ "forbes.com" => 'div[id="leftRail"]',
133
+ "arstechnica.com" => 'div[id="story"]',
134
+ "theregister.co.uk"=> 'div[id="body"]',
135
+ "ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
136
+ "informationweek.com"=> 'span[id="articleBody"]',
137
+ "newyorker.com"=> 'div[id="articletext"]',
138
+ "kotaku.com"=> 'div[id="page"]',
139
+ "slashgear.com"=> 'span[id="intelliTxt"]',
140
+ "pcworld.com"=> 'div[id="articleText"]',
141
+ "news.cnet.com"=> 'div[id="article"]',
142
+ "english.aljazeera.net"=> 'td[id="tdTextContent"]',
143
+ "dailymail.co.uk"=> 'div[id="js-article-text"]',
144
+ "rttnews.com"=> 'div[id=""]',
145
+ "ft.com"=> 'div[id="storyContent"]',
146
+ "politico.com"=> 'div[id="mainContent"]',
147
+ "boston.com"=> 'div[id="page1"]',
148
+ "sfgate.com"=> 'div[id="fontprefs_bottom"]',
149
+ "oregonlive.com"=> 'div[id="article"]'
150
+ #""=> 'div[id=""]',
151
+
152
+ # "wired.com"=> 'div[id=""]'?
153
+ #http://latimesblogs.latimes.com ?
154
+ }
155
+ end
156
+
157
+
158
+ def self.main
159
+ prng = Random.new
160
+ topics = Hash.new(0)
161
+ topics['Top Stories'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&output=rss'
162
+ topics['U.S.'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=n&output=rss'
163
+ topics['Health'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=m&output=rss'
164
+ topics['Business'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=b&output=rss'
165
+ topics['Technology'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=tc&output=rss'
166
+ topics['World'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=w&output=rss'
167
+
168
+ topic_threads = []
169
+ $options = {}
170
+ $options['User-Agent']='Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'
171
+ $options['Accept']='text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
172
+ $options['From']='anon@anon.net'
173
+ topics.keys.each do |topic|
174
+ url = topics[topic]
175
+ sleep 5+prng.rand*2
176
+ topic_threads << Thread.new(url) do
177
+ #uri = URI.parse(url)
178
+
179
+ #http = Net::HTTP.new(uri.host, uri.port)
180
+ #request = Net::HTTP::Get.new(uri.request_uri)
181
+ print "I'm working on #{url}\n"
182
+
183
+ hash = 0
184
+
185
+ while true
186
+ #response = http.request(request)
187
+ #code = response.code
188
+ #body = response.body
189
+ content = ""
190
+ begin
191
+ open(url,$options) do |s| content = s.read end
192
+ rescue OpenURI::HTTPError => ex
193
+ puts "Google caught us again! "+url
194
+ # rescue => ex
195
+ # puts "Unhandled exception:"+ex.message
196
+ rescue SocketError => ex
197
+ puts ex.message
198
+ rescue Timeout::Error => ex
199
+ puts ex.message
200
+ end
201
+ new_hash = content.hash
202
+
203
+ # Headers are lowercased
204
+ #response["cache-control"] # => public, max-age=2592000
205
+
206
+
207
+
208
+ #print "Got code: #{code}\n" # => 301
209
+ print "Got content: #{content.length}, hash: #{new_hash}\n" # => The body (HTML, XML, blob, whatever)
210
+
211
+ if new_hash != hash
212
+ print "Parsing\n"
213
+ self.parse(content, topic)
214
+ else
215
+ print "Skipping\n"
216
+ end
217
+ hash = new_hash
218
+ sleep 30+prng.rand*2
219
+ end #while
220
+ end #thread
221
+ end #each
222
+
223
+
224
+ topic_threads.each do |thr|
225
+ thr.join
226
+ end #each
227
+
228
+ end #main
229
+
230
+ end
231
+
232
+ require 'webminer/constants'
233
+ require 'webminer/util'
234
+
@@ -0,0 +1,57 @@
1
+ class WebMiner::Constants
2
+ def self.get_parser_dictionary
3
+ return {
4
+ "google.com" => 'div[id="hostednews-article"]',
5
+ "cbsnews.com" => 'div[id="contentBody"]',
6
+ "reuters.com" => 'span[id="articleText"]',
7
+ "latimes.com" => 'div[id="story-body-text"]',
8
+ "csmonitor.com" => 'div[id="mainColumn"]',
9
+ "npr.org" => 'div[id="storytext"]',
10
+ "usatoday.com" => 'div[id="mainstory"]',
11
+ "content.usatoday.com" => 'div[id="mainstory"]',
12
+ "guardian.co.uk" => 'div[id="article-body-blocks"]',
13
+ "nytimes.com" => 'div[id="article"]',
14
+ "bloomberg.com" => 'div[id="story_content"]',
15
+ "online.wsj.com" => 'div[id="article_story_body"]',
16
+ "asia.wsj.com" => 'div[id="article_story_body"]',
17
+ "businessweek.com" => 'div[id="story-body"]',
18
+ "cnn.com" => 'div[id="cnnContentContainer"]',
19
+ "edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
20
+ "money.cnn.com" => 'div[id="storytext"]',
21
+ "abcnews.go.com" => 'div[id="innerbody"]',
22
+ "foxnews.com" => 'div[id="introduction"]',
23
+ "businessweek.com" => 'div[id="story-body"]',
24
+ "entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
25
+ "washingtonpost.com" => 'div[id="article_body"]',
26
+ # "bbc.co.uk" => 'div[id="main-content"]',
27
+ "huffingtonpost.com" => 'div[id="entry_12345"]',
28
+ "telegraph.co.uk" => 'div[id="mainBodyArea"]',
29
+ "chicagotribune.com" => 'div[id="story-body-text"]',
30
+ "foxbusiness.com" => 'div[id="introduction"]',
31
+ "thedailybeast.com" => 'div[id="main"] article',
32
+ "economictimes.indiatimes.com" => 'div[id="storydiv"]',
33
+ "forbes.com" => 'div[id="leftRail"]',
34
+ "arstechnica.com" => 'div[id="story"]',
35
+ "theregister.co.uk"=> 'div[id="body"]',
36
+ "ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
37
+ "informationweek.com"=> 'span[id="articleBody"]',
38
+ "newyorker.com"=> 'div[id="articletext"]',
39
+ "kotaku.com"=> 'div[id="page"]',
40
+ "slashgear.com"=> 'span[id="intelliTxt"]',
41
+ "pcworld.com"=> 'div[id="articleText"]',
42
+ "news.cnet.com"=> 'div[id="article"]',
43
+ "english.aljazeera.net"=> 'td[id="tdTextContent"]',
44
+ "dailymail.co.uk"=> 'div[id="js-article-text"]',
45
+ "rttnews.com"=> 'div[id=""]',
46
+ "ft.com"=> 'div[id="storyContent"]',
47
+ "politico.com"=> 'div[id="mainContent"]',
48
+ "boston.com"=> 'div[id="page1"]',
49
+ "sfgate.com"=> 'div[id="fontprefs_bottom"]',
50
+ "oregonlive.com"=> 'div[id="article"]'
51
+ #""=> 'div[id=""]',
52
+
53
+ # "wired.com"=> 'div[id=""]'?
54
+ #http://latimesblogs.latimes.com ?
55
+ }
56
+ end
57
+ end
@@ -0,0 +1,35 @@
1
+ class WebMiner::Util
2
+
3
+ def self.strip_js(raw_text)
4
+ return raw_text.gsub(/<script.*?<\/script>/im,'')
5
+ end
6
+
7
+ def self.strip_stylesheet(raw_text)
8
+ return raw_text.gsub(/<style.*?<\/style>/im,'')
9
+ end
10
+
11
+ def self.strip_tags(raw_text)
12
+ return raw_text.gsub(/<.*?>/im,'')
13
+ end
14
+
15
+ def self.strip_specialchars(raw_text)
16
+ return raw_text.gsub(/[^A-Za-z0-9-]+/,' ')
17
+ end
18
+
19
+ def self.strip_story(raw_text)
20
+ tmp1=self.strip_js(raw_text)
21
+ tmp2=self.strip_stylesheet(tmp1)
22
+ tmp3=self.strip_tags(tmp2)
23
+ tmp4=self.strip_specialchars(tmp3)
24
+ return tmp4
25
+ end
26
+
27
+ #Given 1d array X, put X(t),X(t-1)...X(t-d) into one row, resulting in a 2d array of size (N-1)x(d+1)
28
+ #n>=1
29
+ def self.displace(arr, d)
30
+ (0...arr.length-d).map {|i|
31
+ (i..(i+d)).map {|j|
32
+ arr[j]}}
33
+ end
34
+
35
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webminer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Yu Shen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-25 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: I really just mine the web
15
+ email: yushen83@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/webminer.rb
21
+ - lib/webminer/util.rb
22
+ - lib/webminer/constants.rb
23
+ homepage: https://github.com/yushen
24
+ licenses: []
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 1.8.19
44
+ signing_key:
45
+ specification_version: 3
46
+ summary: I mine the web
47
+ test_files: []