webminer 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/webminer.rb ADDED
@@ -0,0 +1,234 @@
1
+ require 'net/http'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'rss/1.0'
5
+ require 'rss/2.0'
6
+ require 'cgi'
7
+ require 'iconv'
8
+
9
+ #Main class, just a place holder
10
+
11
+ class WebMiner
12
+
13
+ def self.create_story(item, topic)
14
+
15
+ params=CGI::parse(item.link)
16
+
17
+ link = params["url"][0]
18
+
19
+ if Story.where(:link => link).exists?
20
+ return nil
21
+ end
22
+
23
+ story = Story.new
24
+ story.title=item.title
25
+ story.link = link
26
+
27
+ if item.source != nil
28
+ story.source = item.source
29
+ else if story.link != nil
30
+
31
+ matchdata = story.link.match'https?://(?:w{3}\.)?(.*?)/.*' #Since <source> tag is rarely set in Google News, we use domain name to identify source, e.g. www.nytimes.com, but still, only http(s) is going to work
32
+ story.source=matchdata[1]
33
+ else
34
+ return nil
35
+ end
36
+
37
+
38
+ end #if
39
+
40
+ story.topic=topic
41
+ story.date_time=item.date.rfc2822
42
+
43
+ puts "Title: "+story.title
44
+ puts "Link: "+story.link
45
+ puts "Source: "+story.source
46
+ puts "Date/time:"+story.date_time.to_s
47
+
48
+ # raw_text = self.read_story(story.link)
49
+ if !(self.get_parser_dictionary.keys.include? story.source)
50
+ puts "Skipping text extraction from non-trusted source "+story.source+":"+story.title
51
+ # story.raw_content = open(story.link) do |s| Iconv.conv('UTF-8','ISO_8859-1',s.read) end
52
+ else
53
+ story.content = Iconv.conv('UTF-8','ISO_8859-1',self.extract_text(story.link, self.get_parser_dictionary[story.source]))
54
+ if story.content == nil
55
+ puts "Text extraction failed"
56
+ end
57
+ end
58
+
59
+ begin
60
+ story.save
61
+ puts "Saved "+story.title
62
+ rescue SQLite3::ConstraintException,ActiveRecord::RecordNotUnique => ex
63
+ puts "Skipped duplicated story: "+story.title
64
+ end
65
+
66
+ #doc.css 'div[class="storyText"]' cbsnews.com
67
+ #'span[id="articleText"]' reuters
68
+ return story
69
+
70
+ end #def
71
+
72
+ def self.parse(content, topic)
73
+ rss = RSS::Parser::parse(content, false)
74
+ if rss != nil
75
+ items = rss.items
76
+ if items != nil
77
+ for item in items
78
+ story = self.create_story(item, topic)
79
+ #print "Created story: #{story}\n"
80
+ #story.save
81
+ end #for
82
+ end
83
+ end
84
+ end #def
85
+
86
+ def self.extract_text(link, css_path)
87
+ begin
88
+ f=open(link)
89
+ doc = Nokogiri::HTML(f)
90
+ text_nodes = doc.css(css_path)
91
+ raw_text = text_nodes.to_s
92
+ f.close
93
+ return Util.strip_story(raw_text)
94
+ rescue => ex
95
+ puts ex.message
96
+ puts "Error fetching link:"+link
97
+ return nil
98
+ end
99
+ end
100
+
101
+ def self.get_parser_dictionary
102
+ return {
103
+ "google.com" => 'div[id="hostednews-article"]',
104
+ "cbsnews.com" => 'div[id="contentBody"]',
105
+ "reuters.com" => 'span[id="articleText"]',
106
+ "latimes.com" => 'div[id="story-body-text"]',
107
+ "csmonitor.com" => 'div[id="mainColumn"]',
108
+ "npr.org" => 'div[id="storytext"]',
109
+ "usatoday.com" => 'div[id="mainstory"]',
110
+ "content.usatoday.com" => 'div[id="mainstory"]',
111
+ "guardian.co.uk" => 'div[id="article-body-blocks"]',
112
+ "nytimes.com" => 'div[id="article"]',
113
+ "bloomberg.com" => 'div[id="story_content"]',
114
+ "online.wsj.com" => 'div[id="article_story_body"]',
115
+ "asia.wsj.com" => 'div[id="article_story_body"]',
116
+ "businessweek.com" => 'div[id="story-body"]',
117
+ "cnn.com" => 'div[id="cnnContentContainer"]',
118
+ "edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
119
+ "money.cnn.com" => 'div[id="storytext"]',
120
+ "abcnews.go.com" => 'div[id="innerbody"]',
121
+ "foxnews.com" => 'div[id="introduction"]',
122
+ "businessweek.com" => 'div[id="story-body"]',
123
+ "entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
124
+ "washingtonpost.com" => 'div[id="article_body"]',
125
+ # "bbc.co.uk" => 'div[id="main-content"]',
126
+ "huffingtonpost.com" => 'div[id="entry_12345"]',
127
+ "telegraph.co.uk" => 'div[id="mainBodyArea"]',
128
+ "chicagotribune.com" => 'div[id="story-body-text"]',
129
+ "foxbusiness.com" => 'div[id="introduction"]',
130
+ "thedailybeast.com" => 'div[id="main"] article',
131
+ "economictimes.indiatimes.com" => 'div[id="storydiv"]',
132
+ "forbes.com" => 'div[id="leftRail"]',
133
+ "arstechnica.com" => 'div[id="story"]',
134
+ "theregister.co.uk"=> 'div[id="body"]',
135
+ "ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
136
+ "informationweek.com"=> 'span[id="articleBody"]',
137
+ "newyorker.com"=> 'div[id="articletext"]',
138
+ "kotaku.com"=> 'div[id="page"]',
139
+ "slashgear.com"=> 'span[id="intelliTxt"]',
140
+ "pcworld.com"=> 'div[id="articleText"]',
141
+ "news.cnet.com"=> 'div[id="article"]',
142
+ "english.aljazeera.net"=> 'td[id="tdTextContent"]',
143
+ "dailymail.co.uk"=> 'div[id="js-article-text"]',
144
+ "rttnews.com"=> 'div[id=""]',
145
+ "ft.com"=> 'div[id="storyContent"]',
146
+ "politico.com"=> 'div[id="mainContent"]',
147
+ "boston.com"=> 'div[id="page1"]',
148
+ "sfgate.com"=> 'div[id="fontprefs_bottom"]',
149
+ "oregonlive.com"=> 'div[id="article"]'
150
+ #""=> 'div[id=""]',
151
+
152
+ # "wired.com"=> 'div[id=""]'?
153
+ #http://latimesblogs.latimes.com ?
154
+ }
155
+ end
156
+
157
+
158
+ def self.main
159
+ prng = Random.new
160
+ topics = Hash.new(0)
161
+ topics['Top Stories'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&output=rss'
162
+ topics['U.S.'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=n&output=rss'
163
+ topics['Health'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=m&output=rss'
164
+ topics['Business'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=b&output=rss'
165
+ topics['Technology'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=tc&output=rss'
166
+ topics['World'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=w&output=rss'
167
+
168
+ topic_threads = []
169
+ $options = {}
170
+ $options['User-Agent']='Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'
171
+ $options['Accept']='text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
172
+ $options['From']='anon@anon.net'
173
+ topics.keys.each do |topic|
174
+ url = topics[topic]
175
+ sleep 5+prng.rand*2
176
+ topic_threads << Thread.new(url) do
177
+ #uri = URI.parse(url)
178
+
179
+ #http = Net::HTTP.new(uri.host, uri.port)
180
+ #request = Net::HTTP::Get.new(uri.request_uri)
181
+ print "I'm working on #{url}\n"
182
+
183
+ hash = 0
184
+
185
+ while true
186
+ #response = http.request(request)
187
+ #code = response.code
188
+ #body = response.body
189
+ content = ""
190
+ begin
191
+ open(url,$options) do |s| content = s.read end
192
+ rescue OpenURI::HTTPError => ex
193
+ puts "Google caught us again! "+url
194
+ # rescue => ex
195
+ # puts "Unhandled exception:"+ex.message
196
+ rescue SocketError => ex
197
+ puts ex.message
198
+ rescue Timeout::Error => ex
199
+ puts ex.message
200
+ end
201
+ new_hash = content.hash
202
+
203
+ # Headers are lowercased
204
+ #response["cache-control"] # => public, max-age=2592000
205
+
206
+
207
+
208
+ #print "Got code: #{code}\n" # => 301
209
+ print "Got content: #{content.length}, hash: #{new_hash}\n" # => The body (HTML, XML, blob, whatever)
210
+
211
+ if new_hash != hash
212
+ print "Parsing\n"
213
+ self.parse(content, topic)
214
+ else
215
+ print "Skipping\n"
216
+ end
217
+ hash = new_hash
218
+ sleep 30+prng.rand*2
219
+ end #while
220
+ end #thread
221
+ end #each
222
+
223
+
224
+ topic_threads.each do |thr|
225
+ thr.join
226
+ end #each
227
+
228
+ end #main
229
+
230
+ end
231
+
232
+ require 'webminer/constants'
233
+ require 'webminer/util'
234
+
@@ -0,0 +1,57 @@
1
+ class WebMiner::Constants
2
+ def self.get_parser_dictionary
3
+ return {
4
+ "google.com" => 'div[id="hostednews-article"]',
5
+ "cbsnews.com" => 'div[id="contentBody"]',
6
+ "reuters.com" => 'span[id="articleText"]',
7
+ "latimes.com" => 'div[id="story-body-text"]',
8
+ "csmonitor.com" => 'div[id="mainColumn"]',
9
+ "npr.org" => 'div[id="storytext"]',
10
+ "usatoday.com" => 'div[id="mainstory"]',
11
+ "content.usatoday.com" => 'div[id="mainstory"]',
12
+ "guardian.co.uk" => 'div[id="article-body-blocks"]',
13
+ "nytimes.com" => 'div[id="article"]',
14
+ "bloomberg.com" => 'div[id="story_content"]',
15
+ "online.wsj.com" => 'div[id="article_story_body"]',
16
+ "asia.wsj.com" => 'div[id="article_story_body"]',
17
+ "businessweek.com" => 'div[id="story-body"]',
18
+ "cnn.com" => 'div[id="cnnContentContainer"]',
19
+ "edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
20
+ "money.cnn.com" => 'div[id="storytext"]',
21
+ "abcnews.go.com" => 'div[id="innerbody"]',
22
+ "foxnews.com" => 'div[id="introduction"]',
23
+ "businessweek.com" => 'div[id="story-body"]',
24
+ "entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
25
+ "washingtonpost.com" => 'div[id="article_body"]',
26
+ # "bbc.co.uk" => 'div[id="main-content"]',
27
+ "huffingtonpost.com" => 'div[id="entry_12345"]',
28
+ "telegraph.co.uk" => 'div[id="mainBodyArea"]',
29
+ "chicagotribune.com" => 'div[id="story-body-text"]',
30
+ "foxbusiness.com" => 'div[id="introduction"]',
31
+ "thedailybeast.com" => 'div[id="main"] article',
32
+ "economictimes.indiatimes.com" => 'div[id="storydiv"]',
33
+ "forbes.com" => 'div[id="leftRail"]',
34
+ "arstechnica.com" => 'div[id="story"]',
35
+ "theregister.co.uk"=> 'div[id="body"]',
36
+ "ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
37
+ "informationweek.com"=> 'span[id="articleBody"]',
38
+ "newyorker.com"=> 'div[id="articletext"]',
39
+ "kotaku.com"=> 'div[id="page"]',
40
+ "slashgear.com"=> 'span[id="intelliTxt"]',
41
+ "pcworld.com"=> 'div[id="articleText"]',
42
+ "news.cnet.com"=> 'div[id="article"]',
43
+ "english.aljazeera.net"=> 'td[id="tdTextContent"]',
44
+ "dailymail.co.uk"=> 'div[id="js-article-text"]',
45
+ "rttnews.com"=> 'div[id=""]',
46
+ "ft.com"=> 'div[id="storyContent"]',
47
+ "politico.com"=> 'div[id="mainContent"]',
48
+ "boston.com"=> 'div[id="page1"]',
49
+ "sfgate.com"=> 'div[id="fontprefs_bottom"]',
50
+ "oregonlive.com"=> 'div[id="article"]'
51
+ #""=> 'div[id=""]',
52
+
53
+ # "wired.com"=> 'div[id=""]'?
54
+ #http://latimesblogs.latimes.com ?
55
+ }
56
+ end
57
+ end
@@ -0,0 +1,35 @@
1
+ class WebMiner::Util
2
+
3
+ def self.strip_js(raw_text)
4
+ return raw_text.gsub(/<script.*?<\/script>/im,'')
5
+ end
6
+
7
+ def self.strip_stylesheet(raw_text)
8
+ return raw_text.gsub(/<style.*?<\/style>/im,'')
9
+ end
10
+
11
+ def self.strip_tags(raw_text)
12
+ return raw_text.gsub(/<.*?>/im,'')
13
+ end
14
+
15
+ def self.strip_specialchars(raw_text)
16
+ return raw_text.gsub(/[^A-Za-z0-9-]+/,' ')
17
+ end
18
+
19
+ def self.strip_story(raw_text)
20
+ tmp1=self.strip_js(raw_text)
21
+ tmp2=self.strip_stylesheet(tmp1)
22
+ tmp3=self.strip_tags(tmp2)
23
+ tmp4=self.strip_specialchars(tmp3)
24
+ return tmp4
25
+ end
26
+
27
+ #Given 1d array X, put X(t),X(t-1)...X(t-d) into one row, resulting in a 2d array of size (N-1)x(d+1)
28
+ #n>=1
29
+ def self.displace(arr, d)
30
+ (0...arr.length-d).map {|i|
31
+ (i..(i+d)).map {|j|
32
+ arr[j]}}
33
+ end
34
+
35
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webminer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Yu Shen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-25 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: I really just mine the web
15
+ email: yushen83@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/webminer.rb
21
+ - lib/webminer/util.rb
22
+ - lib/webminer/constants.rb
23
+ homepage: https://github.com/yushen
24
+ licenses: []
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 1.8.19
44
+ signing_key:
45
+ specification_version: 3
46
+ summary: I mine the web
47
+ test_files: []