webminer 0.0.0 → 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/webminer.rb +0 -1
- metadata +23 -4
- data/lib/webminer/constants.rb +0 -57
data/lib/webminer.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webminer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,8 +10,28 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
date: 2012-03-25 00:00:00.000000000Z
|
13
|
-
dependencies:
|
14
|
-
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name:
|
16
|
+
- mongo
|
17
|
+
- mongo_mapper
|
18
|
+
- nokogiri
|
19
|
+
requirement: !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ! '>='
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: '0'
|
25
|
+
type: :runtime
|
26
|
+
prerelease: false
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
description: use in conjunction with https://github.com/yushen/tflogr, run rails r
|
34
|
+
script/webminer_script.rb
|
15
35
|
email: yushen83@gmail.com
|
16
36
|
executables: []
|
17
37
|
extensions: []
|
@@ -19,7 +39,6 @@ extra_rdoc_files: []
|
|
19
39
|
files:
|
20
40
|
- lib/webminer.rb
|
21
41
|
- lib/webminer/util.rb
|
22
|
-
- lib/webminer/constants.rb
|
23
42
|
homepage: https://github.com/yushen
|
24
43
|
licenses: []
|
25
44
|
post_install_message:
|
data/lib/webminer/constants.rb
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
class WebMiner::Constants
|
2
|
-
def self.get_parser_dictionary
|
3
|
-
return {
|
4
|
-
"google.com" => 'div[id="hostednews-article"]',
|
5
|
-
"cbsnews.com" => 'div[id="contentBody"]',
|
6
|
-
"reuters.com" => 'span[id="articleText"]',
|
7
|
-
"latimes.com" => 'div[id="story-body-text"]',
|
8
|
-
"csmonitor.com" => 'div[id="mainColumn"]',
|
9
|
-
"npr.org" => 'div[id="storytext"]',
|
10
|
-
"usatoday.com" => 'div[id="mainstory"]',
|
11
|
-
"content.usatoday.com" => 'div[id="mainstory"]',
|
12
|
-
"guardian.co.uk" => 'div[id="article-body-blocks"]',
|
13
|
-
"nytimes.com" => 'div[id="article"]',
|
14
|
-
"bloomberg.com" => 'div[id="story_content"]',
|
15
|
-
"online.wsj.com" => 'div[id="article_story_body"]',
|
16
|
-
"asia.wsj.com" => 'div[id="article_story_body"]',
|
17
|
-
"businessweek.com" => 'div[id="story-body"]',
|
18
|
-
"cnn.com" => 'div[id="cnnContentContainer"]',
|
19
|
-
"edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
|
20
|
-
"money.cnn.com" => 'div[id="storytext"]',
|
21
|
-
"abcnews.go.com" => 'div[id="innerbody"]',
|
22
|
-
"foxnews.com" => 'div[id="introduction"]',
|
23
|
-
"businessweek.com" => 'div[id="story-body"]',
|
24
|
-
"entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
|
25
|
-
"washingtonpost.com" => 'div[id="article_body"]',
|
26
|
-
# "bbc.co.uk" => 'div[id="main-content"]',
|
27
|
-
"huffingtonpost.com" => 'div[id="entry_12345"]',
|
28
|
-
"telegraph.co.uk" => 'div[id="mainBodyArea"]',
|
29
|
-
"chicagotribune.com" => 'div[id="story-body-text"]',
|
30
|
-
"foxbusiness.com" => 'div[id="introduction"]',
|
31
|
-
"thedailybeast.com" => 'div[id="main"] article',
|
32
|
-
"economictimes.indiatimes.com" => 'div[id="storydiv"]',
|
33
|
-
"forbes.com" => 'div[id="leftRail"]',
|
34
|
-
"arstechnica.com" => 'div[id="story"]',
|
35
|
-
"theregister.co.uk"=> 'div[id="body"]',
|
36
|
-
"ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
|
37
|
-
"informationweek.com"=> 'span[id="articleBody"]',
|
38
|
-
"newyorker.com"=> 'div[id="articletext"]',
|
39
|
-
"kotaku.com"=> 'div[id="page"]',
|
40
|
-
"slashgear.com"=> 'span[id="intelliTxt"]',
|
41
|
-
"pcworld.com"=> 'div[id="articleText"]',
|
42
|
-
"news.cnet.com"=> 'div[id="article"]',
|
43
|
-
"english.aljazeera.net"=> 'td[id="tdTextContent"]',
|
44
|
-
"dailymail.co.uk"=> 'div[id="js-article-text"]',
|
45
|
-
"rttnews.com"=> 'div[id=""]',
|
46
|
-
"ft.com"=> 'div[id="storyContent"]',
|
47
|
-
"politico.com"=> 'div[id="mainContent"]',
|
48
|
-
"boston.com"=> 'div[id="page1"]',
|
49
|
-
"sfgate.com"=> 'div[id="fontprefs_bottom"]',
|
50
|
-
"oregonlive.com"=> 'div[id="article"]'
|
51
|
-
#""=> 'div[id=""]',
|
52
|
-
|
53
|
-
# "wired.com"=> 'div[id=""]'?
|
54
|
-
#http://latimesblogs.latimes.com ?
|
55
|
-
}
|
56
|
-
end
|
57
|
-
end
|