webminer 0.0.0 → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/webminer.rb +0 -1
  2. metadata +23 -4
  3. data/lib/webminer/constants.rb +0 -57
@@ -229,6 +229,5 @@ class WebMiner
229
229
 
230
230
  end
231
231
 
232
- require 'webminer/constants'
233
232
  require 'webminer/util'
234
233
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webminer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,8 +10,28 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
  date: 2012-03-25 00:00:00.000000000Z
13
- dependencies: []
14
- description: I really just mine the web
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name:
16
+ - mongo
17
+ - mongo_mapper
18
+ - nokogiri
19
+ requirement: !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ! '>='
23
+ - !ruby/object:Gem::Version
24
+ version: '0'
25
+ type: :runtime
26
+ prerelease: false
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ description: use in conjunction with https://github.com/yushen/tflogr, run rails r
34
+ script/webminer_script.rb
15
35
  email: yushen83@gmail.com
16
36
  executables: []
17
37
  extensions: []
@@ -19,7 +39,6 @@ extra_rdoc_files: []
19
39
  files:
20
40
  - lib/webminer.rb
21
41
  - lib/webminer/util.rb
22
- - lib/webminer/constants.rb
23
42
  homepage: https://github.com/yushen
24
43
  licenses: []
25
44
  post_install_message:
@@ -1,57 +0,0 @@
1
- class WebMiner::Constants
2
- def self.get_parser_dictionary
3
- return {
4
- "google.com" => 'div[id="hostednews-article"]',
5
- "cbsnews.com" => 'div[id="contentBody"]',
6
- "reuters.com" => 'span[id="articleText"]',
7
- "latimes.com" => 'div[id="story-body-text"]',
8
- "csmonitor.com" => 'div[id="mainColumn"]',
9
- "npr.org" => 'div[id="storytext"]',
10
- "usatoday.com" => 'div[id="mainstory"]',
11
- "content.usatoday.com" => 'div[id="mainstory"]',
12
- "guardian.co.uk" => 'div[id="article-body-blocks"]',
13
- "nytimes.com" => 'div[id="article"]',
14
- "bloomberg.com" => 'div[id="story_content"]',
15
- "online.wsj.com" => 'div[id="article_story_body"]',
16
- "asia.wsj.com" => 'div[id="article_story_body"]',
17
- "businessweek.com" => 'div[id="story-body"]',
18
- "cnn.com" => 'div[id="cnnContentContainer"]',
19
- "edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
20
- "money.cnn.com" => 'div[id="storytext"]',
21
- "abcnews.go.com" => 'div[id="innerbody"]',
22
- "foxnews.com" => 'div[id="introduction"]',
23
- "businessweek.com" => 'div[id="story-body"]',
24
- "entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
25
- "washingtonpost.com" => 'div[id="article_body"]',
26
- # "bbc.co.uk" => 'div[id="main-content"]',
27
- "huffingtonpost.com" => 'div[id="entry_12345"]',
28
- "telegraph.co.uk" => 'div[id="mainBodyArea"]',
29
- "chicagotribune.com" => 'div[id="story-body-text"]',
30
- "foxbusiness.com" => 'div[id="introduction"]',
31
- "thedailybeast.com" => 'div[id="main"] article',
32
- "economictimes.indiatimes.com" => 'div[id="storydiv"]',
33
- "forbes.com" => 'div[id="leftRail"]',
34
- "arstechnica.com" => 'div[id="story"]',
35
- "theregister.co.uk"=> 'div[id="body"]',
36
- "ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
37
- "informationweek.com"=> 'span[id="articleBody"]',
38
- "newyorker.com"=> 'div[id="articletext"]',
39
- "kotaku.com"=> 'div[id="page"]',
40
- "slashgear.com"=> 'span[id="intelliTxt"]',
41
- "pcworld.com"=> 'div[id="articleText"]',
42
- "news.cnet.com"=> 'div[id="article"]',
43
- "english.aljazeera.net"=> 'td[id="tdTextContent"]',
44
- "dailymail.co.uk"=> 'div[id="js-article-text"]',
45
- "rttnews.com"=> 'div[id=""]',
46
- "ft.com"=> 'div[id="storyContent"]',
47
- "politico.com"=> 'div[id="mainContent"]',
48
- "boston.com"=> 'div[id="page1"]',
49
- "sfgate.com"=> 'div[id="fontprefs_bottom"]',
50
- "oregonlive.com"=> 'div[id="article"]'
51
- #""=> 'div[id=""]',
52
-
53
- # "wired.com"=> 'div[id=""]'?
54
- #http://latimesblogs.latimes.com ?
55
- }
56
- end
57
- end