webminer 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/webminer.rb +0 -1
  2. metadata +23 -4
  3. data/lib/webminer/constants.rb +0 -57
@@ -229,6 +229,5 @@ class WebMiner
229
229
 
230
230
  end
231
231
 
232
- require 'webminer/constants'
233
232
  require 'webminer/util'
234
233
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webminer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,8 +10,28 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
  date: 2012-03-25 00:00:00.000000000Z
13
- dependencies: []
14
- description: I really just mine the web
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name:
16
+ - mongo
17
+ - mongo_mapper
18
+ - nokogiri
19
+ requirement: !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ! '>='
23
+ - !ruby/object:Gem::Version
24
+ version: '0'
25
+ type: :runtime
26
+ prerelease: false
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ description: use in conjunction with https://github.com/yushen/tflogr, run rails r
34
+ script/webminer_script.rb
15
35
  email: yushen83@gmail.com
16
36
  executables: []
17
37
  extensions: []
@@ -19,7 +39,6 @@ extra_rdoc_files: []
19
39
  files:
20
40
  - lib/webminer.rb
21
41
  - lib/webminer/util.rb
22
- - lib/webminer/constants.rb
23
42
  homepage: https://github.com/yushen
24
43
  licenses: []
25
44
  post_install_message:
@@ -1,57 +0,0 @@
1
- class WebMiner::Constants
2
- def self.get_parser_dictionary
3
- return {
4
- "google.com" => 'div[id="hostednews-article"]',
5
- "cbsnews.com" => 'div[id="contentBody"]',
6
- "reuters.com" => 'span[id="articleText"]',
7
- "latimes.com" => 'div[id="story-body-text"]',
8
- "csmonitor.com" => 'div[id="mainColumn"]',
9
- "npr.org" => 'div[id="storytext"]',
10
- "usatoday.com" => 'div[id="mainstory"]',
11
- "content.usatoday.com" => 'div[id="mainstory"]',
12
- "guardian.co.uk" => 'div[id="article-body-blocks"]',
13
- "nytimes.com" => 'div[id="article"]',
14
- "bloomberg.com" => 'div[id="story_content"]',
15
- "online.wsj.com" => 'div[id="article_story_body"]',
16
- "asia.wsj.com" => 'div[id="article_story_body"]',
17
- "businessweek.com" => 'div[id="story-body"]',
18
- "cnn.com" => 'div[id="cnnContentContainer"]',
19
- "edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
20
- "money.cnn.com" => 'div[id="storytext"]',
21
- "abcnews.go.com" => 'div[id="innerbody"]',
22
- "foxnews.com" => 'div[id="introduction"]',
23
- "businessweek.com" => 'div[id="story-body"]',
24
- "entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
25
- "washingtonpost.com" => 'div[id="article_body"]',
26
- # "bbc.co.uk" => 'div[id="main-content"]',
27
- "huffingtonpost.com" => 'div[id="entry_12345"]',
28
- "telegraph.co.uk" => 'div[id="mainBodyArea"]',
29
- "chicagotribune.com" => 'div[id="story-body-text"]',
30
- "foxbusiness.com" => 'div[id="introduction"]',
31
- "thedailybeast.com" => 'div[id="main"] article',
32
- "economictimes.indiatimes.com" => 'div[id="storydiv"]',
33
- "forbes.com" => 'div[id="leftRail"]',
34
- "arstechnica.com" => 'div[id="story"]',
35
- "theregister.co.uk"=> 'div[id="body"]',
36
- "ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
37
- "informationweek.com"=> 'span[id="articleBody"]',
38
- "newyorker.com"=> 'div[id="articletext"]',
39
- "kotaku.com"=> 'div[id="page"]',
40
- "slashgear.com"=> 'span[id="intelliTxt"]',
41
- "pcworld.com"=> 'div[id="articleText"]',
42
- "news.cnet.com"=> 'div[id="article"]',
43
- "english.aljazeera.net"=> 'td[id="tdTextContent"]',
44
- "dailymail.co.uk"=> 'div[id="js-article-text"]',
45
- "rttnews.com"=> 'div[id=""]',
46
- "ft.com"=> 'div[id="storyContent"]',
47
- "politico.com"=> 'div[id="mainContent"]',
48
- "boston.com"=> 'div[id="page1"]',
49
- "sfgate.com"=> 'div[id="fontprefs_bottom"]',
50
- "oregonlive.com"=> 'div[id="article"]'
51
- #""=> 'div[id=""]',
52
-
53
- # "wired.com"=> 'div[id=""]'?
54
- #http://latimesblogs.latimes.com ?
55
- }
56
- end
57
- end