upton 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/test/test_upton.rb CHANGED
@@ -1,9 +1,11 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'test/unit'
2
4
  require 'rack'
3
5
  require 'thin'
4
6
  require 'nokogiri'
5
7
  require 'restclient'
6
- require 'upton'
8
+ require './lib/upton'
7
9
  require 'fileutils'
8
10
 
9
11
  module Upton
@@ -34,22 +36,79 @@ module Upton
34
36
  start_test_server()
35
37
 
36
38
  headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
37
- "Discussion: Military Lending and Debt",
39
+ "",
38
40
  "A Prosecutor, a Wrongful Conviction and a Question of Justice",
39
41
  "Six Facts Lost in the IRS Scandal"]
40
42
 
41
43
  propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
42
- propubscraper.verbose = false
43
- propubscraper.debug = false
44
+ propubscraper.debug = true
45
+ propubscraper.verbose = true
44
46
 
45
- propubscraper.scrape do |article_str|
47
+ heds = propubscraper.scrape do |article_str|
46
48
  doc = Nokogiri::HTML(article_str)
47
49
  hed = doc.css('h1.article-title').text
48
- assert_equal(hed, headlines.shift)
49
50
  end
51
+ assert_equal(heds, headlines)
50
52
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
51
53
  end
52
54
 
55
+ def test_encodings
56
+ skip "should test getting pages, switching their encoding to UTF-8, saving them as UTF-8, reading them as UTF-8"
57
+ end
58
+
59
+ def test_stashing
60
+ skip "should test stashing, make sure we never send too many requests"
61
+ end
62
+
63
+ def test_scrape_list
64
+ #this doesn't test stashing.
65
+ #TODO: needs a website that has links to a multi-page list (or table)
66
+ start_test_server()
67
+
68
+ most_commented_heds = [["Six Facts Lost in the IRS Scandal",
69
+ "How the IRS’s Nonprofit Division Got So Dysfunctional",
70
+ "Sound, Fury and the IRS Mess",
71
+ "The Most Important #Muckreads on Rape in the Military",
72
+ "Congressmen to Hagel: Where Are the Missing War Records?",
73
+ "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
74
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
75
+ "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
76
+ "The Story Behind Our Hospital Interactive",
77
+ "irs-test-charts-for-embedding"]]
78
+
79
+ propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
80
+ propubscraper.debug = true
81
+ propubscraper.verbose = true
82
+ list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
83
+
84
+ assert_equal(list, most_commented_heds)
85
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
86
+ end
87
+
88
+ def test_scrape_table
89
+ #this doesn't test stashing.
90
+ start_test_server()
91
+
92
+ east_timor_prime_ministers = [[
93
+ ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
94
+ "1", "2", "3", "4",],
95
+ [],
96
+ ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
97
+ ["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
98
+ ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
99
+ ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
100
+ ]]
101
+
102
+ propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
103
+ propubscraper.debug = true
104
+ propubscraper.verbose = true
105
+ table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
106
+ assert_equal(table, east_timor_prime_ministers)
107
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
108
+ end
109
+
110
+
111
+
53
112
  private
54
113
  def start_test_server
55
114
  @server_thread = Thread.new do
@@ -67,12 +126,12 @@ module Upton
67
126
  @root = File.expand_path(File.dirname(__FILE__))
68
127
  path = Rack::Utils.unescape(env['PATH_INFO'])
69
128
  path += 'index.html' if path == '/'
70
- file = @root + "#{path}"
129
+ file = File.join(@root, "data", path)
71
130
 
72
131
  params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
73
132
 
74
133
  if File.exists?(file)
75
- [ 200, {"Content-Type" => "text/html"}, File.read(file) ]
134
+ [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
76
135
  else
77
136
  [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
78
137
  end
metadata CHANGED
@@ -1,114 +1,158 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: upton
3
- version: !ruby/object:Gem::Version
4
- version: 0.1.0
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ prerelease:
5
6
  platform: ruby
6
- authors:
7
+ authors:
7
8
  - Jeremy B. Merrill
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
-
12
- date: 2013-05-29 00:00:00 Z
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
12
+ date: 2013-06-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
15
  name: rack
16
- prerelease: false
17
- requirement: &id001 !ruby/object:Gem::Requirement
18
- requirements:
19
- - &id002
20
- - ">="
21
- - !ruby/object:Gem::Version
22
- version: "0"
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
23
22
  type: :development
24
- version_requirements: *id001
25
- - !ruby/object:Gem::Dependency
26
- name: thin
27
23
  prerelease: false
28
- requirement: &id003 !ruby/object:Gem::Requirement
29
- requirements:
30
- - *id002
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: thin
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
31
38
  type: :development
32
- version_requirements: *id003
33
- - !ruby/object:Gem::Dependency
34
- name: nokogiri
35
39
  prerelease: false
36
- requirement: &id004 !ruby/object:Gem::Requirement
37
- requirements:
38
- - *id002
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
39
54
  type: :development
40
- version_requirements: *id004
41
- - !ruby/object:Gem::Dependency
42
- name: yard
43
55
  prerelease: false
44
- requirement: &id005 !ruby/object:Gem::Requirement
45
- requirements:
46
- - *id002
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
47
70
  type: :development
48
- version_requirements: *id005
49
- - !ruby/object:Gem::Dependency
50
- name: rest-client
51
71
  prerelease: false
52
- requirement: &id006 !ruby/object:Gem::Requirement
53
- requirements:
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rest-client
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
54
83
  - - ~>
55
- - !ruby/object:Gem::Version
84
+ - !ruby/object:Gem::Version
56
85
  version: 1.6.7
57
86
  type: :runtime
58
- version_requirements: *id006
59
- - !ruby/object:Gem::Dependency
60
- name: nokogiri
61
87
  prerelease: false
62
- requirement: &id007 !ruby/object:Gem::Requirement
63
- requirements:
64
- - *id002
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.6.7
94
+ - !ruby/object:Gem::Dependency
95
+ name: nokogiri
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
65
102
  type: :runtime
66
- version_requirements: *id007
67
- description: Don't re-write web scrapers every time. Skrapojan gives you a scraper template that's easy to use for debugging and doesn't hammer servers by default
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Don't re-write web scrapers every time. Upton gives you a scraper template
111
+ that's easy to use for debugging and doesn't hammer servers by default
68
112
  email: jeremy.merrill@propublica.org
69
113
  executables: []
70
-
71
114
  extensions: []
72
-
73
115
  extra_rdoc_files: []
74
-
75
- files:
116
+ files:
76
117
  - lib/upton.rb
77
118
  - test/data/discussion.html
119
+ - test/data/easttimor.html
78
120
  - test/data/propublica.html
79
121
  - test/data/prosecutor.html
80
122
  - test/data/sixfacts.html
81
123
  - test/data/webinar.html
82
124
  - test/test_upton.rb
83
125
  homepage: http://github.org/propublica/upton
84
- licenses:
126
+ licenses:
85
127
  - MIT
86
- metadata: {}
87
-
88
128
  post_install_message:
89
129
  rdoc_options: []
90
-
91
- require_paths:
130
+ require_paths:
92
131
  - lib
93
- required_ruby_version: !ruby/object:Gem::Requirement
94
- requirements:
95
- - - ">="
96
- - !ruby/object:Gem::Version
132
+ required_ruby_version: !ruby/object:Gem::Requirement
133
+ none: false
134
+ requirements:
135
+ - - ! '>='
136
+ - !ruby/object:Gem::Version
97
137
  version: 1.8.7
98
- required_rubygems_version: !ruby/object:Gem::Requirement
99
- requirements:
100
- - *id002
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ none: false
140
+ requirements:
141
+ - - ! '>='
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
101
144
  requirements: []
102
-
103
145
  rubyforge_project:
104
- rubygems_version: 2.0.3
146
+ rubygems_version: 1.8.23
105
147
  signing_key:
106
- specification_version: 4
148
+ specification_version: 3
107
149
  summary: A simple web-scraping framework
108
- test_files:
150
+ test_files:
109
151
  - test/data/discussion.html
152
+ - test/data/easttimor.html
110
153
  - test/data/propublica.html
111
154
  - test/data/prosecutor.html
112
155
  - test/data/sixfacts.html
113
156
  - test/data/webinar.html
114
157
  - test/test_upton.rb
158
+ has_rdoc: true
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA512:
3
- data.tar.gz: 0c50b13aca2d3f11f8ebbef72420dfc3276169fb0c1eafadd437ae7d1a83f2d9467511ddc8e529321be677e26bcbd1af02c6c2c615d14e1ea09de0cc9b6d9762
4
- metadata.gz: e795b2a9ffe25373b419ccded0db1a7786f7b4789dbf8fc7da28cd53764232ddcad30f398b569e78ae5404690d96661577ea84340b4cf4a4e40798dbd82779e0
5
- SHA1:
6
- data.tar.gz: 13e674057073d5cd0fc1dff400450717a0613bc4
7
- metadata.gz: 1629ba3d63b260994bcebf99f1f5d79a499bb707