upton 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/test/test_upton.rb CHANGED
@@ -1,9 +1,11 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'test/unit'
2
4
  require 'rack'
3
5
  require 'thin'
4
6
  require 'nokogiri'
5
7
  require 'restclient'
6
- require 'upton'
8
+ require './lib/upton'
7
9
  require 'fileutils'
8
10
 
9
11
  module Upton
@@ -34,22 +36,79 @@ module Upton
34
36
  start_test_server()
35
37
 
36
38
  headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
37
- "Discussion: Military Lending and Debt",
39
+ "",
38
40
  "A Prosecutor, a Wrongful Conviction and a Question of Justice",
39
41
  "Six Facts Lost in the IRS Scandal"]
40
42
 
41
43
  propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
42
- propubscraper.verbose = false
43
- propubscraper.debug = false
44
+ propubscraper.debug = true
45
+ propubscraper.verbose = true
44
46
 
45
- propubscraper.scrape do |article_str|
47
+ heds = propubscraper.scrape do |article_str|
46
48
  doc = Nokogiri::HTML(article_str)
47
49
  hed = doc.css('h1.article-title').text
48
- assert_equal(hed, headlines.shift)
49
50
  end
51
+ assert_equal(heds, headlines)
50
52
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
51
53
  end
52
54
 
55
+ def test_encodings
56
+ skip "should test getting pages, switching their encoding to UTF-8, saving them as UTF-8, reading them as UTF-8"
57
+ end
58
+
59
+ def test_stashing
60
+ skip "should test stashing, make sure we never send too many requests"
61
+ end
62
+
63
+ def test_scrape_list
64
+ #this doesn't test stashing.
65
+ #TODO: needs a website that has links to a multi-page list (or table)
66
+ start_test_server()
67
+
68
+ most_commented_heds = [["Six Facts Lost in the IRS Scandal",
69
+ "How the IRS’s Nonprofit Division Got So Dysfunctional",
70
+ "Sound, Fury and the IRS Mess",
71
+ "The Most Important #Muckreads on Rape in the Military",
72
+ "Congressmen to Hagel: Where Are the Missing War Records?",
73
+ "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
74
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
75
+ "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
76
+ "The Story Behind Our Hospital Interactive",
77
+ "irs-test-charts-for-embedding"]]
78
+
79
+ propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
80
+ propubscraper.debug = true
81
+ propubscraper.verbose = true
82
+ list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
83
+
84
+ assert_equal(list, most_commented_heds)
85
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
86
+ end
87
+
88
+ def test_scrape_table
89
+ #this doesn't test stashing.
90
+ start_test_server()
91
+
92
+ east_timor_prime_ministers = [[
93
+ ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
94
+ "1", "2", "3", "4",],
95
+ [],
96
+ ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
97
+ ["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
98
+ ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
99
+ ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
100
+ ]]
101
+
102
+ propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
103
+ propubscraper.debug = true
104
+ propubscraper.verbose = true
105
+ table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
106
+ assert_equal(table, east_timor_prime_ministers)
107
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
108
+ end
109
+
110
+
111
+
53
112
  private
54
113
  def start_test_server
55
114
  @server_thread = Thread.new do
@@ -67,12 +126,12 @@ module Upton
67
126
  @root = File.expand_path(File.dirname(__FILE__))
68
127
  path = Rack::Utils.unescape(env['PATH_INFO'])
69
128
  path += 'index.html' if path == '/'
70
- file = @root + "#{path}"
129
+ file = File.join(@root, "data", path)
71
130
 
72
131
  params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
73
132
 
74
133
  if File.exists?(file)
75
- [ 200, {"Content-Type" => "text/html"}, File.read(file) ]
134
+ [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
76
135
  else
77
136
  [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
78
137
  end
metadata CHANGED
@@ -1,114 +1,158 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: upton
3
- version: !ruby/object:Gem::Version
4
- version: 0.1.0
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ prerelease:
5
6
  platform: ruby
6
- authors:
7
+ authors:
7
8
  - Jeremy B. Merrill
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
-
12
- date: 2013-05-29 00:00:00 Z
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
12
+ date: 2013-06-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
15
  name: rack
16
- prerelease: false
17
- requirement: &id001 !ruby/object:Gem::Requirement
18
- requirements:
19
- - &id002
20
- - ">="
21
- - !ruby/object:Gem::Version
22
- version: "0"
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
23
22
  type: :development
24
- version_requirements: *id001
25
- - !ruby/object:Gem::Dependency
26
- name: thin
27
23
  prerelease: false
28
- requirement: &id003 !ruby/object:Gem::Requirement
29
- requirements:
30
- - *id002
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: thin
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
31
38
  type: :development
32
- version_requirements: *id003
33
- - !ruby/object:Gem::Dependency
34
- name: nokogiri
35
39
  prerelease: false
36
- requirement: &id004 !ruby/object:Gem::Requirement
37
- requirements:
38
- - *id002
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
39
54
  type: :development
40
- version_requirements: *id004
41
- - !ruby/object:Gem::Dependency
42
- name: yard
43
55
  prerelease: false
44
- requirement: &id005 !ruby/object:Gem::Requirement
45
- requirements:
46
- - *id002
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
47
70
  type: :development
48
- version_requirements: *id005
49
- - !ruby/object:Gem::Dependency
50
- name: rest-client
51
71
  prerelease: false
52
- requirement: &id006 !ruby/object:Gem::Requirement
53
- requirements:
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rest-client
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
54
83
  - - ~>
55
- - !ruby/object:Gem::Version
84
+ - !ruby/object:Gem::Version
56
85
  version: 1.6.7
57
86
  type: :runtime
58
- version_requirements: *id006
59
- - !ruby/object:Gem::Dependency
60
- name: nokogiri
61
87
  prerelease: false
62
- requirement: &id007 !ruby/object:Gem::Requirement
63
- requirements:
64
- - *id002
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.6.7
94
+ - !ruby/object:Gem::Dependency
95
+ name: nokogiri
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
65
102
  type: :runtime
66
- version_requirements: *id007
67
- description: Don't re-write web scrapers every time. Skrapojan gives you a scraper template that's easy to use for debugging and doesn't hammer servers by default
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Don't re-write web scrapers every time. Upton gives you a scraper template
111
+ that's easy to use for debugging and doesn't hammer servers by default
68
112
  email: jeremy.merrill@propublica.org
69
113
  executables: []
70
-
71
114
  extensions: []
72
-
73
115
  extra_rdoc_files: []
74
-
75
- files:
116
+ files:
76
117
  - lib/upton.rb
77
118
  - test/data/discussion.html
119
+ - test/data/easttimor.html
78
120
  - test/data/propublica.html
79
121
  - test/data/prosecutor.html
80
122
  - test/data/sixfacts.html
81
123
  - test/data/webinar.html
82
124
  - test/test_upton.rb
83
125
  homepage: http://github.org/propublica/upton
84
- licenses:
126
+ licenses:
85
127
  - MIT
86
- metadata: {}
87
-
88
128
  post_install_message:
89
129
  rdoc_options: []
90
-
91
- require_paths:
130
+ require_paths:
92
131
  - lib
93
- required_ruby_version: !ruby/object:Gem::Requirement
94
- requirements:
95
- - - ">="
96
- - !ruby/object:Gem::Version
132
+ required_ruby_version: !ruby/object:Gem::Requirement
133
+ none: false
134
+ requirements:
135
+ - - ! '>='
136
+ - !ruby/object:Gem::Version
97
137
  version: 1.8.7
98
- required_rubygems_version: !ruby/object:Gem::Requirement
99
- requirements:
100
- - *id002
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ none: false
140
+ requirements:
141
+ - - ! '>='
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
101
144
  requirements: []
102
-
103
145
  rubyforge_project:
104
- rubygems_version: 2.0.3
146
+ rubygems_version: 1.8.23
105
147
  signing_key:
106
- specification_version: 4
148
+ specification_version: 3
107
149
  summary: A simple web-scraping framework
108
- test_files:
150
+ test_files:
109
151
  - test/data/discussion.html
152
+ - test/data/easttimor.html
110
153
  - test/data/propublica.html
111
154
  - test/data/prosecutor.html
112
155
  - test/data/sixfacts.html
113
156
  - test/data/webinar.html
114
157
  - test/test_upton.rb
158
+ has_rdoc: true
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA512:
3
- data.tar.gz: 0c50b13aca2d3f11f8ebbef72420dfc3276169fb0c1eafadd437ae7d1a83f2d9467511ddc8e529321be677e26bcbd1af02c6c2c615d14e1ea09de0cc9b6d9762
4
- metadata.gz: e795b2a9ffe25373b419ccded0db1a7786f7b4789dbf8fc7da28cd53764232ddcad30f398b569e78ae5404690d96661577ea84340b4cf4a4e40798dbd82779e0
5
- SHA1:
6
- data.tar.gz: 13e674057073d5cd0fc1dff400450717a0613bc4
7
- metadata.gz: 1629ba3d63b260994bcebf99f1f5d79a499bb707