upton 0.3.1 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/upton.rb +12 -9
- data/lib/upton/downloader.rb +1 -1
- data/lib/upton/version.rb +1 -1
- data/spec/upton_spec.rb +4 -0
- metadata +4 -32
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ff34d4a9c4a6356b1018dd6a754ceba17c42cf7d
|
4
|
+
data.tar.gz: d8f78bf65235f590795c9ad88ad6327ea096a7d4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf4fa0283c1902ddf7cdb73762580ce4cb7477d1232e3d71ea22d1262c68bb019c064b98a8fd3d3e3d1cd61cf891c018ad66f9123ff0e8f9350e96b3a90217ac
|
7
|
+
data.tar.gz: ee02d8805a78d7332aac844d7ce486083debab43443ac702eabccf4349018907ce8001d2c755c1cbf23ea6048f631d0d9f18728b47e09fddb5435bb2230c136e
|
data/lib/upton.rb
CHANGED
@@ -46,6 +46,7 @@ module Upton
|
|
46
46
|
##
|
47
47
|
def scrape(&blk)
|
48
48
|
self.url_array = self.get_index unless self.url_array
|
49
|
+
blk = Proc.new{|x| x} if blk.nil?
|
49
50
|
self.scrape_from_list(self.url_array, blk)
|
50
51
|
end
|
51
52
|
|
@@ -146,7 +147,7 @@ module Upton
|
|
146
147
|
#
|
147
148
|
##
|
148
149
|
def next_index_page_url(url, pagination_index)
|
149
|
-
return
|
150
|
+
return url unless @paginated
|
150
151
|
|
151
152
|
if pagination_index > @pagination_max_pages
|
152
153
|
puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
|
@@ -291,17 +292,19 @@ module Upton
|
|
291
292
|
# Returns the concatenated output of each member of a paginated index,
|
292
293
|
# e.g. a site listing links with 2+ pages.
|
293
294
|
##
|
294
|
-
def get_index_pages(
|
295
|
-
resps = [
|
296
|
-
prev_url =
|
297
|
-
while !resps.last.empty?
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
295
|
+
def get_index_pages(original_url, pagination_index, pagination_interval, options={})
|
296
|
+
resps = []
|
297
|
+
prev_url = nil
|
298
|
+
while resps.empty? || !resps.last.empty?
|
299
|
+
next_url = self.next_index_page_url(original_url, pagination_index)
|
300
|
+
break if next_url.empty?
|
301
|
+
|
302
|
+
next_url = resolve_url(next_url, original_url)
|
303
|
+
break if next_url == prev_url
|
302
304
|
|
303
305
|
next_resp = self.get_page(next_url, @index_debug, options).to_s
|
304
306
|
prev_url = next_url
|
307
|
+
pagination_index += pagination_interval
|
305
308
|
resps << next_resp
|
306
309
|
end
|
307
310
|
resps
|
data/lib/upton/downloader.rb
CHANGED
@@ -103,7 +103,7 @@ module Upton
|
|
103
103
|
msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
|
104
104
|
resp_html = Nokogiri::HTML(resp)
|
105
105
|
comment = Nokogiri::XML::Comment.new(resp_html, msg)
|
106
|
-
if resp_html.root.nil?
|
106
|
+
if resp_html.root.nil? || !resp_html.include?("<html")
|
107
107
|
return resp
|
108
108
|
elsif resp_html.root.children.empty?
|
109
109
|
resp_html.root.add_child(comment)
|
data/lib/upton/version.rb
CHANGED
data/spec/upton_spec.rb
CHANGED
@@ -139,6 +139,8 @@ describe Upton do
|
|
139
139
|
it "should scrape paginated pages" do
|
140
140
|
stub_request(:get, "www.example.com/propublica_search.html").
|
141
141
|
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
142
|
+
stub_request(:get, "www.example.com/propublica_search.html?p=1").
|
143
|
+
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
142
144
|
stub_request(:get, "www.example.com/propublica_search.html?p=2").
|
143
145
|
to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
|
144
146
|
stub_request(:get, "www.example.com/propublica_search.html?p=3").
|
@@ -197,6 +199,8 @@ describe Upton do
|
|
197
199
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
198
200
|
stub_request(:get, "www.example.com/propublica_search.html").
|
199
201
|
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
202
|
+
stub_request(:get, "www.example.com/propublica_search.html?p=1").
|
203
|
+
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
200
204
|
stub_request(:get, "www.example.com/propublica_search.html?p=2").
|
201
205
|
to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
|
202
206
|
stub_request(:get, "www.example.com/propublica_search.html?p=3").
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rack
|
@@ -66,20 +66,6 @@ dependencies:
|
|
66
66
|
- - '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: nokogiri
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - '>='
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: 1.5.1
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - '>='
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: 1.5.1
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: yard
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -114,28 +100,14 @@ dependencies:
|
|
114
100
|
requirements:
|
115
101
|
- - '>='
|
116
102
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
118
|
-
type: :runtime
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
requirements:
|
122
|
-
- - '>='
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: '0'
|
125
|
-
- !ruby/object:Gem::Dependency
|
126
|
-
name: mechanize
|
127
|
-
requirement: !ruby/object:Gem::Requirement
|
128
|
-
requirements:
|
129
|
-
- - '>='
|
130
|
-
- !ruby/object:Gem::Version
|
131
|
-
version: '0'
|
103
|
+
version: 1.5.1
|
132
104
|
type: :runtime
|
133
105
|
prerelease: false
|
134
106
|
version_requirements: !ruby/object:Gem::Requirement
|
135
107
|
requirements:
|
136
108
|
- - '>='
|
137
109
|
- !ruby/object:Gem::Version
|
138
|
-
version:
|
110
|
+
version: 1.5.1
|
139
111
|
description: Don't re-write web scrapers every time. Upton gives you a scraper template
|
140
112
|
that's easy to use for debugging and doesn't hammer servers by default.
|
141
113
|
email: jeremybmerrill@jeremybmerrill.com
|