upton 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 892592f6c890ecd94fb1bdf3b8cc500e813ebfa3
4
- data.tar.gz: 95d10ea4c37aaec611c76dc98c45dd449a1ac35d
3
+ metadata.gz: ff34d4a9c4a6356b1018dd6a754ceba17c42cf7d
4
+ data.tar.gz: d8f78bf65235f590795c9ad88ad6327ea096a7d4
5
5
  SHA512:
6
- metadata.gz: f112a48ed90264ac5e111e48b45e6b67468793059f613385faa87bd6ab5122a7f11c532358daf7382c689e42b66db022bfce24ffd0b32ffe15619de0a026df77
7
- data.tar.gz: 8fcbd1276ea6e284481d0395de5a1f73c07a2acb159edab6775c875d3dbf76e2b3c16c6b46b2aa48aec345fbd4950a5af179a6e560a1af5923e5087a8c6a648b
6
+ metadata.gz: bf4fa0283c1902ddf7cdb73762580ce4cb7477d1232e3d71ea22d1262c68bb019c064b98a8fd3d3e3d1cd61cf891c018ad66f9123ff0e8f9350e96b3a90217ac
7
+ data.tar.gz: ee02d8805a78d7332aac844d7ce486083debab43443ac702eabccf4349018907ce8001d2c755c1cbf23ea6048f631d0d9f18728b47e09fddb5435bb2230c136e
@@ -46,6 +46,7 @@ module Upton
46
46
  ##
47
47
  def scrape(&blk)
48
48
  self.url_array = self.get_index unless self.url_array
49
+ blk = Proc.new{|x| x} if blk.nil?
49
50
  self.scrape_from_list(self.url_array, blk)
50
51
  end
51
52
 
@@ -146,7 +147,7 @@ module Upton
146
147
  #
147
148
  ##
148
149
  def next_index_page_url(url, pagination_index)
149
- return EMPTY_STRING unless @paginated
150
+ return url unless @paginated
150
151
 
151
152
  if pagination_index > @pagination_max_pages
152
153
  puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
@@ -291,17 +292,19 @@ module Upton
291
292
  # Returns the concatenated output of each member of a paginated index,
292
293
  # e.g. a site listing links with 2+ pages.
293
294
  ##
294
- def get_index_pages(url, pagination_index, pagination_interval, options={})
295
- resps = [self.get_page(url, @index_debug, options)]
296
- prev_url = url
297
- while !resps.last.empty?
298
- pagination_index += pagination_interval
299
- next_url = self.next_index_page_url(url, pagination_index)
300
- next_url = resolve_url(next_url, url)
301
- break if next_url == prev_url || next_url.empty?
295
+ def get_index_pages(original_url, pagination_index, pagination_interval, options={})
296
+ resps = []
297
+ prev_url = nil
298
+ while resps.empty? || !resps.last.empty?
299
+ next_url = self.next_index_page_url(original_url, pagination_index)
300
+ break if next_url.empty?
301
+
302
+ next_url = resolve_url(next_url, original_url)
303
+ break if next_url == prev_url
302
304
 
303
305
  next_resp = self.get_page(next_url, @index_debug, options).to_s
304
306
  prev_url = next_url
307
+ pagination_index += pagination_interval
305
308
  resps << next_resp
306
309
  end
307
310
  resps
@@ -103,7 +103,7 @@ module Upton
103
103
  msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
104
104
  resp_html = Nokogiri::HTML(resp)
105
105
  comment = Nokogiri::XML::Comment.new(resp_html, msg)
106
- if resp_html.root.nil?
106
+ if resp_html.root.nil? || !resp_html.include?("<html")
107
107
  return resp
108
108
  elsif resp_html.root.children.empty?
109
109
  resp_html.root.add_child(comment)
@@ -1,3 +1,3 @@
1
1
  module Upton # :nodoc:
2
- VERSION = '0.3.1'
2
+ VERSION = '0.3.3'
3
3
  end
@@ -139,6 +139,8 @@ describe Upton do
139
139
  it "should scrape paginated pages" do
140
140
  stub_request(:get, "www.example.com/propublica_search.html").
141
141
  to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
142
+ stub_request(:get, "www.example.com/propublica_search.html?p=1").
143
+ to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
142
144
  stub_request(:get, "www.example.com/propublica_search.html?p=2").
143
145
  to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
144
146
  stub_request(:get, "www.example.com/propublica_search.html?p=3").
@@ -197,6 +199,8 @@ describe Upton do
197
199
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
198
200
  stub_request(:get, "www.example.com/propublica_search.html").
199
201
  to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
202
+ stub_request(:get, "www.example.com/propublica_search.html?p=1").
203
+ to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
200
204
  stub_request(:get, "www.example.com/propublica_search.html?p=2").
201
205
  to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
202
206
  stub_request(:get, "www.example.com/propublica_search.html?p=3").
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-16 00:00:00.000000000 Z
11
+ date: 2014-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack
@@ -66,20 +66,6 @@ dependencies:
66
66
  - - '>='
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: nokogiri
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - '>='
74
- - !ruby/object:Gem::Version
75
- version: 1.5.1
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - '>='
81
- - !ruby/object:Gem::Version
82
- version: 1.5.1
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: yard
85
71
  requirement: !ruby/object:Gem::Requirement
@@ -114,28 +100,14 @@ dependencies:
114
100
  requirements:
115
101
  - - '>='
116
102
  - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :runtime
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - '>='
123
- - !ruby/object:Gem::Version
124
- version: '0'
125
- - !ruby/object:Gem::Dependency
126
- name: mechanize
127
- requirement: !ruby/object:Gem::Requirement
128
- requirements:
129
- - - '>='
130
- - !ruby/object:Gem::Version
131
- version: '0'
103
+ version: 1.5.1
132
104
  type: :runtime
133
105
  prerelease: false
134
106
  version_requirements: !ruby/object:Gem::Requirement
135
107
  requirements:
136
108
  - - '>='
137
109
  - !ruby/object:Gem::Version
138
- version: '0'
110
+ version: 1.5.1
139
111
  description: Don't re-write web scrapers every time. Upton gives you a scraper template
140
112
  that's easy to use for debugging and doesn't hammer servers by default.
141
113
  email: jeremybmerrill@jeremybmerrill.com