upton 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 892592f6c890ecd94fb1bdf3b8cc500e813ebfa3
4
- data.tar.gz: 95d10ea4c37aaec611c76dc98c45dd449a1ac35d
3
+ metadata.gz: ff34d4a9c4a6356b1018dd6a754ceba17c42cf7d
4
+ data.tar.gz: d8f78bf65235f590795c9ad88ad6327ea096a7d4
5
5
  SHA512:
6
- metadata.gz: f112a48ed90264ac5e111e48b45e6b67468793059f613385faa87bd6ab5122a7f11c532358daf7382c689e42b66db022bfce24ffd0b32ffe15619de0a026df77
7
- data.tar.gz: 8fcbd1276ea6e284481d0395de5a1f73c07a2acb159edab6775c875d3dbf76e2b3c16c6b46b2aa48aec345fbd4950a5af179a6e560a1af5923e5087a8c6a648b
6
+ metadata.gz: bf4fa0283c1902ddf7cdb73762580ce4cb7477d1232e3d71ea22d1262c68bb019c064b98a8fd3d3e3d1cd61cf891c018ad66f9123ff0e8f9350e96b3a90217ac
7
+ data.tar.gz: ee02d8805a78d7332aac844d7ce486083debab43443ac702eabccf4349018907ce8001d2c755c1cbf23ea6048f631d0d9f18728b47e09fddb5435bb2230c136e
@@ -46,6 +46,7 @@ module Upton
46
46
  ##
47
47
  def scrape(&blk)
48
48
  self.url_array = self.get_index unless self.url_array
49
+ blk = Proc.new{|x| x} if blk.nil?
49
50
  self.scrape_from_list(self.url_array, blk)
50
51
  end
51
52
 
@@ -146,7 +147,7 @@ module Upton
146
147
  #
147
148
  ##
148
149
  def next_index_page_url(url, pagination_index)
149
- return EMPTY_STRING unless @paginated
150
+ return url unless @paginated
150
151
 
151
152
  if pagination_index > @pagination_max_pages
152
153
  puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
@@ -291,17 +292,19 @@ module Upton
291
292
  # Returns the concatenated output of each member of a paginated index,
292
293
  # e.g. a site listing links with 2+ pages.
293
294
  ##
294
- def get_index_pages(url, pagination_index, pagination_interval, options={})
295
- resps = [self.get_page(url, @index_debug, options)]
296
- prev_url = url
297
- while !resps.last.empty?
298
- pagination_index += pagination_interval
299
- next_url = self.next_index_page_url(url, pagination_index)
300
- next_url = resolve_url(next_url, url)
301
- break if next_url == prev_url || next_url.empty?
295
+ def get_index_pages(original_url, pagination_index, pagination_interval, options={})
296
+ resps = []
297
+ prev_url = nil
298
+ while resps.empty? || !resps.last.empty?
299
+ next_url = self.next_index_page_url(original_url, pagination_index)
300
+ break if next_url.empty?
301
+
302
+ next_url = resolve_url(next_url, original_url)
303
+ break if next_url == prev_url
302
304
 
303
305
  next_resp = self.get_page(next_url, @index_debug, options).to_s
304
306
  prev_url = next_url
307
+ pagination_index += pagination_interval
305
308
  resps << next_resp
306
309
  end
307
310
  resps
@@ -103,7 +103,7 @@ module Upton
103
103
  msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
104
104
  resp_html = Nokogiri::HTML(resp)
105
105
  comment = Nokogiri::XML::Comment.new(resp_html, msg)
106
- if resp_html.root.nil?
106
+ if resp_html.root.nil? || !resp_html.include?("<html")
107
107
  return resp
108
108
  elsif resp_html.root.children.empty?
109
109
  resp_html.root.add_child(comment)
@@ -1,3 +1,3 @@
1
1
  module Upton # :nodoc:
2
- VERSION = '0.3.1'
2
+ VERSION = '0.3.3'
3
3
  end
@@ -139,6 +139,8 @@ describe Upton do
139
139
  it "should scrape paginated pages" do
140
140
  stub_request(:get, "www.example.com/propublica_search.html").
141
141
  to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
142
+ stub_request(:get, "www.example.com/propublica_search.html?p=1").
143
+ to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
142
144
  stub_request(:get, "www.example.com/propublica_search.html?p=2").
143
145
  to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
144
146
  stub_request(:get, "www.example.com/propublica_search.html?p=3").
@@ -197,6 +199,8 @@ describe Upton do
197
199
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
198
200
  stub_request(:get, "www.example.com/propublica_search.html").
199
201
  to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
202
+ stub_request(:get, "www.example.com/propublica_search.html?p=1").
203
+ to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
200
204
  stub_request(:get, "www.example.com/propublica_search.html?p=2").
201
205
  to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
202
206
  stub_request(:get, "www.example.com/propublica_search.html?p=3").
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-16 00:00:00.000000000 Z
11
+ date: 2014-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack
@@ -66,20 +66,6 @@ dependencies:
66
66
  - - '>='
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: nokogiri
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - '>='
74
- - !ruby/object:Gem::Version
75
- version: 1.5.1
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - '>='
81
- - !ruby/object:Gem::Version
82
- version: 1.5.1
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: yard
85
71
  requirement: !ruby/object:Gem::Requirement
@@ -114,28 +100,14 @@ dependencies:
114
100
  requirements:
115
101
  - - '>='
116
102
  - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :runtime
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - '>='
123
- - !ruby/object:Gem::Version
124
- version: '0'
125
- - !ruby/object:Gem::Dependency
126
- name: mechanize
127
- requirement: !ruby/object:Gem::Requirement
128
- requirements:
129
- - - '>='
130
- - !ruby/object:Gem::Version
131
- version: '0'
103
+ version: 1.5.1
132
104
  type: :runtime
133
105
  prerelease: false
134
106
  version_requirements: !ruby/object:Gem::Requirement
135
107
  requirements:
136
108
  - - '>='
137
109
  - !ruby/object:Gem::Version
138
- version: '0'
110
+ version: 1.5.1
139
111
  description: Don't re-write web scrapers every time. Upton gives you a scraper template
140
112
  that's easy to use for debugging and doesn't hammer servers by default.
141
113
  email: jeremybmerrill@jeremybmerrill.com