upton 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +15 -0
  2. data/lib/upton.rb +24 -15
  3. metadata +5 -19
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ Y2M0NjVlMWNlN2ZkNTEwMzIzYmQ2N2MxYjBjZjExZmQ1ZTYyZjAzNw==
5
+ data.tar.gz: !binary |-
6
+ NTg4NTc3NDQ3OWQ1OGRiY2Y0ZmFkYjI5OWU2NmRkNjVlNDNlYzRkZQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ NzY5YWNlMTIwNWM5YTNiM2YwMDkwNDNhMTlhNjM2ZTQyMWQ1YjRhMTUwNThm
10
+ YThkYzU4ZjI2ZWU0YzA1MjRjMDgxMmEwOGU1MGFlYjMyZDZjYTNjOGU3NDAy
11
+ OTQ0Yzc5NTNiNWU0MjE0YTZmYWZkZTQxZjQ1NjliZGEyOGYyZWI=
12
+ data.tar.gz: !binary |-
13
+ Zjg4ODk0ZGZjMDVkZGE5MGEzMTk1Mjg0MzQ1ODllZjUzZjc2MWFjOTU3YzAz
14
+ YzE5ZDAyZGZmNDdmZDU3ZWViZTEzNjNhZGNmMGZmMDgwY2YxYTgwZjkxNzFj
15
+ NTMwODU4ODg2Nzg2NGYzOGU2NmJhNDBmNmI1OWQwZDVkN2UzZmM=
data/lib/upton.rb CHANGED
@@ -11,8 +11,11 @@
11
11
  # site's search page or a newspaper's homepage.
12
12
  # 2. Instance pages, which represent the goal of your scraping, e.g.
13
13
  # job listings or news articles.
14
+ #
14
15
 
15
16
  require 'nokogiri'
17
+ require 'uri'
18
+ require 'restclient'
16
19
  require './lib/utils'
17
20
 
18
21
  module Upton
@@ -21,7 +24,7 @@ module Upton
21
24
  # in more complicated cases; e.g. +MyScraper < Upton::Scraper+
22
25
  class Scraper
23
26
 
24
- attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder, :url_array
27
+ attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
25
28
 
26
29
  # == Basic use-case methods.
27
30
 
@@ -36,16 +39,16 @@ module Upton
36
39
  self.scrape_from_list(self.url_array, blk)
37
40
  end
38
41
 
39
-
40
- # == Configuration Options
41
-
42
- # +index_url+: The URL of the page containing the list of instances.
42
+ # +index_url_or_array+: A list of string URLs, OR
43
+ # the URL of the page containing the list of instances.
43
44
  # +selector+: The XPath or CSS that specifies the anchor elements within
44
- # the page.
45
+ # the page, if a url is specified for the previous argument.
45
46
  # +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
46
47
  #
47
48
  # These options are a shortcut. If you plant to override +get_index+, you
48
49
  # do not need to set them.
50
+ # If you don't specify a selector, the first argument will be treated as a
51
+ # list of URLs.
49
52
  def initialize(index_url_or_array, selector="", selector_method=:xpath)
50
53
 
51
54
  #if first arg is a valid URL, do already-written stuff;
@@ -55,7 +58,7 @@ module Upton
55
58
  #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
56
59
  if selector.empty?
57
60
  @url_array = index_url_or_array
58
- elsif index_url_or_array =~ URI::ABS_URI
61
+ elsif index_url_or_array =~ ::URI::ABS_URI
59
62
  @index_url = index_url_or_array
60
63
  @index_selector = selector
61
64
  @index_selector_method = selector_method
@@ -79,7 +82,7 @@ module Upton
79
82
 
80
83
  # In order to not hammer servers, Upton waits for, by default, 30
81
84
  # seconds between requests to the remote server.
82
- @nice_sleep_time = 30 #seconds
85
+ @sleep_time_between_requests = 30 #seconds
83
86
 
84
87
  # Folder name for stashes, if you want them to be stored somewhere else,
85
88
  # e.g. under /tmp.
@@ -90,13 +93,15 @@ module Upton
90
93
  end
91
94
 
92
95
 
96
+ # == Configuration Options
93
97
 
94
98
  # If instance pages are paginated, <b>you must override</b>
95
99
  # this method to return the next URL, given the current URL and its index.
96
100
  #
97
101
  # If instance pages aren't paginated, there's no need to override this.
98
102
  #
99
- # Return URLs that are empty strings are ignored (and recursion stops.)
103
+ # Recursion stops if the fetching URL returns an empty string or an error.
104
+ #
100
105
  # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
101
106
  # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
102
107
  def next_instance_page_url(url, index)
@@ -108,7 +113,8 @@ module Upton
108
113
  #
109
114
  # If index pages aren't paginated, there's no need to override this.
110
115
  #
111
- # Return URLs that are empty strings are ignored (and recursion stops.)
116
+ # Recursion stops if the fetching URL returns an empty string or an error.
117
+ #
112
118
  # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
113
119
  # ought to return "http://whatever.com/articles?page=2"
114
120
  def next_index_page_url(url, index)
@@ -121,7 +127,7 @@ module Upton
121
127
  self.url_array = self.get_index
122
128
  end
123
129
  CSV.open filename, 'wb' do |csv|
124
- self.scrape_from_list(self.url_array, blk).each{|document| document.each{|line| csv << line }}
130
+ self.scrape_from_list(self.url_array, blk).each{|document| csv << document }
125
131
  end
126
132
  end
127
133
 
@@ -139,7 +145,7 @@ module Upton
139
145
  else
140
146
  begin
141
147
  puts "getting " + url if @verbose
142
- sleep @nice_sleep_time
148
+ sleep @sleep_time_between_requests
143
149
  resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
144
150
 
145
151
  #this is silly, but rest-client needs to get on their game.
@@ -152,7 +158,6 @@ module Upton
152
158
  elsif content_type.split('/').first == 'text'
153
159
  'iso-8859-1'
154
160
  end
155
- puts charset
156
161
  resp.force_encoding(charset) if charset
157
162
  end
158
163
 
@@ -201,8 +206,12 @@ module Upton
201
206
  resp
202
207
  end
203
208
 
204
- # Returns the concatenated output of each member of a paginated instance,
205
- # e.g. a news article with 2 pages.
209
+ # Returns the article at `url`.
210
+ #
211
+ # If the page is stashed, returns that, otherwise, fetches it from the web.
212
+ #
213
+ # If an instance is paginated, returns the concatenated output of each
214
+ # page, e.g. if a news article has two pages.
206
215
  def get_instance(url, index=0)
207
216
  resp = self.get_page(url, @debug)
208
217
  if !resp.empty?
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
5
- prerelease:
4
+ version: 0.2.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - Jeremy B. Merrill
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-06-25 00:00:00.000000000 Z
11
+ date: 2013-07-17 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rack
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :development
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: thin
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: nokogiri
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ! '>='
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ! '>='
60
53
  - !ruby/object:Gem::Version
@@ -62,7 +55,6 @@ dependencies:
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: yard
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
59
  - - ! '>='
68
60
  - !ruby/object:Gem::Version
@@ -70,7 +62,6 @@ dependencies:
70
62
  type: :development
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
66
  - - ! '>='
76
67
  - !ruby/object:Gem::Version
@@ -78,7 +69,6 @@ dependencies:
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: rest-client
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
73
  - - ~>
84
74
  - !ruby/object:Gem::Version
@@ -86,7 +76,6 @@ dependencies:
86
76
  type: :runtime
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
80
  - - ~>
92
81
  - !ruby/object:Gem::Version
@@ -94,7 +83,6 @@ dependencies:
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: nokogiri
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
87
  - - ! '>='
100
88
  - !ruby/object:Gem::Version
@@ -102,7 +90,6 @@ dependencies:
102
90
  type: :runtime
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
94
  - - ! '>='
108
95
  - !ruby/object:Gem::Version
@@ -125,27 +112,26 @@ files:
125
112
  homepage: http://github.org/propublica/upton
126
113
  licenses:
127
114
  - MIT
115
+ metadata: {}
128
116
  post_install_message:
129
117
  rdoc_options: []
130
118
  require_paths:
131
119
  - lib
132
120
  required_ruby_version: !ruby/object:Gem::Requirement
133
- none: false
134
121
  requirements:
135
122
  - - ! '>='
136
123
  - !ruby/object:Gem::Version
137
124
  version: 1.8.7
138
125
  required_rubygems_version: !ruby/object:Gem::Requirement
139
- none: false
140
126
  requirements:
141
127
  - - ! '>='
142
128
  - !ruby/object:Gem::Version
143
129
  version: '0'
144
130
  requirements: []
145
131
  rubyforge_project:
146
- rubygems_version: 1.8.23
132
+ rubygems_version: 2.0.5
147
133
  signing_key:
148
- specification_version: 3
134
+ specification_version: 4
149
135
  summary: A simple web-scraping framework
150
136
  test_files:
151
137
  - test/data/discussion.html