upton 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +15 -0
  2. data/lib/upton.rb +24 -15
  3. metadata +5 -19
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ Y2M0NjVlMWNlN2ZkNTEwMzIzYmQ2N2MxYjBjZjExZmQ1ZTYyZjAzNw==
5
+ data.tar.gz: !binary |-
6
+ NTg4NTc3NDQ3OWQ1OGRiY2Y0ZmFkYjI5OWU2NmRkNjVlNDNlYzRkZQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ NzY5YWNlMTIwNWM5YTNiM2YwMDkwNDNhMTlhNjM2ZTQyMWQ1YjRhMTUwNThm
10
+ YThkYzU4ZjI2ZWU0YzA1MjRjMDgxMmEwOGU1MGFlYjMyZDZjYTNjOGU3NDAy
11
+ OTQ0Yzc5NTNiNWU0MjE0YTZmYWZkZTQxZjQ1NjliZGEyOGYyZWI=
12
+ data.tar.gz: !binary |-
13
+ Zjg4ODk0ZGZjMDVkZGE5MGEzMTk1Mjg0MzQ1ODllZjUzZjc2MWFjOTU3YzAz
14
+ YzE5ZDAyZGZmNDdmZDU3ZWViZTEzNjNhZGNmMGZmMDgwY2YxYTgwZjkxNzFj
15
+ NTMwODU4ODg2Nzg2NGYzOGU2NmJhNDBmNmI1OWQwZDVkN2UzZmM=
data/lib/upton.rb CHANGED
@@ -11,8 +11,11 @@
11
11
  # site's search page or a newspaper's homepage.
12
12
  # 2. Instance pages, which represent the goal of your scraping, e.g.
13
13
  # job listings or news articles.
14
+ #
14
15
 
15
16
  require 'nokogiri'
17
+ require 'uri'
18
+ require 'restclient'
16
19
  require './lib/utils'
17
20
 
18
21
  module Upton
@@ -21,7 +24,7 @@ module Upton
21
24
  # in more complicated cases; e.g. +MyScraper < Upton::Scraper+
22
25
  class Scraper
23
26
 
24
- attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder, :url_array
27
+ attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
25
28
 
26
29
  # == Basic use-case methods.
27
30
 
@@ -36,16 +39,16 @@ module Upton
36
39
  self.scrape_from_list(self.url_array, blk)
37
40
  end
38
41
 
39
-
40
- # == Configuration Options
41
-
42
- # +index_url+: The URL of the page containing the list of instances.
42
+ # +index_url_or_array+: A list of string URLs, OR
43
+ # the URL of the page containing the list of instances.
43
44
  # +selector+: The XPath or CSS that specifies the anchor elements within
44
- # the page.
45
+ # the page, if a url is specified for the previous argument.
45
46
  # +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
46
47
  #
47
48
  # These options are a shortcut. If you plant to override +get_index+, you
48
49
  # do not need to set them.
50
+ # If you don't specify a selector, the first argument will be treated as a
51
+ # list of URLs.
49
52
  def initialize(index_url_or_array, selector="", selector_method=:xpath)
50
53
 
51
54
  #if first arg is a valid URL, do already-written stuff;
@@ -55,7 +58,7 @@ module Upton
55
58
  #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
56
59
  if selector.empty?
57
60
  @url_array = index_url_or_array
58
- elsif index_url_or_array =~ URI::ABS_URI
61
+ elsif index_url_or_array =~ ::URI::ABS_URI
59
62
  @index_url = index_url_or_array
60
63
  @index_selector = selector
61
64
  @index_selector_method = selector_method
@@ -79,7 +82,7 @@ module Upton
79
82
 
80
83
  # In order to not hammer servers, Upton waits for, by default, 30
81
84
  # seconds between requests to the remote server.
82
- @nice_sleep_time = 30 #seconds
85
+ @sleep_time_between_requests = 30 #seconds
83
86
 
84
87
  # Folder name for stashes, if you want them to be stored somewhere else,
85
88
  # e.g. under /tmp.
@@ -90,13 +93,15 @@ module Upton
90
93
  end
91
94
 
92
95
 
96
+ # == Configuration Options
93
97
 
94
98
  # If instance pages are paginated, <b>you must override</b>
95
99
  # this method to return the next URL, given the current URL and its index.
96
100
  #
97
101
  # If instance pages aren't paginated, there's no need to override this.
98
102
  #
99
- # Return URLs that are empty strings are ignored (and recursion stops.)
103
+ # Recursion stops if the fetching URL returns an empty string or an error.
104
+ #
100
105
  # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
101
106
  # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
102
107
  def next_instance_page_url(url, index)
@@ -108,7 +113,8 @@ module Upton
108
113
  #
109
114
  # If index pages aren't paginated, there's no need to override this.
110
115
  #
111
- # Return URLs that are empty strings are ignored (and recursion stops.)
116
+ # Recursion stops if the fetching URL returns an empty string or an error.
117
+ #
112
118
  # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
113
119
  # ought to return "http://whatever.com/articles?page=2"
114
120
  def next_index_page_url(url, index)
@@ -121,7 +127,7 @@ module Upton
121
127
  self.url_array = self.get_index
122
128
  end
123
129
  CSV.open filename, 'wb' do |csv|
124
- self.scrape_from_list(self.url_array, blk).each{|document| document.each{|line| csv << line }}
130
+ self.scrape_from_list(self.url_array, blk).each{|document| csv << document }
125
131
  end
126
132
  end
127
133
 
@@ -139,7 +145,7 @@ module Upton
139
145
  else
140
146
  begin
141
147
  puts "getting " + url if @verbose
142
- sleep @nice_sleep_time
148
+ sleep @sleep_time_between_requests
143
149
  resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
144
150
 
145
151
  #this is silly, but rest-client needs to get on their game.
@@ -152,7 +158,6 @@ module Upton
152
158
  elsif content_type.split('/').first == 'text'
153
159
  'iso-8859-1'
154
160
  end
155
- puts charset
156
161
  resp.force_encoding(charset) if charset
157
162
  end
158
163
 
@@ -201,8 +206,12 @@ module Upton
201
206
  resp
202
207
  end
203
208
 
204
- # Returns the concatenated output of each member of a paginated instance,
205
- # e.g. a news article with 2 pages.
209
+ # Returns the article at `url`.
210
+ #
211
+ # If the page is stashed, returns that, otherwise, fetches it from the web.
212
+ #
213
+ # If an instance is paginated, returns the concatenated output of each
214
+ # page, e.g. if a news article has two pages.
206
215
  def get_instance(url, index=0)
207
216
  resp = self.get_page(url, @debug)
208
217
  if !resp.empty?
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
5
- prerelease:
4
+ version: 0.2.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - Jeremy B. Merrill
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-06-25 00:00:00.000000000 Z
11
+ date: 2013-07-17 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rack
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :development
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: thin
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: nokogiri
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ! '>='
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ! '>='
60
53
  - !ruby/object:Gem::Version
@@ -62,7 +55,6 @@ dependencies:
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: yard
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
59
  - - ! '>='
68
60
  - !ruby/object:Gem::Version
@@ -70,7 +62,6 @@ dependencies:
70
62
  type: :development
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
66
  - - ! '>='
76
67
  - !ruby/object:Gem::Version
@@ -78,7 +69,6 @@ dependencies:
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: rest-client
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
73
  - - ~>
84
74
  - !ruby/object:Gem::Version
@@ -86,7 +76,6 @@ dependencies:
86
76
  type: :runtime
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
80
  - - ~>
92
81
  - !ruby/object:Gem::Version
@@ -94,7 +83,6 @@ dependencies:
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: nokogiri
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
87
  - - ! '>='
100
88
  - !ruby/object:Gem::Version
@@ -102,7 +90,6 @@ dependencies:
102
90
  type: :runtime
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
94
  - - ! '>='
108
95
  - !ruby/object:Gem::Version
@@ -125,27 +112,26 @@ files:
125
112
  homepage: http://github.org/propublica/upton
126
113
  licenses:
127
114
  - MIT
115
+ metadata: {}
128
116
  post_install_message:
129
117
  rdoc_options: []
130
118
  require_paths:
131
119
  - lib
132
120
  required_ruby_version: !ruby/object:Gem::Requirement
133
- none: false
134
121
  requirements:
135
122
  - - ! '>='
136
123
  - !ruby/object:Gem::Version
137
124
  version: 1.8.7
138
125
  required_rubygems_version: !ruby/object:Gem::Requirement
139
- none: false
140
126
  requirements:
141
127
  - - ! '>='
142
128
  - !ruby/object:Gem::Version
143
129
  version: '0'
144
130
  requirements: []
145
131
  rubyforge_project:
146
- rubygems_version: 1.8.23
132
+ rubygems_version: 2.0.5
147
133
  signing_key:
148
- specification_version: 3
134
+ specification_version: 4
149
135
  summary: A simple web-scraping framework
150
136
  test_files:
151
137
  - test/data/discussion.html