upton 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/lib/upton.rb +24 -15
- metadata +5 -19
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
Y2M0NjVlMWNlN2ZkNTEwMzIzYmQ2N2MxYjBjZjExZmQ1ZTYyZjAzNw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NTg4NTc3NDQ3OWQ1OGRiY2Y0ZmFkYjI5OWU2NmRkNjVlNDNlYzRkZQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NzY5YWNlMTIwNWM5YTNiM2YwMDkwNDNhMTlhNjM2ZTQyMWQ1YjRhMTUwNThm
|
10
|
+
YThkYzU4ZjI2ZWU0YzA1MjRjMDgxMmEwOGU1MGFlYjMyZDZjYTNjOGU3NDAy
|
11
|
+
OTQ0Yzc5NTNiNWU0MjE0YTZmYWZkZTQxZjQ1NjliZGEyOGYyZWI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
Zjg4ODk0ZGZjMDVkZGE5MGEzMTk1Mjg0MzQ1ODllZjUzZjc2MWFjOTU3YzAz
|
14
|
+
YzE5ZDAyZGZmNDdmZDU3ZWViZTEzNjNhZGNmMGZmMDgwY2YxYTgwZjkxNzFj
|
15
|
+
NTMwODU4ODg2Nzg2NGYzOGU2NmJhNDBmNmI1OWQwZDVkN2UzZmM=
|
data/lib/upton.rb
CHANGED
@@ -11,8 +11,11 @@
|
|
11
11
|
# site's search page or a newspaper's homepage.
|
12
12
|
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
13
13
|
# job listings or news articles.
|
14
|
+
#
|
14
15
|
|
15
16
|
require 'nokogiri'
|
17
|
+
require 'uri'
|
18
|
+
require 'restclient'
|
16
19
|
require './lib/utils'
|
17
20
|
|
18
21
|
module Upton
|
@@ -21,7 +24,7 @@ module Upton
|
|
21
24
|
# in more complicated cases; e.g. +MyScraper < Upton::Scraper+
|
22
25
|
class Scraper
|
23
26
|
|
24
|
-
attr_accessor :verbose, :debug, :
|
27
|
+
attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
|
25
28
|
|
26
29
|
# == Basic use-case methods.
|
27
30
|
|
@@ -36,16 +39,16 @@ module Upton
|
|
36
39
|
self.scrape_from_list(self.url_array, blk)
|
37
40
|
end
|
38
41
|
|
39
|
-
|
40
|
-
#
|
41
|
-
|
42
|
-
# +index_url+: The URL of the page containing the list of instances.
|
42
|
+
# +index_url_or_array+: A list of string URLs, OR
|
43
|
+
# the URL of the page containing the list of instances.
|
43
44
|
# +selector+: The XPath or CSS that specifies the anchor elements within
|
44
|
-
# the
|
45
|
+
# the page, if a url is specified for the previous argument.
|
45
46
|
# +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
|
46
47
|
#
|
47
48
|
# These options are a shortcut. If you plant to override +get_index+, you
|
48
49
|
# do not need to set them.
|
50
|
+
# If you don't specify a selector, the first argument will be treated as a
|
51
|
+
# list of URLs.
|
49
52
|
def initialize(index_url_or_array, selector="", selector_method=:xpath)
|
50
53
|
|
51
54
|
#if first arg is a valid URL, do already-written stuff;
|
@@ -55,7 +58,7 @@ module Upton
|
|
55
58
|
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
56
59
|
if selector.empty?
|
57
60
|
@url_array = index_url_or_array
|
58
|
-
elsif index_url_or_array =~ URI::ABS_URI
|
61
|
+
elsif index_url_or_array =~ ::URI::ABS_URI
|
59
62
|
@index_url = index_url_or_array
|
60
63
|
@index_selector = selector
|
61
64
|
@index_selector_method = selector_method
|
@@ -79,7 +82,7 @@ module Upton
|
|
79
82
|
|
80
83
|
# In order to not hammer servers, Upton waits for, by default, 30
|
81
84
|
# seconds between requests to the remote server.
|
82
|
-
@
|
85
|
+
@sleep_time_between_requests = 30 #seconds
|
83
86
|
|
84
87
|
# Folder name for stashes, if you want them to be stored somewhere else,
|
85
88
|
# e.g. under /tmp.
|
@@ -90,13 +93,15 @@ module Upton
|
|
90
93
|
end
|
91
94
|
|
92
95
|
|
96
|
+
# == Configuration Options
|
93
97
|
|
94
98
|
# If instance pages are paginated, <b>you must override</b>
|
95
99
|
# this method to return the next URL, given the current URL and its index.
|
96
100
|
#
|
97
101
|
# If instance pages aren't paginated, there's no need to override this.
|
98
102
|
#
|
99
|
-
#
|
103
|
+
# Recursion stops if the fetching URL returns an empty string or an error.
|
104
|
+
#
|
100
105
|
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
101
106
|
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
102
107
|
def next_instance_page_url(url, index)
|
@@ -108,7 +113,8 @@ module Upton
|
|
108
113
|
#
|
109
114
|
# If index pages aren't paginated, there's no need to override this.
|
110
115
|
#
|
111
|
-
#
|
116
|
+
# Recursion stops if the fetching URL returns an empty string or an error.
|
117
|
+
#
|
112
118
|
# e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
|
113
119
|
# ought to return "http://whatever.com/articles?page=2"
|
114
120
|
def next_index_page_url(url, index)
|
@@ -121,7 +127,7 @@ module Upton
|
|
121
127
|
self.url_array = self.get_index
|
122
128
|
end
|
123
129
|
CSV.open filename, 'wb' do |csv|
|
124
|
-
self.scrape_from_list(self.url_array, blk).each{|document|
|
130
|
+
self.scrape_from_list(self.url_array, blk).each{|document| csv << document }
|
125
131
|
end
|
126
132
|
end
|
127
133
|
|
@@ -139,7 +145,7 @@ module Upton
|
|
139
145
|
else
|
140
146
|
begin
|
141
147
|
puts "getting " + url if @verbose
|
142
|
-
sleep @
|
148
|
+
sleep @sleep_time_between_requests
|
143
149
|
resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
|
144
150
|
|
145
151
|
#this is silly, but rest-client needs to get on their game.
|
@@ -152,7 +158,6 @@ module Upton
|
|
152
158
|
elsif content_type.split('/').first == 'text'
|
153
159
|
'iso-8859-1'
|
154
160
|
end
|
155
|
-
puts charset
|
156
161
|
resp.force_encoding(charset) if charset
|
157
162
|
end
|
158
163
|
|
@@ -201,8 +206,12 @@ module Upton
|
|
201
206
|
resp
|
202
207
|
end
|
203
208
|
|
204
|
-
# Returns the
|
205
|
-
#
|
209
|
+
# Returns the article at `url`.
|
210
|
+
#
|
211
|
+
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
212
|
+
#
|
213
|
+
# If an instance is paginated, returns the concatenated output of each
|
214
|
+
# page, e.g. if a news article has two pages.
|
206
215
|
def get_instance(url, index=0)
|
207
216
|
resp = self.get_page(url, @debug)
|
208
217
|
if !resp.empty?
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
5
|
-
prerelease:
|
4
|
+
version: 0.2.2
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Jeremy B. Merrill
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-07-17 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rack
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: thin
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ! '>='
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,7 +41,6 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: nokogiri
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ! '>='
|
52
46
|
- !ruby/object:Gem::Version
|
@@ -54,7 +48,6 @@ dependencies:
|
|
54
48
|
type: :development
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
52
|
- - ! '>='
|
60
53
|
- !ruby/object:Gem::Version
|
@@ -62,7 +55,6 @@ dependencies:
|
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: yard
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
59
|
- - ! '>='
|
68
60
|
- !ruby/object:Gem::Version
|
@@ -70,7 +62,6 @@ dependencies:
|
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
66
|
- - ! '>='
|
76
67
|
- !ruby/object:Gem::Version
|
@@ -78,7 +69,6 @@ dependencies:
|
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: rest-client
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
73
|
- - ~>
|
84
74
|
- !ruby/object:Gem::Version
|
@@ -86,7 +76,6 @@ dependencies:
|
|
86
76
|
type: :runtime
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
80
|
- - ~>
|
92
81
|
- !ruby/object:Gem::Version
|
@@ -94,7 +83,6 @@ dependencies:
|
|
94
83
|
- !ruby/object:Gem::Dependency
|
95
84
|
name: nokogiri
|
96
85
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
86
|
requirements:
|
99
87
|
- - ! '>='
|
100
88
|
- !ruby/object:Gem::Version
|
@@ -102,7 +90,6 @@ dependencies:
|
|
102
90
|
type: :runtime
|
103
91
|
prerelease: false
|
104
92
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
93
|
requirements:
|
107
94
|
- - ! '>='
|
108
95
|
- !ruby/object:Gem::Version
|
@@ -125,27 +112,26 @@ files:
|
|
125
112
|
homepage: http://github.org/propublica/upton
|
126
113
|
licenses:
|
127
114
|
- MIT
|
115
|
+
metadata: {}
|
128
116
|
post_install_message:
|
129
117
|
rdoc_options: []
|
130
118
|
require_paths:
|
131
119
|
- lib
|
132
120
|
required_ruby_version: !ruby/object:Gem::Requirement
|
133
|
-
none: false
|
134
121
|
requirements:
|
135
122
|
- - ! '>='
|
136
123
|
- !ruby/object:Gem::Version
|
137
124
|
version: 1.8.7
|
138
125
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
-
none: false
|
140
126
|
requirements:
|
141
127
|
- - ! '>='
|
142
128
|
- !ruby/object:Gem::Version
|
143
129
|
version: '0'
|
144
130
|
requirements: []
|
145
131
|
rubyforge_project:
|
146
|
-
rubygems_version:
|
132
|
+
rubygems_version: 2.0.5
|
147
133
|
signing_key:
|
148
|
-
specification_version:
|
134
|
+
specification_version: 4
|
149
135
|
summary: A simple web-scraping framework
|
150
136
|
test_files:
|
151
137
|
- test/data/discussion.html
|