upton 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/lib/upton.rb +24 -15
- metadata +5 -19
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
Y2M0NjVlMWNlN2ZkNTEwMzIzYmQ2N2MxYjBjZjExZmQ1ZTYyZjAzNw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NTg4NTc3NDQ3OWQ1OGRiY2Y0ZmFkYjI5OWU2NmRkNjVlNDNlYzRkZQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NzY5YWNlMTIwNWM5YTNiM2YwMDkwNDNhMTlhNjM2ZTQyMWQ1YjRhMTUwNThm
|
10
|
+
YThkYzU4ZjI2ZWU0YzA1MjRjMDgxMmEwOGU1MGFlYjMyZDZjYTNjOGU3NDAy
|
11
|
+
OTQ0Yzc5NTNiNWU0MjE0YTZmYWZkZTQxZjQ1NjliZGEyOGYyZWI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
Zjg4ODk0ZGZjMDVkZGE5MGEzMTk1Mjg0MzQ1ODllZjUzZjc2MWFjOTU3YzAz
|
14
|
+
YzE5ZDAyZGZmNDdmZDU3ZWViZTEzNjNhZGNmMGZmMDgwY2YxYTgwZjkxNzFj
|
15
|
+
NTMwODU4ODg2Nzg2NGYzOGU2NmJhNDBmNmI1OWQwZDVkN2UzZmM=
|
data/lib/upton.rb
CHANGED
@@ -11,8 +11,11 @@
|
|
11
11
|
# site's search page or a newspaper's homepage.
|
12
12
|
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
13
13
|
# job listings or news articles.
|
14
|
+
#
|
14
15
|
|
15
16
|
require 'nokogiri'
|
17
|
+
require 'uri'
|
18
|
+
require 'restclient'
|
16
19
|
require './lib/utils'
|
17
20
|
|
18
21
|
module Upton
|
@@ -21,7 +24,7 @@ module Upton
|
|
21
24
|
# in more complicated cases; e.g. +MyScraper < Upton::Scraper+
|
22
25
|
class Scraper
|
23
26
|
|
24
|
-
attr_accessor :verbose, :debug, :
|
27
|
+
attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
|
25
28
|
|
26
29
|
# == Basic use-case methods.
|
27
30
|
|
@@ -36,16 +39,16 @@ module Upton
|
|
36
39
|
self.scrape_from_list(self.url_array, blk)
|
37
40
|
end
|
38
41
|
|
39
|
-
|
40
|
-
#
|
41
|
-
|
42
|
-
# +index_url+: The URL of the page containing the list of instances.
|
42
|
+
# +index_url_or_array+: A list of string URLs, OR
|
43
|
+
# the URL of the page containing the list of instances.
|
43
44
|
# +selector+: The XPath or CSS that specifies the anchor elements within
|
44
|
-
# the
|
45
|
+
# the page, if a url is specified for the previous argument.
|
45
46
|
# +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
|
46
47
|
#
|
47
48
|
# These options are a shortcut. If you plant to override +get_index+, you
|
48
49
|
# do not need to set them.
|
50
|
+
# If you don't specify a selector, the first argument will be treated as a
|
51
|
+
# list of URLs.
|
49
52
|
def initialize(index_url_or_array, selector="", selector_method=:xpath)
|
50
53
|
|
51
54
|
#if first arg is a valid URL, do already-written stuff;
|
@@ -55,7 +58,7 @@ module Upton
|
|
55
58
|
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
56
59
|
if selector.empty?
|
57
60
|
@url_array = index_url_or_array
|
58
|
-
elsif index_url_or_array =~ URI::ABS_URI
|
61
|
+
elsif index_url_or_array =~ ::URI::ABS_URI
|
59
62
|
@index_url = index_url_or_array
|
60
63
|
@index_selector = selector
|
61
64
|
@index_selector_method = selector_method
|
@@ -79,7 +82,7 @@ module Upton
|
|
79
82
|
|
80
83
|
# In order to not hammer servers, Upton waits for, by default, 30
|
81
84
|
# seconds between requests to the remote server.
|
82
|
-
@
|
85
|
+
@sleep_time_between_requests = 30 #seconds
|
83
86
|
|
84
87
|
# Folder name for stashes, if you want them to be stored somewhere else,
|
85
88
|
# e.g. under /tmp.
|
@@ -90,13 +93,15 @@ module Upton
|
|
90
93
|
end
|
91
94
|
|
92
95
|
|
96
|
+
# == Configuration Options
|
93
97
|
|
94
98
|
# If instance pages are paginated, <b>you must override</b>
|
95
99
|
# this method to return the next URL, given the current URL and its index.
|
96
100
|
#
|
97
101
|
# If instance pages aren't paginated, there's no need to override this.
|
98
102
|
#
|
99
|
-
#
|
103
|
+
# Recursion stops if the fetching URL returns an empty string or an error.
|
104
|
+
#
|
100
105
|
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
101
106
|
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
102
107
|
def next_instance_page_url(url, index)
|
@@ -108,7 +113,8 @@ module Upton
|
|
108
113
|
#
|
109
114
|
# If index pages aren't paginated, there's no need to override this.
|
110
115
|
#
|
111
|
-
#
|
116
|
+
# Recursion stops if the fetching URL returns an empty string or an error.
|
117
|
+
#
|
112
118
|
# e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
|
113
119
|
# ought to return "http://whatever.com/articles?page=2"
|
114
120
|
def next_index_page_url(url, index)
|
@@ -121,7 +127,7 @@ module Upton
|
|
121
127
|
self.url_array = self.get_index
|
122
128
|
end
|
123
129
|
CSV.open filename, 'wb' do |csv|
|
124
|
-
self.scrape_from_list(self.url_array, blk).each{|document|
|
130
|
+
self.scrape_from_list(self.url_array, blk).each{|document| csv << document }
|
125
131
|
end
|
126
132
|
end
|
127
133
|
|
@@ -139,7 +145,7 @@ module Upton
|
|
139
145
|
else
|
140
146
|
begin
|
141
147
|
puts "getting " + url if @verbose
|
142
|
-
sleep @
|
148
|
+
sleep @sleep_time_between_requests
|
143
149
|
resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
|
144
150
|
|
145
151
|
#this is silly, but rest-client needs to get on their game.
|
@@ -152,7 +158,6 @@ module Upton
|
|
152
158
|
elsif content_type.split('/').first == 'text'
|
153
159
|
'iso-8859-1'
|
154
160
|
end
|
155
|
-
puts charset
|
156
161
|
resp.force_encoding(charset) if charset
|
157
162
|
end
|
158
163
|
|
@@ -201,8 +206,12 @@ module Upton
|
|
201
206
|
resp
|
202
207
|
end
|
203
208
|
|
204
|
-
# Returns the
|
205
|
-
#
|
209
|
+
# Returns the article at `url`.
|
210
|
+
#
|
211
|
+
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
212
|
+
#
|
213
|
+
# If an instance is paginated, returns the concatenated output of each
|
214
|
+
# page, e.g. if a news article has two pages.
|
206
215
|
def get_instance(url, index=0)
|
207
216
|
resp = self.get_page(url, @debug)
|
208
217
|
if !resp.empty?
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
5
|
-
prerelease:
|
4
|
+
version: 0.2.2
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Jeremy B. Merrill
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-07-17 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rack
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: thin
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ! '>='
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,7 +41,6 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: nokogiri
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ! '>='
|
52
46
|
- !ruby/object:Gem::Version
|
@@ -54,7 +48,6 @@ dependencies:
|
|
54
48
|
type: :development
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
52
|
- - ! '>='
|
60
53
|
- !ruby/object:Gem::Version
|
@@ -62,7 +55,6 @@ dependencies:
|
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: yard
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
59
|
- - ! '>='
|
68
60
|
- !ruby/object:Gem::Version
|
@@ -70,7 +62,6 @@ dependencies:
|
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
66
|
- - ! '>='
|
76
67
|
- !ruby/object:Gem::Version
|
@@ -78,7 +69,6 @@ dependencies:
|
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: rest-client
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
73
|
- - ~>
|
84
74
|
- !ruby/object:Gem::Version
|
@@ -86,7 +76,6 @@ dependencies:
|
|
86
76
|
type: :runtime
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
80
|
- - ~>
|
92
81
|
- !ruby/object:Gem::Version
|
@@ -94,7 +83,6 @@ dependencies:
|
|
94
83
|
- !ruby/object:Gem::Dependency
|
95
84
|
name: nokogiri
|
96
85
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
86
|
requirements:
|
99
87
|
- - ! '>='
|
100
88
|
- !ruby/object:Gem::Version
|
@@ -102,7 +90,6 @@ dependencies:
|
|
102
90
|
type: :runtime
|
103
91
|
prerelease: false
|
104
92
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
93
|
requirements:
|
107
94
|
- - ! '>='
|
108
95
|
- !ruby/object:Gem::Version
|
@@ -125,27 +112,26 @@ files:
|
|
125
112
|
homepage: http://github.org/propublica/upton
|
126
113
|
licenses:
|
127
114
|
- MIT
|
115
|
+
metadata: {}
|
128
116
|
post_install_message:
|
129
117
|
rdoc_options: []
|
130
118
|
require_paths:
|
131
119
|
- lib
|
132
120
|
required_ruby_version: !ruby/object:Gem::Requirement
|
133
|
-
none: false
|
134
121
|
requirements:
|
135
122
|
- - ! '>='
|
136
123
|
- !ruby/object:Gem::Version
|
137
124
|
version: 1.8.7
|
138
125
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
-
none: false
|
140
126
|
requirements:
|
141
127
|
- - ! '>='
|
142
128
|
- !ruby/object:Gem::Version
|
143
129
|
version: '0'
|
144
130
|
requirements: []
|
145
131
|
rubyforge_project:
|
146
|
-
rubygems_version:
|
132
|
+
rubygems_version: 2.0.5
|
147
133
|
signing_key:
|
148
|
-
specification_version:
|
134
|
+
specification_version: 4
|
149
135
|
summary: A simple web-scraping framework
|
150
136
|
test_files:
|
151
137
|
- test/data/discussion.html
|