upton 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/upton.rb +47 -24
- data/lib/utils.rb +15 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZWYwNGQyM2ZkYWVhYWViZmU1ZDczNTA4OGZjMzhhN2FkMWIwNzc5Nw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MmVmMjdkZGFlYzBjZGJjNjNiOTg4MDJlNjhmZDA2YzI4MzdlY2E5Mw==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjgwMjAyYTM0YTNhODA5ZmExNjBkYzcwNDgzYjdjM2M2ZGJiODM0NmUzMDE0
|
10
|
+
YmFlNjhmZjNhMDgwMTRkMWFmMWYzODgxMzI1MTZhZWNmYTY0MTEzN2QzYzE4
|
11
|
+
MWM3YjE5YzAxNmU5NjViZGQyNjZkYzBkZjgyZTEzMWUzNjc3N2U=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZDA2MGI0YzllM2UyNmFhYTljNjZiNDM3NmVhNWJhNjljOGQwNWJlZTViZDQw
|
14
|
+
ZmZkZjBiNjlhNDQ3MzFlNjRkZWE5MzZjMjViMjQ2N2QxYWRjYzg1MGJmMmFk
|
15
|
+
ZDE1NmJhNjliNmViZDE5ZDY1NjRiNDg3OTIwYTU5NTA0OTJhM2U=
|
data/lib/upton.rb
CHANGED
@@ -1,17 +1,5 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
|
-
# *Upton* is a framework for easy web-scraping with a useful debug mode
|
4
|
-
# that doesn't hammer your target's servers. It does the repetitive parts of
|
5
|
-
# writing scrapers, so you only have to write the unique parts for each site.
|
6
|
-
#
|
7
|
-
# Upton operates on the theory that, for most scraping projects, you need to
|
8
|
-
# scrape two types of pages:
|
9
|
-
#
|
10
|
-
# 1. Index pages, which list instance pages. For example, a job search
|
11
|
-
# site's search page or a newspaper's homepage.
|
12
|
-
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
13
|
-
# job listings or news articles.
|
14
|
-
#
|
15
3
|
|
16
4
|
require 'nokogiri'
|
17
5
|
require 'uri'
|
@@ -19,19 +7,37 @@ require 'restclient'
|
|
19
7
|
require './lib/utils'
|
20
8
|
|
21
9
|
module Upton
|
22
|
-
|
23
|
-
# Upton
|
24
|
-
#
|
10
|
+
##
|
11
|
+
# *Upton* is a framework for easy web-scraping with a useful debug mode
|
12
|
+
# that doesn't hammer your target's servers. It does the repetitive parts of
|
13
|
+
# writing scrapers, so you only have to write the unique parts for each site.
|
14
|
+
#
|
15
|
+
# Upton operates on the theory that, for most scraping projects, you need to
|
16
|
+
# scrape two types of pages:
|
17
|
+
#
|
18
|
+
# 1. Index pages, which list instance pages. For example, a job search
|
19
|
+
# site's search page or a newspaper's homepage.
|
20
|
+
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
21
|
+
# job listings or news articles.
|
22
|
+
#
|
23
|
+
# Upton::Scraper can be used as-is for basic use-cases by:
|
24
|
+
# 1. specifying the pages to be scraped in `new` as an index page
|
25
|
+
# or as an Array of URLs.
|
26
|
+
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
27
|
+
# block from Upton::Utils.
|
28
|
+
# For more complicated cases; subclass Upton::Scraper
|
29
|
+
# e.g. +MyScraper < Upton::Scraper+ and overrdie various methods.
|
30
|
+
##
|
25
31
|
class Scraper
|
26
32
|
|
27
33
|
attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
|
28
34
|
|
29
|
-
|
30
|
-
|
35
|
+
##
|
31
36
|
# This is the main user-facing method for a basic scraper.
|
32
37
|
# Call +scrape+ with a block; this block will be called on
|
33
38
|
# the text of each instance page, (and optionally, its URL and its index
|
34
39
|
# in the list of instance URLs returned by +get_index+).
|
40
|
+
##
|
35
41
|
def scrape &blk
|
36
42
|
unless self.url_array
|
37
43
|
self.url_array = self.get_index
|
@@ -39,6 +45,7 @@ module Upton
|
|
39
45
|
self.scrape_from_list(self.url_array, blk)
|
40
46
|
end
|
41
47
|
|
48
|
+
##
|
42
49
|
# +index_url_or_array+: A list of string URLs, OR
|
43
50
|
# the URL of the page containing the list of instances.
|
44
51
|
# +selector+: The XPath or CSS that specifies the anchor elements within
|
@@ -49,6 +56,7 @@ module Upton
|
|
49
56
|
# do not need to set them.
|
50
57
|
# If you don't specify a selector, the first argument will be treated as a
|
51
58
|
# list of URLs.
|
59
|
+
##
|
52
60
|
def initialize(index_url_or_array, selector="", selector_method=:xpath)
|
53
61
|
|
54
62
|
#if first arg is a valid URL, do already-written stuff;
|
@@ -92,9 +100,7 @@ module Upton
|
|
92
100
|
end
|
93
101
|
end
|
94
102
|
|
95
|
-
|
96
|
-
# == Configuration Options
|
97
|
-
|
103
|
+
##
|
98
104
|
# If instance pages are paginated, <b>you must override</b>
|
99
105
|
# this method to return the next URL, given the current URL and its index.
|
100
106
|
#
|
@@ -104,10 +110,12 @@ module Upton
|
|
104
110
|
#
|
105
111
|
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
106
112
|
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
113
|
+
##
|
107
114
|
def next_instance_page_url(url, index)
|
108
115
|
""
|
109
116
|
end
|
110
117
|
|
118
|
+
##
|
111
119
|
# If index pages are paginated, <b>you must override</b>
|
112
120
|
# this method to return the next URL, given the current URL and its index.
|
113
121
|
#
|
@@ -117,10 +125,14 @@ module Upton
|
|
117
125
|
#
|
118
126
|
# e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
|
119
127
|
# ought to return "http://whatever.com/articles?page=2"
|
128
|
+
##
|
120
129
|
def next_index_page_url(url, index)
|
121
130
|
""
|
122
131
|
end
|
123
132
|
|
133
|
+
##
|
134
|
+
# Writes the scraped result to a CSV at the given filename.
|
135
|
+
##
|
124
136
|
def scrape_to_csv filename, &blk
|
125
137
|
require 'csv'
|
126
138
|
unless self.url_array
|
@@ -133,8 +145,11 @@ module Upton
|
|
133
145
|
|
134
146
|
protected
|
135
147
|
|
136
|
-
|
137
|
-
#Handles getting pages with RestClient or getting them from the local stash
|
148
|
+
##
|
149
|
+
# Handles getting pages with RestClient or getting them from the local stash.
|
150
|
+
#
|
151
|
+
# Uses a kludge (because rest-client is outdated) to handle encoding.
|
152
|
+
##
|
138
153
|
def get_page(url, stash=false)
|
139
154
|
return "" if url.empty?
|
140
155
|
|
@@ -179,21 +194,27 @@ module Upton
|
|
179
194
|
resp
|
180
195
|
end
|
181
196
|
|
197
|
+
##
|
182
198
|
# Return a list of URLs for the instances you want to scrape.
|
183
199
|
# This can optionally be overridden if, for example, the list of instances
|
184
200
|
# comes from an API.
|
201
|
+
##
|
185
202
|
def get_index
|
186
203
|
parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
|
187
204
|
end
|
188
205
|
|
189
|
-
|
190
|
-
# the
|
206
|
+
##
|
207
|
+
# Using the XPath expression or CSS selector and selector_method that
|
208
|
+
# uniquely identifies the links in the index, return those links as strings.
|
209
|
+
##
|
191
210
|
def parse_index(text, selector, selector_method=:xpath)
|
192
211
|
Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
|
193
212
|
end
|
194
213
|
|
214
|
+
##
|
195
215
|
# Returns the concatenated output of each member of a paginated index,
|
196
216
|
# e.g. a site listing links with 2+ pages.
|
217
|
+
##
|
197
218
|
def get_index_pages(url, index)
|
198
219
|
resp = self.get_page(url, @index_debug)
|
199
220
|
if !resp.empty?
|
@@ -206,12 +227,14 @@ module Upton
|
|
206
227
|
resp
|
207
228
|
end
|
208
229
|
|
230
|
+
##
|
209
231
|
# Returns the article at `url`.
|
210
232
|
#
|
211
233
|
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
212
234
|
#
|
213
235
|
# If an instance is paginated, returns the concatenated output of each
|
214
236
|
# page, e.g. if a news article has two pages.
|
237
|
+
##
|
215
238
|
def get_instance(url, index=0)
|
216
239
|
resp = self.get_page(url, @debug)
|
217
240
|
if !resp.empty?
|
data/lib/utils.rb
CHANGED
@@ -1,8 +1,19 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
module Upton
|
4
|
+
|
5
|
+
##
|
6
|
+
# This class contains a collection of helpers for Upton
|
7
|
+
#
|
8
|
+
# Each method returns a Proc that (with an & ) can be used as the final
|
9
|
+
# argument to Upton's `scrape` and `scrape_to_csv`
|
10
|
+
##
|
4
11
|
module Utils
|
5
|
-
|
12
|
+
|
13
|
+
##
|
14
|
+
# Scrapes an HTML <table> element into an Array of Arrays. The header, if
|
15
|
+
# present, is returned as the first row.
|
16
|
+
##
|
6
17
|
def self.table(table_selector, selector_method=:xpath)
|
7
18
|
require 'csv'
|
8
19
|
return Proc.new do |instance_html|
|
@@ -16,6 +27,9 @@ module Upton
|
|
16
27
|
end
|
17
28
|
end
|
18
29
|
|
30
|
+
##
|
31
|
+
# Scrapes any set of HTML elements into an Array.
|
32
|
+
##
|
19
33
|
def self.list(list_selector, selector_method=:xpath)
|
20
34
|
require 'csv'
|
21
35
|
return Proc.new do |instance_html|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rack
|