upton 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/upton.rb +47 -24
- data/lib/utils.rb +15 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZWYwNGQyM2ZkYWVhYWViZmU1ZDczNTA4OGZjMzhhN2FkMWIwNzc5Nw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MmVmMjdkZGFlYzBjZGJjNjNiOTg4MDJlNjhmZDA2YzI4MzdlY2E5Mw==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjgwMjAyYTM0YTNhODA5ZmExNjBkYzcwNDgzYjdjM2M2ZGJiODM0NmUzMDE0
|
10
|
+
YmFlNjhmZjNhMDgwMTRkMWFmMWYzODgxMzI1MTZhZWNmYTY0MTEzN2QzYzE4
|
11
|
+
MWM3YjE5YzAxNmU5NjViZGQyNjZkYzBkZjgyZTEzMWUzNjc3N2U=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZDA2MGI0YzllM2UyNmFhYTljNjZiNDM3NmVhNWJhNjljOGQwNWJlZTViZDQw
|
14
|
+
ZmZkZjBiNjlhNDQ3MzFlNjRkZWE5MzZjMjViMjQ2N2QxYWRjYzg1MGJmMmFk
|
15
|
+
ZDE1NmJhNjliNmViZDE5ZDY1NjRiNDg3OTIwYTU5NTA0OTJhM2U=
|
data/lib/upton.rb
CHANGED
@@ -1,17 +1,5 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
|
-
# *Upton* is a framework for easy web-scraping with a useful debug mode
|
4
|
-
# that doesn't hammer your target's servers. It does the repetitive parts of
|
5
|
-
# writing scrapers, so you only have to write the unique parts for each site.
|
6
|
-
#
|
7
|
-
# Upton operates on the theory that, for most scraping projects, you need to
|
8
|
-
# scrape two types of pages:
|
9
|
-
#
|
10
|
-
# 1. Index pages, which list instance pages. For example, a job search
|
11
|
-
# site's search page or a newspaper's homepage.
|
12
|
-
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
13
|
-
# job listings or news articles.
|
14
|
-
#
|
15
3
|
|
16
4
|
require 'nokogiri'
|
17
5
|
require 'uri'
|
@@ -19,19 +7,37 @@ require 'restclient'
|
|
19
7
|
require './lib/utils'
|
20
8
|
|
21
9
|
module Upton
|
22
|
-
|
23
|
-
# Upton
|
24
|
-
#
|
10
|
+
##
|
11
|
+
# *Upton* is a framework for easy web-scraping with a useful debug mode
|
12
|
+
# that doesn't hammer your target's servers. It does the repetitive parts of
|
13
|
+
# writing scrapers, so you only have to write the unique parts for each site.
|
14
|
+
#
|
15
|
+
# Upton operates on the theory that, for most scraping projects, you need to
|
16
|
+
# scrape two types of pages:
|
17
|
+
#
|
18
|
+
# 1. Index pages, which list instance pages. For example, a job search
|
19
|
+
# site's search page or a newspaper's homepage.
|
20
|
+
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
21
|
+
# job listings or news articles.
|
22
|
+
#
|
23
|
+
# Upton::Scraper can be used as-is for basic use-cases by:
|
24
|
+
# 1. specifying the pages to be scraped in `new` as an index page
|
25
|
+
# or as an Array of URLs.
|
26
|
+
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
27
|
+
# block from Upton::Utils.
|
28
|
+
# For more complicated cases; subclass Upton::Scraper
|
29
|
+
# e.g. +MyScraper < Upton::Scraper+ and overrdie various methods.
|
30
|
+
##
|
25
31
|
class Scraper
|
26
32
|
|
27
33
|
attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
|
28
34
|
|
29
|
-
|
30
|
-
|
35
|
+
##
|
31
36
|
# This is the main user-facing method for a basic scraper.
|
32
37
|
# Call +scrape+ with a block; this block will be called on
|
33
38
|
# the text of each instance page, (and optionally, its URL and its index
|
34
39
|
# in the list of instance URLs returned by +get_index+).
|
40
|
+
##
|
35
41
|
def scrape &blk
|
36
42
|
unless self.url_array
|
37
43
|
self.url_array = self.get_index
|
@@ -39,6 +45,7 @@ module Upton
|
|
39
45
|
self.scrape_from_list(self.url_array, blk)
|
40
46
|
end
|
41
47
|
|
48
|
+
##
|
42
49
|
# +index_url_or_array+: A list of string URLs, OR
|
43
50
|
# the URL of the page containing the list of instances.
|
44
51
|
# +selector+: The XPath or CSS that specifies the anchor elements within
|
@@ -49,6 +56,7 @@ module Upton
|
|
49
56
|
# do not need to set them.
|
50
57
|
# If you don't specify a selector, the first argument will be treated as a
|
51
58
|
# list of URLs.
|
59
|
+
##
|
52
60
|
def initialize(index_url_or_array, selector="", selector_method=:xpath)
|
53
61
|
|
54
62
|
#if first arg is a valid URL, do already-written stuff;
|
@@ -92,9 +100,7 @@ module Upton
|
|
92
100
|
end
|
93
101
|
end
|
94
102
|
|
95
|
-
|
96
|
-
# == Configuration Options
|
97
|
-
|
103
|
+
##
|
98
104
|
# If instance pages are paginated, <b>you must override</b>
|
99
105
|
# this method to return the next URL, given the current URL and its index.
|
100
106
|
#
|
@@ -104,10 +110,12 @@ module Upton
|
|
104
110
|
#
|
105
111
|
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
106
112
|
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
113
|
+
##
|
107
114
|
def next_instance_page_url(url, index)
|
108
115
|
""
|
109
116
|
end
|
110
117
|
|
118
|
+
##
|
111
119
|
# If index pages are paginated, <b>you must override</b>
|
112
120
|
# this method to return the next URL, given the current URL and its index.
|
113
121
|
#
|
@@ -117,10 +125,14 @@ module Upton
|
|
117
125
|
#
|
118
126
|
# e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
|
119
127
|
# ought to return "http://whatever.com/articles?page=2"
|
128
|
+
##
|
120
129
|
def next_index_page_url(url, index)
|
121
130
|
""
|
122
131
|
end
|
123
132
|
|
133
|
+
##
|
134
|
+
# Writes the scraped result to a CSV at the given filename.
|
135
|
+
##
|
124
136
|
def scrape_to_csv filename, &blk
|
125
137
|
require 'csv'
|
126
138
|
unless self.url_array
|
@@ -133,8 +145,11 @@ module Upton
|
|
133
145
|
|
134
146
|
protected
|
135
147
|
|
136
|
-
|
137
|
-
#Handles getting pages with RestClient or getting them from the local stash
|
148
|
+
##
|
149
|
+
# Handles getting pages with RestClient or getting them from the local stash.
|
150
|
+
#
|
151
|
+
# Uses a kludge (because rest-client is outdated) to handle encoding.
|
152
|
+
##
|
138
153
|
def get_page(url, stash=false)
|
139
154
|
return "" if url.empty?
|
140
155
|
|
@@ -179,21 +194,27 @@ module Upton
|
|
179
194
|
resp
|
180
195
|
end
|
181
196
|
|
197
|
+
##
|
182
198
|
# Return a list of URLs for the instances you want to scrape.
|
183
199
|
# This can optionally be overridden if, for example, the list of instances
|
184
200
|
# comes from an API.
|
201
|
+
##
|
185
202
|
def get_index
|
186
203
|
parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
|
187
204
|
end
|
188
205
|
|
189
|
-
|
190
|
-
# the
|
206
|
+
##
|
207
|
+
# Using the XPath expression or CSS selector and selector_method that
|
208
|
+
# uniquely identifies the links in the index, return those links as strings.
|
209
|
+
##
|
191
210
|
def parse_index(text, selector, selector_method=:xpath)
|
192
211
|
Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
|
193
212
|
end
|
194
213
|
|
214
|
+
##
|
195
215
|
# Returns the concatenated output of each member of a paginated index,
|
196
216
|
# e.g. a site listing links with 2+ pages.
|
217
|
+
##
|
197
218
|
def get_index_pages(url, index)
|
198
219
|
resp = self.get_page(url, @index_debug)
|
199
220
|
if !resp.empty?
|
@@ -206,12 +227,14 @@ module Upton
|
|
206
227
|
resp
|
207
228
|
end
|
208
229
|
|
230
|
+
##
|
209
231
|
# Returns the article at `url`.
|
210
232
|
#
|
211
233
|
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
212
234
|
#
|
213
235
|
# If an instance is paginated, returns the concatenated output of each
|
214
236
|
# page, e.g. if a news article has two pages.
|
237
|
+
##
|
215
238
|
def get_instance(url, index=0)
|
216
239
|
resp = self.get_page(url, @debug)
|
217
240
|
if !resp.empty?
|
data/lib/utils.rb
CHANGED
@@ -1,8 +1,19 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
module Upton
|
4
|
+
|
5
|
+
##
|
6
|
+
# This class contains a collection of helpers for Upton
|
7
|
+
#
|
8
|
+
# Each method returns a Proc that (with an & ) can be used as the final
|
9
|
+
# argument to Upton's `scrape` and `scrape_to_csv`
|
10
|
+
##
|
4
11
|
module Utils
|
5
|
-
|
12
|
+
|
13
|
+
##
|
14
|
+
# Scrapes an HTML <table> element into an Array of Arrays. The header, if
|
15
|
+
# present, is returned as the first row.
|
16
|
+
##
|
6
17
|
def self.table(table_selector, selector_method=:xpath)
|
7
18
|
require 'csv'
|
8
19
|
return Proc.new do |instance_html|
|
@@ -16,6 +27,9 @@ module Upton
|
|
16
27
|
end
|
17
28
|
end
|
18
29
|
|
30
|
+
##
|
31
|
+
# Scrapes any set of HTML elements into an Array.
|
32
|
+
##
|
19
33
|
def self.list(list_selector, selector_method=:xpath)
|
20
34
|
require 'csv'
|
21
35
|
return Proc.new do |instance_html|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rack
|