upton 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/upton.rb +186 -0
- data/test/data/discussion.html +620 -0
- data/test/data/propublica.html +1554 -0
- data/test/data/prosecutor.html +2223 -0
- data/test/data/sixfacts.html +2234 -0
- data/test/data/webinar.html +881 -0
- data/test/test_upton.rb +82 -0
- metadata +114 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA512:
|
3
|
+
data.tar.gz: 0c50b13aca2d3f11f8ebbef72420dfc3276169fb0c1eafadd437ae7d1a83f2d9467511ddc8e529321be677e26bcbd1af02c6c2c615d14e1ea09de0cc9b6d9762
|
4
|
+
metadata.gz: e795b2a9ffe25373b419ccded0db1a7786f7b4789dbf8fc7da28cd53764232ddcad30f398b569e78ae5404690d96661577ea84340b4cf4a4e40798dbd82779e0
|
5
|
+
SHA1:
|
6
|
+
data.tar.gz: 13e674057073d5cd0fc1dff400450717a0613bc4
|
7
|
+
metadata.gz: 1629ba3d63b260994bcebf99f1f5d79a499bb707
|
data/lib/upton.rb
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# *Upton* is a framework for easy web-scraping with a useful debug mode
|
4
|
+
# that doesn't hammer your target's servers. It does the repetitive parts of
|
5
|
+
# writing scrapers, so you only have to write the unique parts for each site.
|
6
|
+
#
|
7
|
+
# Upton operates on the theory that, for most scraping projects, you need to
|
8
|
+
# scrape two types of pages:
|
9
|
+
#
|
10
|
+
# 1. Index pages, which list instance pages. For example, a job search
|
11
|
+
# site's search page or a newspaper's homepage.
|
12
|
+
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
13
|
+
# job listings or news articles.
|
14
|
+
|
15
|
+
module Upton
|
16
|
+
|
17
|
+
# Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
|
18
|
+
# in more complicated cases; e.g. +MyScraper < Upton::Scraper+
|
19
|
+
class Scraper
|
20
|
+
|
21
|
+
attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder
|
22
|
+
|
23
|
+
# == Basic use-case methods.
|
24
|
+
|
25
|
+
# This is the main user-facing method for a basic scraper.
|
26
|
+
# Call +scrape+ with a block; this block will be called on
|
27
|
+
# the text of each instance page, (and optionally, its URL and its index
|
28
|
+
# in the list of instance URLs returned by +get_index+).
|
29
|
+
def scrape &blk
|
30
|
+
self.scrape_from_list(self.get_index, blk)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
# == Configuration Options
|
35
|
+
|
36
|
+
# +index_url+: The URL of the page containing the list of instances.
|
37
|
+
# +selector+: The XPath or CSS that specifies the anchor elements within
|
38
|
+
# the page.
|
39
|
+
# +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
|
40
|
+
#
|
41
|
+
# These options are a shortcut. If you plant to override +get_index+, you
|
42
|
+
# do not need to set them.
|
43
|
+
def initialize(index_url="", selector="", selector_method=:xpath)
|
44
|
+
@index_url = index_url
|
45
|
+
@index_selector = selector
|
46
|
+
@index_selector_method = selector_method
|
47
|
+
|
48
|
+
# If true, then Upton prints information about when it gets
|
49
|
+
# files from the internet and when it gets them from its stash.
|
50
|
+
@verbose = false
|
51
|
+
|
52
|
+
# If true, then Upton fetches each instance page only once
|
53
|
+
# future requests for that file are responded to with the locally stashed
|
54
|
+
# version.
|
55
|
+
# You may want to set @debug to false for production (but maybe not).
|
56
|
+
# You can also control stashing behavior on a per-call basis with the
|
57
|
+
# optional second argument to get_page, if, for instance, you want to
|
58
|
+
# stash certain instance pages, e.g. based on their modification date.
|
59
|
+
@debug = true
|
60
|
+
# Index debug does the same, but for index pages.
|
61
|
+
@index_debug = false
|
62
|
+
|
63
|
+
# In order to not hammer servers, Upton waits for, by default, 30
|
64
|
+
# seconds between requests to the remote server.
|
65
|
+
@nice_sleep_time = 30 #seconds
|
66
|
+
|
67
|
+
# Folder name for stashes, if you want them to be stored somewhere else,
|
68
|
+
# e.g. under /tmp.
|
69
|
+
@stash_folder = "stashes"
|
70
|
+
unless Dir.exists?(@stash_folder)
|
71
|
+
Dir.mkdir(@stash_folder)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
# If instance pages are paginated, <b>you must override</b>
|
78
|
+
# this method to return the next URL, given the current URL and its index.
|
79
|
+
#
|
80
|
+
# If instance pages aren't paginated, there's no need to override this.
|
81
|
+
#
|
82
|
+
# Return URLs that are empty strings are ignored (and recursion stops.)
|
83
|
+
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
84
|
+
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
85
|
+
def next_instance_page_url(url, index)
|
86
|
+
""
|
87
|
+
end
|
88
|
+
|
89
|
+
# If index pages are paginated, <b>you must override</b>
|
90
|
+
# this method to return the next URL, given the current URL and its index.
|
91
|
+
#
|
92
|
+
# If index pages aren't paginated, there's no need to override this.
|
93
|
+
#
|
94
|
+
# Return URLs that are empty strings are ignored (and recursion stops.)
|
95
|
+
# e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
|
96
|
+
# ought to return "http://whatever.com/articles?page=2"
|
97
|
+
def next_index_page_url(url, index)
|
98
|
+
""
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
protected
|
103
|
+
|
104
|
+
|
105
|
+
#Handles getting pages with RestClient or getting them from the local stash
|
106
|
+
def get_page(url, stash=false)
|
107
|
+
return "" if url.empty?
|
108
|
+
|
109
|
+
#the filename for each stashed version is a cleaned version of the URL.
|
110
|
+
if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
|
111
|
+
puts "usin' a stashed copy of " + url if @verbose
|
112
|
+
resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read
|
113
|
+
else
|
114
|
+
begin
|
115
|
+
puts "getting " + url if @verbose
|
116
|
+
sleep @nice_sleep_time
|
117
|
+
resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
|
118
|
+
rescue RestClient::ResourceNotFound
|
119
|
+
resp = ""
|
120
|
+
rescue RestClient::InternalServerError
|
121
|
+
resp = ""
|
122
|
+
end
|
123
|
+
if stash
|
124
|
+
puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
|
125
|
+
open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ))}
|
126
|
+
end
|
127
|
+
end
|
128
|
+
resp
|
129
|
+
end
|
130
|
+
|
131
|
+
# Return a list of URLs for the instances you want to scrape.
|
132
|
+
# This can optionally be overridden if, for example, the list of instances
|
133
|
+
# comes from an API.
|
134
|
+
def get_index
|
135
|
+
parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
|
136
|
+
end
|
137
|
+
|
138
|
+
# Using the XPath or CSS selector and selector_method that uniquely locates
|
139
|
+
# the links in the index, return those links as strings.
|
140
|
+
def parse_index(text, selector, selector_method=:xpath)
|
141
|
+
Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns the concatenated output of each member of a paginated index,
|
145
|
+
# e.g. a site listing links with 2+ pages.
|
146
|
+
def get_index_pages(url, index)
|
147
|
+
resp = self.get_page(url, @index_debug)
|
148
|
+
if !resp.empty?
|
149
|
+
next_url = self.next_index_page_url(url, index + 1)
|
150
|
+
unless next_url == url
|
151
|
+
next_resp = self.get_index_pages(next_url, index + 1).to_s
|
152
|
+
resp += next_resp
|
153
|
+
end
|
154
|
+
end
|
155
|
+
resp
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns the concatenated output of each member of a paginated instance,
|
159
|
+
# e.g. a news article with 2 pages.
|
160
|
+
def get_instance(url, index=0)
|
161
|
+
resp = self.get_page(url, @debug)
|
162
|
+
if !resp.empty?
|
163
|
+
next_url = self.next_instance_page_url(url, index + 1)
|
164
|
+
unless next_url == url
|
165
|
+
next_resp = self.get_instance(next_url, index + 1).to_s
|
166
|
+
resp += next_resp
|
167
|
+
end
|
168
|
+
end
|
169
|
+
resp
|
170
|
+
end
|
171
|
+
|
172
|
+
# Just a helper for +scrape+.
|
173
|
+
def scrape_from_list(list, blk)
|
174
|
+
puts "Scraping #{list.size} instances" if @verbose
|
175
|
+
list.each_with_index.map do |instance_url, index|
|
176
|
+
blk.call(get_instance(instance_url), instance_url, index)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
|
181
|
+
def slug(url)
|
182
|
+
url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
end
|