upton 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/upton.rb +186 -0
- data/test/data/discussion.html +620 -0
- data/test/data/propublica.html +1554 -0
- data/test/data/prosecutor.html +2223 -0
- data/test/data/sixfacts.html +2234 -0
- data/test/data/webinar.html +881 -0
- data/test/test_upton.rb +82 -0
- metadata +114 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA512:
|
3
|
+
data.tar.gz: 0c50b13aca2d3f11f8ebbef72420dfc3276169fb0c1eafadd437ae7d1a83f2d9467511ddc8e529321be677e26bcbd1af02c6c2c615d14e1ea09de0cc9b6d9762
|
4
|
+
metadata.gz: e795b2a9ffe25373b419ccded0db1a7786f7b4789dbf8fc7da28cd53764232ddcad30f398b569e78ae5404690d96661577ea84340b4cf4a4e40798dbd82779e0
|
5
|
+
SHA1:
|
6
|
+
data.tar.gz: 13e674057073d5cd0fc1dff400450717a0613bc4
|
7
|
+
metadata.gz: 1629ba3d63b260994bcebf99f1f5d79a499bb707
|
data/lib/upton.rb
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# *Upton* is a framework for easy web-scraping with a useful debug mode
|
4
|
+
# that doesn't hammer your target's servers. It does the repetitive parts of
|
5
|
+
# writing scrapers, so you only have to write the unique parts for each site.
|
6
|
+
#
|
7
|
+
# Upton operates on the theory that, for most scraping projects, you need to
|
8
|
+
# scrape two types of pages:
|
9
|
+
#
|
10
|
+
# 1. Index pages, which list instance pages. For example, a job search
|
11
|
+
# site's search page or a newspaper's homepage.
|
12
|
+
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
13
|
+
# job listings or news articles.
|
14
|
+
|
15
|
+
module Upton
|
16
|
+
|
17
|
+
# Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
|
18
|
+
# in more complicated cases; e.g. +MyScraper < Upton::Scraper+
|
19
|
+
class Scraper
|
20
|
+
|
21
|
+
attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder
|
22
|
+
|
23
|
+
# == Basic use-case methods.
|
24
|
+
|
25
|
+
# This is the main user-facing method for a basic scraper.
|
26
|
+
# Call +scrape+ with a block; this block will be called on
|
27
|
+
# the text of each instance page, (and optionally, its URL and its index
|
28
|
+
# in the list of instance URLs returned by +get_index+).
|
29
|
+
def scrape &blk
|
30
|
+
self.scrape_from_list(self.get_index, blk)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
# == Configuration Options
|
35
|
+
|
36
|
+
# +index_url+: The URL of the page containing the list of instances.
|
37
|
+
# +selector+: The XPath or CSS that specifies the anchor elements within
|
38
|
+
# the page.
|
39
|
+
# +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
|
40
|
+
#
|
41
|
+
# These options are a shortcut. If you plant to override +get_index+, you
|
42
|
+
# do not need to set them.
|
43
|
+
def initialize(index_url="", selector="", selector_method=:xpath)
|
44
|
+
@index_url = index_url
|
45
|
+
@index_selector = selector
|
46
|
+
@index_selector_method = selector_method
|
47
|
+
|
48
|
+
# If true, then Upton prints information about when it gets
|
49
|
+
# files from the internet and when it gets them from its stash.
|
50
|
+
@verbose = false
|
51
|
+
|
52
|
+
# If true, then Upton fetches each instance page only once
|
53
|
+
# future requests for that file are responded to with the locally stashed
|
54
|
+
# version.
|
55
|
+
# You may want to set @debug to false for production (but maybe not).
|
56
|
+
# You can also control stashing behavior on a per-call basis with the
|
57
|
+
# optional second argument to get_page, if, for instance, you want to
|
58
|
+
# stash certain instance pages, e.g. based on their modification date.
|
59
|
+
@debug = true
|
60
|
+
# Index debug does the same, but for index pages.
|
61
|
+
@index_debug = false
|
62
|
+
|
63
|
+
# In order to not hammer servers, Upton waits for, by default, 30
|
64
|
+
# seconds between requests to the remote server.
|
65
|
+
@nice_sleep_time = 30 #seconds
|
66
|
+
|
67
|
+
# Folder name for stashes, if you want them to be stored somewhere else,
|
68
|
+
# e.g. under /tmp.
|
69
|
+
@stash_folder = "stashes"
|
70
|
+
unless Dir.exists?(@stash_folder)
|
71
|
+
Dir.mkdir(@stash_folder)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
# If instance pages are paginated, <b>you must override</b>
|
78
|
+
# this method to return the next URL, given the current URL and its index.
|
79
|
+
#
|
80
|
+
# If instance pages aren't paginated, there's no need to override this.
|
81
|
+
#
|
82
|
+
# Return URLs that are empty strings are ignored (and recursion stops.)
|
83
|
+
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
84
|
+
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
85
|
+
def next_instance_page_url(url, index)
|
86
|
+
""
|
87
|
+
end
|
88
|
+
|
89
|
+
# If index pages are paginated, <b>you must override</b>
|
90
|
+
# this method to return the next URL, given the current URL and its index.
|
91
|
+
#
|
92
|
+
# If index pages aren't paginated, there's no need to override this.
|
93
|
+
#
|
94
|
+
# Return URLs that are empty strings are ignored (and recursion stops.)
|
95
|
+
# e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
|
96
|
+
# ought to return "http://whatever.com/articles?page=2"
|
97
|
+
def next_index_page_url(url, index)
|
98
|
+
""
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
protected
|
103
|
+
|
104
|
+
|
105
|
+
#Handles getting pages with RestClient or getting them from the local stash
|
106
|
+
def get_page(url, stash=false)
|
107
|
+
return "" if url.empty?
|
108
|
+
|
109
|
+
#the filename for each stashed version is a cleaned version of the URL.
|
110
|
+
if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
|
111
|
+
puts "usin' a stashed copy of " + url if @verbose
|
112
|
+
resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read
|
113
|
+
else
|
114
|
+
begin
|
115
|
+
puts "getting " + url if @verbose
|
116
|
+
sleep @nice_sleep_time
|
117
|
+
resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
|
118
|
+
rescue RestClient::ResourceNotFound
|
119
|
+
resp = ""
|
120
|
+
rescue RestClient::InternalServerError
|
121
|
+
resp = ""
|
122
|
+
end
|
123
|
+
if stash
|
124
|
+
puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
|
125
|
+
open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ))}
|
126
|
+
end
|
127
|
+
end
|
128
|
+
resp
|
129
|
+
end
|
130
|
+
|
131
|
+
# Return a list of URLs for the instances you want to scrape.
|
132
|
+
# This can optionally be overridden if, for example, the list of instances
|
133
|
+
# comes from an API.
|
134
|
+
def get_index
|
135
|
+
parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
|
136
|
+
end
|
137
|
+
|
138
|
+
# Using the XPath or CSS selector and selector_method that uniquely locates
|
139
|
+
# the links in the index, return those links as strings.
|
140
|
+
def parse_index(text, selector, selector_method=:xpath)
|
141
|
+
Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns the concatenated output of each member of a paginated index,
|
145
|
+
# e.g. a site listing links with 2+ pages.
|
146
|
+
def get_index_pages(url, index)
|
147
|
+
resp = self.get_page(url, @index_debug)
|
148
|
+
if !resp.empty?
|
149
|
+
next_url = self.next_index_page_url(url, index + 1)
|
150
|
+
unless next_url == url
|
151
|
+
next_resp = self.get_index_pages(next_url, index + 1).to_s
|
152
|
+
resp += next_resp
|
153
|
+
end
|
154
|
+
end
|
155
|
+
resp
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns the concatenated output of each member of a paginated instance,
|
159
|
+
# e.g. a news article with 2 pages.
|
160
|
+
def get_instance(url, index=0)
|
161
|
+
resp = self.get_page(url, @debug)
|
162
|
+
if !resp.empty?
|
163
|
+
next_url = self.next_instance_page_url(url, index + 1)
|
164
|
+
unless next_url == url
|
165
|
+
next_resp = self.get_instance(next_url, index + 1).to_s
|
166
|
+
resp += next_resp
|
167
|
+
end
|
168
|
+
end
|
169
|
+
resp
|
170
|
+
end
|
171
|
+
|
172
|
+
# Just a helper for +scrape+.
|
173
|
+
def scrape_from_list(list, blk)
|
174
|
+
puts "Scraping #{list.size} instances" if @verbose
|
175
|
+
list.each_with_index.map do |instance_url, index|
|
176
|
+
blk.call(get_instance(instance_url), instance_url, index)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
|
181
|
+
def slug(url)
|
182
|
+
url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
end
|