upton 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA512:
3
+ data.tar.gz: 0c50b13aca2d3f11f8ebbef72420dfc3276169fb0c1eafadd437ae7d1a83f2d9467511ddc8e529321be677e26bcbd1af02c6c2c615d14e1ea09de0cc9b6d9762
4
+ metadata.gz: e795b2a9ffe25373b419ccded0db1a7786f7b4789dbf8fc7da28cd53764232ddcad30f398b569e78ae5404690d96661577ea84340b4cf4a4e40798dbd82779e0
5
+ SHA1:
6
+ data.tar.gz: 13e674057073d5cd0fc1dff400450717a0613bc4
7
+ metadata.gz: 1629ba3d63b260994bcebf99f1f5d79a499bb707
data/lib/upton.rb ADDED
@@ -0,0 +1,186 @@
1
+ # encoding: UTF-8
2
+
3
+ # *Upton* is a framework for easy web-scraping with a useful debug mode
4
+ # that doesn't hammer your target's servers. It does the repetitive parts of
5
+ # writing scrapers, so you only have to write the unique parts for each site.
6
+ #
7
+ # Upton operates on the theory that, for most scraping projects, you need to
8
+ # scrape two types of pages:
9
+ #
10
+ # 1. Index pages, which list instance pages. For example, a job search
11
+ # site's search page or a newspaper's homepage.
12
+ # 2. Instance pages, which represent the goal of your scraping, e.g.
13
+ # job listings or news articles.
14
+
15
+ module Upton
16
+
17
+ # Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
18
+ # in more complicated cases; e.g. +MyScraper < Upton::Scraper+
19
+ class Scraper
20
+
21
+ attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder
22
+
23
+ # == Basic use-case methods.
24
+
25
+ # This is the main user-facing method for a basic scraper.
26
+ # Call +scrape+ with a block; this block will be called on
27
+ # the text of each instance page, (and optionally, its URL and its index
28
+ # in the list of instance URLs returned by +get_index+).
29
+ def scrape &blk
30
+ self.scrape_from_list(self.get_index, blk)
31
+ end
32
+
33
+
34
+ # == Configuration Options
35
+
36
+ # +index_url+: The URL of the page containing the list of instances.
37
+ # +selector+: The XPath or CSS that specifies the anchor elements within
38
+ # the page.
39
+ # +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
40
+ #
41
+ # These options are a shortcut. If you plant to override +get_index+, you
42
+ # do not need to set them.
43
+ def initialize(index_url="", selector="", selector_method=:xpath)
44
+ @index_url = index_url
45
+ @index_selector = selector
46
+ @index_selector_method = selector_method
47
+
48
+ # If true, then Upton prints information about when it gets
49
+ # files from the internet and when it gets them from its stash.
50
+ @verbose = false
51
+
52
+ # If true, then Upton fetches each instance page only once
53
+ # future requests for that file are responded to with the locally stashed
54
+ # version.
55
+ # You may want to set @debug to false for production (but maybe not).
56
+ # You can also control stashing behavior on a per-call basis with the
57
+ # optional second argument to get_page, if, for instance, you want to
58
+ # stash certain instance pages, e.g. based on their modification date.
59
+ @debug = true
60
+ # Index debug does the same, but for index pages.
61
+ @index_debug = false
62
+
63
+ # In order to not hammer servers, Upton waits for, by default, 30
64
+ # seconds between requests to the remote server.
65
+ @nice_sleep_time = 30 #seconds
66
+
67
+ # Folder name for stashes, if you want them to be stored somewhere else,
68
+ # e.g. under /tmp.
69
+ @stash_folder = "stashes"
70
+ unless Dir.exists?(@stash_folder)
71
+ Dir.mkdir(@stash_folder)
72
+ end
73
+ end
74
+
75
+
76
+
77
+ # If instance pages are paginated, <b>you must override</b>
78
+ # this method to return the next URL, given the current URL and its index.
79
+ #
80
+ # If instance pages aren't paginated, there's no need to override this.
81
+ #
82
+ # Return URLs that are empty strings are ignored (and recursion stops.)
83
+ # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
84
+ # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
85
+ def next_instance_page_url(url, index)
86
+ ""
87
+ end
88
+
89
+ # If index pages are paginated, <b>you must override</b>
90
+ # this method to return the next URL, given the current URL and its index.
91
+ #
92
+ # If index pages aren't paginated, there's no need to override this.
93
+ #
94
+ # Return URLs that are empty strings are ignored (and recursion stops.)
95
+ # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
96
+ # ought to return "http://whatever.com/articles?page=2"
97
+ def next_index_page_url(url, index)
98
+ ""
99
+ end
100
+
101
+
102
+ protected
103
+
104
+
105
+ #Handles getting pages with RestClient or getting them from the local stash
106
+ def get_page(url, stash=false)
107
+ return "" if url.empty?
108
+
109
+ #the filename for each stashed version is a cleaned version of the URL.
110
+ if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
111
+ puts "usin' a stashed copy of " + url if @verbose
112
+ resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read
113
+ else
114
+ begin
115
+ puts "getting " + url if @verbose
116
+ sleep @nice_sleep_time
117
+ resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
118
+ rescue RestClient::ResourceNotFound
119
+ resp = ""
120
+ rescue RestClient::InternalServerError
121
+ resp = ""
122
+ end
123
+ if stash
124
+ puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
125
+ open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ))}
126
+ end
127
+ end
128
+ resp
129
+ end
130
+
131
+ # Return a list of URLs for the instances you want to scrape.
132
+ # This can optionally be overridden if, for example, the list of instances
133
+ # comes from an API.
134
+ def get_index
135
+ parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
136
+ end
137
+
138
+ # Using the XPath or CSS selector and selector_method that uniquely locates
139
+ # the links in the index, return those links as strings.
140
+ def parse_index(text, selector, selector_method=:xpath)
141
+ Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
142
+ end
143
+
144
+ # Returns the concatenated output of each member of a paginated index,
145
+ # e.g. a site listing links with 2+ pages.
146
+ def get_index_pages(url, index)
147
+ resp = self.get_page(url, @index_debug)
148
+ if !resp.empty?
149
+ next_url = self.next_index_page_url(url, index + 1)
150
+ unless next_url == url
151
+ next_resp = self.get_index_pages(next_url, index + 1).to_s
152
+ resp += next_resp
153
+ end
154
+ end
155
+ resp
156
+ end
157
+
158
+ # Returns the concatenated output of each member of a paginated instance,
159
+ # e.g. a news article with 2 pages.
160
+ def get_instance(url, index=0)
161
+ resp = self.get_page(url, @debug)
162
+ if !resp.empty?
163
+ next_url = self.next_instance_page_url(url, index + 1)
164
+ unless next_url == url
165
+ next_resp = self.get_instance(next_url, index + 1).to_s
166
+ resp += next_resp
167
+ end
168
+ end
169
+ resp
170
+ end
171
+
172
+ # Just a helper for +scrape+.
173
+ def scrape_from_list(list, blk)
174
+ puts "Scraping #{list.size} instances" if @verbose
175
+ list.each_with_index.map do |instance_url, index|
176
+ blk.call(get_instance(instance_url), instance_url, index)
177
+ end
178
+ end
179
+
180
+ # it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
181
+ def slug(url)
182
+ url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
183
+ end
184
+
185
+ end
186
+ end