upton 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA512:
3
+ data.tar.gz: 0c50b13aca2d3f11f8ebbef72420dfc3276169fb0c1eafadd437ae7d1a83f2d9467511ddc8e529321be677e26bcbd1af02c6c2c615d14e1ea09de0cc9b6d9762
4
+ metadata.gz: e795b2a9ffe25373b419ccded0db1a7786f7b4789dbf8fc7da28cd53764232ddcad30f398b569e78ae5404690d96661577ea84340b4cf4a4e40798dbd82779e0
5
+ SHA1:
6
+ data.tar.gz: 13e674057073d5cd0fc1dff400450717a0613bc4
7
+ metadata.gz: 1629ba3d63b260994bcebf99f1f5d79a499bb707
data/lib/upton.rb ADDED
@@ -0,0 +1,186 @@
1
+ # encoding: UTF-8
2
+
3
+ # *Upton* is a framework for easy web-scraping with a useful debug mode
4
+ # that doesn't hammer your target's servers. It does the repetitive parts of
5
+ # writing scrapers, so you only have to write the unique parts for each site.
6
+ #
7
+ # Upton operates on the theory that, for most scraping projects, you need to
8
+ # scrape two types of pages:
9
+ #
10
+ # 1. Index pages, which list instance pages. For example, a job search
11
+ # site's search page or a newspaper's homepage.
12
+ # 2. Instance pages, which represent the goal of your scraping, e.g.
13
+ # job listings or news articles.
14
+
15
+ module Upton
16
+
17
+ # Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
18
+ # in more complicated cases; e.g. +MyScraper < Upton::Scraper+
19
+ class Scraper
20
+
21
+ attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder
22
+
23
+ # == Basic use-case methods.
24
+
25
+ # This is the main user-facing method for a basic scraper.
26
+ # Call +scrape+ with a block; this block will be called on
27
+ # the text of each instance page, (and optionally, its URL and its index
28
+ # in the list of instance URLs returned by +get_index+).
29
+ def scrape &blk
30
+ self.scrape_from_list(self.get_index, blk)
31
+ end
32
+
33
+
34
+ # == Configuration Options
35
+
36
+ # +index_url+: The URL of the page containing the list of instances.
37
+ # +selector+: The XPath or CSS that specifies the anchor elements within
38
+ # the page.
39
+ # +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
40
+ #
41
+ # These options are a shortcut. If you plant to override +get_index+, you
42
+ # do not need to set them.
43
+ def initialize(index_url="", selector="", selector_method=:xpath)
44
+ @index_url = index_url
45
+ @index_selector = selector
46
+ @index_selector_method = selector_method
47
+
48
+ # If true, then Upton prints information about when it gets
49
+ # files from the internet and when it gets them from its stash.
50
+ @verbose = false
51
+
52
+ # If true, then Upton fetches each instance page only once
53
+ # future requests for that file are responded to with the locally stashed
54
+ # version.
55
+ # You may want to set @debug to false for production (but maybe not).
56
+ # You can also control stashing behavior on a per-call basis with the
57
+ # optional second argument to get_page, if, for instance, you want to
58
+ # stash certain instance pages, e.g. based on their modification date.
59
+ @debug = true
60
+ # Index debug does the same, but for index pages.
61
+ @index_debug = false
62
+
63
+ # In order to not hammer servers, Upton waits for, by default, 30
64
+ # seconds between requests to the remote server.
65
+ @nice_sleep_time = 30 #seconds
66
+
67
+ # Folder name for stashes, if you want them to be stored somewhere else,
68
+ # e.g. under /tmp.
69
+ @stash_folder = "stashes"
70
+ unless Dir.exists?(@stash_folder)
71
+ Dir.mkdir(@stash_folder)
72
+ end
73
+ end
74
+
75
+
76
+
77
+ # If instance pages are paginated, <b>you must override</b>
78
+ # this method to return the next URL, given the current URL and its index.
79
+ #
80
+ # If instance pages aren't paginated, there's no need to override this.
81
+ #
82
+ # Return URLs that are empty strings are ignored (and recursion stops.)
83
+ # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
84
+ # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
85
+ def next_instance_page_url(url, index)
86
+ ""
87
+ end
88
+
89
+ # If index pages are paginated, <b>you must override</b>
90
+ # this method to return the next URL, given the current URL and its index.
91
+ #
92
+ # If index pages aren't paginated, there's no need to override this.
93
+ #
94
+ # Return URLs that are empty strings are ignored (and recursion stops.)
95
+ # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
96
+ # ought to return "http://whatever.com/articles?page=2"
97
+ def next_index_page_url(url, index)
98
+ ""
99
+ end
100
+
101
+
102
+ protected
103
+
104
+
105
+ #Handles getting pages with RestClient or getting them from the local stash
106
+ def get_page(url, stash=false)
107
+ return "" if url.empty?
108
+
109
+ #the filename for each stashed version is a cleaned version of the URL.
110
+ if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
111
+ puts "usin' a stashed copy of " + url if @verbose
112
+ resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read
113
+ else
114
+ begin
115
+ puts "getting " + url if @verbose
116
+ sleep @nice_sleep_time
117
+ resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
118
+ rescue RestClient::ResourceNotFound
119
+ resp = ""
120
+ rescue RestClient::InternalServerError
121
+ resp = ""
122
+ end
123
+ if stash
124
+ puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
125
+ open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ))}
126
+ end
127
+ end
128
+ resp
129
+ end
130
+
131
+ # Return a list of URLs for the instances you want to scrape.
132
+ # This can optionally be overridden if, for example, the list of instances
133
+ # comes from an API.
134
+ def get_index
135
+ parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
136
+ end
137
+
138
+ # Using the XPath or CSS selector and selector_method that uniquely locates
139
+ # the links in the index, return those links as strings.
140
+ def parse_index(text, selector, selector_method=:xpath)
141
+ Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
142
+ end
143
+
144
+ # Returns the concatenated output of each member of a paginated index,
145
+ # e.g. a site listing links with 2+ pages.
146
+ def get_index_pages(url, index)
147
+ resp = self.get_page(url, @index_debug)
148
+ if !resp.empty?
149
+ next_url = self.next_index_page_url(url, index + 1)
150
+ unless next_url == url
151
+ next_resp = self.get_index_pages(next_url, index + 1).to_s
152
+ resp += next_resp
153
+ end
154
+ end
155
+ resp
156
+ end
157
+
158
+ # Returns the concatenated output of each member of a paginated instance,
159
+ # e.g. a news article with 2 pages.
160
+ def get_instance(url, index=0)
161
+ resp = self.get_page(url, @debug)
162
+ if !resp.empty?
163
+ next_url = self.next_instance_page_url(url, index + 1)
164
+ unless next_url == url
165
+ next_resp = self.get_instance(next_url, index + 1).to_s
166
+ resp += next_resp
167
+ end
168
+ end
169
+ resp
170
+ end
171
+
172
+ # Just a helper for +scrape+.
173
+ def scrape_from_list(list, blk)
174
+ puts "Scraping #{list.size} instances" if @verbose
175
+ list.each_with_index.map do |instance_url, index|
176
+ blk.call(get_instance(instance_url), instance_url, index)
177
+ end
178
+ end
179
+
180
+ # it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
181
+ def slug(url)
182
+ url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
183
+ end
184
+
185
+ end
186
+ end