upton 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +8 -8
  2. data/lib/upton.rb +47 -24
  3. data/lib/utils.rb +15 -1
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NWMxNTc5MGRjODQzYjNmNWVkODVkNDg1NmM0ODJkMmI1YWU1YmZlYw==
4
+ ZWYwNGQyM2ZkYWVhYWViZmU1ZDczNTA4OGZjMzhhN2FkMWIwNzc5Nw==
5
5
  data.tar.gz: !binary |-
6
- MGI1YTQ1MjM5OGMwZTU2NGVjYWE4OWY5NzY5YjE3OGE4Y2E5ZjdhMQ==
6
+ MmVmMjdkZGFlYzBjZGJjNjNiOTg4MDJlNjhmZDA2YzI4MzdlY2E5Mw==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- NTUwZDkyZDcxMDJiMjBhYzE3NTZjNjQ3NGRiNTdjZjFlOGY3OGI1MTZkZjk3
10
- ZmQyYzk0YTYzZWI4NzAzMWUyZmNkNmUyZmMxNWI2ZGU2Zjg4NGM2MmY1MmJk
11
- NWU0MmRjN2EyMjA3MjM1NWMyZTE0YTNjMDc0MDEyZGU3NGM4ZmY=
9
+ YjgwMjAyYTM0YTNhODA5ZmExNjBkYzcwNDgzYjdjM2M2ZGJiODM0NmUzMDE0
10
+ YmFlNjhmZjNhMDgwMTRkMWFmMWYzODgxMzI1MTZhZWNmYTY0MTEzN2QzYzE4
11
+ MWM3YjE5YzAxNmU5NjViZGQyNjZkYzBkZjgyZTEzMWUzNjc3N2U=
12
12
  data.tar.gz: !binary |-
13
- ODNjYzc3ODEzYzM5ZjA4OWI3NDA3YmRkODYwMWI0NTk4OTY1NzI3ZGM2OWMx
14
- ZmUwZDk3ZTA0MThmNDFkOTM3NjRlMTA0MTM5MTk5ODlmYzc3MzFkM2IyZmY0
15
- YWFmNmEzNzRjOGFiZDY2Njc5ZDEzMzQzMjgwZTZhYjIyYmYxMzQ=
13
+ ZDA2MGI0YzllM2UyNmFhYTljNjZiNDM3NmVhNWJhNjljOGQwNWJlZTViZDQw
14
+ ZmZkZjBiNjlhNDQ3MzFlNjRkZWE5MzZjMjViMjQ2N2QxYWRjYzg1MGJmMmFk
15
+ ZDE1NmJhNjliNmViZDE5ZDY1NjRiNDg3OTIwYTU5NTA0OTJhM2U=
data/lib/upton.rb CHANGED
@@ -1,17 +1,5 @@
1
1
  # encoding: UTF-8
2
2
 
3
- # *Upton* is a framework for easy web-scraping with a useful debug mode
4
- # that doesn't hammer your target's servers. It does the repetitive parts of
5
- # writing scrapers, so you only have to write the unique parts for each site.
6
- #
7
- # Upton operates on the theory that, for most scraping projects, you need to
8
- # scrape two types of pages:
9
- #
10
- # 1. Index pages, which list instance pages. For example, a job search
11
- # site's search page or a newspaper's homepage.
12
- # 2. Instance pages, which represent the goal of your scraping, e.g.
13
- # job listings or news articles.
14
- #
15
3
 
16
4
  require 'nokogiri'
17
5
  require 'uri'
@@ -19,19 +7,37 @@ require 'restclient'
19
7
  require './lib/utils'
20
8
 
21
9
  module Upton
22
-
23
- # Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
24
- # in more complicated cases; e.g. +MyScraper < Upton::Scraper+
10
+ ##
11
+ # *Upton* is a framework for easy web-scraping with a useful debug mode
12
+ # that doesn't hammer your target's servers. It does the repetitive parts of
13
+ # writing scrapers, so you only have to write the unique parts for each site.
14
+ #
15
+ # Upton operates on the theory that, for most scraping projects, you need to
16
+ # scrape two types of pages:
17
+ #
18
+ # 1. Index pages, which list instance pages. For example, a job search
19
+ # site's search page or a newspaper's homepage.
20
+ # 2. Instance pages, which represent the goal of your scraping, e.g.
21
+ # job listings or news articles.
22
+ #
23
+ # Upton::Scraper can be used as-is for basic use-cases by:
24
+ # 1. specifying the pages to be scraped in `new` as an index page
25
+ # or as an Array of URLs.
26
+ # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
27
+ # block from Upton::Utils.
28
+ # For more complicated cases; subclass Upton::Scraper
29
+ # e.g. +MyScraper < Upton::Scraper+ and overrdie various methods.
30
+ ##
25
31
  class Scraper
26
32
 
27
33
  attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
28
34
 
29
- # == Basic use-case methods.
30
-
35
+ ##
31
36
  # This is the main user-facing method for a basic scraper.
32
37
  # Call +scrape+ with a block; this block will be called on
33
38
  # the text of each instance page, (and optionally, its URL and its index
34
39
  # in the list of instance URLs returned by +get_index+).
40
+ ##
35
41
  def scrape &blk
36
42
  unless self.url_array
37
43
  self.url_array = self.get_index
@@ -39,6 +45,7 @@ module Upton
39
45
  self.scrape_from_list(self.url_array, blk)
40
46
  end
41
47
 
48
+ ##
42
49
  # +index_url_or_array+: A list of string URLs, OR
43
50
  # the URL of the page containing the list of instances.
44
51
  # +selector+: The XPath or CSS that specifies the anchor elements within
@@ -49,6 +56,7 @@ module Upton
49
56
  # do not need to set them.
50
57
  # If you don't specify a selector, the first argument will be treated as a
51
58
  # list of URLs.
59
+ ##
52
60
  def initialize(index_url_or_array, selector="", selector_method=:xpath)
53
61
 
54
62
  #if first arg is a valid URL, do already-written stuff;
@@ -92,9 +100,7 @@ module Upton
92
100
  end
93
101
  end
94
102
 
95
-
96
- # == Configuration Options
97
-
103
+ ##
98
104
  # If instance pages are paginated, <b>you must override</b>
99
105
  # this method to return the next URL, given the current URL and its index.
100
106
  #
@@ -104,10 +110,12 @@ module Upton
104
110
  #
105
111
  # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
106
112
  # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
113
+ ##
107
114
  def next_instance_page_url(url, index)
108
115
  ""
109
116
  end
110
117
 
118
+ ##
111
119
  # If index pages are paginated, <b>you must override</b>
112
120
  # this method to return the next URL, given the current URL and its index.
113
121
  #
@@ -117,10 +125,14 @@ module Upton
117
125
  #
118
126
  # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
119
127
  # ought to return "http://whatever.com/articles?page=2"
128
+ ##
120
129
  def next_index_page_url(url, index)
121
130
  ""
122
131
  end
123
132
 
133
+ ##
134
+ # Writes the scraped result to a CSV at the given filename.
135
+ ##
124
136
  def scrape_to_csv filename, &blk
125
137
  require 'csv'
126
138
  unless self.url_array
@@ -133,8 +145,11 @@ module Upton
133
145
 
134
146
  protected
135
147
 
136
-
137
- #Handles getting pages with RestClient or getting them from the local stash
148
+ ##
149
+ # Handles getting pages with RestClient or getting them from the local stash.
150
+ #
151
+ # Uses a kludge (because rest-client is outdated) to handle encoding.
152
+ ##
138
153
  def get_page(url, stash=false)
139
154
  return "" if url.empty?
140
155
 
@@ -179,21 +194,27 @@ module Upton
179
194
  resp
180
195
  end
181
196
 
197
+ ##
182
198
  # Return a list of URLs for the instances you want to scrape.
183
199
  # This can optionally be overridden if, for example, the list of instances
184
200
  # comes from an API.
201
+ ##
185
202
  def get_index
186
203
  parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
187
204
  end
188
205
 
189
- # Using the XPath or CSS selector and selector_method that uniquely locates
190
- # the links in the index, return those links as strings.
206
+ ##
207
+ # Using the XPath expression or CSS selector and selector_method that
208
+ # uniquely identifies the links in the index, return those links as strings.
209
+ ##
191
210
  def parse_index(text, selector, selector_method=:xpath)
192
211
  Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
193
212
  end
194
213
 
214
+ ##
195
215
  # Returns the concatenated output of each member of a paginated index,
196
216
  # e.g. a site listing links with 2+ pages.
217
+ ##
197
218
  def get_index_pages(url, index)
198
219
  resp = self.get_page(url, @index_debug)
199
220
  if !resp.empty?
@@ -206,12 +227,14 @@ module Upton
206
227
  resp
207
228
  end
208
229
 
230
+ ##
209
231
  # Returns the article at `url`.
210
232
  #
211
233
  # If the page is stashed, returns that, otherwise, fetches it from the web.
212
234
  #
213
235
  # If an instance is paginated, returns the concatenated output of each
214
236
  # page, e.g. if a news article has two pages.
237
+ ##
215
238
  def get_instance(url, index=0)
216
239
  resp = self.get_page(url, @debug)
217
240
  if !resp.empty?
data/lib/utils.rb CHANGED
@@ -1,8 +1,19 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module Upton
4
+
5
+ ##
6
+ # This class contains a collection of helpers for Upton
7
+ #
8
+ # Each method returns a Proc that (with an & ) can be used as the final
9
+ # argument to Upton's `scrape` and `scrape_to_csv`
10
+ ##
4
11
  module Utils
5
- #instance_html, instance_url, index
12
+
13
+ ##
14
+ # Scrapes an HTML <table> element into an Array of Arrays. The header, if
15
+ # present, is returned as the first row.
16
+ ##
6
17
  def self.table(table_selector, selector_method=:xpath)
7
18
  require 'csv'
8
19
  return Proc.new do |instance_html|
@@ -16,6 +27,9 @@ module Upton
16
27
  end
17
28
  end
18
29
 
30
+ ##
31
+ # Scrapes any set of HTML elements into an Array.
32
+ ##
19
33
  def self.list(list_selector, selector_method=:xpath)
20
34
  require 'csv'
21
35
  return Proc.new do |instance_html|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-17 00:00:00.000000000 Z
11
+ date: 2013-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack