upton 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +8 -8
  2. data/lib/upton.rb +47 -24
  3. data/lib/utils.rb +15 -1
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NWMxNTc5MGRjODQzYjNmNWVkODVkNDg1NmM0ODJkMmI1YWU1YmZlYw==
4
+ ZWYwNGQyM2ZkYWVhYWViZmU1ZDczNTA4OGZjMzhhN2FkMWIwNzc5Nw==
5
5
  data.tar.gz: !binary |-
6
- MGI1YTQ1MjM5OGMwZTU2NGVjYWE4OWY5NzY5YjE3OGE4Y2E5ZjdhMQ==
6
+ MmVmMjdkZGFlYzBjZGJjNjNiOTg4MDJlNjhmZDA2YzI4MzdlY2E5Mw==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- NTUwZDkyZDcxMDJiMjBhYzE3NTZjNjQ3NGRiNTdjZjFlOGY3OGI1MTZkZjk3
10
- ZmQyYzk0YTYzZWI4NzAzMWUyZmNkNmUyZmMxNWI2ZGU2Zjg4NGM2MmY1MmJk
11
- NWU0MmRjN2EyMjA3MjM1NWMyZTE0YTNjMDc0MDEyZGU3NGM4ZmY=
9
+ YjgwMjAyYTM0YTNhODA5ZmExNjBkYzcwNDgzYjdjM2M2ZGJiODM0NmUzMDE0
10
+ YmFlNjhmZjNhMDgwMTRkMWFmMWYzODgxMzI1MTZhZWNmYTY0MTEzN2QzYzE4
11
+ MWM3YjE5YzAxNmU5NjViZGQyNjZkYzBkZjgyZTEzMWUzNjc3N2U=
12
12
  data.tar.gz: !binary |-
13
- ODNjYzc3ODEzYzM5ZjA4OWI3NDA3YmRkODYwMWI0NTk4OTY1NzI3ZGM2OWMx
14
- ZmUwZDk3ZTA0MThmNDFkOTM3NjRlMTA0MTM5MTk5ODlmYzc3MzFkM2IyZmY0
15
- YWFmNmEzNzRjOGFiZDY2Njc5ZDEzMzQzMjgwZTZhYjIyYmYxMzQ=
13
+ ZDA2MGI0YzllM2UyNmFhYTljNjZiNDM3NmVhNWJhNjljOGQwNWJlZTViZDQw
14
+ ZmZkZjBiNjlhNDQ3MzFlNjRkZWE5MzZjMjViMjQ2N2QxYWRjYzg1MGJmMmFk
15
+ ZDE1NmJhNjliNmViZDE5ZDY1NjRiNDg3OTIwYTU5NTA0OTJhM2U=
data/lib/upton.rb CHANGED
@@ -1,17 +1,5 @@
1
1
  # encoding: UTF-8
2
2
 
3
- # *Upton* is a framework for easy web-scraping with a useful debug mode
4
- # that doesn't hammer your target's servers. It does the repetitive parts of
5
- # writing scrapers, so you only have to write the unique parts for each site.
6
- #
7
- # Upton operates on the theory that, for most scraping projects, you need to
8
- # scrape two types of pages:
9
- #
10
- # 1. Index pages, which list instance pages. For example, a job search
11
- # site's search page or a newspaper's homepage.
12
- # 2. Instance pages, which represent the goal of your scraping, e.g.
13
- # job listings or news articles.
14
- #
15
3
 
16
4
  require 'nokogiri'
17
5
  require 'uri'
@@ -19,19 +7,37 @@ require 'restclient'
19
7
  require './lib/utils'
20
8
 
21
9
  module Upton
22
-
23
- # Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
24
- # in more complicated cases; e.g. +MyScraper < Upton::Scraper+
10
+ ##
11
+ # *Upton* is a framework for easy web-scraping with a useful debug mode
12
+ # that doesn't hammer your target's servers. It does the repetitive parts of
13
+ # writing scrapers, so you only have to write the unique parts for each site.
14
+ #
15
+ # Upton operates on the theory that, for most scraping projects, you need to
16
+ # scrape two types of pages:
17
+ #
18
+ # 1. Index pages, which list instance pages. For example, a job search
19
+ # site's search page or a newspaper's homepage.
20
+ # 2. Instance pages, which represent the goal of your scraping, e.g.
21
+ # job listings or news articles.
22
+ #
23
+ # Upton::Scraper can be used as-is for basic use-cases by:
24
+ # 1. specifying the pages to be scraped in `new` as an index page
25
+ # or as an Array of URLs.
26
+ # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
27
+ # block from Upton::Utils.
28
+ # For more complicated cases; subclass Upton::Scraper
29
+ # e.g. +MyScraper < Upton::Scraper+ and overrdie various methods.
30
+ ##
25
31
  class Scraper
26
32
 
27
33
  attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
28
34
 
29
- # == Basic use-case methods.
30
-
35
+ ##
31
36
  # This is the main user-facing method for a basic scraper.
32
37
  # Call +scrape+ with a block; this block will be called on
33
38
  # the text of each instance page, (and optionally, its URL and its index
34
39
  # in the list of instance URLs returned by +get_index+).
40
+ ##
35
41
  def scrape &blk
36
42
  unless self.url_array
37
43
  self.url_array = self.get_index
@@ -39,6 +45,7 @@ module Upton
39
45
  self.scrape_from_list(self.url_array, blk)
40
46
  end
41
47
 
48
+ ##
42
49
  # +index_url_or_array+: A list of string URLs, OR
43
50
  # the URL of the page containing the list of instances.
44
51
  # +selector+: The XPath or CSS that specifies the anchor elements within
@@ -49,6 +56,7 @@ module Upton
49
56
  # do not need to set them.
50
57
  # If you don't specify a selector, the first argument will be treated as a
51
58
  # list of URLs.
59
+ ##
52
60
  def initialize(index_url_or_array, selector="", selector_method=:xpath)
53
61
 
54
62
  #if first arg is a valid URL, do already-written stuff;
@@ -92,9 +100,7 @@ module Upton
92
100
  end
93
101
  end
94
102
 
95
-
96
- # == Configuration Options
97
-
103
+ ##
98
104
  # If instance pages are paginated, <b>you must override</b>
99
105
  # this method to return the next URL, given the current URL and its index.
100
106
  #
@@ -104,10 +110,12 @@ module Upton
104
110
  #
105
111
  # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
106
112
  # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
113
+ ##
107
114
  def next_instance_page_url(url, index)
108
115
  ""
109
116
  end
110
117
 
118
+ ##
111
119
  # If index pages are paginated, <b>you must override</b>
112
120
  # this method to return the next URL, given the current URL and its index.
113
121
  #
@@ -117,10 +125,14 @@ module Upton
117
125
  #
118
126
  # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
119
127
  # ought to return "http://whatever.com/articles?page=2"
128
+ ##
120
129
  def next_index_page_url(url, index)
121
130
  ""
122
131
  end
123
132
 
133
+ ##
134
+ # Writes the scraped result to a CSV at the given filename.
135
+ ##
124
136
  def scrape_to_csv filename, &blk
125
137
  require 'csv'
126
138
  unless self.url_array
@@ -133,8 +145,11 @@ module Upton
133
145
 
134
146
  protected
135
147
 
136
-
137
- #Handles getting pages with RestClient or getting them from the local stash
148
+ ##
149
+ # Handles getting pages with RestClient or getting them from the local stash.
150
+ #
151
+ # Uses a kludge (because rest-client is outdated) to handle encoding.
152
+ ##
138
153
  def get_page(url, stash=false)
139
154
  return "" if url.empty?
140
155
 
@@ -179,21 +194,27 @@ module Upton
179
194
  resp
180
195
  end
181
196
 
197
+ ##
182
198
  # Return a list of URLs for the instances you want to scrape.
183
199
  # This can optionally be overridden if, for example, the list of instances
184
200
  # comes from an API.
201
+ ##
185
202
  def get_index
186
203
  parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
187
204
  end
188
205
 
189
- # Using the XPath or CSS selector and selector_method that uniquely locates
190
- # the links in the index, return those links as strings.
206
+ ##
207
+ # Using the XPath expression or CSS selector and selector_method that
208
+ # uniquely identifies the links in the index, return those links as strings.
209
+ ##
191
210
  def parse_index(text, selector, selector_method=:xpath)
192
211
  Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
193
212
  end
194
213
 
214
+ ##
195
215
  # Returns the concatenated output of each member of a paginated index,
196
216
  # e.g. a site listing links with 2+ pages.
217
+ ##
197
218
  def get_index_pages(url, index)
198
219
  resp = self.get_page(url, @index_debug)
199
220
  if !resp.empty?
@@ -206,12 +227,14 @@ module Upton
206
227
  resp
207
228
  end
208
229
 
230
+ ##
209
231
  # Returns the article at `url`.
210
232
  #
211
233
  # If the page is stashed, returns that, otherwise, fetches it from the web.
212
234
  #
213
235
  # If an instance is paginated, returns the concatenated output of each
214
236
  # page, e.g. if a news article has two pages.
237
+ ##
215
238
  def get_instance(url, index=0)
216
239
  resp = self.get_page(url, @debug)
217
240
  if !resp.empty?
data/lib/utils.rb CHANGED
@@ -1,8 +1,19 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module Upton
4
+
5
+ ##
6
+ # This class contains a collection of helpers for Upton
7
+ #
8
+ # Each method returns a Proc that (with an & ) can be used as the final
9
+ # argument to Upton's `scrape` and `scrape_to_csv`
10
+ ##
4
11
  module Utils
5
- #instance_html, instance_url, index
12
+
13
+ ##
14
+ # Scrapes an HTML <table> element into an Array of Arrays. The header, if
15
+ # present, is returned as the first row.
16
+ ##
6
17
  def self.table(table_selector, selector_method=:xpath)
7
18
  require 'csv'
8
19
  return Proc.new do |instance_html|
@@ -16,6 +27,9 @@ module Upton
16
27
  end
17
28
  end
18
29
 
30
+ ##
31
+ # Scrapes any set of HTML elements into an Array.
32
+ ##
19
33
  def self.list(list_selector, selector_method=:xpath)
20
34
  require 'csv'
21
35
  return Proc.new do |instance_html|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-17 00:00:00.000000000 Z
11
+ date: 2013-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack