RubyGems - upton - Versions diffs - 0.2.3 → 0.2.4 - Mend

upton 0.2.3 → 0.2.4

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    NWMxNTc5MGRjODQzYjNmNWVkODVkNDg1NmM0ODJkMmI1YWU1YmZlYw==
+    ZWYwNGQyM2ZkYWVhYWViZmU1ZDczNTA4OGZjMzhhN2FkMWIwNzc5Nw==
   data.tar.gz: !binary |-
-    MGI1YTQ1MjM5OGMwZTU2NGVjYWE4OWY5NzY5YjE3OGE4Y2E5ZjdhMQ==
+    MmVmMjdkZGFlYzBjZGJjNjNiOTg4MDJlNjhmZDA2YzI4MzdlY2E5Mw==
 !binary "U0hBNTEy":
   metadata.gz: !binary |-
-    NTUwZDkyZDcxMDJiMjBhYzE3NTZjNjQ3NGRiNTdjZjFlOGY3OGI1MTZkZjk3
-    ZmQyYzk0YTYzZWI4NzAzMWUyZmNkNmUyZmMxNWI2ZGU2Zjg4NGM2MmY1MmJk
-    NWU0MmRjN2EyMjA3MjM1NWMyZTE0YTNjMDc0MDEyZGU3NGM4ZmY=
+    YjgwMjAyYTM0YTNhODA5ZmExNjBkYzcwNDgzYjdjM2M2ZGJiODM0NmUzMDE0
+    YmFlNjhmZjNhMDgwMTRkMWFmMWYzODgxMzI1MTZhZWNmYTY0MTEzN2QzYzE4
+    MWM3YjE5YzAxNmU5NjViZGQyNjZkYzBkZjgyZTEzMWUzNjc3N2U=
   data.tar.gz: !binary |-
-    ODNjYzc3ODEzYzM5ZjA4OWI3NDA3YmRkODYwMWI0NTk4OTY1NzI3ZGM2OWMx
-    ZmUwZDk3ZTA0MThmNDFkOTM3NjRlMTA0MTM5MTk5ODlmYzc3MzFkM2IyZmY0
-    YWFmNmEzNzRjOGFiZDY2Njc5ZDEzMzQzMjgwZTZhYjIyYmYxMzQ=
+    ZDA2MGI0YzllM2UyNmFhYTljNjZiNDM3NmVhNWJhNjljOGQwNWJlZTViZDQw
+    ZmZkZjBiNjlhNDQ3MzFlNjRkZWE5MzZjMjViMjQ2N2QxYWRjYzg1MGJmMmFk
+    ZDE1NmJhNjliNmViZDE5ZDY1NjRiNDg3OTIwYTU5NTA0OTJhM2U=

data/lib/upton.rb CHANGED Viewed

@@ -1,17 +1,5 @@
 # encoding: UTF-8
-# *Upton* is a framework for easy web-scraping with a useful debug mode
-# that doesn't hammer your target's servers. It does the repetitive parts of
-# writing scrapers, so you only have to write the unique parts for each site.
-#
-# Upton operates on the theory that, for most scraping projects, you need to
-# scrape two types of pages:
-#
-# 1. Index pages, which list instance pages. For example, a job search
-#     site's search page or a newspaper's homepage.
-# 2. Instance pages, which represent the goal of your scraping, e.g.
-#     job listings or news articles.
-#
 require 'nokogiri'
 require 'uri'
@@ -19,19 +7,37 @@ require 'restclient'
 require './lib/utils'
 module Upton
-  # Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
-  # in more complicated cases; e.g. +MyScraper < Upton::Scraper+
+  ##
+  # *Upton* is a framework for easy web-scraping with a useful debug mode
+  # that doesn't hammer your target's servers. It does the repetitive parts of
+  # writing scrapers, so you only have to write the unique parts for each site.
+  #
+  # Upton operates on the theory that, for most scraping projects, you need to
+  # scrape two types of pages:
+  #
+  # 1. Index pages, which list instance pages. For example, a job search
+  #     site's search page or a newspaper's homepage.
+  # 2. Instance pages, which represent the goal of your scraping, e.g.
+  #     job listings or news articles.
+  #
+  # Upton::Scraper can be used as-is for basic use-cases by:
+  # 1. specifying the pages to be scraped in `new` as an index page
+  #      or as an Array of URLs.
+  # 2.  supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
+  #      block from Upton::Utils.
+  # For more complicated cases; subclass Upton::Scraper
+  #    e.g. +MyScraper < Upton::Scraper+ and overrdie various methods.
+  ##
   class Scraper
     attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
-    # == Basic use-case methods.
+    ##
     # This is the main user-facing method for a basic scraper.
     # Call +scrape+ with a block; this block will be called on
     # the text of each instance page, (and optionally, its URL and its index
     # in the list of instance URLs returned by +get_index+).
+    ##
     def scrape &blk
       unless self.url_array
         self.url_array = self.get_index
@@ -39,6 +45,7 @@ module Upton
       self.scrape_from_list(self.url_array, blk)
     end
+    ##
     # +index_url_or_array+: A list of string URLs, OR
     #              the URL of the page containing the list of instances.
     # +selector+: The XPath or CSS that specifies the anchor elements within
@@ -49,6 +56,7 @@ module Upton
     # do not need to set them.
     # If you don't specify a selector, the first argument will be treated as a
     # list of URLs.
+    ##
     def initialize(index_url_or_array, selector="", selector_method=:xpath)
       #if first arg is a valid URL, do already-written stuff;
@@ -92,9 +100,7 @@ module Upton
       end
     end
-    # == Configuration Options
+    ##
     # If instance pages are paginated, <b>you must override</b>
     # this method to return the next URL, given the current URL and its index.
     #
@@ -104,10 +110,12 @@ module Upton
     #
     # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
     # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
+    ##
     def next_instance_page_url(url, index)
       ""
     end
+    ##
     # If index pages are paginated, <b>you must override</b>
     # this method to return the next URL, given the current URL and its index.
     #
@@ -117,10 +125,14 @@ module Upton
     #
     # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
     # ought to return "http://whatever.com/articles?page=2"
+    ##
     def next_index_page_url(url, index)
       ""
     end
+    ##
+    # Writes the scraped result to a CSV at the given filename.
+    ##
     def scrape_to_csv filename, &blk
       require 'csv'
       unless self.url_array
@@ -133,8 +145,11 @@ module Upton
     protected
-    #Handles getting pages with RestClient or getting them from the local stash
+    ##
+    # Handles getting pages with RestClient or getting them from the local stash.
+    #
+    # Uses a kludge (because rest-client is outdated) to handle encoding.
+    ##
     def get_page(url, stash=false)
       return "" if url.empty?
@@ -179,21 +194,27 @@ module Upton
       resp
     end
+    ##
     # Return a list of URLs for the instances you want to scrape.
     # This can optionally be overridden if, for example, the list of instances
     # comes from an API.
+    ##
     def get_index
       parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
     end
-    # Using the XPath or CSS selector and selector_method that uniquely locates
-    # the links in the index, return those links as strings.
+    ##
+    # Using the XPath expression or CSS selector and selector_method that
+    # uniquely identifies the links in the index, return those links as strings.
+    ##
     def parse_index(text, selector, selector_method=:xpath)
       Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
     end
+    ##
     # Returns the concatenated output of each member of a paginated index,
     # e.g. a site listing links with 2+ pages.
+    ##
     def get_index_pages(url, index)
       resp = self.get_page(url, @index_debug)
       if !resp.empty?
@@ -206,12 +227,14 @@ module Upton
       resp
     end
+    ##
     # Returns the article at `url`.
     #
     # If the page is stashed, returns that, otherwise, fetches it from the web.
     #
     # If an instance is paginated, returns the concatenated output of each
     # page, e.g. if a news article has two pages.
+    ##
     def get_instance(url, index=0)
       resp = self.get_page(url, @debug)
       if !resp.empty?

data/lib/utils.rb CHANGED Viewed

@@ -1,8 +1,19 @@
 # encoding: UTF-8
 module Upton
+  ##
+  # This class contains a collection of helpers for Upton
+  #
+  # Each method returns a Proc that (with an & ) can be used as the final
+  # argument to Upton's `scrape` and `scrape_to_csv`
+  ##
   module Utils
-    #instance_html, instance_url, index
+    ##
+    # Scrapes an HTML <table> element into an Array of Arrays. The header, if
+    # present, is returned as the first row.
+    ##
     def self.table(table_selector, selector_method=:xpath)
       require 'csv'
       return Proc.new do |instance_html|
@@ -16,6 +27,9 @@ module Upton
       end
     end
+    ##
+    # Scrapes any set of HTML elements into an Array.
+    ##
     def self.list(list_selector, selector_method=:xpath)
       require 'csv'
       return Proc.new do |instance_html|

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: upton
 version: !ruby/object:Gem::Version
-  version: 0.2.3
+  version: 0.2.4
 platform: ruby
 authors:
 - Jeremy B. Merrill
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-17 00:00:00.000000000 Z
+date: 2013-07-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rack