RubyGems - scrape - Versions diffs - 0.2.2 → 0.2.4 - Mend

scrape 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/.gitignore +2 -1
data/Gemfile.lock +1 -1
data/bin/scrape +1 -3
data/lib/scrape/application.rb +25 -21
data/lib/scrape/cli.rb +1 -1
data/lib/scrape/core_ext/array.rb +5 -0
data/lib/scrape/{string_ext.rb → core_ext/string.rb} +0 -0
data/lib/scrape/default_loader.rb +7 -14
data/lib/scrape/dsl.rb +11 -6
data/lib/scrape/robots_txt.rb +2 -0
data/lib/scrape/site.rb +11 -10
data/lib/scrape/version.rb +1 -1
data/lib/scrape.rb +5 -4
data/scrape.gemspec +1 -1
data/test/unit/application_test.rb +4 -6
data/test/unit/default_loader_test.rb +17 -13
data/test/unit/dsl_test.rb +43 -0
data/test/unit/robots_txt_test.rb +8 -0
data/test/unit/site_test.rb +9 -0
metadata +6 -3

data/.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 .DS_Store
-pkg/
+pkg/
+.tags*

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    scrape (0.2.1)
+    scrape (0.2.4)
 GEM
   remote: http://rubygems.org/

data/bin/scrape CHANGED Viewed

@@ -1,7 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.expand_path('../../lib', __FILE__)
-require "scrape/cli"
+require File.expand_path("../../lib/scrape/cli", __FILE__)
 Scrape::CLI.new(File.basename($0), ARGV).run

data/lib/scrape/application.rb CHANGED Viewed

@@ -1,30 +1,33 @@
 class Scrape::Application
-  attr_reader :scrapefile, :loader, :sites, :history, :ignore_robots_txt
+  attr_reader :scrapefile, :loader, :sites, :history, :options
-  def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
+  def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
     @scrapefile = File.expand_path scrapefile
-    @loader = loader
+    @options = options
+    @loader = loader.class == Class ? loader.new(self) : loader
     @sites = {}
     @queue = []
     @history = []
-    @ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
   end
   def run
     load_scrapefile
     while url = @queue.shift
-      Scrape.logger.info "Loading: #{url}..."
       @history << url
-      if site = self[url]
-        if urls = site.parse(url)
-          enqueue *urls
-          Scrape.logger.info "Found #{urls.length} urls."
+      begin
+        if site = self[url]
+          if urls = site.parse(url)
+            enqueue *urls
+            Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
+          else
+            Scrape.logger.info "Parsed #{url}."
+          end
         else
-          Scrape.logger.info "Done."
+          Scrape.logger.info "No rules defined for #{url}"
         end
-      else
-        Scrape.logger.info "Not defined."
+      rescue OpenURI::HTTPError => e
+        Scrape.logger.info "Error loading #{url}: #{e.message}"
       end
     end
   end
@@ -44,21 +47,22 @@ class Scrape::Application
     end
   end
-  def ignore_robots_txt= bool
-    sites.each{|_, site| site.ignore_robots_txt = bool }
-    @ignore_robots_txt = bool
-  end
   def [] url
     @sites.values.detect{|site| site.accept? url }
   end
+  def add_site site, options = {}
+    case site
+    when String
+      site = Scrape::Site.new site, options
+      @sites.update site.to_s => site
+      site
+    end
+  end
   def load_scrapefile
     return if @scrapefile_loaded
-    raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
-    result = loader.load scrapefile
-    @sites.update result if result.is_a? Hash
-    self.ignore_robots_txt = ignore_robots_txt
+    loader.load(scrapefile)
     reset
     @scrapefile_loaded = true
   end

data/lib/scrape/cli.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require "optparse"
-require "scrape"
+require File.expand_path("../../scrape", __FILE__)
 class Scrape::CLI
   attr_reader :command, :app, :options

data/lib/scrape/core_ext/array.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class Array
+  def extract_options!
+    last.instance_of?(Hash) ? pop : {}
+  end unless instance_methods.include?(:extract_options!)
+end

data/lib/scrape/{string_ext.rb → core_ext/string.rb} RENAMED Viewed

File without changes

data/lib/scrape/default_loader.rb CHANGED Viewed

@@ -1,19 +1,12 @@
 class Scrape::DefaultLoader
-  def load path
-    path = File.expand_path path
-    sites = {}
-    sandbox = Sandbox.new sites
-    sandbox.instance_eval File.read(path), path
-    sites
+  def initialize app
+    @app = app
   end
-  class Sandbox
-    include Scrape::DSL
-    def initialize sites
-      @sites = sites
-    end
+  def load path
+    path = File.expand_path path
+    File.exists? path or raise Scrape::FileNotFound, path
+    dsl = Scrape::DSL.new @app
+    dsl.instance_eval File.read(path), path
   end
 end

data/lib/scrape/dsl.rb CHANGED Viewed

@@ -1,13 +1,18 @@
-module Scrape::DSL
+class Scrape::DSL
+  def initialize app
+    @application = app
+  end
   def site *urls
-    @_sites ||= {}
-    @sites ||= {}
-    @current_sites = urls.flatten.map{|url| @_sites[url] ||= Scrape::Site.new(url) }
+    return @sites if urls.empty?
+    urls = urls.flatten
+    options = urls.extract_options!
+    @sites = urls.map{|url| @application.sites[url] || @application.add_site(url, options) }
   end
   def match matcher, &proc
-    raise ArgumentError.new("site must be set") unless defined? @current_sites
-    matches = @current_sites.map{|site| @sites[site.url.to_s] = site; site.add_match matcher, &proc }
+    raise ArgumentError, "No sites have been defined" unless defined? @sites
+    matches = @sites.map{|site| site.add_match matcher, &proc }
     matches.size == 1 ? matches.first : matches
   end
 end

data/lib/scrape/robots_txt.rb CHANGED Viewed

@@ -49,6 +49,8 @@ class Scrape::RobotsTxt
   def self.load url, default = true
     url = Addressable::URI.join(url, "/robots.txt") if default
     parse Scrape.open(url)
+  rescue OpenURI::HTTPError
+    nil
   end
   public :load
 end

data/lib/scrape/site.rb CHANGED Viewed

@@ -2,15 +2,14 @@ require 'addressable/uri'
 require 'nokogiri'
 class Scrape::Site
-  attr_reader :url, :matches
-  attr_accessor :ignore_robots_txt
+  attr_reader :url, :matches, :options
   def initialize url, options = {}
     @url = Addressable::URI.parse url
     @url.query = nil
     @url.fragment = nil
+    @options = {:ignore_robots_txt => true}.merge options
     @matches = []
-    @ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
   end
   def add_match matcher, &proc
@@ -19,9 +18,15 @@ class Scrape::Site
     match
   end
+  def open url
+    headers = Hash.new
+    headers['Set-Cookie'] = options[:cookie].to_s if options.has_key? :cookie
+    Scrape.open url, headers
+  end
   def parse url
     url = normalize url
-    doc = Nokogiri::HTML Scrape.open(url)
+    doc = Nokogiri::HTML open(url)
     @matches.each{|match| match.invoke doc, url if match =~ url }
@@ -38,11 +43,7 @@ class Scrape::Site
   end
   def robots_txt
-    @robots_txt ||= Scrape::RobotsTxt.load url
-  end
-  def ignore_robots_txt?
-    !!@ignore_robots_txt
+    @robots_txt = Scrape::RobotsTxt.load url unless defined? @robots_txt
   end
   def to_s
@@ -52,6 +53,6 @@ class Scrape::Site
 private
   def disallowed? url
-    !ignore_robots_txt? && robots_txt =~ Addressable::URI.parse(url).path
+    !options[:ignore_robots_txt] && robots_txt =~ Addressable::URI.parse(url).path
   end
 end

data/lib/scrape/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Scrape
-  VERSION = '0.2.2' unless defined? ::Scrape::VERSION
+  VERSION = '0.2.4' unless defined? ::Scrape::VERSION
 end

data/lib/scrape.rb CHANGED Viewed

@@ -1,13 +1,14 @@
 require "rubygems"
 require "logger"
 require "open-uri"
-require "bundler/setup"
-require "scrape/string_ext.rb"
+$: << File.dirname(__FILE__)
-module Scrape
-  require 'scrape/version'
+require "scrape/version"
+require "scrape/core_ext/array"
+require "scrape/core_ext/string"
+module Scrape
   autoload 'Application', 'scrape/application'
   autoload 'Site', 'scrape/site'
   autoload 'Match', 'scrape/match'

data/scrape.gemspec CHANGED Viewed

@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test}/*`.split("\n")
   s.executables   = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f) }
-  s.require_paths = ["lib"]
+  s.require_paths = ["lib", "lib/scrape"]
   s.add_development_dependency "nokogiri", "~> 1.5.5"
   s.add_development_dependency "addressable", "~> 2.2.8"

data/test/unit/application_test.rb CHANGED Viewed

@@ -53,6 +53,7 @@ class ApplicationTest < Scrape::TestCase
   test "#run should load the specified file" do
     filepath = File.join(SUPPORT_FILES, 'test1.scrape')
     test_loader = MiniTest::Mock.new
+    test_loader.expect :class, Scrape::DefaultLoader
     test_loader.expect :load, nil, [filepath]
     Scrape::Application.new(filepath, {}, test_loader).run
     assert test_loader.verify, "loader did not receive file"
@@ -70,12 +71,9 @@ class ApplicationTest < Scrape::TestCase
     assert_equal ["http://example.com"], app.queue
   end
-  test "#ignore_robots_txt should update #ignore_robots_txt on all sites" do
-    site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
+  test "#add_site should add the specied string to the collection" do
     app = Scrape::Application.new(".")
-    app.sites.update site.to_s => site
-    assert_equal false, site.ignore_robots_txt
-    app.ignore_robots_txt = true
-    assert_equal true, site.ignore_robots_txt
+    app.add_site "http://example.com"
+    assert app.sites.member?("http://example.com")
   end
 end

data/test/unit/default_loader_test.rb CHANGED Viewed

@@ -3,23 +3,27 @@ require "test_helper"
 class DefaultLoaderTest < Scrape::TestCase
   SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
-  test "#load should return sites parsed from the specified file" do
-    loader = Scrape::DefaultLoader.new
-    sites = loader.load File.join(SUPPORT_FILES, "test1.scrape")
-    assert_equal ["http://example.com"], sites.keys
-    assert_instance_of Scrape::Site, sites.values[0]
+  test "#load should parse the specified file" do
+    app = Scrape::Application.new "."
+    loader = Scrape::DefaultLoader.new app
+    loader.load File.join(SUPPORT_FILES, "test1.scrape")
+    assert_equal ["http://example.com"], app.sites.keys
+    assert_instance_of Scrape::Site, app.sites.values[0]
   end
-  test "#load should return an empty hash when no matches have been defined" do
-    loader = Scrape::DefaultLoader.new
-    sites = loader.load File.join(SUPPORT_FILES, "test2.scrape")
-    assert_equal Hash.new, sites
-  end
-  test "#load should raise an error when no site is defined" do
-    loader = Scrape::DefaultLoader.new
+  test "#load should raise error when no site is defined" do
+    app = Scrape::Application.new "."
+    loader = Scrape::DefaultLoader.new app
     assert_raises ArgumentError do
       loader.load File.join(SUPPORT_FILES, "test3.scrape")
     end
   end
+  test "#load should raise error when file cannot be found" do
+    app = Scrape::Application.new "."
+    loader = Scrape::DefaultLoader.new app
+    assert_raises Scrape::FileNotFound do
+      loader.load "#{Time.now.to_i}.txt"
+    end
+  end
 end

data/test/unit/dsl_test.rb ADDED Viewed

@@ -0,0 +1,43 @@
+require "test_helper"
+class DSLTest < Scrape::TestCase
+  test "#site should add the url to the application" do
+    app = Scrape::Application.new(".")
+    dsl = Scrape::DSL.new app
+    dsl.site "http://example.com"
+    assert app.sites.member?("http://example.com")
+  end
+  test "#site should return the currently defined sites" do
+    app = Scrape::Application.new(".")
+    dsl = Scrape::DSL.new app
+    sites = dsl.site "http://example.com"
+    assert_equal "http://example.com", sites[0].to_s
+    assert_equal sites, dsl.site
+  end
+  test "#site should pass the options to the site" do
+    app = Scrape::Application.new(".")
+    dsl = Scrape::DSL.new app
+    dsl.site "http://example.com", :test => true
+    assert_equal true, app.sites["http://example.com"].options[:test]
+  end
+  test "#match should create a match on the current sites" do
+    app = Scrape::Application.new(".")
+    dsl = Scrape::DSL.new app
+    dsl.site "http://example.com"
+    site = app.sites["http://example.com"]
+    assert_empty site.matches
+    dsl.match("test"){|*args|}
+    refute_empty site.matches
+  end
+  test "#match should raise an error when no sites have been defined" do
+    app = Scrape::Application.new(".")
+    dsl = Scrape::DSL.new app
+    assert_raises ArgumentError do
+      dsl.match("test"){|*args|}
+    end
+  end
+end

data/test/unit/robots_txt_test.rb CHANGED Viewed

@@ -68,4 +68,12 @@ class RobotsTxtTest < Scrape::TestCase
     assert_equal ["Test"], robots.user_agents
     assert_equal ["/foo", "/bar"], robots.disallows
   end
+  test ".load should return nil when specified url results in 404" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 404, :body => "")
+    robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
+    assert_nil robots
+  end
 end

data/test/unit/site_test.rb CHANGED Viewed

@@ -7,6 +7,15 @@ class SiteTest < Scrape::TestCase
     assert_instance_of Scrape::Match, match
   end
+  test "#open should include cookie header when cookie option is set" do
+    stub_request(:get, "http://www.example.com/").
+      with(:headers => {'Set-Cookie'=>'omnom'}).
+      to_return(:status => 200, :body => "")
+    site = Scrape::Site.new "http://www.example.com", :cookie => "omnom"
+    site.open "http://www.example.com"
+  end
   test "#parse should return absolute urls that match the site's url" do
     stub_request(:get, "http://www.example.com/test").
       with(:headers => {"User-Agent" => Scrape.user_agent}).

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scrape
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.2.4
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-15 00:00:00.000000000 Z
+date: 2012-07-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -63,13 +63,14 @@ files:
 - lib/scrape.rb
 - lib/scrape/application.rb
 - lib/scrape/cli.rb
+- lib/scrape/core_ext/array.rb
+- lib/scrape/core_ext/string.rb
 - lib/scrape/default_loader.rb
 - lib/scrape/dsl.rb
 - lib/scrape/match.rb
 - lib/scrape/robots_txt.rb
 - lib/scrape/robots_txt_rules.rb
 - lib/scrape/site.rb
-- lib/scrape/string_ext.rb
 - lib/scrape/version.rb
 - scrape.gemspec
 - test/support/test1.scrape
@@ -79,6 +80,7 @@ files:
 - test/unit/application_test.rb
 - test/unit/cli_test.rb
 - test/unit/default_loader_test.rb
+- test/unit/dsl_test.rb
 - test/unit/match_test.rb
 - test/unit/robots_txt_rules_test.rb
 - test/unit/robots_txt_test.rb
@@ -90,6 +92,7 @@ post_install_message:
 rdoc_options: []
 require_paths:
 - lib
+- lib/scrape
 required_ruby_version: !ruby/object:Gem::Requirement
   none: false
   requirements: