RubyGems - scrape - Versions diffs - 0.1.1 → 0.2 - Mend

scrape 0.1.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/Gemfile.lock +2 -1
data/lib/scrape.rb +5 -2
data/lib/scrape/application.rb +11 -3
data/lib/scrape/cli.rb +9 -7
data/lib/scrape/robots_txt.rb +54 -0
data/lib/scrape/robots_txt_rules.rb +24 -0
data/lib/scrape/site.rb +24 -11
data/lib/scrape/string_ext.rb +6 -0
data/lib/scrape/version.rb +1 -1
data/scrape.gemspec +1 -0
data/test/unit/application_test.rb +10 -1
data/test/unit/robots_txt_rules_test.rb +50 -0
data/test/unit/robots_txt_test.rb +71 -0
data/test/unit/site_test.rb +51 -7
metadata +23 -2

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    scrape (0.1.1)
+    scrape (0.2)
 GEM
   remote: http://rubygems.org/
@@ -17,6 +17,7 @@ PLATFORMS
   ruby
 DEPENDENCIES
+  addressable (~> 2.2.8)
   nokogiri (~> 1.5.5)
   scrape!
   webmock (~> 1.8.7)

data/lib/scrape.rb CHANGED Viewed

@@ -3,6 +3,8 @@ require "logger"
 require "open-uri"
 require "bundler/setup"
+require "scrape/string_ext.rb"
 module Scrape
   require 'scrape/version'
@@ -11,9 +13,10 @@ module Scrape
   autoload 'Match', 'scrape/match'
   autoload 'DefaultLoader', 'scrape/default_loader'
   autoload 'DSL', 'scrape/dsl'
-  autoload 'URI', 'scrape/uri'
+  autoload 'RobotsTxt', 'scrape/robots_txt'
+  autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
-  class ScrapeFileNotFound < Exception; end
+  class FileNotFound < Exception; end
   class << self
     attr_writer :user_agent

data/lib/scrape/application.rb CHANGED Viewed

@@ -1,12 +1,13 @@
 class Scrape::Application
-  attr_reader :scrapefile, :loader, :sites, :history
+  attr_reader :scrapefile, :loader, :sites, :history, :ignore_robots_txt
-  def initialize scrapefile, loader = Scrape::DefaultLoader.new
+  def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
     @scrapefile = File.expand_path scrapefile
     @loader = loader
     @sites = {}
     @queue = []
     @history = []
+    @ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
   end
   def run
@@ -43,14 +44,21 @@ class Scrape::Application
     end
   end
+  def ignore_robots_txt= bool
+    sites.each{|_, site| site.ignore_robots_txt = bool }
+    @ignore_robots_txt = bool
+  end
   def [] url
     @sites.values.detect{|site| site.accept? url }
   end
   def load_scrapefile
     return if @scrapefile_loaded
-    result = loader.load(scrapefile)
+    raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
+    result = loader.load scrapefile
     @sites.update result if result.is_a? Hash
+    self.ignore_robots_txt = ignore_robots_txt
     reset
     @scrapefile_loaded = true
   end

data/lib/scrape/cli.rb CHANGED Viewed

@@ -17,6 +17,9 @@ class Scrape::CLI
       opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
         options[:file] = File.expand_path file
       end
+      opts.on "-i", "--ignore-robots-txt", "Ignore robots.txt" do
+        options[:ignore_robots_txt] = true
+      end
       opts.on_tail "-h", "--help", "Show this message" do
         puts opts
         exit
@@ -28,12 +31,11 @@ class Scrape::CLI
     end
     opts.parse argv
-    if File.exists? options[:file]
-      Scrape::Application.new(options[:file]).run
-    else
-      puts "#{command} aborted!"
-      puts "No Scrapefile found"
-      exit -1
-    end
+    Scrape::Application.new(options.delete(:file), options).run
+  rescue Scrape::FileNotFound
+    puts "#{command} aborted!"
+    puts "No Scrapefile found"
+    exit -1
   end
 end

data/lib/scrape/robots_txt.rb ADDED Viewed

@@ -0,0 +1,54 @@
+require 'addressable/uri'
+class Scrape::RobotsTxt
+  def initialize rules
+    @rules = rules
+    @rules.default = Scrape::RobotsTxtRules.new
+  end
+  def user_agents
+    @rules.keys
+  end
+  def disallows
+    @rules.values.flatten
+  end
+  def [] user_agent
+    rules  = @rules[user_agent].clone
+    rules += @rules['*'] unless user_agent == '*'
+    rules
+  end
+  def =~ str
+    self[Scrape.user_agent] =~ str
+  end
+  def each &block
+    @rules.each &block
+  end
+  def self.parse content
+    rules, user_agent = Hash.new, nil
+    content.split("\n").each do |line|
+      case line
+      when /^#/
+        next
+      when /User-agent:\s*(.+)/
+        user_agent = $1.strip
+        rules.update user_agent => Scrape::RobotsTxtRules.new
+      when /Disallow:\s*(.+)/
+        rules[user_agent] << $1.strip
+      end
+    end
+    new rules
+  end
+  def self.load url, default = true
+    url = Addressable::URI.join(url, "/robots.txt") if default
+    parse Scrape.open(url)
+  end
+  public :load
+end

data/lib/scrape/robots_txt_rules.rb ADDED Viewed

@@ -0,0 +1,24 @@
+class Scrape::RobotsTxtRules
+  def initialize *rules
+    @rules = rules.flatten
+  end
+  def << rule
+    @rules.push *Array(rule).flatten
+    self
+  end
+  def + ary
+    dup << ary.to_ary
+  end
+  def =~ str
+    str = str.to_str
+    @rules.any?{|rule| str.starts_with rule }
+  end
+  def to_a
+    @rules.dup
+  end
+  alias_method :to_ary, :to_a
+end

data/lib/scrape/site.rb CHANGED Viewed

@@ -1,14 +1,16 @@
-require 'uri'
+require 'addressable/uri'
 require 'nokogiri'
 class Scrape::Site
   attr_reader :url, :matches
+  attr_accessor :ignore_robots_txt
-  def initialize url
-    @url = URI.parse url
+  def initialize url, options = {}
+    @url = Addressable::URI.parse url
     @url.query = nil
     @url.fragment = nil
     @matches = []
+    @ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
   end
   def add_match matcher, &proc
@@ -23,22 +25,33 @@ class Scrape::Site
     @matches.each{|match| match.invoke doc if match =~ url }
-    doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
+    doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
   end
   def accept? url
-    url.to_s[0, to_s.length] == to_s
+    url = normalize url
+    url.starts_with(to_s) && !disallowed?(url)
+  end
+  def normalize url, base_url = self.url
+    Addressable::URI.join(base_url, url).to_s
+  end
+  def robots_txt
+    @robots_txt ||= Scrape::RobotsTxt.load url
   end
-  def normalize url
-    case url
-    when /^.+:\/\// then url.dup
-    when /^\//      then @url.merge(url).to_s
-    else @url.merge("#{@url.path}/#{url}").to_s
-    end
+  def ignore_robots_txt?
+    !!@ignore_robots_txt
   end
   def to_s
     url.to_s
   end
+private
+  def disallowed? url
+    !ignore_robots_txt? && robots_txt =~ Addressable::URI.parse(url).path
+  end
 end

data/lib/scrape/string_ext.rb ADDED Viewed

@@ -0,0 +1,6 @@
+class String
+  def starts_with str
+    str = str.to_str
+    self[0, str.length] == str
+  end unless instance_methods.include?(:starts_with)
+end

data/lib/scrape/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Scrape
-  VERSION = '0.1.1' unless defined? ::Scrape::VERSION
+  VERSION = '0.2' unless defined? ::Scrape::VERSION
 end

data/scrape.gemspec CHANGED Viewed

@@ -19,4 +19,5 @@ Gem::Specification.new do |s|
   s.require_paths = ["lib"]
   s.add_development_dependency "nokogiri", "~> 1.5.5"
+  s.add_development_dependency "addressable", "~> 2.2.8"
 end

data/test/unit/application_test.rb CHANGED Viewed

@@ -54,7 +54,7 @@ class ApplicationTest < Scrape::TestCase
     filepath = File.join(SUPPORT_FILES, 'test1.scrape')
     test_loader = MiniTest::Mock.new
     test_loader.expect :load, nil, [filepath]
-    Scrape::Application.new(filepath, test_loader).run
+    Scrape::Application.new(filepath, {}, test_loader).run
     assert test_loader.verify, "loader did not receive file"
   end
@@ -69,4 +69,13 @@ class ApplicationTest < Scrape::TestCase
     3.times{ app.enqueue "http://example.com" }
     assert_equal ["http://example.com"], app.queue
   end
+  test "#ignore_robots_txt should update #ignore_robots_txt on all sites" do
+    site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
+    app = Scrape::Application.new(".")
+    app.sites.update site.to_s => site
+    assert_equal false, site.ignore_robots_txt
+    app.ignore_robots_txt = true
+    assert_equal true, site.ignore_robots_txt
+  end
 end

data/test/unit/robots_txt_rules_test.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require "test_helper"
+class RobotsTxtRulesTest < Scrape::TestCase
+  test "#initialize should set the rules passed as multiple arguments" do
+    rules = Scrape::RobotsTxtRules.new "/foo", "/bar"
+    assert_equal ["/foo", "/bar"], rules.to_a
+  end
+  test "#initialize should set the rules passed an array argument" do
+    rules = Scrape::RobotsTxtRules.new ["/foo", "/bar"]
+    assert_equal ["/foo", "/bar"], rules.to_a
+  end
+  test "#<< should append the string" do
+    rules = Scrape::RobotsTxtRules.new "/foo"
+    assert_equal ["/foo"], rules.to_a
+    rules << "/bar"
+    assert_equal ["/foo", "/bar"], rules.to_a
+  end
+  test "#<< should append the array" do
+    rules = Scrape::RobotsTxtRules.new "/foo"
+    assert_equal ["/foo"], rules.to_a
+    rules << ["/bar", "/too"]
+    assert_equal ["/foo", "/bar", "/too"], rules.to_a
+  end
+  test "#+ should return a new instance concatenating it self and the given array" do
+    rules1 = Scrape::RobotsTxtRules.new "/foo"
+    rules2 = rules1 + ["/bar"]
+    refute_equal rules1, rules2
+    assert_kind_of Scrape::RobotsTxtRules, rules2
+    assert_equal ["/foo", "/bar"], rules2.to_a
+  end
+  test "#=~ should match anything that beings with /" do
+    rules = Scrape::RobotsTxtRules.new "/"
+    assert rules =~ "/"
+    assert rules =~ "/foo"
+  end
+  test "#=~ should match anything that begins with rules" do
+    rules = Scrape::RobotsTxtRules.new "/foo"
+    assert rules =~ "/foo"
+    assert rules =~ "/foo/"
+    assert rules =~ "/foo/bar"
+    assert rules =~ "/foo.html"
+    refute rules =~ "/bar"
+  end
+end

data/test/unit/robots_txt_test.rb ADDED Viewed

@@ -0,0 +1,71 @@
+require "test_helper"
+class RobotsTxtTest < Scrape::TestCase
+  test "#user_agents should return an array" do
+    robots = Scrape::RobotsTxt.new "Test" => []
+    assert_equal ["Test"], robots.user_agents
+  end
+  test "#disallows should return an array" do
+    robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
+    assert_equal ["/foo"], robots.disallows
+  end
+  test "#[] should return all disallows for the specified user agent" do
+    robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
+    assert_equal ["/foo"], robots["Test"]
+  end
+  test "#[] should return all disallows for the specified user agent including wildcard" do
+    robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
+    assert_equal ["/foo", "/bar"], robots["Test"]
+  end
+  test "#[] should return all disallows for wildcard" do
+    robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
+    assert_equal ["/bar"], robots["*"]
+  end
+  test ".parse should return new instance parsed from a string" do
+    robots = Scrape::RobotsTxt.parse <<-TXT
+    User-agent: Test
+    Disallow: /foo
+    Disallow: /bar
+    TXT
+    assert_equal ["Test"], robots.user_agents
+    assert_equal ["/foo", "/bar"], robots.disallows
+  end
+  test ".parse should return new empty instance" do
+    robots = Scrape::RobotsTxt.parse ""
+    assert_equal [], robots.user_agents
+    assert_equal [], robots.disallows
+  end
+  test ".load should return a new instance parsed from the specified url" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: Test
+      Disallow: /foo
+      Disallow: /bar
+      TXT
+    robots = Scrape::RobotsTxt.load "http://www.example.com/robots.txt"
+    assert_equal ["Test"], robots.user_agents
+    assert_equal ["/foo", "/bar"], robots.disallows
+  end
+  test ".load should return a new instance parsed from the specified url with the path defaulted" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: Test
+      Disallow: /foo
+      Disallow: /bar
+      TXT
+    robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
+    assert_equal ["Test"], robots.user_agents
+    assert_equal ["/foo", "/bar"], robots.disallows
+  end
+end

data/test/unit/site_test.rb CHANGED Viewed

@@ -24,8 +24,8 @@ class SiteTest < Scrape::TestCase
     assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
   end
-  test "#parse should return relative urls to the site" do
-    stub_request(:get, "http://www.example.com/test").
+  test "#parse should return relative urls to the specified url" do
+    stub_request(:get, "http://www.example.com/foo/bar").
       with(:headers => {"User-Agent" => Scrape.user_agent}).
       to_return(:status => 200, :body => <<-HTML)
       <html>
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
     HTML
     site = Scrape::Site.new "http://www.example.com"
-    assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
+    assert_equal ["http://www.example.com/foo/link1.html"], site.parse("/foo/bar")
   end
   test "#parse should return no urls" do
@@ -73,9 +73,36 @@ class SiteTest < Scrape::TestCase
     assert ok, "Match was not invoked"
   end
-  test "#accept? should return true when specified url inside the site's url" do
-    uri = Scrape::Site.new "http://www.example.com/foo"
-    assert uri.accept?("http://www.example.com/foo/bar")
+  test "#accept? should return true when specified url is inside the site's url" do
+    site = Scrape::Site.new "http://www.example.com/foo"
+    assert site.accept?("http://www.example.com/foo/bar")
+  end
+  test "#accept? should return false when specified url is outside the site's url" do
+    site = Scrape::Site.new "http://www.example.com/foo"
+    refute site.accept?("http://www.example.com/bar")
+  end
+  test "#accept? should return true when specified url is inside the site's url and allowed by robots.txt" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: #{Scrape.user_agent}
+      Disallow: /bar
+      TXT
+    site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
+    assert site.accept?("http://www.example.com/foo/bar")
+  end
+  test "#accept? should return false when specified url is inside the site's url and disallowed by robots.txt" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: #{Scrape.user_agent}
+      Disallow: /foo
+      TXT
+    site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
+    refute site.accept?("http://www.example.com/foo/bar"), "URL should not be accepted"
   end
   test "#normalize should return a url when string begins with a slash" do
@@ -84,7 +111,7 @@ class SiteTest < Scrape::TestCase
   end
   test "#normalize should return a url with the string appended" do
-    site = Scrape::Site.new "http://www.example.com/foo"
+    site = Scrape::Site.new "http://www.example.com/foo/boo"
     assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
   end
@@ -92,4 +119,21 @@ class SiteTest < Scrape::TestCase
     site = Scrape::Site.new "http://www.example.com/foo"
     assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
   end
+  test "#normalize should return a url when string is a forward slash" do
+    site = Scrape::Site.new "http://www.example.com/foo"
+    assert_equal "http://www.example.com/", site.normalize("/")
+  end
+  test "#robots_txt should return a RobotsTxt instance from the site's url" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: Test
+      Disallow: /foo
+      TXT
+    site = Scrape::Site.new "http://www.example.com/foo"
+    robots = site.robots_txt
+    assert_kind_of Scrape::RobotsTxt, robots
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scrape
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: '0.2'
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-11 00:00:00.000000000 Z
+date: 2012-07-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -27,6 +27,22 @@ dependencies:
     - - ~>
       - !ruby/object:Gem::Version
         version: 1.5.5
+- !ruby/object:Gem::Dependency
+  name: addressable
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.2.8
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.2.8
 description: An easy to use utility to scrape websites using a DSL similar to rake.
 email:
 - evilmarty@gmail.com
@@ -50,7 +66,10 @@ files:
 - lib/scrape/default_loader.rb
 - lib/scrape/dsl.rb
 - lib/scrape/match.rb
+- lib/scrape/robots_txt.rb
+- lib/scrape/robots_txt_rules.rb
 - lib/scrape/site.rb
+- lib/scrape/string_ext.rb
 - lib/scrape/version.rb
 - scrape.gemspec
 - test/support/test1.scrape
@@ -60,6 +79,8 @@ files:
 - test/unit/application_test.rb
 - test/unit/default_loader_test.rb
 - test/unit/match_test.rb
+- test/unit/robots_txt_rules_test.rb
+- test/unit/robots_txt_test.rb
 - test/unit/scrape_test.rb
 - test/unit/site_test.rb
 homepage: http://github.com/evilmarty/scrape