RubyGems - scrape - Versions diffs - 0.1.1 → 0.2 - Mend

scrape 0.1.1 → 0.2

Files changed (15) hide show

data/Gemfile.lock +2 -1
data/lib/scrape.rb +5 -2
data/lib/scrape/application.rb +11 -3
data/lib/scrape/cli.rb +9 -7
data/lib/scrape/robots_txt.rb +54 -0
data/lib/scrape/robots_txt_rules.rb +24 -0
data/lib/scrape/site.rb +24 -11
data/lib/scrape/string_ext.rb +6 -0
data/lib/scrape/version.rb +1 -1
data/scrape.gemspec +1 -0
data/test/unit/application_test.rb +10 -1
data/test/unit/robots_txt_rules_test.rb +50 -0
data/test/unit/robots_txt_test.rb +71 -0
data/test/unit/site_test.rb +51 -7
metadata +23 -2

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    scrape (0.1.1)
+    scrape (0.2)
 GEM
   remote: http://rubygems.org/
@@ -17,6 +17,7 @@ PLATFORMS
   ruby
 DEPENDENCIES
+  addressable (~> 2.2.8)
   nokogiri (~> 1.5.5)
   scrape!
   webmock (~> 1.8.7)

data/lib/scrape.rb CHANGED Viewed

@@ -3,6 +3,8 @@ require "logger"
 require "open-uri"
 require "bundler/setup"
+require "scrape/string_ext.rb"
 module Scrape
   require 'scrape/version'
@@ -11,9 +13,10 @@ module Scrape
   autoload 'Match', 'scrape/match'
   autoload 'DefaultLoader', 'scrape/default_loader'
   autoload 'DSL', 'scrape/dsl'
-  autoload 'URI', 'scrape/uri'
+  autoload 'RobotsTxt', 'scrape/robots_txt'
+  autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
-  class ScrapeFileNotFound < Exception; end
+  class FileNotFound < Exception; end
   class << self
     attr_writer :user_agent

data/lib/scrape/application.rb CHANGED Viewed

@@ -1,12 +1,13 @@
 class Scrape::Application
-  attr_reader :scrapefile, :loader, :sites, :history
+  attr_reader :scrapefile, :loader, :sites, :history, :ignore_robots_txt
-  def initialize scrapefile, loader = Scrape::DefaultLoader.new
+  def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
     @scrapefile = File.expand_path scrapefile
     @loader = loader
     @sites = {}
     @queue = []
     @history = []
+    @ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
   end
   def run
@@ -43,14 +44,21 @@ class Scrape::Application
     end
   end
+  def ignore_robots_txt= bool
+    sites.each{|_, site| site.ignore_robots_txt = bool }
+    @ignore_robots_txt = bool
+  end
   def [] url
     @sites.values.detect{|site| site.accept? url }
   end
   def load_scrapefile
     return if @scrapefile_loaded
-    result = loader.load(scrapefile)
+    raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
+    result = loader.load scrapefile
     @sites.update result if result.is_a? Hash
+    self.ignore_robots_txt = ignore_robots_txt
     reset
     @scrapefile_loaded = true
   end

data/lib/scrape/cli.rb CHANGED Viewed

@@ -17,6 +17,9 @@ class Scrape::CLI
       opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
         options[:file] = File.expand_path file
       end
+      opts.on "-i", "--ignore-robots-txt", "Ignore robots.txt" do
+        options[:ignore_robots_txt] = true
+      end
       opts.on_tail "-h", "--help", "Show this message" do
         puts opts
         exit
@@ -28,12 +31,11 @@ class Scrape::CLI
     end
     opts.parse argv
-    if File.exists? options[:file]
-      Scrape::Application.new(options[:file]).run
-    else
-      puts "#{command} aborted!"
-      puts "No Scrapefile found"
-      exit -1
-    end
+    Scrape::Application.new(options.delete(:file), options).run
+  rescue Scrape::FileNotFound
+    puts "#{command} aborted!"
+    puts "No Scrapefile found"
+    exit -1
   end
 end

data/lib/scrape/robots_txt.rb ADDED Viewed

@@ -0,0 +1,54 @@
+require 'addressable/uri'
+class Scrape::RobotsTxt
+  def initialize rules
+    @rules = rules
+    @rules.default = Scrape::RobotsTxtRules.new
+  end
+  def user_agents
+    @rules.keys
+  end
+  def disallows
+    @rules.values.flatten
+  end
+  def [] user_agent
+    rules  = @rules[user_agent].clone
+    rules += @rules['*'] unless user_agent == '*'
+    rules
+  end
+  def =~ str
+    self[Scrape.user_agent] =~ str
+  end
+  def each &block
+    @rules.each &block
+  end
+  def self.parse content
+    rules, user_agent = Hash.new, nil
+    content.split("\n").each do |line|
+      case line
+      when /^#/
+        next
+      when /User-agent:\s*(.+)/
+        user_agent = $1.strip
+        rules.update user_agent => Scrape::RobotsTxtRules.new
+      when /Disallow:\s*(.+)/
+        rules[user_agent] << $1.strip
+      end
+    end
+    new rules
+  end
+  def self.load url, default = true
+    url = Addressable::URI.join(url, "/robots.txt") if default
+    parse Scrape.open(url)
+  end
+  public :load
+end

data/lib/scrape/robots_txt_rules.rb ADDED Viewed

@@ -0,0 +1,24 @@
+class Scrape::RobotsTxtRules
+  def initialize *rules
+    @rules = rules.flatten
+  end
+  def << rule
+    @rules.push *Array(rule).flatten
+    self
+  end
+  def + ary
+    dup << ary.to_ary
+  end
+  def =~ str
+    str = str.to_str
+    @rules.any?{|rule| str.starts_with rule }
+  end
+  def to_a
+    @rules.dup
+  end
+  alias_method :to_ary, :to_a
+end

data/lib/scrape/site.rb CHANGED Viewed

@@ -1,14 +1,16 @@
-require 'uri'
+require 'addressable/uri'
 require 'nokogiri'
 class Scrape::Site
   attr_reader :url, :matches
+  attr_accessor :ignore_robots_txt
-  def initialize url
-    @url = URI.parse url
+  def initialize url, options = {}
+    @url = Addressable::URI.parse url
     @url.query = nil
     @url.fragment = nil
     @matches = []
+    @ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
   end
   def add_match matcher, &proc
@@ -23,22 +25,33 @@ class Scrape::Site
     @matches.each{|match| match.invoke doc if match =~ url }
-    doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
+    doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
   end
   def accept? url
-    url.to_s[0, to_s.length] == to_s
+    url = normalize url
+    url.starts_with(to_s) && !disallowed?(url)
+  end
+  def normalize url, base_url = self.url
+    Addressable::URI.join(base_url, url).to_s
+  end
+  def robots_txt
+    @robots_txt ||= Scrape::RobotsTxt.load url
   end
-  def normalize url
-    case url
-    when /^.+:\/\// then url.dup
-    when /^\//      then @url.merge(url).to_s
-    else @url.merge("#{@url.path}/#{url}").to_s
-    end
+  def ignore_robots_txt?
+    !!@ignore_robots_txt
   end
   def to_s
     url.to_s
   end
+private
+  def disallowed? url
+    !ignore_robots_txt? && robots_txt =~ Addressable::URI.parse(url).path
+  end
 end

data/lib/scrape/string_ext.rb ADDED Viewed

@@ -0,0 +1,6 @@
+class String
+  def starts_with str
+    str = str.to_str
+    self[0, str.length] == str
+  end unless instance_methods.include?(:starts_with)
+end

data/lib/scrape/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Scrape
-  VERSION = '0.1.1' unless defined? ::Scrape::VERSION
+  VERSION = '0.2' unless defined? ::Scrape::VERSION
 end

data/scrape.gemspec CHANGED Viewed

@@ -19,4 +19,5 @@ Gem::Specification.new do |s|
   s.require_paths = ["lib"]
   s.add_development_dependency "nokogiri", "~> 1.5.5"
+  s.add_development_dependency "addressable", "~> 2.2.8"
 end

data/test/unit/application_test.rb CHANGED Viewed

@@ -54,7 +54,7 @@ class ApplicationTest < Scrape::TestCase
     filepath = File.join(SUPPORT_FILES, 'test1.scrape')
     test_loader = MiniTest::Mock.new
     test_loader.expect :load, nil, [filepath]
-    Scrape::Application.new(filepath, test_loader).run
+    Scrape::Application.new(filepath, {}, test_loader).run
     assert test_loader.verify, "loader did not receive file"
   end
@@ -69,4 +69,13 @@ class ApplicationTest < Scrape::TestCase
     3.times{ app.enqueue "http://example.com" }
     assert_equal ["http://example.com"], app.queue
   end
+  test "#ignore_robots_txt should update #ignore_robots_txt on all sites" do
+    site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
+    app = Scrape::Application.new(".")
+    app.sites.update site.to_s => site
+    assert_equal false, site.ignore_robots_txt
+    app.ignore_robots_txt = true
+    assert_equal true, site.ignore_robots_txt
+  end
 end

data/test/unit/robots_txt_rules_test.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require "test_helper"
+class RobotsTxtRulesTest < Scrape::TestCase
+  test "#initialize should set the rules passed as multiple arguments" do
+    rules = Scrape::RobotsTxtRules.new "/foo", "/bar"
+    assert_equal ["/foo", "/bar"], rules.to_a
+  end
+  test "#initialize should set the rules passed an array argument" do
+    rules = Scrape::RobotsTxtRules.new ["/foo", "/bar"]
+    assert_equal ["/foo", "/bar"], rules.to_a
+  end
+  test "#<< should append the string" do
+    rules = Scrape::RobotsTxtRules.new "/foo"
+    assert_equal ["/foo"], rules.to_a
+    rules << "/bar"
+    assert_equal ["/foo", "/bar"], rules.to_a
+  end
+  test "#<< should append the array" do
+    rules = Scrape::RobotsTxtRules.new "/foo"
+    assert_equal ["/foo"], rules.to_a
+    rules << ["/bar", "/too"]
+    assert_equal ["/foo", "/bar", "/too"], rules.to_a
+  end
+  test "#+ should return a new instance concatenating it self and the given array" do
+    rules1 = Scrape::RobotsTxtRules.new "/foo"
+    rules2 = rules1 + ["/bar"]
+    refute_equal rules1, rules2
+    assert_kind_of Scrape::RobotsTxtRules, rules2
+    assert_equal ["/foo", "/bar"], rules2.to_a
+  end
+  test "#=~ should match anything that beings with /" do
+    rules = Scrape::RobotsTxtRules.new "/"
+    assert rules =~ "/"
+    assert rules =~ "/foo"
+  end
+  test "#=~ should match anything that begins with rules" do
+    rules = Scrape::RobotsTxtRules.new "/foo"
+    assert rules =~ "/foo"
+    assert rules =~ "/foo/"
+    assert rules =~ "/foo/bar"
+    assert rules =~ "/foo.html"
+    refute rules =~ "/bar"
+  end
+end

data/test/unit/robots_txt_test.rb ADDED Viewed

@@ -0,0 +1,71 @@
+require "test_helper"
+class RobotsTxtTest < Scrape::TestCase
+  test "#user_agents should return an array" do
+    robots = Scrape::RobotsTxt.new "Test" => []
+    assert_equal ["Test"], robots.user_agents
+  end
+  test "#disallows should return an array" do
+    robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
+    assert_equal ["/foo"], robots.disallows
+  end
+  test "#[] should return all disallows for the specified user agent" do
+    robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
+    assert_equal ["/foo"], robots["Test"]
+  end
+  test "#[] should return all disallows for the specified user agent including wildcard" do
+    robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
+    assert_equal ["/foo", "/bar"], robots["Test"]
+  end
+  test "#[] should return all disallows for wildcard" do
+    robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
+    assert_equal ["/bar"], robots["*"]
+  end
+  test ".parse should return new instance parsed from a string" do
+    robots = Scrape::RobotsTxt.parse <<-TXT
+    User-agent: Test
+    Disallow: /foo
+    Disallow: /bar
+    TXT
+    assert_equal ["Test"], robots.user_agents
+    assert_equal ["/foo", "/bar"], robots.disallows
+  end
+  test ".parse should return new empty instance" do
+    robots = Scrape::RobotsTxt.parse ""
+    assert_equal [], robots.user_agents
+    assert_equal [], robots.disallows
+  end
+  test ".load should return a new instance parsed from the specified url" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: Test
+      Disallow: /foo
+      Disallow: /bar
+      TXT
+    robots = Scrape::RobotsTxt.load "http://www.example.com/robots.txt"
+    assert_equal ["Test"], robots.user_agents
+    assert_equal ["/foo", "/bar"], robots.disallows
+  end
+  test ".load should return a new instance parsed from the specified url with the path defaulted" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: Test
+      Disallow: /foo
+      Disallow: /bar
+      TXT
+    robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
+    assert_equal ["Test"], robots.user_agents
+    assert_equal ["/foo", "/bar"], robots.disallows
+  end
+end

data/test/unit/site_test.rb CHANGED Viewed

@@ -24,8 +24,8 @@ class SiteTest < Scrape::TestCase
     assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
   end
-  test "#parse should return relative urls to the site" do
-    stub_request(:get, "http://www.example.com/test").
+  test "#parse should return relative urls to the specified url" do
+    stub_request(:get, "http://www.example.com/foo/bar").
       with(:headers => {"User-Agent" => Scrape.user_agent}).
       to_return(:status => 200, :body => <<-HTML)
       <html>
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
     HTML
     site = Scrape::Site.new "http://www.example.com"
-    assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
+    assert_equal ["http://www.example.com/foo/link1.html"], site.parse("/foo/bar")
   end
   test "#parse should return no urls" do
@@ -73,9 +73,36 @@ class SiteTest < Scrape::TestCase
     assert ok, "Match was not invoked"
   end
-  test "#accept? should return true when specified url inside the site's url" do
-    uri = Scrape::Site.new "http://www.example.com/foo"
-    assert uri.accept?("http://www.example.com/foo/bar")
+  test "#accept? should return true when specified url is inside the site's url" do
+    site = Scrape::Site.new "http://www.example.com/foo"
+    assert site.accept?("http://www.example.com/foo/bar")
+  end
+  test "#accept? should return false when specified url is outside the site's url" do
+    site = Scrape::Site.new "http://www.example.com/foo"
+    refute site.accept?("http://www.example.com/bar")
+  end
+  test "#accept? should return true when specified url is inside the site's url and allowed by robots.txt" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: #{Scrape.user_agent}
+      Disallow: /bar
+      TXT
+    site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
+    assert site.accept?("http://www.example.com/foo/bar")
+  end
+  test "#accept? should return false when specified url is inside the site's url and disallowed by robots.txt" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: #{Scrape.user_agent}
+      Disallow: /foo
+      TXT
+    site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
+    refute site.accept?("http://www.example.com/foo/bar"), "URL should not be accepted"
   end
   test "#normalize should return a url when string begins with a slash" do
@@ -84,7 +111,7 @@ class SiteTest < Scrape::TestCase
   end
   test "#normalize should return a url with the string appended" do
-    site = Scrape::Site.new "http://www.example.com/foo"
+    site = Scrape::Site.new "http://www.example.com/foo/boo"
     assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
   end
@@ -92,4 +119,21 @@ class SiteTest < Scrape::TestCase
     site = Scrape::Site.new "http://www.example.com/foo"
     assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
   end
+  test "#normalize should return a url when string is a forward slash" do
+    site = Scrape::Site.new "http://www.example.com/foo"
+    assert_equal "http://www.example.com/", site.normalize("/")
+  end
+  test "#robots_txt should return a RobotsTxt instance from the site's url" do
+    stub_request(:get, "http://www.example.com/robots.txt").
+      to_return(:status => 200, :body => <<-TXT)
+      User-agent: Test
+      Disallow: /foo
+      TXT
+    site = Scrape::Site.new "http://www.example.com/foo"
+    robots = site.robots_txt
+    assert_kind_of Scrape::RobotsTxt, robots
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scrape
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: '0.2'
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-11 00:00:00.000000000 Z
+date: 2012-07-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -27,6 +27,22 @@ dependencies:
     - - ~>
       - !ruby/object:Gem::Version
         version: 1.5.5
+- !ruby/object:Gem::Dependency
+  name: addressable
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.2.8
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.2.8
 description: An easy to use utility to scrape websites using a DSL similar to rake.
 email:
 - evilmarty@gmail.com
@@ -50,7 +66,10 @@ files:
 - lib/scrape/default_loader.rb
 - lib/scrape/dsl.rb
 - lib/scrape/match.rb
+- lib/scrape/robots_txt.rb
+- lib/scrape/robots_txt_rules.rb
 - lib/scrape/site.rb
+- lib/scrape/string_ext.rb
 - lib/scrape/version.rb
 - scrape.gemspec
 - test/support/test1.scrape
@@ -60,6 +79,8 @@ files:
 - test/unit/application_test.rb
 - test/unit/default_loader_test.rb
 - test/unit/match_test.rb
+- test/unit/robots_txt_rules_test.rb
+- test/unit/robots_txt_test.rb
 - test/unit/scrape_test.rb
 - test/unit/site_test.rb
 homepage: http://github.com/evilmarty/scrape