RubyGems - scrape - Versions diffs - 0.1 → 0.1.1 - Mend

scrape 0.1 → 0.1.1

Files changed (10) hide show

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    scrape (0.1)
+    scrape (0.1.1)
 GEM
   remote: http://rubygems.org/

data/lib/scrape/application.rb CHANGED Viewed

@@ -30,7 +30,7 @@ class Scrape::Application
   def reset
     @history = []
-    @queue = sites.values.map{|site| site.url.to_s }
+    @queue = sites.values.map{|site| site.to_s }
   end
   def queue
@@ -44,7 +44,7 @@ class Scrape::Application
   end
   def [] url
-    @sites.values.detect{|site| site.url < url }
+    @sites.values.detect{|site| site.accept? url }
   end
   def load_scrapefile

data/lib/scrape/site.rb CHANGED Viewed

@@ -1,10 +1,11 @@
+require 'uri'
 require 'nokogiri'
 class Scrape::Site
   attr_reader :url, :matches
   def initialize url
-    @url = Scrape::URI.new url
+    @url = URI.parse url
     @url.query = nil
     @url.fragment = nil
     @matches = []
@@ -17,15 +18,27 @@ class Scrape::Site
   end
   def parse url
-    url = self.url + url
-    doc = Nokogiri::HTML url.open
+    url = normalize url
+    doc = Nokogiri::HTML Scrape.open(url)
     @matches.each{|match| match.invoke doc if match =~ url }
-    urls = doc.css("a[href]").map do |node|
-      href = self.url + node['href']
-      self.url < href ? href : nil
+    doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
+  end
+  def accept? url
+    url.to_s[0, to_s.length] == to_s
+  end
+  def normalize url
+    case url
+    when /^.+:\/\// then url.dup
+    when /^\//      then @url.merge(url).to_s
+    else @url.merge("#{@url.path}/#{url}").to_s
     end
-    urls.compact
+  end
+  def to_s
+    url.to_s
   end
 end

data/lib/scrape/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Scrape
-  VERSION = '0.1' unless defined? ::Scrape::VERSION
+  VERSION = '0.1.1' unless defined? ::Scrape::VERSION
 end

data/lib/scrape.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require "rubygems"
 require "logger"
+require "open-uri"
 require "bundler/setup"
 module Scrape
@@ -32,5 +33,10 @@ module Scrape
     def load_scrapefile path
       Application.new path
     end
+    def open url, headers = {}, &block
+      headers = {"User-Agent" => user_agent}.merge(headers)
+      super(url, headers, &block).read
+    end
   end
 end

data/test/unit/scrape_test.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require "test_helper"
+class ScrapeTest < Scrape::TestCase
+  test "#user_agent should return default when not set" do
+    assert_equal Scrape.user_agent, "Scrape/#{Scrape::VERSION}"
+  end
+  test "#load_scrapefile should return a new application" do
+    app = Scrape.load_scrapefile '.'
+    assert_kind_of Scrape::Application, app
+  end
+  test "#open should send a request to the specified url and return the contents" do
+    stub_request(:get, "http://example.com/").to_return(:status => 200, :body => "booyah")
+    assert_equal "booyah", Scrape.open("http://example.com")
+  end
+  test "#open should set the user agent in the request header" do
+    stub_request(:get, "http://example.com/").
+      with(:headers => {"User-Agent" => "Scrape/#{Scrape::VERSION}"}).
+      to_return(:status => 200, :body => "")
+    Scrape.open("http://example.com")
+    assert true
+  end
+end

data/test/unit/site_test.rb CHANGED Viewed

@@ -21,7 +21,7 @@ class SiteTest < Scrape::TestCase
     HTML
     site = Scrape::Site.new "http://www.example.com"
-    assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
+    assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
   end
   test "#parse should return relative urls to the site" do
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
     HTML
     site = Scrape::Site.new "http://www.example.com"
-    assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
+    assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
   end
   test "#parse should return no urls" do
@@ -72,4 +72,24 @@ class SiteTest < Scrape::TestCase
     assert ok, "Match was not invoked"
   end
+  test "#accept? should return true when specified url inside the site's url" do
+    uri = Scrape::Site.new "http://www.example.com/foo"
+    assert uri.accept?("http://www.example.com/foo/bar")
+  end
+  test "#normalize should return a url when string begins with a slash" do
+    site = Scrape::Site.new "http://www.example.com/foo"
+    assert_equal "http://www.example.com/bar", site.normalize("/bar")
+  end
+  test "#normalize should return a url with the string appended" do
+    site = Scrape::Site.new "http://www.example.com/foo"
+    assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
+  end
+  test "#normalize should return the string when it begins with a scheme" do
+    site = Scrape::Site.new "http://www.example.com/foo"
+    assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scrape
 version: !ruby/object:Gem::Version
-  version: '0.1'
+  version: 0.1.1
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-07-10 00:00:00.000000000 Z
+date: 2012-07-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -51,7 +51,6 @@ files:
 - lib/scrape/dsl.rb
 - lib/scrape/match.rb
 - lib/scrape/site.rb
-- lib/scrape/uri.rb
 - lib/scrape/version.rb
 - scrape.gemspec
 - test/support/test1.scrape
@@ -61,8 +60,8 @@ files:
 - test/unit/application_test.rb
 - test/unit/default_loader_test.rb
 - test/unit/match_test.rb
+- test/unit/scrape_test.rb
 - test/unit/site_test.rb
-- test/unit/uri_test.rb
 homepage: http://github.com/evilmarty/scrape
 licenses: []
 post_install_message:

data/lib/scrape/uri.rb DELETED Viewed

@@ -1,58 +0,0 @@
-require 'uri'
-require 'open-uri'
-class Scrape::URI
-  def initialize uri = nil
-    @uri = case uri
-    when URI then uri.clone
-    when NilClass then URI.new
-    else URI.parse uri.to_s
-    end
-  end
-  %w[fragment host hostname password path port query scheme user to_s relative? absolute?].each do |method_name|
-    class_eval <<-EOT, __FILE__, __LINE__ + 1
-      def #{method_name}
-        @uri.#{method_name}
-      end
-    EOT
-  end
-  %w[fragment host hostname password path port query scheme user].each do |method_name|
-    class_eval <<-EOT, __FILE__, __LINE__ + 1
-      def #{method_name}= value
-        @uri.#{method_name} = value
-      end
-    EOT
-  end
-  def + url
-    return clone if self == url
-    relative = (url.to_s =~ /^(?!.+:\/\/|\/)/)
-    uri = self.class.new @uri.merge(url)
-    uri.path = "#{@uri.path}#{uri.path}" if relative
-    uri
-  end
-  def < url
-    url[0, length] == to_s
-  end
-  def [] *args
-    to_s[*args]
-  end
-  def == url
-    to_s == url.to_s
-  end
-  def length
-    to_s.length
-  end
-  alias_method :size, :length
-  def open headers = {}, &block
-    headers = {"User-Agent" => Scrape.user_agent}.merge(headers)
-    super(to_s, headers, &block).read
-  end
-end

data/test/unit/uri_test.rb DELETED Viewed

@@ -1,52 +0,0 @@
-require "test_helper"
-class URITest < Scrape::TestCase
-  {fragment: "blah", host: "www.example.com", password: "secret", path: "/dot", query: "foo=bar", scheme: "http", user: "chuck", relative?: false, absolute?: true}.each do |method_name, value|
-    test "##{method_name} should return value" do
-      uri = Scrape::URI.new "http://chuck:secret@www.example.com/dot?foo=bar#blah"
-      assert_equal value, uri.send(method_name)
-    end
-  end
-  test "#open should return the contents at the url" do
-    stub_request(:get, "http://www.example.com/").with(headers: {"User-Agent" => Scrape.user_agent}).to_return(status: 200, body: "Howdie")
-    uri = Scrape::URI.new "http://www.example.com"
-    assert_equal "Howdie", uri.open
-  end
-  test "#+ should return a URI with the specified path" do
-    uri1 = Scrape::URI.new "http://www.example.com"
-    uri2 = uri1 + "/bar"
-    assert_equal "http://www.example.com/bar", uri2.to_s
-  end
-  test "#+ should return a URI overwriting with the specified path" do
-    uri1 = Scrape::URI.new "http://www.example.com/foo"
-    uri2 = uri1 + "/bar"
-    assert_equal "http://www.example.com/bar", uri2.to_s
-  end
-  test "#+ should return a URI with the specified path appended" do
-    uri1 = Scrape::URI.new "http://www.example.com/foo"
-    uri2 = uri1 + "bar"
-    assert_equal "http://www.example.com/foo/bar", uri2.to_s
-  end
-  test "#+ should return a URI from the absolute url" do
-    uri1 = Scrape::URI.new "http://www.example.com/foo"
-    uri2 = uri1 + "http://www.example.com/bar"
-    assert_equal "http://www.example.com/bar", uri2.to_s
-  end
-  test "#+ should return a URI appended from the absolute url" do
-    uri1 = Scrape::URI.new "http://www.example.com/foo"
-    uri2 = uri1 + "http://www.example.com/foo/bar"
-    assert_equal "http://www.example.com/foo/bar", uri2.to_s
-  end
-  test "#< should return true when specified url is greater" do
-    uri1 = Scrape::URI.new "http://www.example.com/foo"
-    assert uri1 < "http://www.example.com/foo/bar"
-  end
-end