scrape 0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrape (0.1)
4
+ scrape (0.1.1)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -30,7 +30,7 @@ class Scrape::Application
30
30
 
31
31
  def reset
32
32
  @history = []
33
- @queue = sites.values.map{|site| site.url.to_s }
33
+ @queue = sites.values.map{|site| site.to_s }
34
34
  end
35
35
 
36
36
  def queue
@@ -44,7 +44,7 @@ class Scrape::Application
44
44
  end
45
45
 
46
46
  def [] url
47
- @sites.values.detect{|site| site.url < url }
47
+ @sites.values.detect{|site| site.accept? url }
48
48
  end
49
49
 
50
50
  def load_scrapefile
data/lib/scrape/site.rb CHANGED
@@ -1,10 +1,11 @@
1
+ require 'uri'
1
2
  require 'nokogiri'
2
3
 
3
4
  class Scrape::Site
4
5
  attr_reader :url, :matches
5
6
 
6
7
  def initialize url
7
- @url = Scrape::URI.new url
8
+ @url = URI.parse url
8
9
  @url.query = nil
9
10
  @url.fragment = nil
10
11
  @matches = []
@@ -17,15 +18,27 @@ class Scrape::Site
17
18
  end
18
19
 
19
20
  def parse url
20
- url = self.url + url
21
- doc = Nokogiri::HTML url.open
21
+ url = normalize url
22
+ doc = Nokogiri::HTML Scrape.open(url)
22
23
 
23
24
  @matches.each{|match| match.invoke doc if match =~ url }
24
25
 
25
- urls = doc.css("a[href]").map do |node|
26
- href = self.url + node['href']
27
- self.url < href ? href : nil
26
+ doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
27
+ end
28
+
29
+ def accept? url
30
+ url.to_s[0, to_s.length] == to_s
31
+ end
32
+
33
+ def normalize url
34
+ case url
35
+ when /^.+:\/\// then url.dup
36
+ when /^\// then @url.merge(url).to_s
37
+ else @url.merge("#{@url.path}/#{url}").to_s
28
38
  end
29
- urls.compact
39
+ end
40
+
41
+ def to_s
42
+ url.to_s
30
43
  end
31
44
  end
@@ -1,3 +1,3 @@
1
1
  module Scrape
2
- VERSION = '0.1' unless defined? ::Scrape::VERSION
2
+ VERSION = '0.1.1' unless defined? ::Scrape::VERSION
3
3
  end
data/lib/scrape.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "rubygems"
2
2
  require "logger"
3
+ require "open-uri"
3
4
  require "bundler/setup"
4
5
 
5
6
  module Scrape
@@ -32,5 +33,10 @@ module Scrape
32
33
  def load_scrapefile path
33
34
  Application.new path
34
35
  end
36
+
37
+ def open url, headers = {}, &block
38
+ headers = {"User-Agent" => user_agent}.merge(headers)
39
+ super(url, headers, &block).read
40
+ end
35
41
  end
36
42
  end
@@ -0,0 +1,25 @@
1
+ require "test_helper"
2
+
3
+ class ScrapeTest < Scrape::TestCase
4
+ test "#user_agent should return default when not set" do
5
+ assert_equal Scrape.user_agent, "Scrape/#{Scrape::VERSION}"
6
+ end
7
+
8
+ test "#load_scrapefile should return a new application" do
9
+ app = Scrape.load_scrapefile '.'
10
+ assert_kind_of Scrape::Application, app
11
+ end
12
+
13
+ test "#open should send a request to the specified url and return the contents" do
14
+ stub_request(:get, "http://example.com/").to_return(:status => 200, :body => "booyah")
15
+ assert_equal "booyah", Scrape.open("http://example.com")
16
+ end
17
+
18
+ test "#open should set the user agent in the request header" do
19
+ stub_request(:get, "http://example.com/").
20
+ with(:headers => {"User-Agent" => "Scrape/#{Scrape::VERSION}"}).
21
+ to_return(:status => 200, :body => "")
22
+ Scrape.open("http://example.com")
23
+ assert true
24
+ end
25
+ end
@@ -21,7 +21,7 @@ class SiteTest < Scrape::TestCase
21
21
  HTML
22
22
 
23
23
  site = Scrape::Site.new "http://www.example.com"
24
- assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
24
+ assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
25
25
  end
26
26
 
27
27
  test "#parse should return relative urls to the site" do
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
36
36
  HTML
37
37
 
38
38
  site = Scrape::Site.new "http://www.example.com"
39
- assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
39
+ assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
40
40
  end
41
41
 
42
42
  test "#parse should return no urls" do
@@ -72,4 +72,24 @@ class SiteTest < Scrape::TestCase
72
72
 
73
73
  assert ok, "Match was not invoked"
74
74
  end
75
+
76
+ test "#accept? should return true when specified url inside the site's url" do
77
+ uri = Scrape::Site.new "http://www.example.com/foo"
78
+ assert uri.accept?("http://www.example.com/foo/bar")
79
+ end
80
+
81
+ test "#normalize should return a url when string begins with a slash" do
82
+ site = Scrape::Site.new "http://www.example.com/foo"
83
+ assert_equal "http://www.example.com/bar", site.normalize("/bar")
84
+ end
85
+
86
+ test "#normalize should return a url with the string appended" do
87
+ site = Scrape::Site.new "http://www.example.com/foo"
88
+ assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
89
+ end
90
+
91
+ test "#normalize should return the string when it begins with a scheme" do
92
+ site = Scrape::Site.new "http://www.example.com/foo"
93
+ assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
94
+ end
75
95
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-10 00:00:00.000000000 Z
12
+ date: 2012-07-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -51,7 +51,6 @@ files:
51
51
  - lib/scrape/dsl.rb
52
52
  - lib/scrape/match.rb
53
53
  - lib/scrape/site.rb
54
- - lib/scrape/uri.rb
55
54
  - lib/scrape/version.rb
56
55
  - scrape.gemspec
57
56
  - test/support/test1.scrape
@@ -61,8 +60,8 @@ files:
61
60
  - test/unit/application_test.rb
62
61
  - test/unit/default_loader_test.rb
63
62
  - test/unit/match_test.rb
63
+ - test/unit/scrape_test.rb
64
64
  - test/unit/site_test.rb
65
- - test/unit/uri_test.rb
66
65
  homepage: http://github.com/evilmarty/scrape
67
66
  licenses: []
68
67
  post_install_message:
data/lib/scrape/uri.rb DELETED
@@ -1,58 +0,0 @@
1
- require 'uri'
2
- require 'open-uri'
3
-
4
- class Scrape::URI
5
- def initialize uri = nil
6
- @uri = case uri
7
- when URI then uri.clone
8
- when NilClass then URI.new
9
- else URI.parse uri.to_s
10
- end
11
- end
12
-
13
- %w[fragment host hostname password path port query scheme user to_s relative? absolute?].each do |method_name|
14
- class_eval <<-EOT, __FILE__, __LINE__ + 1
15
- def #{method_name}
16
- @uri.#{method_name}
17
- end
18
- EOT
19
- end
20
-
21
- %w[fragment host hostname password path port query scheme user].each do |method_name|
22
- class_eval <<-EOT, __FILE__, __LINE__ + 1
23
- def #{method_name}= value
24
- @uri.#{method_name} = value
25
- end
26
- EOT
27
- end
28
-
29
- def + url
30
- return clone if self == url
31
- relative = (url.to_s =~ /^(?!.+:\/\/|\/)/)
32
- uri = self.class.new @uri.merge(url)
33
- uri.path = "#{@uri.path}#{uri.path}" if relative
34
- uri
35
- end
36
-
37
- def < url
38
- url[0, length] == to_s
39
- end
40
-
41
- def [] *args
42
- to_s[*args]
43
- end
44
-
45
- def == url
46
- to_s == url.to_s
47
- end
48
-
49
- def length
50
- to_s.length
51
- end
52
- alias_method :size, :length
53
-
54
- def open headers = {}, &block
55
- headers = {"User-Agent" => Scrape.user_agent}.merge(headers)
56
- super(to_s, headers, &block).read
57
- end
58
- end
@@ -1,52 +0,0 @@
1
- require "test_helper"
2
-
3
- class URITest < Scrape::TestCase
4
- {fragment: "blah", host: "www.example.com", password: "secret", path: "/dot", query: "foo=bar", scheme: "http", user: "chuck", relative?: false, absolute?: true}.each do |method_name, value|
5
- test "##{method_name} should return value" do
6
- uri = Scrape::URI.new "http://chuck:secret@www.example.com/dot?foo=bar#blah"
7
- assert_equal value, uri.send(method_name)
8
- end
9
- end
10
-
11
- test "#open should return the contents at the url" do
12
- stub_request(:get, "http://www.example.com/").with(headers: {"User-Agent" => Scrape.user_agent}).to_return(status: 200, body: "Howdie")
13
-
14
- uri = Scrape::URI.new "http://www.example.com"
15
- assert_equal "Howdie", uri.open
16
- end
17
-
18
- test "#+ should return a URI with the specified path" do
19
- uri1 = Scrape::URI.new "http://www.example.com"
20
- uri2 = uri1 + "/bar"
21
- assert_equal "http://www.example.com/bar", uri2.to_s
22
- end
23
-
24
- test "#+ should return a URI overwriting with the specified path" do
25
- uri1 = Scrape::URI.new "http://www.example.com/foo"
26
- uri2 = uri1 + "/bar"
27
- assert_equal "http://www.example.com/bar", uri2.to_s
28
- end
29
-
30
- test "#+ should return a URI with the specified path appended" do
31
- uri1 = Scrape::URI.new "http://www.example.com/foo"
32
- uri2 = uri1 + "bar"
33
- assert_equal "http://www.example.com/foo/bar", uri2.to_s
34
- end
35
-
36
- test "#+ should return a URI from the absolute url" do
37
- uri1 = Scrape::URI.new "http://www.example.com/foo"
38
- uri2 = uri1 + "http://www.example.com/bar"
39
- assert_equal "http://www.example.com/bar", uri2.to_s
40
- end
41
-
42
- test "#+ should return a URI appended from the absolute url" do
43
- uri1 = Scrape::URI.new "http://www.example.com/foo"
44
- uri2 = uri1 + "http://www.example.com/foo/bar"
45
- assert_equal "http://www.example.com/foo/bar", uri2.to_s
46
- end
47
-
48
- test "#< should return true when specified url is greater" do
49
- uri1 = Scrape::URI.new "http://www.example.com/foo"
50
- assert uri1 < "http://www.example.com/foo/bar"
51
- end
52
- end