scrape 0.1 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrape (0.1)
4
+ scrape (0.1.1)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -30,7 +30,7 @@ class Scrape::Application
30
30
 
31
31
  def reset
32
32
  @history = []
33
- @queue = sites.values.map{|site| site.url.to_s }
33
+ @queue = sites.values.map{|site| site.to_s }
34
34
  end
35
35
 
36
36
  def queue
@@ -44,7 +44,7 @@ class Scrape::Application
44
44
  end
45
45
 
46
46
  def [] url
47
- @sites.values.detect{|site| site.url < url }
47
+ @sites.values.detect{|site| site.accept? url }
48
48
  end
49
49
 
50
50
  def load_scrapefile
data/lib/scrape/site.rb CHANGED
@@ -1,10 +1,11 @@
1
+ require 'uri'
1
2
  require 'nokogiri'
2
3
 
3
4
  class Scrape::Site
4
5
  attr_reader :url, :matches
5
6
 
6
7
  def initialize url
7
- @url = Scrape::URI.new url
8
+ @url = URI.parse url
8
9
  @url.query = nil
9
10
  @url.fragment = nil
10
11
  @matches = []
@@ -17,15 +18,27 @@ class Scrape::Site
17
18
  end
18
19
 
19
20
  def parse url
20
- url = self.url + url
21
- doc = Nokogiri::HTML url.open
21
+ url = normalize url
22
+ doc = Nokogiri::HTML Scrape.open(url)
22
23
 
23
24
  @matches.each{|match| match.invoke doc if match =~ url }
24
25
 
25
- urls = doc.css("a[href]").map do |node|
26
- href = self.url + node['href']
27
- self.url < href ? href : nil
26
+ doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
27
+ end
28
+
29
+ def accept? url
30
+ url.to_s[0, to_s.length] == to_s
31
+ end
32
+
33
+ def normalize url
34
+ case url
35
+ when /^.+:\/\// then url.dup
36
+ when /^\// then @url.merge(url).to_s
37
+ else @url.merge("#{@url.path}/#{url}").to_s
28
38
  end
29
- urls.compact
39
+ end
40
+
41
+ def to_s
42
+ url.to_s
30
43
  end
31
44
  end
@@ -1,3 +1,3 @@
1
1
  module Scrape
2
- VERSION = '0.1' unless defined? ::Scrape::VERSION
2
+ VERSION = '0.1.1' unless defined? ::Scrape::VERSION
3
3
  end
data/lib/scrape.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "rubygems"
2
2
  require "logger"
3
+ require "open-uri"
3
4
  require "bundler/setup"
4
5
 
5
6
  module Scrape
@@ -32,5 +33,10 @@ module Scrape
32
33
  def load_scrapefile path
33
34
  Application.new path
34
35
  end
36
+
37
+ def open url, headers = {}, &block
38
+ headers = {"User-Agent" => user_agent}.merge(headers)
39
+ super(url, headers, &block).read
40
+ end
35
41
  end
36
42
  end
@@ -0,0 +1,25 @@
1
+ require "test_helper"
2
+
3
+ class ScrapeTest < Scrape::TestCase
4
+ test "#user_agent should return default when not set" do
5
+ assert_equal Scrape.user_agent, "Scrape/#{Scrape::VERSION}"
6
+ end
7
+
8
+ test "#load_scrapefile should return a new application" do
9
+ app = Scrape.load_scrapefile '.'
10
+ assert_kind_of Scrape::Application, app
11
+ end
12
+
13
+ test "#open should send a request to the specified url and return the contents" do
14
+ stub_request(:get, "http://example.com/").to_return(:status => 200, :body => "booyah")
15
+ assert_equal "booyah", Scrape.open("http://example.com")
16
+ end
17
+
18
+ test "#open should set the user agent in the request header" do
19
+ stub_request(:get, "http://example.com/").
20
+ with(:headers => {"User-Agent" => "Scrape/#{Scrape::VERSION}"}).
21
+ to_return(:status => 200, :body => "")
22
+ Scrape.open("http://example.com")
23
+ assert true
24
+ end
25
+ end
@@ -21,7 +21,7 @@ class SiteTest < Scrape::TestCase
21
21
  HTML
22
22
 
23
23
  site = Scrape::Site.new "http://www.example.com"
24
- assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
24
+ assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
25
25
  end
26
26
 
27
27
  test "#parse should return relative urls to the site" do
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
36
36
  HTML
37
37
 
38
38
  site = Scrape::Site.new "http://www.example.com"
39
- assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
39
+ assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
40
40
  end
41
41
 
42
42
  test "#parse should return no urls" do
@@ -72,4 +72,24 @@ class SiteTest < Scrape::TestCase
72
72
 
73
73
  assert ok, "Match was not invoked"
74
74
  end
75
+
76
+ test "#accept? should return true when specified url inside the site's url" do
77
+ uri = Scrape::Site.new "http://www.example.com/foo"
78
+ assert uri.accept?("http://www.example.com/foo/bar")
79
+ end
80
+
81
+ test "#normalize should return a url when string begins with a slash" do
82
+ site = Scrape::Site.new "http://www.example.com/foo"
83
+ assert_equal "http://www.example.com/bar", site.normalize("/bar")
84
+ end
85
+
86
+ test "#normalize should return a url with the string appended" do
87
+ site = Scrape::Site.new "http://www.example.com/foo"
88
+ assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
89
+ end
90
+
91
+ test "#normalize should return the string when it begins with a scheme" do
92
+ site = Scrape::Site.new "http://www.example.com/foo"
93
+ assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
94
+ end
75
95
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-10 00:00:00.000000000 Z
12
+ date: 2012-07-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -51,7 +51,6 @@ files:
51
51
  - lib/scrape/dsl.rb
52
52
  - lib/scrape/match.rb
53
53
  - lib/scrape/site.rb
54
- - lib/scrape/uri.rb
55
54
  - lib/scrape/version.rb
56
55
  - scrape.gemspec
57
56
  - test/support/test1.scrape
@@ -61,8 +60,8 @@ files:
61
60
  - test/unit/application_test.rb
62
61
  - test/unit/default_loader_test.rb
63
62
  - test/unit/match_test.rb
63
+ - test/unit/scrape_test.rb
64
64
  - test/unit/site_test.rb
65
- - test/unit/uri_test.rb
66
65
  homepage: http://github.com/evilmarty/scrape
67
66
  licenses: []
68
67
  post_install_message:
data/lib/scrape/uri.rb DELETED
@@ -1,58 +0,0 @@
1
- require 'uri'
2
- require 'open-uri'
3
-
4
- class Scrape::URI
5
- def initialize uri = nil
6
- @uri = case uri
7
- when URI then uri.clone
8
- when NilClass then URI.new
9
- else URI.parse uri.to_s
10
- end
11
- end
12
-
13
- %w[fragment host hostname password path port query scheme user to_s relative? absolute?].each do |method_name|
14
- class_eval <<-EOT, __FILE__, __LINE__ + 1
15
- def #{method_name}
16
- @uri.#{method_name}
17
- end
18
- EOT
19
- end
20
-
21
- %w[fragment host hostname password path port query scheme user].each do |method_name|
22
- class_eval <<-EOT, __FILE__, __LINE__ + 1
23
- def #{method_name}= value
24
- @uri.#{method_name} = value
25
- end
26
- EOT
27
- end
28
-
29
- def + url
30
- return clone if self == url
31
- relative = (url.to_s =~ /^(?!.+:\/\/|\/)/)
32
- uri = self.class.new @uri.merge(url)
33
- uri.path = "#{@uri.path}#{uri.path}" if relative
34
- uri
35
- end
36
-
37
- def < url
38
- url[0, length] == to_s
39
- end
40
-
41
- def [] *args
42
- to_s[*args]
43
- end
44
-
45
- def == url
46
- to_s == url.to_s
47
- end
48
-
49
- def length
50
- to_s.length
51
- end
52
- alias_method :size, :length
53
-
54
- def open headers = {}, &block
55
- headers = {"User-Agent" => Scrape.user_agent}.merge(headers)
56
- super(to_s, headers, &block).read
57
- end
58
- end
@@ -1,52 +0,0 @@
1
- require "test_helper"
2
-
3
- class URITest < Scrape::TestCase
4
- {fragment: "blah", host: "www.example.com", password: "secret", path: "/dot", query: "foo=bar", scheme: "http", user: "chuck", relative?: false, absolute?: true}.each do |method_name, value|
5
- test "##{method_name} should return value" do
6
- uri = Scrape::URI.new "http://chuck:secret@www.example.com/dot?foo=bar#blah"
7
- assert_equal value, uri.send(method_name)
8
- end
9
- end
10
-
11
- test "#open should return the contents at the url" do
12
- stub_request(:get, "http://www.example.com/").with(headers: {"User-Agent" => Scrape.user_agent}).to_return(status: 200, body: "Howdie")
13
-
14
- uri = Scrape::URI.new "http://www.example.com"
15
- assert_equal "Howdie", uri.open
16
- end
17
-
18
- test "#+ should return a URI with the specified path" do
19
- uri1 = Scrape::URI.new "http://www.example.com"
20
- uri2 = uri1 + "/bar"
21
- assert_equal "http://www.example.com/bar", uri2.to_s
22
- end
23
-
24
- test "#+ should return a URI overwriting with the specified path" do
25
- uri1 = Scrape::URI.new "http://www.example.com/foo"
26
- uri2 = uri1 + "/bar"
27
- assert_equal "http://www.example.com/bar", uri2.to_s
28
- end
29
-
30
- test "#+ should return a URI with the specified path appended" do
31
- uri1 = Scrape::URI.new "http://www.example.com/foo"
32
- uri2 = uri1 + "bar"
33
- assert_equal "http://www.example.com/foo/bar", uri2.to_s
34
- end
35
-
36
- test "#+ should return a URI from the absolute url" do
37
- uri1 = Scrape::URI.new "http://www.example.com/foo"
38
- uri2 = uri1 + "http://www.example.com/bar"
39
- assert_equal "http://www.example.com/bar", uri2.to_s
40
- end
41
-
42
- test "#+ should return a URI appended from the absolute url" do
43
- uri1 = Scrape::URI.new "http://www.example.com/foo"
44
- uri2 = uri1 + "http://www.example.com/foo/bar"
45
- assert_equal "http://www.example.com/foo/bar", uri2.to_s
46
- end
47
-
48
- test "#< should return true when specified url is greater" do
49
- uri1 = Scrape::URI.new "http://www.example.com/foo"
50
- assert uri1 < "http://www.example.com/foo/bar"
51
- end
52
- end