scrape 0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +1 -1
- data/lib/scrape/application.rb +2 -2
- data/lib/scrape/site.rb +20 -7
- data/lib/scrape/version.rb +1 -1
- data/lib/scrape.rb +6 -0
- data/test/unit/scrape_test.rb +25 -0
- data/test/unit/site_test.rb +22 -2
- metadata +3 -4
- data/lib/scrape/uri.rb +0 -58
- data/test/unit/uri_test.rb +0 -52
data/Gemfile.lock
CHANGED
data/lib/scrape/application.rb
CHANGED
@@ -30,7 +30,7 @@ class Scrape::Application
|
|
30
30
|
|
31
31
|
def reset
|
32
32
|
@history = []
|
33
|
-
@queue = sites.values.map{|site| site.
|
33
|
+
@queue = sites.values.map{|site| site.to_s }
|
34
34
|
end
|
35
35
|
|
36
36
|
def queue
|
@@ -44,7 +44,7 @@ class Scrape::Application
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def [] url
|
47
|
-
@sites.values.detect{|site| site.
|
47
|
+
@sites.values.detect{|site| site.accept? url }
|
48
48
|
end
|
49
49
|
|
50
50
|
def load_scrapefile
|
data/lib/scrape/site.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
+
require 'uri'
|
1
2
|
require 'nokogiri'
|
2
3
|
|
3
4
|
class Scrape::Site
|
4
5
|
attr_reader :url, :matches
|
5
6
|
|
6
7
|
def initialize url
|
7
|
-
@url =
|
8
|
+
@url = URI.parse url
|
8
9
|
@url.query = nil
|
9
10
|
@url.fragment = nil
|
10
11
|
@matches = []
|
@@ -17,15 +18,27 @@ class Scrape::Site
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def parse url
|
20
|
-
url =
|
21
|
-
doc = Nokogiri::HTML
|
21
|
+
url = normalize url
|
22
|
+
doc = Nokogiri::HTML Scrape.open(url)
|
22
23
|
|
23
24
|
@matches.each{|match| match.invoke doc if match =~ url }
|
24
25
|
|
25
|
-
|
26
|
-
|
27
|
-
|
26
|
+
doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
|
27
|
+
end
|
28
|
+
|
29
|
+
def accept? url
|
30
|
+
url.to_s[0, to_s.length] == to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
def normalize url
|
34
|
+
case url
|
35
|
+
when /^.+:\/\// then url.dup
|
36
|
+
when /^\// then @url.merge(url).to_s
|
37
|
+
else @url.merge("#{@url.path}/#{url}").to_s
|
28
38
|
end
|
29
|
-
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_s
|
42
|
+
url.to_s
|
30
43
|
end
|
31
44
|
end
|
data/lib/scrape/version.rb
CHANGED
data/lib/scrape.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "rubygems"
|
2
2
|
require "logger"
|
3
|
+
require "open-uri"
|
3
4
|
require "bundler/setup"
|
4
5
|
|
5
6
|
module Scrape
|
@@ -32,5 +33,10 @@ module Scrape
|
|
32
33
|
def load_scrapefile path
|
33
34
|
Application.new path
|
34
35
|
end
|
36
|
+
|
37
|
+
def open url, headers = {}, &block
|
38
|
+
headers = {"User-Agent" => user_agent}.merge(headers)
|
39
|
+
super(url, headers, &block).read
|
40
|
+
end
|
35
41
|
end
|
36
42
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class ScrapeTest < Scrape::TestCase
|
4
|
+
test "#user_agent should return default when not set" do
|
5
|
+
assert_equal Scrape.user_agent, "Scrape/#{Scrape::VERSION}"
|
6
|
+
end
|
7
|
+
|
8
|
+
test "#load_scrapefile should return a new application" do
|
9
|
+
app = Scrape.load_scrapefile '.'
|
10
|
+
assert_kind_of Scrape::Application, app
|
11
|
+
end
|
12
|
+
|
13
|
+
test "#open should send a request to the specified url and return the contents" do
|
14
|
+
stub_request(:get, "http://example.com/").to_return(:status => 200, :body => "booyah")
|
15
|
+
assert_equal "booyah", Scrape.open("http://example.com")
|
16
|
+
end
|
17
|
+
|
18
|
+
test "#open should set the user agent in the request header" do
|
19
|
+
stub_request(:get, "http://example.com/").
|
20
|
+
with(:headers => {"User-Agent" => "Scrape/#{Scrape::VERSION}"}).
|
21
|
+
to_return(:status => 200, :body => "")
|
22
|
+
Scrape.open("http://example.com")
|
23
|
+
assert true
|
24
|
+
end
|
25
|
+
end
|
data/test/unit/site_test.rb
CHANGED
@@ -21,7 +21,7 @@ class SiteTest < Scrape::TestCase
|
|
21
21
|
HTML
|
22
22
|
|
23
23
|
site = Scrape::Site.new "http://www.example.com"
|
24
|
-
assert_equal [
|
24
|
+
assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
|
25
25
|
end
|
26
26
|
|
27
27
|
test "#parse should return relative urls to the site" do
|
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
|
|
36
36
|
HTML
|
37
37
|
|
38
38
|
site = Scrape::Site.new "http://www.example.com"
|
39
|
-
assert_equal [
|
39
|
+
assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
|
40
40
|
end
|
41
41
|
|
42
42
|
test "#parse should return no urls" do
|
@@ -72,4 +72,24 @@ class SiteTest < Scrape::TestCase
|
|
72
72
|
|
73
73
|
assert ok, "Match was not invoked"
|
74
74
|
end
|
75
|
+
|
76
|
+
test "#accept? should return true when specified url inside the site's url" do
|
77
|
+
uri = Scrape::Site.new "http://www.example.com/foo"
|
78
|
+
assert uri.accept?("http://www.example.com/foo/bar")
|
79
|
+
end
|
80
|
+
|
81
|
+
test "#normalize should return a url when string begins with a slash" do
|
82
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
83
|
+
assert_equal "http://www.example.com/bar", site.normalize("/bar")
|
84
|
+
end
|
85
|
+
|
86
|
+
test "#normalize should return a url with the string appended" do
|
87
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
88
|
+
assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
|
89
|
+
end
|
90
|
+
|
91
|
+
test "#normalize should return the string when it begins with a scheme" do
|
92
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
93
|
+
assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
|
94
|
+
end
|
75
95
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -51,7 +51,6 @@ files:
|
|
51
51
|
- lib/scrape/dsl.rb
|
52
52
|
- lib/scrape/match.rb
|
53
53
|
- lib/scrape/site.rb
|
54
|
-
- lib/scrape/uri.rb
|
55
54
|
- lib/scrape/version.rb
|
56
55
|
- scrape.gemspec
|
57
56
|
- test/support/test1.scrape
|
@@ -61,8 +60,8 @@ files:
|
|
61
60
|
- test/unit/application_test.rb
|
62
61
|
- test/unit/default_loader_test.rb
|
63
62
|
- test/unit/match_test.rb
|
63
|
+
- test/unit/scrape_test.rb
|
64
64
|
- test/unit/site_test.rb
|
65
|
-
- test/unit/uri_test.rb
|
66
65
|
homepage: http://github.com/evilmarty/scrape
|
67
66
|
licenses: []
|
68
67
|
post_install_message:
|
data/lib/scrape/uri.rb
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
require 'uri'
|
2
|
-
require 'open-uri'
|
3
|
-
|
4
|
-
class Scrape::URI
|
5
|
-
def initialize uri = nil
|
6
|
-
@uri = case uri
|
7
|
-
when URI then uri.clone
|
8
|
-
when NilClass then URI.new
|
9
|
-
else URI.parse uri.to_s
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
%w[fragment host hostname password path port query scheme user to_s relative? absolute?].each do |method_name|
|
14
|
-
class_eval <<-EOT, __FILE__, __LINE__ + 1
|
15
|
-
def #{method_name}
|
16
|
-
@uri.#{method_name}
|
17
|
-
end
|
18
|
-
EOT
|
19
|
-
end
|
20
|
-
|
21
|
-
%w[fragment host hostname password path port query scheme user].each do |method_name|
|
22
|
-
class_eval <<-EOT, __FILE__, __LINE__ + 1
|
23
|
-
def #{method_name}= value
|
24
|
-
@uri.#{method_name} = value
|
25
|
-
end
|
26
|
-
EOT
|
27
|
-
end
|
28
|
-
|
29
|
-
def + url
|
30
|
-
return clone if self == url
|
31
|
-
relative = (url.to_s =~ /^(?!.+:\/\/|\/)/)
|
32
|
-
uri = self.class.new @uri.merge(url)
|
33
|
-
uri.path = "#{@uri.path}#{uri.path}" if relative
|
34
|
-
uri
|
35
|
-
end
|
36
|
-
|
37
|
-
def < url
|
38
|
-
url[0, length] == to_s
|
39
|
-
end
|
40
|
-
|
41
|
-
def [] *args
|
42
|
-
to_s[*args]
|
43
|
-
end
|
44
|
-
|
45
|
-
def == url
|
46
|
-
to_s == url.to_s
|
47
|
-
end
|
48
|
-
|
49
|
-
def length
|
50
|
-
to_s.length
|
51
|
-
end
|
52
|
-
alias_method :size, :length
|
53
|
-
|
54
|
-
def open headers = {}, &block
|
55
|
-
headers = {"User-Agent" => Scrape.user_agent}.merge(headers)
|
56
|
-
super(to_s, headers, &block).read
|
57
|
-
end
|
58
|
-
end
|
data/test/unit/uri_test.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
require "test_helper"
|
2
|
-
|
3
|
-
class URITest < Scrape::TestCase
|
4
|
-
{fragment: "blah", host: "www.example.com", password: "secret", path: "/dot", query: "foo=bar", scheme: "http", user: "chuck", relative?: false, absolute?: true}.each do |method_name, value|
|
5
|
-
test "##{method_name} should return value" do
|
6
|
-
uri = Scrape::URI.new "http://chuck:secret@www.example.com/dot?foo=bar#blah"
|
7
|
-
assert_equal value, uri.send(method_name)
|
8
|
-
end
|
9
|
-
end
|
10
|
-
|
11
|
-
test "#open should return the contents at the url" do
|
12
|
-
stub_request(:get, "http://www.example.com/").with(headers: {"User-Agent" => Scrape.user_agent}).to_return(status: 200, body: "Howdie")
|
13
|
-
|
14
|
-
uri = Scrape::URI.new "http://www.example.com"
|
15
|
-
assert_equal "Howdie", uri.open
|
16
|
-
end
|
17
|
-
|
18
|
-
test "#+ should return a URI with the specified path" do
|
19
|
-
uri1 = Scrape::URI.new "http://www.example.com"
|
20
|
-
uri2 = uri1 + "/bar"
|
21
|
-
assert_equal "http://www.example.com/bar", uri2.to_s
|
22
|
-
end
|
23
|
-
|
24
|
-
test "#+ should return a URI overwriting with the specified path" do
|
25
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
26
|
-
uri2 = uri1 + "/bar"
|
27
|
-
assert_equal "http://www.example.com/bar", uri2.to_s
|
28
|
-
end
|
29
|
-
|
30
|
-
test "#+ should return a URI with the specified path appended" do
|
31
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
32
|
-
uri2 = uri1 + "bar"
|
33
|
-
assert_equal "http://www.example.com/foo/bar", uri2.to_s
|
34
|
-
end
|
35
|
-
|
36
|
-
test "#+ should return a URI from the absolute url" do
|
37
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
38
|
-
uri2 = uri1 + "http://www.example.com/bar"
|
39
|
-
assert_equal "http://www.example.com/bar", uri2.to_s
|
40
|
-
end
|
41
|
-
|
42
|
-
test "#+ should return a URI appended from the absolute url" do
|
43
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
44
|
-
uri2 = uri1 + "http://www.example.com/foo/bar"
|
45
|
-
assert_equal "http://www.example.com/foo/bar", uri2.to_s
|
46
|
-
end
|
47
|
-
|
48
|
-
test "#< should return true when specified url is greater" do
|
49
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
50
|
-
assert uri1 < "http://www.example.com/foo/bar"
|
51
|
-
end
|
52
|
-
end
|