scrape 0.1 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +1 -1
- data/lib/scrape/application.rb +2 -2
- data/lib/scrape/site.rb +20 -7
- data/lib/scrape/version.rb +1 -1
- data/lib/scrape.rb +6 -0
- data/test/unit/scrape_test.rb +25 -0
- data/test/unit/site_test.rb +22 -2
- metadata +3 -4
- data/lib/scrape/uri.rb +0 -58
- data/test/unit/uri_test.rb +0 -52
data/Gemfile.lock
CHANGED
data/lib/scrape/application.rb
CHANGED
@@ -30,7 +30,7 @@ class Scrape::Application
|
|
30
30
|
|
31
31
|
def reset
|
32
32
|
@history = []
|
33
|
-
@queue = sites.values.map{|site| site.
|
33
|
+
@queue = sites.values.map{|site| site.to_s }
|
34
34
|
end
|
35
35
|
|
36
36
|
def queue
|
@@ -44,7 +44,7 @@ class Scrape::Application
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def [] url
|
47
|
-
@sites.values.detect{|site| site.
|
47
|
+
@sites.values.detect{|site| site.accept? url }
|
48
48
|
end
|
49
49
|
|
50
50
|
def load_scrapefile
|
data/lib/scrape/site.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
+
require 'uri'
|
1
2
|
require 'nokogiri'
|
2
3
|
|
3
4
|
class Scrape::Site
|
4
5
|
attr_reader :url, :matches
|
5
6
|
|
6
7
|
def initialize url
|
7
|
-
@url =
|
8
|
+
@url = URI.parse url
|
8
9
|
@url.query = nil
|
9
10
|
@url.fragment = nil
|
10
11
|
@matches = []
|
@@ -17,15 +18,27 @@ class Scrape::Site
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def parse url
|
20
|
-
url =
|
21
|
-
doc = Nokogiri::HTML
|
21
|
+
url = normalize url
|
22
|
+
doc = Nokogiri::HTML Scrape.open(url)
|
22
23
|
|
23
24
|
@matches.each{|match| match.invoke doc if match =~ url }
|
24
25
|
|
25
|
-
|
26
|
-
|
27
|
-
|
26
|
+
doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
|
27
|
+
end
|
28
|
+
|
29
|
+
def accept? url
|
30
|
+
url.to_s[0, to_s.length] == to_s
|
31
|
+
end
|
32
|
+
|
33
|
+
def normalize url
|
34
|
+
case url
|
35
|
+
when /^.+:\/\// then url.dup
|
36
|
+
when /^\// then @url.merge(url).to_s
|
37
|
+
else @url.merge("#{@url.path}/#{url}").to_s
|
28
38
|
end
|
29
|
-
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_s
|
42
|
+
url.to_s
|
30
43
|
end
|
31
44
|
end
|
data/lib/scrape/version.rb
CHANGED
data/lib/scrape.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "rubygems"
|
2
2
|
require "logger"
|
3
|
+
require "open-uri"
|
3
4
|
require "bundler/setup"
|
4
5
|
|
5
6
|
module Scrape
|
@@ -32,5 +33,10 @@ module Scrape
|
|
32
33
|
def load_scrapefile path
|
33
34
|
Application.new path
|
34
35
|
end
|
36
|
+
|
37
|
+
def open url, headers = {}, &block
|
38
|
+
headers = {"User-Agent" => user_agent}.merge(headers)
|
39
|
+
super(url, headers, &block).read
|
40
|
+
end
|
35
41
|
end
|
36
42
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class ScrapeTest < Scrape::TestCase
|
4
|
+
test "#user_agent should return default when not set" do
|
5
|
+
assert_equal Scrape.user_agent, "Scrape/#{Scrape::VERSION}"
|
6
|
+
end
|
7
|
+
|
8
|
+
test "#load_scrapefile should return a new application" do
|
9
|
+
app = Scrape.load_scrapefile '.'
|
10
|
+
assert_kind_of Scrape::Application, app
|
11
|
+
end
|
12
|
+
|
13
|
+
test "#open should send a request to the specified url and return the contents" do
|
14
|
+
stub_request(:get, "http://example.com/").to_return(:status => 200, :body => "booyah")
|
15
|
+
assert_equal "booyah", Scrape.open("http://example.com")
|
16
|
+
end
|
17
|
+
|
18
|
+
test "#open should set the user agent in the request header" do
|
19
|
+
stub_request(:get, "http://example.com/").
|
20
|
+
with(:headers => {"User-Agent" => "Scrape/#{Scrape::VERSION}"}).
|
21
|
+
to_return(:status => 200, :body => "")
|
22
|
+
Scrape.open("http://example.com")
|
23
|
+
assert true
|
24
|
+
end
|
25
|
+
end
|
data/test/unit/site_test.rb
CHANGED
@@ -21,7 +21,7 @@ class SiteTest < Scrape::TestCase
|
|
21
21
|
HTML
|
22
22
|
|
23
23
|
site = Scrape::Site.new "http://www.example.com"
|
24
|
-
assert_equal [
|
24
|
+
assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
|
25
25
|
end
|
26
26
|
|
27
27
|
test "#parse should return relative urls to the site" do
|
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
|
|
36
36
|
HTML
|
37
37
|
|
38
38
|
site = Scrape::Site.new "http://www.example.com"
|
39
|
-
assert_equal [
|
39
|
+
assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
|
40
40
|
end
|
41
41
|
|
42
42
|
test "#parse should return no urls" do
|
@@ -72,4 +72,24 @@ class SiteTest < Scrape::TestCase
|
|
72
72
|
|
73
73
|
assert ok, "Match was not invoked"
|
74
74
|
end
|
75
|
+
|
76
|
+
test "#accept? should return true when specified url inside the site's url" do
|
77
|
+
uri = Scrape::Site.new "http://www.example.com/foo"
|
78
|
+
assert uri.accept?("http://www.example.com/foo/bar")
|
79
|
+
end
|
80
|
+
|
81
|
+
test "#normalize should return a url when string begins with a slash" do
|
82
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
83
|
+
assert_equal "http://www.example.com/bar", site.normalize("/bar")
|
84
|
+
end
|
85
|
+
|
86
|
+
test "#normalize should return a url with the string appended" do
|
87
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
88
|
+
assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
|
89
|
+
end
|
90
|
+
|
91
|
+
test "#normalize should return the string when it begins with a scheme" do
|
92
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
93
|
+
assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
|
94
|
+
end
|
75
95
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -51,7 +51,6 @@ files:
|
|
51
51
|
- lib/scrape/dsl.rb
|
52
52
|
- lib/scrape/match.rb
|
53
53
|
- lib/scrape/site.rb
|
54
|
-
- lib/scrape/uri.rb
|
55
54
|
- lib/scrape/version.rb
|
56
55
|
- scrape.gemspec
|
57
56
|
- test/support/test1.scrape
|
@@ -61,8 +60,8 @@ files:
|
|
61
60
|
- test/unit/application_test.rb
|
62
61
|
- test/unit/default_loader_test.rb
|
63
62
|
- test/unit/match_test.rb
|
63
|
+
- test/unit/scrape_test.rb
|
64
64
|
- test/unit/site_test.rb
|
65
|
-
- test/unit/uri_test.rb
|
66
65
|
homepage: http://github.com/evilmarty/scrape
|
67
66
|
licenses: []
|
68
67
|
post_install_message:
|
data/lib/scrape/uri.rb
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
require 'uri'
|
2
|
-
require 'open-uri'
|
3
|
-
|
4
|
-
class Scrape::URI
|
5
|
-
def initialize uri = nil
|
6
|
-
@uri = case uri
|
7
|
-
when URI then uri.clone
|
8
|
-
when NilClass then URI.new
|
9
|
-
else URI.parse uri.to_s
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
%w[fragment host hostname password path port query scheme user to_s relative? absolute?].each do |method_name|
|
14
|
-
class_eval <<-EOT, __FILE__, __LINE__ + 1
|
15
|
-
def #{method_name}
|
16
|
-
@uri.#{method_name}
|
17
|
-
end
|
18
|
-
EOT
|
19
|
-
end
|
20
|
-
|
21
|
-
%w[fragment host hostname password path port query scheme user].each do |method_name|
|
22
|
-
class_eval <<-EOT, __FILE__, __LINE__ + 1
|
23
|
-
def #{method_name}= value
|
24
|
-
@uri.#{method_name} = value
|
25
|
-
end
|
26
|
-
EOT
|
27
|
-
end
|
28
|
-
|
29
|
-
def + url
|
30
|
-
return clone if self == url
|
31
|
-
relative = (url.to_s =~ /^(?!.+:\/\/|\/)/)
|
32
|
-
uri = self.class.new @uri.merge(url)
|
33
|
-
uri.path = "#{@uri.path}#{uri.path}" if relative
|
34
|
-
uri
|
35
|
-
end
|
36
|
-
|
37
|
-
def < url
|
38
|
-
url[0, length] == to_s
|
39
|
-
end
|
40
|
-
|
41
|
-
def [] *args
|
42
|
-
to_s[*args]
|
43
|
-
end
|
44
|
-
|
45
|
-
def == url
|
46
|
-
to_s == url.to_s
|
47
|
-
end
|
48
|
-
|
49
|
-
def length
|
50
|
-
to_s.length
|
51
|
-
end
|
52
|
-
alias_method :size, :length
|
53
|
-
|
54
|
-
def open headers = {}, &block
|
55
|
-
headers = {"User-Agent" => Scrape.user_agent}.merge(headers)
|
56
|
-
super(to_s, headers, &block).read
|
57
|
-
end
|
58
|
-
end
|
data/test/unit/uri_test.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
require "test_helper"
|
2
|
-
|
3
|
-
class URITest < Scrape::TestCase
|
4
|
-
{fragment: "blah", host: "www.example.com", password: "secret", path: "/dot", query: "foo=bar", scheme: "http", user: "chuck", relative?: false, absolute?: true}.each do |method_name, value|
|
5
|
-
test "##{method_name} should return value" do
|
6
|
-
uri = Scrape::URI.new "http://chuck:secret@www.example.com/dot?foo=bar#blah"
|
7
|
-
assert_equal value, uri.send(method_name)
|
8
|
-
end
|
9
|
-
end
|
10
|
-
|
11
|
-
test "#open should return the contents at the url" do
|
12
|
-
stub_request(:get, "http://www.example.com/").with(headers: {"User-Agent" => Scrape.user_agent}).to_return(status: 200, body: "Howdie")
|
13
|
-
|
14
|
-
uri = Scrape::URI.new "http://www.example.com"
|
15
|
-
assert_equal "Howdie", uri.open
|
16
|
-
end
|
17
|
-
|
18
|
-
test "#+ should return a URI with the specified path" do
|
19
|
-
uri1 = Scrape::URI.new "http://www.example.com"
|
20
|
-
uri2 = uri1 + "/bar"
|
21
|
-
assert_equal "http://www.example.com/bar", uri2.to_s
|
22
|
-
end
|
23
|
-
|
24
|
-
test "#+ should return a URI overwriting with the specified path" do
|
25
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
26
|
-
uri2 = uri1 + "/bar"
|
27
|
-
assert_equal "http://www.example.com/bar", uri2.to_s
|
28
|
-
end
|
29
|
-
|
30
|
-
test "#+ should return a URI with the specified path appended" do
|
31
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
32
|
-
uri2 = uri1 + "bar"
|
33
|
-
assert_equal "http://www.example.com/foo/bar", uri2.to_s
|
34
|
-
end
|
35
|
-
|
36
|
-
test "#+ should return a URI from the absolute url" do
|
37
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
38
|
-
uri2 = uri1 + "http://www.example.com/bar"
|
39
|
-
assert_equal "http://www.example.com/bar", uri2.to_s
|
40
|
-
end
|
41
|
-
|
42
|
-
test "#+ should return a URI appended from the absolute url" do
|
43
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
44
|
-
uri2 = uri1 + "http://www.example.com/foo/bar"
|
45
|
-
assert_equal "http://www.example.com/foo/bar", uri2.to_s
|
46
|
-
end
|
47
|
-
|
48
|
-
test "#< should return true when specified url is greater" do
|
49
|
-
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
50
|
-
assert uri1 < "http://www.example.com/foo/bar"
|
51
|
-
end
|
52
|
-
end
|