skyscraper 0.0.5 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -1
- data/lib/skyscraper.rb +2 -0
- data/lib/skyscraper/document.rb +21 -1
- data/lib/skyscraper/node.rb +15 -0
- data/lib/skyscraper/path/base.rb +8 -0
- data/lib/skyscraper/results.rb +5 -2
- data/lib/skyscraper/version.rb +1 -1
- data/spec/skyscraper/skyscraper/base_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/config_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/document_spec.rb +8 -0
- data/spec/skyscraper/skyscraper/field_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/node_spec.rb +29 -0
- data/spec/skyscraper/skyscraper/pages_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/path_spec.rb +16 -0
- data/spec/skyscraper/skyscraper/resource_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/results_spec.rb +18 -1
- data/spec/skyscraper/skyscraper_spec.rb +6 -0
- data/spec/test_files/skyscraper-node-form.html +13 -0
- metadata +12 -10
data/.rspec
CHANGED
@@ -1 +1 @@
|
|
1
|
-
--color
|
1
|
+
--color --fail-fast --format nested
|
data/lib/skyscraper.rb
CHANGED
@@ -2,6 +2,7 @@ require "open-uri"
|
|
2
2
|
require "uri"
|
3
3
|
require "nokogiri"
|
4
4
|
require "active_support/core_ext"
|
5
|
+
require "net/http"
|
5
6
|
|
6
7
|
include ActiveSupport
|
7
8
|
|
@@ -38,6 +39,7 @@ module Skyscraper
|
|
38
39
|
Node.new document.css("html")
|
39
40
|
end
|
40
41
|
|
42
|
+
|
41
43
|
def fetch
|
42
44
|
self.class.send(:base).fetch
|
43
45
|
end
|
data/lib/skyscraper/document.rb
CHANGED
@@ -3,9 +3,29 @@ module Skyscraper
|
|
3
3
|
attr_accessor :path
|
4
4
|
|
5
5
|
def self.load path, encoding = 'utf-8'
|
6
|
-
document = Skyscraper::Document.parse
|
6
|
+
document = Skyscraper::Document.parse open_from_path(path), nil, encoding
|
7
7
|
document.path = Skyscraper::Path.factory(path)
|
8
8
|
document
|
9
9
|
end
|
10
|
+
|
11
|
+
def self.load_post path, params = {}, encoding = 'utf-8'
|
12
|
+
file = Net::HTTP.post_form(URI.parse(path), params).body
|
13
|
+
|
14
|
+
document = Skyscraper::Document.parse file
|
15
|
+
document.path = path
|
16
|
+
document
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.open_from_path path
|
20
|
+
begin
|
21
|
+
file = open(path)
|
22
|
+
rescue RuntimeError
|
23
|
+
https_path = path.gsub /http:/, "https:"
|
24
|
+
file = open(https_path) unless https_path == path
|
25
|
+
end
|
26
|
+
|
27
|
+
file
|
28
|
+
end
|
29
|
+
|
10
30
|
end
|
11
31
|
end
|
data/lib/skyscraper/node.rb
CHANGED
@@ -97,5 +97,20 @@ module Skyscraper
|
|
97
97
|
def tag
|
98
98
|
@element.name
|
99
99
|
end
|
100
|
+
|
101
|
+
def submit params = {}
|
102
|
+
raise Skyscraper::LocalFormException if @element.document.path.local?
|
103
|
+
raise Skyscraper::NotActionException if self.action.blank?
|
104
|
+
|
105
|
+
path = @element.document.path.full_path_for(self.action)
|
106
|
+
document = Skyscraper::Document.load_post path, params
|
107
|
+
|
108
|
+
Node.new(document.css("html"))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
class LocalFormException < Exception
|
113
|
+
end
|
114
|
+
class NotActionException < Exception
|
100
115
|
end
|
101
116
|
end
|
data/lib/skyscraper/path/base.rb
CHANGED
data/lib/skyscraper/results.rb
CHANGED
@@ -40,8 +40,9 @@ module Skyscraper
|
|
40
40
|
result[field.name] = field.find_in_document document
|
41
41
|
end
|
42
42
|
|
43
|
-
call_callbacks @after_each, result, document
|
43
|
+
callback_result = call_callbacks @after_each, result, document
|
44
44
|
results << result
|
45
|
+
break if callback_result === false
|
45
46
|
sleep @delay[:sleep] if (i+1) % @delay[:after] == 0
|
46
47
|
|
47
48
|
rescue SocketError, Errno::ENOENT
|
@@ -82,9 +83,11 @@ module Skyscraper
|
|
82
83
|
end
|
83
84
|
|
84
85
|
def call_callbacks callbacks, *args
|
86
|
+
result = true
|
85
87
|
callbacks.each do |callback|
|
86
|
-
callback.call(*args)
|
88
|
+
result = callback.call(*args)
|
87
89
|
end
|
90
|
+
result
|
88
91
|
end
|
89
92
|
end
|
90
93
|
|
data/lib/skyscraper/version.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#encoding: utf-8
|
2
|
+
require "spec_helper"
|
2
3
|
|
3
4
|
describe Skyscraper::Document do
|
4
5
|
it "should support utf-8 encoding by default in remote pages" do
|
@@ -11,4 +12,11 @@ describe Skyscraper::Document do
|
|
11
12
|
document = Skyscraper::Document::load(path_to("skyscraper-document.html"))
|
12
13
|
document.path.should be_an Skyscraper::Path::Base
|
13
14
|
end
|
15
|
+
|
16
|
+
describe "when is opening" do
|
17
|
+
it "should handle https redirects" do
|
18
|
+
Skyscraper::Document.open_from_path("http://github.com").should be_an_instance_of Tempfile
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
14
22
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
1
3
|
describe Skyscraper::Node do
|
2
4
|
describe "when is initialized" do
|
3
5
|
before(:each) do
|
@@ -83,5 +85,32 @@ describe Skyscraper::Node do
|
|
83
85
|
@node.tag.should == "ul"
|
84
86
|
end
|
85
87
|
end
|
88
|
+
|
89
|
+
describe "Submit post data" do
|
90
|
+
it "should submit form post data" do
|
91
|
+
node = Skyscraper::fetch("http://www.balticplaza.eu/kontakt").first("#new_inquiry")
|
92
|
+
submited_page = node.submit(:"inquiry[name]" => "Example name")
|
93
|
+
submited_page.first("#inquiry_name").value.should == "Example name"
|
94
|
+
end
|
95
|
+
|
96
|
+
it "should throws an LocalFormException" do
|
97
|
+
lambda do
|
98
|
+
node = Skyscraper::fetch(path_to("skyscraper-node-form.html"))
|
99
|
+
node.first("form").submit
|
100
|
+
end.should raise_error Skyscraper::LocalFormException
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should throws NotActionException" do
|
104
|
+
Skyscraper::Path::Local.any_instance.stub(:local? => false)
|
105
|
+
|
106
|
+
lambda do
|
107
|
+
Skyscraper::fetch(path_to("skyscraper-node-traversing.html")).first(".menu").submit
|
108
|
+
end.should raise_error Skyscraper::NotActionException
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should handle GET form method" do
|
112
|
+
pending "todo"
|
113
|
+
end
|
114
|
+
end
|
86
115
|
end
|
87
116
|
|
@@ -6,6 +6,14 @@ describe Skyscraper::Path do
|
|
6
6
|
@path = Skyscraper::Path.factory("http://google.com/index.php?q=e")
|
7
7
|
end
|
8
8
|
|
9
|
+
it "local? method should returns false" do
|
10
|
+
@path.local?.should == false
|
11
|
+
end
|
12
|
+
|
13
|
+
it "remote? method should returns true" do
|
14
|
+
@path.remote?.should == true
|
15
|
+
end
|
16
|
+
|
9
17
|
it "should returns domain" do
|
10
18
|
@path.domain.should == "http://google.com"
|
11
19
|
end
|
@@ -65,6 +73,14 @@ describe Skyscraper::Path do
|
|
65
73
|
@path.folder.should == "/var/www/files/"
|
66
74
|
end
|
67
75
|
|
76
|
+
it "local? method should returns true" do
|
77
|
+
@path.local?.should == true
|
78
|
+
end
|
79
|
+
|
80
|
+
it "remote? method should returns false" do
|
81
|
+
@path.remote?.should == false
|
82
|
+
end
|
83
|
+
|
68
84
|
it "should returns file name" do
|
69
85
|
@path.file_name.should == "file.ext"
|
70
86
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
1
3
|
describe Skyscraper::Results do
|
2
4
|
def fetch options = {}
|
3
5
|
options.reverse_merge! fields: {}, options: {}
|
@@ -72,7 +74,7 @@ describe Skyscraper::Results do
|
|
72
74
|
page.should be_an_instance_of(Skyscraper::Node)
|
73
75
|
@call_count += 1
|
74
76
|
end
|
75
|
-
|
77
|
+
|
76
78
|
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { after_each: [callback] }
|
77
79
|
|
78
80
|
@call_count.should == 10
|
@@ -117,6 +119,21 @@ describe Skyscraper::Results do
|
|
117
119
|
results[0][:h1].should == "Hello world"
|
118
120
|
end
|
119
121
|
|
122
|
+
it "should stops when after each callback returns false" do
|
123
|
+
counter = 0
|
124
|
+
|
125
|
+
callback = proc do
|
126
|
+
if counter == 1
|
127
|
+
counter = 0
|
128
|
+
false
|
129
|
+
else
|
130
|
+
counter += 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_each: [callback] }
|
135
|
+
results.length.should == 2
|
136
|
+
end
|
120
137
|
end
|
121
138
|
|
122
139
|
describe "errors" do
|
@@ -1,5 +1,7 @@
|
|
1
1
|
#encoding: utf-8
|
2
2
|
|
3
|
+
require "spec_helper"
|
4
|
+
|
3
5
|
class TestScraper
|
4
6
|
include Skyscraper
|
5
7
|
|
@@ -27,6 +29,10 @@ describe Skyscraper do
|
|
27
29
|
Skyscraper::fetch(path_to("skyscraper-encoding.html")).first(".utf-8").text.should == "ąśćżół"
|
28
30
|
end
|
29
31
|
|
32
|
+
it "should handle http -> https redirects" do
|
33
|
+
Skyscraper::fetch("http://github.com").first("title").text.should =~ /GitHub/
|
34
|
+
end
|
35
|
+
|
30
36
|
it "should works when included" do
|
31
37
|
TestScraper.new.fetch[0][:h1].should == "Hello world"
|
32
38
|
TestScraper.new.fetch[1][:h1].should == "Hello world"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: skyscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-05-
|
12
|
+
date: 2012-05-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &78066490 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *78066490
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rake
|
27
|
-
requirement: &
|
27
|
+
requirement: &78128330 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *78128330
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &78126140 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *78126140
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: actionpack
|
49
|
-
requirement: &
|
49
|
+
requirement: &78123100 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *78123100
|
58
58
|
description: Easy to use DSL that helps scraping data from websites. Thanks to it,
|
59
59
|
writing web crawlers would be very fast and intuitive. Traversing through html nodes
|
60
60
|
and fetching all of the HTML attributes, would be possible. Just like in jQuery
|
@@ -109,6 +109,7 @@ files:
|
|
109
109
|
- spec/test_files/skyscraper-field.html
|
110
110
|
- spec/test_files/skyscraper-node-a.html
|
111
111
|
- spec/test_files/skyscraper-node-b.html
|
112
|
+
- spec/test_files/skyscraper-node-form.html
|
112
113
|
- spec/test_files/skyscraper-node-traversing.html
|
113
114
|
- spec/test_files/skyscraper-node.html
|
114
115
|
- spec/test_files/skyscraper-pages.html
|
@@ -162,6 +163,7 @@ test_files:
|
|
162
163
|
- spec/test_files/skyscraper-field.html
|
163
164
|
- spec/test_files/skyscraper-node-a.html
|
164
165
|
- spec/test_files/skyscraper-node-b.html
|
166
|
+
- spec/test_files/skyscraper-node-form.html
|
165
167
|
- spec/test_files/skyscraper-node-traversing.html
|
166
168
|
- spec/test_files/skyscraper-node.html
|
167
169
|
- spec/test_files/skyscraper-pages.html
|