skyscraper 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -1
- data/lib/skyscraper.rb +2 -0
- data/lib/skyscraper/document.rb +21 -1
- data/lib/skyscraper/node.rb +15 -0
- data/lib/skyscraper/path/base.rb +8 -0
- data/lib/skyscraper/results.rb +5 -2
- data/lib/skyscraper/version.rb +1 -1
- data/spec/skyscraper/skyscraper/base_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/config_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/document_spec.rb +8 -0
- data/spec/skyscraper/skyscraper/field_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/node_spec.rb +29 -0
- data/spec/skyscraper/skyscraper/pages_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/path_spec.rb +16 -0
- data/spec/skyscraper/skyscraper/resource_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/results_spec.rb +18 -1
- data/spec/skyscraper/skyscraper_spec.rb +6 -0
- data/spec/test_files/skyscraper-node-form.html +13 -0
- metadata +12 -10
data/.rspec
CHANGED
@@ -1 +1 @@
|
|
1
|
-
--color
|
1
|
+
--color --fail-fast --format nested
|
data/lib/skyscraper.rb
CHANGED
@@ -2,6 +2,7 @@ require "open-uri"
|
|
2
2
|
require "uri"
|
3
3
|
require "nokogiri"
|
4
4
|
require "active_support/core_ext"
|
5
|
+
require "net/http"
|
5
6
|
|
6
7
|
include ActiveSupport
|
7
8
|
|
@@ -38,6 +39,7 @@ module Skyscraper
|
|
38
39
|
Node.new document.css("html")
|
39
40
|
end
|
40
41
|
|
42
|
+
|
41
43
|
def fetch
|
42
44
|
self.class.send(:base).fetch
|
43
45
|
end
|
data/lib/skyscraper/document.rb
CHANGED
@@ -3,9 +3,29 @@ module Skyscraper
|
|
3
3
|
attr_accessor :path
|
4
4
|
|
5
5
|
def self.load path, encoding = 'utf-8'
|
6
|
-
document = Skyscraper::Document.parse
|
6
|
+
document = Skyscraper::Document.parse open_from_path(path), nil, encoding
|
7
7
|
document.path = Skyscraper::Path.factory(path)
|
8
8
|
document
|
9
9
|
end
|
10
|
+
|
11
|
+
def self.load_post path, params = {}, encoding = 'utf-8'
|
12
|
+
file = Net::HTTP.post_form(URI.parse(path), params).body
|
13
|
+
|
14
|
+
document = Skyscraper::Document.parse file
|
15
|
+
document.path = path
|
16
|
+
document
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.open_from_path path
|
20
|
+
begin
|
21
|
+
file = open(path)
|
22
|
+
rescue RuntimeError
|
23
|
+
https_path = path.gsub /http:/, "https:"
|
24
|
+
file = open(https_path) unless https_path == path
|
25
|
+
end
|
26
|
+
|
27
|
+
file
|
28
|
+
end
|
29
|
+
|
10
30
|
end
|
11
31
|
end
|
data/lib/skyscraper/node.rb
CHANGED
@@ -97,5 +97,20 @@ module Skyscraper
|
|
97
97
|
def tag
|
98
98
|
@element.name
|
99
99
|
end
|
100
|
+
|
101
|
+
def submit params = {}
|
102
|
+
raise Skyscraper::LocalFormException if @element.document.path.local?
|
103
|
+
raise Skyscraper::NotActionException if self.action.blank?
|
104
|
+
|
105
|
+
path = @element.document.path.full_path_for(self.action)
|
106
|
+
document = Skyscraper::Document.load_post path, params
|
107
|
+
|
108
|
+
Node.new(document.css("html"))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
class LocalFormException < Exception
|
113
|
+
end
|
114
|
+
class NotActionException < Exception
|
100
115
|
end
|
101
116
|
end
|
data/lib/skyscraper/path/base.rb
CHANGED
data/lib/skyscraper/results.rb
CHANGED
@@ -40,8 +40,9 @@ module Skyscraper
|
|
40
40
|
result[field.name] = field.find_in_document document
|
41
41
|
end
|
42
42
|
|
43
|
-
call_callbacks @after_each, result, document
|
43
|
+
callback_result = call_callbacks @after_each, result, document
|
44
44
|
results << result
|
45
|
+
break if callback_result === false
|
45
46
|
sleep @delay[:sleep] if (i+1) % @delay[:after] == 0
|
46
47
|
|
47
48
|
rescue SocketError, Errno::ENOENT
|
@@ -82,9 +83,11 @@ module Skyscraper
|
|
82
83
|
end
|
83
84
|
|
84
85
|
def call_callbacks callbacks, *args
|
86
|
+
result = true
|
85
87
|
callbacks.each do |callback|
|
86
|
-
callback.call(*args)
|
88
|
+
result = callback.call(*args)
|
87
89
|
end
|
90
|
+
result
|
88
91
|
end
|
89
92
|
end
|
90
93
|
|
data/lib/skyscraper/version.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#encoding: utf-8
|
2
|
+
require "spec_helper"
|
2
3
|
|
3
4
|
describe Skyscraper::Document do
|
4
5
|
it "should support utf-8 encoding by default in remote pages" do
|
@@ -11,4 +12,11 @@ describe Skyscraper::Document do
|
|
11
12
|
document = Skyscraper::Document::load(path_to("skyscraper-document.html"))
|
12
13
|
document.path.should be_an Skyscraper::Path::Base
|
13
14
|
end
|
15
|
+
|
16
|
+
describe "when is opening" do
|
17
|
+
it "should handle https redirects" do
|
18
|
+
Skyscraper::Document.open_from_path("http://github.com").should be_an_instance_of Tempfile
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
14
22
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
1
3
|
describe Skyscraper::Node do
|
2
4
|
describe "when is initialized" do
|
3
5
|
before(:each) do
|
@@ -83,5 +85,32 @@ describe Skyscraper::Node do
|
|
83
85
|
@node.tag.should == "ul"
|
84
86
|
end
|
85
87
|
end
|
88
|
+
|
89
|
+
describe "Submit post data" do
|
90
|
+
it "should submit form post data" do
|
91
|
+
node = Skyscraper::fetch("http://www.balticplaza.eu/kontakt").first("#new_inquiry")
|
92
|
+
submited_page = node.submit(:"inquiry[name]" => "Example name")
|
93
|
+
submited_page.first("#inquiry_name").value.should == "Example name"
|
94
|
+
end
|
95
|
+
|
96
|
+
it "should throws an LocalFormException" do
|
97
|
+
lambda do
|
98
|
+
node = Skyscraper::fetch(path_to("skyscraper-node-form.html"))
|
99
|
+
node.first("form").submit
|
100
|
+
end.should raise_error Skyscraper::LocalFormException
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should throws NotActionException" do
|
104
|
+
Skyscraper::Path::Local.any_instance.stub(:local? => false)
|
105
|
+
|
106
|
+
lambda do
|
107
|
+
Skyscraper::fetch(path_to("skyscraper-node-traversing.html")).first(".menu").submit
|
108
|
+
end.should raise_error Skyscraper::NotActionException
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should handle GET form method" do
|
112
|
+
pending "todo"
|
113
|
+
end
|
114
|
+
end
|
86
115
|
end
|
87
116
|
|
@@ -6,6 +6,14 @@ describe Skyscraper::Path do
|
|
6
6
|
@path = Skyscraper::Path.factory("http://google.com/index.php?q=e")
|
7
7
|
end
|
8
8
|
|
9
|
+
it "local? method should returns false" do
|
10
|
+
@path.local?.should == false
|
11
|
+
end
|
12
|
+
|
13
|
+
it "remote? method should returns true" do
|
14
|
+
@path.remote?.should == true
|
15
|
+
end
|
16
|
+
|
9
17
|
it "should returns domain" do
|
10
18
|
@path.domain.should == "http://google.com"
|
11
19
|
end
|
@@ -65,6 +73,14 @@ describe Skyscraper::Path do
|
|
65
73
|
@path.folder.should == "/var/www/files/"
|
66
74
|
end
|
67
75
|
|
76
|
+
it "local? method should returns true" do
|
77
|
+
@path.local?.should == true
|
78
|
+
end
|
79
|
+
|
80
|
+
it "remote? method should returns false" do
|
81
|
+
@path.remote?.should == false
|
82
|
+
end
|
83
|
+
|
68
84
|
it "should returns file name" do
|
69
85
|
@path.file_name.should == "file.ext"
|
70
86
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
1
3
|
describe Skyscraper::Results do
|
2
4
|
def fetch options = {}
|
3
5
|
options.reverse_merge! fields: {}, options: {}
|
@@ -72,7 +74,7 @@ describe Skyscraper::Results do
|
|
72
74
|
page.should be_an_instance_of(Skyscraper::Node)
|
73
75
|
@call_count += 1
|
74
76
|
end
|
75
|
-
|
77
|
+
|
76
78
|
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { after_each: [callback] }
|
77
79
|
|
78
80
|
@call_count.should == 10
|
@@ -117,6 +119,21 @@ describe Skyscraper::Results do
|
|
117
119
|
results[0][:h1].should == "Hello world"
|
118
120
|
end
|
119
121
|
|
122
|
+
it "should stops when after each callback returns false" do
|
123
|
+
counter = 0
|
124
|
+
|
125
|
+
callback = proc do
|
126
|
+
if counter == 1
|
127
|
+
counter = 0
|
128
|
+
false
|
129
|
+
else
|
130
|
+
counter += 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_each: [callback] }
|
135
|
+
results.length.should == 2
|
136
|
+
end
|
120
137
|
end
|
121
138
|
|
122
139
|
describe "errors" do
|
@@ -1,5 +1,7 @@
|
|
1
1
|
#encoding: utf-8
|
2
2
|
|
3
|
+
require "spec_helper"
|
4
|
+
|
3
5
|
class TestScraper
|
4
6
|
include Skyscraper
|
5
7
|
|
@@ -27,6 +29,10 @@ describe Skyscraper do
|
|
27
29
|
Skyscraper::fetch(path_to("skyscraper-encoding.html")).first(".utf-8").text.should == "ąśćżół"
|
28
30
|
end
|
29
31
|
|
32
|
+
it "should handle http -> https redirects" do
|
33
|
+
Skyscraper::fetch("http://github.com").first("title").text.should =~ /GitHub/
|
34
|
+
end
|
35
|
+
|
30
36
|
it "should works when included" do
|
31
37
|
TestScraper.new.fetch[0][:h1].should == "Hello world"
|
32
38
|
TestScraper.new.fetch[1][:h1].should == "Hello world"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: skyscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-05-
|
12
|
+
date: 2012-05-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &78066490 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *78066490
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rake
|
27
|
-
requirement: &
|
27
|
+
requirement: &78128330 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *78128330
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &78126140 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *78126140
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: actionpack
|
49
|
-
requirement: &
|
49
|
+
requirement: &78123100 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *78123100
|
58
58
|
description: Easy to use DSL that helps scraping data from websites. Thanks to it,
|
59
59
|
writing web crawlers would be very fast and intuitive. Traversing through html nodes
|
60
60
|
and fetching all of the HTML attributes, would be possible. Just like in jQuery
|
@@ -109,6 +109,7 @@ files:
|
|
109
109
|
- spec/test_files/skyscraper-field.html
|
110
110
|
- spec/test_files/skyscraper-node-a.html
|
111
111
|
- spec/test_files/skyscraper-node-b.html
|
112
|
+
- spec/test_files/skyscraper-node-form.html
|
112
113
|
- spec/test_files/skyscraper-node-traversing.html
|
113
114
|
- spec/test_files/skyscraper-node.html
|
114
115
|
- spec/test_files/skyscraper-pages.html
|
@@ -162,6 +163,7 @@ test_files:
|
|
162
163
|
- spec/test_files/skyscraper-field.html
|
163
164
|
- spec/test_files/skyscraper-node-a.html
|
164
165
|
- spec/test_files/skyscraper-node-b.html
|
166
|
+
- spec/test_files/skyscraper-node-form.html
|
165
167
|
- spec/test_files/skyscraper-node-traversing.html
|
166
168
|
- spec/test_files/skyscraper-node.html
|
167
169
|
- spec/test_files/skyscraper-pages.html
|