skyscraper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +180 -0
- data/Rakefile +5 -0
- data/lib/skyscraper.rb +56 -0
- data/lib/skyscraper/base.rb +44 -0
- data/lib/skyscraper/config.rb +15 -0
- data/lib/skyscraper/document.rb +11 -0
- data/lib/skyscraper/field.rb +24 -0
- data/lib/skyscraper/node.rb +8 -0
- data/lib/skyscraper/node/base.rb +103 -0
- data/lib/skyscraper/node/resource.rb +57 -0
- data/lib/skyscraper/pages.rb +27 -0
- data/lib/skyscraper/path.rb +29 -0
- data/lib/skyscraper/path/base.rb +15 -0
- data/lib/skyscraper/path/local.rb +29 -0
- data/lib/skyscraper/path/remote.rb +32 -0
- data/lib/skyscraper/results.rb +93 -0
- data/lib/version.rb +3 -0
- data/skyscraper.gemspec +22 -0
- data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
- data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
- data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
- data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
- data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
- data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
- data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
- data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
- data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
- data/spec/skyscraper/skyscraper_spec.rb +39 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/support/skyscraper_helpers.rb +9 -0
- data/spec/test_files/encoding.html~ +12 -0
- data/spec/test_files/skyscraper-base.html +30 -0
- data/spec/test_files/skyscraper-document.html +30 -0
- data/spec/test_files/skyscraper-encoding.html +12 -0
- data/spec/test_files/skyscraper-fetch-2.html +11 -0
- data/spec/test_files/skyscraper-fetch.html +31 -0
- data/spec/test_files/skyscraper-field.html +30 -0
- data/spec/test_files/skyscraper-node-base-a.html +11 -0
- data/spec/test_files/skyscraper-node-base-b.html +10 -0
- data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
- data/spec/test_files/skyscraper-node-base.html +30 -0
- data/spec/test_files/skyscraper-node-resource-b.html +10 -0
- data/spec/test_files/skyscraper-node-resource-image.png +0 -0
- data/spec/test_files/skyscraper-node-resource.html +12 -0
- data/spec/test_files/skyscraper-pages.html +30 -0
- data/spec/test_files/skyscraper.html +30 -0
- metadata +169 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
describe Skyscraper::Field do
|
2
|
+
before(:all) do
|
3
|
+
@page = Skyscraper::fetch(path_to("skyscraper-field.html"))
|
4
|
+
end
|
5
|
+
|
6
|
+
it "should find field value using css selector" do
|
7
|
+
field = Skyscraper::Field.new name: :name, selector: ".item strong.name"
|
8
|
+
field.find_in_document @page
|
9
|
+
field.value.should == "Name value"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should apply callback" do
|
13
|
+
callback = proc { |item| item.href * 2 }
|
14
|
+
field = Skyscraper::Field.new name: :name, selector: "a", callback: callback
|
15
|
+
field.find_in_document @page
|
16
|
+
field.value.should == "a.htmla.html"
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should read attributes from elements" do
|
20
|
+
field = Skyscraper::Field.new name: :name, selector: "a", attribute: :href
|
21
|
+
field.find_in_document @page
|
22
|
+
field.value.should == "a.html"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should returns text code of inner element by default" do
|
26
|
+
field = Skyscraper::Field.new name: :name, selector: ".item"
|
27
|
+
field.find_in_document @page
|
28
|
+
field.value.should include "Name value"
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should returns html code of inner element" do
|
32
|
+
field = Skyscraper::Field.new name: :name, selector: ".item", attribute: "html"
|
33
|
+
field.find_in_document @page
|
34
|
+
field.value.should include "<strong class=\"name\">Name value</strong>"
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
describe Skyscraper::Node::Base do
|
2
|
+
describe "when is initialized" do
|
3
|
+
before(:each) do
|
4
|
+
@node = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("div.item")
|
5
|
+
end
|
6
|
+
|
7
|
+
it "should returns html code" do
|
8
|
+
@node.html.should include "<strong class=\"name\">Name value</strong>"
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should returns class name" do
|
12
|
+
@node.class.should == "item"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should be auto converted to string with stripped tags" do
|
16
|
+
@node.text.should == "Name value"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should follow links" do
|
21
|
+
Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("h1").text.should == "Hello from A"
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should deep follow links" do
|
25
|
+
Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").follow.first("h1").text.should == "Hello from B"
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should download page" do
|
29
|
+
remove_test_directory
|
30
|
+
Skyscraper.config.download_path = "/tmp/skyscraper_test/nodes/:file_name"
|
31
|
+
file = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").download
|
32
|
+
File.exists?(file).should == true
|
33
|
+
end
|
34
|
+
describe "traversing" do
|
35
|
+
before(:each) do
|
36
|
+
@node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first(".menu")
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should find descendands items" do
|
40
|
+
result = @node.find("li")
|
41
|
+
result.length.should == 5
|
42
|
+
result.map(&:text).should include "Item 4 1"
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should returns children of element with selector" do
|
46
|
+
node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first("#parent-3")
|
47
|
+
node.children(".a").length.should == 4
|
48
|
+
node.children(".b").length.should == 2
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should returns children of element without selector" do
|
52
|
+
result = @node.children
|
53
|
+
result.length.should == 4
|
54
|
+
result.map(&:to_s).should_not include "Item 4 1"
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should returns first element" do
|
58
|
+
@node.first("li").class.should == "item-1"
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should returns parent of item" do
|
62
|
+
@node.parent.class.should == "parent-2"
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should tells if element have parent" do
|
66
|
+
@node.have_parent?.should == true
|
67
|
+
@node.parents("html").first.have_parent?.should == false
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should returns parents of item" do
|
71
|
+
@node.parents.length.should == 4
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should returns parents of item matched by selector" do
|
75
|
+
@node.parents("div").length.should == 2
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should returns siblings of item" do
|
79
|
+
@node.first(".item-3").siblings.length.should == 3
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should returns node tag" do
|
83
|
+
@node.tag.should == "ul"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
describe Skyscraper::Node::Resource do
|
2
|
+
def should_download_resource_to node, path, options = {}
|
3
|
+
resource = Skyscraper::Node::Resource.new(node)
|
4
|
+
resource.download(options).should == path
|
5
|
+
end
|
6
|
+
|
7
|
+
before(:all) do
|
8
|
+
Skyscraper.config.download_path = "/tmp/skyscraper_test/:sequence/:file_name"
|
9
|
+
@node = Skyscraper::fetch(path_to("skyscraper-node-resource.html")).first("a")
|
10
|
+
end
|
11
|
+
|
12
|
+
before(:each) do
|
13
|
+
remove_test_directory
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should create path if not exists when downloaded" do
|
17
|
+
File.directory?("/tmp/skyscraper_test/1").should == false
|
18
|
+
Skyscraper::Node::Resource.new(@node)
|
19
|
+
File.directory?("/tmp/skyscraper_test/1").should == false
|
20
|
+
Skyscraper::Node::Resource.new(@node).download
|
21
|
+
File.directory?("/tmp/skyscraper_test/1").should == true
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should not fail if path already exists" do
|
25
|
+
Skyscraper::Node::Resource.new(@node).download path: "/tmp/skyscraper_test/some_directory/:file_name"
|
26
|
+
File.directory?("/tmp/skyscraper_test/some_directory").should == true
|
27
|
+
Skyscraper::Node::Resource.new(@node).download
|
28
|
+
File.directory?("/tmp/skyscraper_test/some_directory").should == true
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should have file name" do
|
32
|
+
resource = Skyscraper::Node::Resource.new(@node)
|
33
|
+
resource.download.should == "/tmp/skyscraper_test/1/skyscraper-node-resource-b.html"
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should create path with :sequence variable" do
|
37
|
+
download_to = "/tmp/skyscraper_test/sequences/:sequence/:file_name"
|
38
|
+
should_download_resource_to @node, "/tmp/skyscraper_test/sequences/1/skyscraper-node-resource-b.html", path: download_to
|
39
|
+
should_download_resource_to @node, "/tmp/skyscraper_test/sequences/2/skyscraper-node-resource-b.html", path: download_to
|
40
|
+
should_download_resource_to @node, "/tmp/skyscraper_test/sequences/3/skyscraper-node-resource-b.html", path: download_to
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should create custom file name if provided" do
|
44
|
+
download_to = "/tmp/skyscraper_test/custom_name/:file_name"
|
45
|
+
should_download_resource_to @node, "/tmp/skyscraper_test/custom_name/test.html", path: download_to, file_name: "test.html"
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should download resource" do
|
49
|
+
Skyscraper::Node::Resource.new(@node).download
|
50
|
+
File.exists?("/tmp/skyscraper_test/1/skyscraper-node-resource-b.html").should == true
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should download image" do
|
54
|
+
image_node = Skyscraper::fetch(path_to("skyscraper-node-resource.html")).first("img")
|
55
|
+
Skyscraper::Node::Resource.new(image_node).download
|
56
|
+
File.exists?("/tmp/skyscraper_test/1/skyscraper-node-resource-image.png").should == true
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
describe Skyscraper::Pages do
|
2
|
+
it "should set convert string to items array" do
|
3
|
+
Skyscraper::Pages.new("http://google.com").items.should == ["http://google.com"]
|
4
|
+
end
|
5
|
+
|
6
|
+
it "should set items array from array" do
|
7
|
+
Skyscraper::Pages.new(["http://google.com"]).items.should == ["http://google.com"]
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should flat pages from nested arrays" do
|
11
|
+
Skyscraper::Pages.new(["http://google.com", ["http://yahoo.com"]]).items.should == ["http://google.com", "http://yahoo.com"]
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should set array from block" do
|
15
|
+
Skyscraper::Pages.new do
|
16
|
+
2.times.map { |i| "http://google.com/#{i}.html"}
|
17
|
+
end.items.should == ["http://google.com/0.html", "http://google.com/1.html"]
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should pass scraper instance to block" do
|
21
|
+
Skyscraper::Pages.new do |scraper|
|
22
|
+
scraper.fetch(path_to("skyscraper-pages.html")).first("a").href
|
23
|
+
end.items.should == ["a.html"]
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should works when block is passed without arguments" do
|
27
|
+
Skyscraper::Pages.new do
|
28
|
+
"a.html"
|
29
|
+
end.items.should == ["a.html"]
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should return next item" do
|
33
|
+
pages = Skyscraper::Pages.new(["a", "b", "c"])
|
34
|
+
pages.next.should == "a"
|
35
|
+
pages.next.should == "b"
|
36
|
+
pages.next.should == "c"
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should reset pages" do
|
40
|
+
pages = Skyscraper::Pages.new(["a", "b", "c"])
|
41
|
+
pages.next.should == "a"
|
42
|
+
pages.next.should == "b"
|
43
|
+
pages.reset
|
44
|
+
pages.next.should == "a"
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Skyscraper::Path do
|
4
|
+
describe "when path is REMOTE" do
|
5
|
+
before(:each) do
|
6
|
+
@path = Skyscraper::Path.factory("http://google.com/index.php?q=e")
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should returns domain" do
|
10
|
+
@path.domain.should == "http://google.com"
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should returns domain with no scheme" do
|
14
|
+
@path = Skyscraper::Path.factory("google.com/index.php?q=e")
|
15
|
+
@path.full_path.should == "google.com/index.php?q=e"
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should returns path" do
|
19
|
+
@path.path.should == "/index.php"
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should returns query" do
|
23
|
+
@path.query.should == "?q=e"
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should returns base" do
|
27
|
+
@path.base.should == "http://google.com/"
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should returns full path" do
|
31
|
+
@path.full_path.should == "http://google.com/index.php?q=e"
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should be converted to string" do
|
35
|
+
@path.to_s.should == @path.full_path
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should returns full path for full different" do
|
39
|
+
path = Skyscraper::Path.factory("http://google.com/a/index.php")
|
40
|
+
path.full_path_for("http://yahoo.com/b.html").should == "http://yahoo.com/b.html"
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should returns full path for relative" do
|
44
|
+
path = Skyscraper::Path.factory("http://google.com/a/index.php")
|
45
|
+
path.full_path_for("b.html").should == "http://google.com/a/b.html"
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should returns full path for absolute" do
|
49
|
+
path = Skyscraper::Path.factory("http://google.com/a/index.php")
|
50
|
+
path.full_path_for("/b.html").should == "http://google.com/b.html"
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should returns file name" do
|
54
|
+
path = Skyscraper::Path.factory("http://google.com/a/index.php")
|
55
|
+
path.file_name.should == "index.php"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "when path is LOCAL" do
|
60
|
+
before(:each) do
|
61
|
+
@path = Skyscraper::Path.factory("/var/www/files/file.ext")
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should returns folder" do
|
65
|
+
@path.folder.should == "/var/www/files/"
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should returns file name" do
|
69
|
+
@path.file_name.should == "file.ext"
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should returns full path" do
|
73
|
+
@path.full_path.should == "/var/www/files/file.ext"
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should returns base" do
|
77
|
+
@path.base.should == "/var/www/files/"
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should returns full path for relative" do
|
81
|
+
path = Skyscraper::Path.factory("/var/www/public/index.html")
|
82
|
+
path.full_path_for("../b.html").should == "/var/www/public/../b.html"
|
83
|
+
path.full_path_for("b.html").should == "/var/www/public/b.html"
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should returns full path for absolute full" do
|
87
|
+
path = Skyscraper::Path.factory("/var/www/public/index.html")
|
88
|
+
path.full_path_for("/var/www/test.html").should == "/var/www/test.html"
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should detect if string is remote " do
|
93
|
+
Skyscraper::Path.remote?("http://google.com").should == true
|
94
|
+
Skyscraper::Path.remote?("google.com").should == true
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should detect if string is not remote " do
|
98
|
+
Skyscraper::Path.remote?("/var/www/projects").should == false
|
99
|
+
Skyscraper::Path.remote?("/var/www/projects/file.ext").should == false
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should check if is absolute address" do
|
103
|
+
Skyscraper::Path.absolute?("/some/relative/path").should == true
|
104
|
+
end
|
105
|
+
|
106
|
+
it "should returns nil for wrong path" do
|
107
|
+
path = Skyscraper::Path.factory("/var/www/files/")
|
108
|
+
path.file_name.should == nil
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
describe Skyscraper::Results do
|
2
|
+
def fetch options = {}
|
3
|
+
options.reverse_merge! fields: {}, options: {}
|
4
|
+
base = Skyscraper::Base.new
|
5
|
+
base.pages options[:path]
|
6
|
+
|
7
|
+
options[:fields].each_pair do |key, value|
|
8
|
+
base.field key, value
|
9
|
+
end
|
10
|
+
|
11
|
+
Skyscraper::Results.new(base, options[:options]).fetch
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should fetch file content" do
|
15
|
+
results = fetch path: path_to("skyscraper-fetch.html"), fields: { h1: "h1" }
|
16
|
+
results[0][:h1].should == "Hello world"
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should fetch with delay" do
|
20
|
+
time = Time.now
|
21
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 2, fields: { title: "title" }, options: { delay: 1 }
|
22
|
+
time_diff = Time.now - time
|
23
|
+
time_diff.should > 1
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should fetch with delay after" do
|
27
|
+
time = Time.now
|
28
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { delay: { sleep: 1, after: 7 } }
|
29
|
+
time_diff = Time.now - time
|
30
|
+
time_diff.should > 1
|
31
|
+
time_diff.should < 3
|
32
|
+
|
33
|
+
time = Time.now
|
34
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { delay: { sleep: 1, after: 11 } }
|
35
|
+
time_diff = Time.now - time
|
36
|
+
time_diff.should < 1
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should fetch with results limit" do
|
40
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 11, options: { limit: 10 }
|
41
|
+
results.length.should == 10
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should apply config defaults" do
|
45
|
+
base = Skyscraper::Base.new
|
46
|
+
base.config.limit = 2
|
47
|
+
base.pages [path_to("skyscraper-fetch.html")] * 10
|
48
|
+
|
49
|
+
results = Skyscraper::Results.new(base).fetch
|
50
|
+
results.length.should == 2
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should continue after limit reached" do
|
54
|
+
base = Skyscraper::Base.new
|
55
|
+
base.config.limit = 1
|
56
|
+
base.pages [path_to("skyscraper-fetch.html"), path_to("skyscraper-fetch-2.html")]
|
57
|
+
base.field :h1, "h1"
|
58
|
+
|
59
|
+
results = Skyscraper::Results.new(base)
|
60
|
+
results.fetch.length.should == 1
|
61
|
+
results.records.length.should == 1
|
62
|
+
results.continue.length.should == 1
|
63
|
+
results.records.length.should == 2
|
64
|
+
results.records[1][:h1].should == "Hello from A"
|
65
|
+
end
|
66
|
+
|
67
|
+
describe "callbacks" do
|
68
|
+
it "should calls after each page callback" do
|
69
|
+
@call_count = 0
|
70
|
+
callback = proc do |result, page|
|
71
|
+
result.should be_an_instance_of(Hash)
|
72
|
+
page.should be_an_instance_of(Skyscraper::Node::Base)
|
73
|
+
@call_count += 1
|
74
|
+
end
|
75
|
+
|
76
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { after_each: [callback] }
|
77
|
+
|
78
|
+
@call_count.should == 10
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should calls after all callback" do
|
82
|
+
@call_count = 0
|
83
|
+
callback = proc do |results|
|
84
|
+
results.should be_an_instance_of(Array)
|
85
|
+
@call_count += 1
|
86
|
+
end
|
87
|
+
|
88
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { after_all: [callback] }
|
89
|
+
|
90
|
+
@call_count.should == 1
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should change result value for each" do
|
94
|
+
callback = proc do |result, page|
|
95
|
+
result[:h1] += " with callback"
|
96
|
+
end
|
97
|
+
|
98
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_each: [callback] }
|
99
|
+
results[0][:h1].should == "Hello world with callback"
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should change results values for all" do
|
103
|
+
callback = proc do |results|
|
104
|
+
results << "test"
|
105
|
+
end
|
106
|
+
|
107
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_all: [callback] }
|
108
|
+
results.last.should == "test"
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should doesn't requires callback arguments" do
|
112
|
+
callback = proc do
|
113
|
+
"with callback"
|
114
|
+
end
|
115
|
+
|
116
|
+
results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_each: [callback] }
|
117
|
+
results[0][:h1].should == "Hello world"
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
describe "errors" do
|
123
|
+
before(:all) do
|
124
|
+
Skyscraper.config.noise_errors = false
|
125
|
+
end
|
126
|
+
|
127
|
+
it "should catch invalid url exception" do
|
128
|
+
Skyscraper.config.skip_on_error = false
|
129
|
+
lambda do
|
130
|
+
fetch path: "http://google.wrong"
|
131
|
+
end.should raise_error Skyscraper::NoResourceException
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should catch file not exists exception" do
|
135
|
+
Skyscraper.config.skip_on_error = false
|
136
|
+
lambda do
|
137
|
+
fetch path: "/tmp/skyscraper/unknow_file"
|
138
|
+
end.should raise_error Skyscraper::NoResourceException
|
139
|
+
end
|
140
|
+
|
141
|
+
it "should skip on error" do
|
142
|
+
Skyscraper.config.skip_on_error = true
|
143
|
+
begin
|
144
|
+
fetch path: "http://google.wrong"
|
145
|
+
rescue Skyscraper::NoResourceException
|
146
|
+
catched = true
|
147
|
+
end
|
148
|
+
catched.should == nil
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|