skyscraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +22 -0
  5. data/README.md +180 -0
  6. data/Rakefile +5 -0
  7. data/lib/skyscraper.rb +56 -0
  8. data/lib/skyscraper/base.rb +44 -0
  9. data/lib/skyscraper/config.rb +15 -0
  10. data/lib/skyscraper/document.rb +11 -0
  11. data/lib/skyscraper/field.rb +24 -0
  12. data/lib/skyscraper/node.rb +8 -0
  13. data/lib/skyscraper/node/base.rb +103 -0
  14. data/lib/skyscraper/node/resource.rb +57 -0
  15. data/lib/skyscraper/pages.rb +27 -0
  16. data/lib/skyscraper/path.rb +29 -0
  17. data/lib/skyscraper/path/base.rb +15 -0
  18. data/lib/skyscraper/path/local.rb +29 -0
  19. data/lib/skyscraper/path/remote.rb +32 -0
  20. data/lib/skyscraper/results.rb +93 -0
  21. data/lib/version.rb +3 -0
  22. data/skyscraper.gemspec +22 -0
  23. data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
  24. data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
  25. data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
  26. data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
  27. data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
  28. data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
  29. data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
  30. data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
  31. data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
  32. data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
  33. data/spec/skyscraper/skyscraper_spec.rb +39 -0
  34. data/spec/spec_helper.rb +3 -0
  35. data/spec/support/skyscraper_helpers.rb +9 -0
  36. data/spec/test_files/encoding.html~ +12 -0
  37. data/spec/test_files/skyscraper-base.html +30 -0
  38. data/spec/test_files/skyscraper-document.html +30 -0
  39. data/spec/test_files/skyscraper-encoding.html +12 -0
  40. data/spec/test_files/skyscraper-fetch-2.html +11 -0
  41. data/spec/test_files/skyscraper-fetch.html +31 -0
  42. data/spec/test_files/skyscraper-field.html +30 -0
  43. data/spec/test_files/skyscraper-node-base-a.html +11 -0
  44. data/spec/test_files/skyscraper-node-base-b.html +10 -0
  45. data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
  46. data/spec/test_files/skyscraper-node-base.html +30 -0
  47. data/spec/test_files/skyscraper-node-resource-b.html +10 -0
  48. data/spec/test_files/skyscraper-node-resource-image.png +0 -0
  49. data/spec/test_files/skyscraper-node-resource.html +12 -0
  50. data/spec/test_files/skyscraper-pages.html +30 -0
  51. data/spec/test_files/skyscraper.html +30 -0
  52. metadata +169 -0
@@ -0,0 +1,36 @@
1
+ describe Skyscraper::Field do
2
+ before(:all) do
3
+ @page = Skyscraper::fetch(path_to("skyscraper-field.html"))
4
+ end
5
+
6
+ it "should find field value using css selector" do
7
+ field = Skyscraper::Field.new name: :name, selector: ".item strong.name"
8
+ field.find_in_document @page
9
+ field.value.should == "Name value"
10
+ end
11
+
12
+ it "should apply callback" do
13
+ callback = proc { |item| item.href * 2 }
14
+ field = Skyscraper::Field.new name: :name, selector: "a", callback: callback
15
+ field.find_in_document @page
16
+ field.value.should == "a.htmla.html"
17
+ end
18
+
19
+ it "should read attributes from elements" do
20
+ field = Skyscraper::Field.new name: :name, selector: "a", attribute: :href
21
+ field.find_in_document @page
22
+ field.value.should == "a.html"
23
+ end
24
+
25
+ it "should returns text code of inner element by default" do
26
+ field = Skyscraper::Field.new name: :name, selector: ".item"
27
+ field.find_in_document @page
28
+ field.value.should include "Name value"
29
+ end
30
+
31
+ it "should returns html code of inner element" do
32
+ field = Skyscraper::Field.new name: :name, selector: ".item", attribute: "html"
33
+ field.find_in_document @page
34
+ field.value.should include "<strong class=\"name\">Name value</strong>"
35
+ end
36
+ end
@@ -0,0 +1,87 @@
1
+ describe Skyscraper::Node::Base do
2
+ describe "when is initialized" do
3
+ before(:each) do
4
+ @node = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("div.item")
5
+ end
6
+
7
+ it "should returns html code" do
8
+ @node.html.should include "<strong class=\"name\">Name value</strong>"
9
+ end
10
+
11
+ it "should returns class name" do
12
+ @node.class.should == "item"
13
+ end
14
+
15
+ it "should be auto converted to string with stripped tags" do
16
+ @node.text.should == "Name value"
17
+ end
18
+ end
19
+
20
+ it "should follow links" do
21
+ Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("h1").text.should == "Hello from A"
22
+ end
23
+
24
+ it "should deep follow links" do
25
+ Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").follow.first("h1").text.should == "Hello from B"
26
+ end
27
+
28
+ it "should download page" do
29
+ remove_test_directory
30
+ Skyscraper.config.download_path = "/tmp/skyscraper_test/nodes/:file_name"
31
+ file = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").download
32
+ File.exists?(file).should == true
33
+ end
34
+ describe "traversing" do
35
+ before(:each) do
36
+ @node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first(".menu")
37
+ end
38
+
39
+ it "should find descendands items" do
40
+ result = @node.find("li")
41
+ result.length.should == 5
42
+ result.map(&:text).should include "Item 4 1"
43
+ end
44
+
45
+ it "should returns children of element with selector" do
46
+ node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first("#parent-3")
47
+ node.children(".a").length.should == 4
48
+ node.children(".b").length.should == 2
49
+ end
50
+
51
+ it "should returns children of element without selector" do
52
+ result = @node.children
53
+ result.length.should == 4
54
+ result.map(&:to_s).should_not include "Item 4 1"
55
+ end
56
+
57
+ it "should returns first element" do
58
+ @node.first("li").class.should == "item-1"
59
+ end
60
+
61
+ it "should returns parent of item" do
62
+ @node.parent.class.should == "parent-2"
63
+ end
64
+
65
+ it "should tells if element have parent" do
66
+ @node.have_parent?.should == true
67
+ @node.parents("html").first.have_parent?.should == false
68
+ end
69
+
70
+ it "should returns parents of item" do
71
+ @node.parents.length.should == 4
72
+ end
73
+
74
+ it "should returns parents of item matched by selector" do
75
+ @node.parents("div").length.should == 2
76
+ end
77
+
78
+ it "should returns siblings of item" do
79
+ @node.first(".item-3").siblings.length.should == 3
80
+ end
81
+
82
+ it "should returns node tag" do
83
+ @node.tag.should == "ul"
84
+ end
85
+ end
86
+ end
87
+
@@ -0,0 +1,58 @@
1
+ describe Skyscraper::Node::Resource do
2
+ def should_download_resource_to node, path, options = {}
3
+ resource = Skyscraper::Node::Resource.new(node)
4
+ resource.download(options).should == path
5
+ end
6
+
7
+ before(:all) do
8
+ Skyscraper.config.download_path = "/tmp/skyscraper_test/:sequence/:file_name"
9
+ @node = Skyscraper::fetch(path_to("skyscraper-node-resource.html")).first("a")
10
+ end
11
+
12
+ before(:each) do
13
+ remove_test_directory
14
+ end
15
+
16
+ it "should create path if not exists when downloaded" do
17
+ File.directory?("/tmp/skyscraper_test/1").should == false
18
+ Skyscraper::Node::Resource.new(@node)
19
+ File.directory?("/tmp/skyscraper_test/1").should == false
20
+ Skyscraper::Node::Resource.new(@node).download
21
+ File.directory?("/tmp/skyscraper_test/1").should == true
22
+ end
23
+
24
+ it "should not fail if path already exists" do
25
+ Skyscraper::Node::Resource.new(@node).download path: "/tmp/skyscraper_test/some_directory/:file_name"
26
+ File.directory?("/tmp/skyscraper_test/some_directory").should == true
27
+ Skyscraper::Node::Resource.new(@node).download
28
+ File.directory?("/tmp/skyscraper_test/some_directory").should == true
29
+ end
30
+
31
+ it "should have file name" do
32
+ resource = Skyscraper::Node::Resource.new(@node)
33
+ resource.download.should == "/tmp/skyscraper_test/1/skyscraper-node-resource-b.html"
34
+ end
35
+
36
+ it "should create path with :sequence variable" do
37
+ download_to = "/tmp/skyscraper_test/sequences/:sequence/:file_name"
38
+ should_download_resource_to @node, "/tmp/skyscraper_test/sequences/1/skyscraper-node-resource-b.html", path: download_to
39
+ should_download_resource_to @node, "/tmp/skyscraper_test/sequences/2/skyscraper-node-resource-b.html", path: download_to
40
+ should_download_resource_to @node, "/tmp/skyscraper_test/sequences/3/skyscraper-node-resource-b.html", path: download_to
41
+ end
42
+
43
+ it "should create custom file name if provided" do
44
+ download_to = "/tmp/skyscraper_test/custom_name/:file_name"
45
+ should_download_resource_to @node, "/tmp/skyscraper_test/custom_name/test.html", path: download_to, file_name: "test.html"
46
+ end
47
+
48
+ it "should download resource" do
49
+ Skyscraper::Node::Resource.new(@node).download
50
+ File.exists?("/tmp/skyscraper_test/1/skyscraper-node-resource-b.html").should == true
51
+ end
52
+
53
+ it "should download image" do
54
+ image_node = Skyscraper::fetch(path_to("skyscraper-node-resource.html")).first("img")
55
+ Skyscraper::Node::Resource.new(image_node).download
56
+ File.exists?("/tmp/skyscraper_test/1/skyscraper-node-resource-image.png").should == true
57
+ end
58
+ end
@@ -0,0 +1,2 @@
1
+ describe Skyscraper::Node do
2
+ end
@@ -0,0 +1,46 @@
1
+ describe Skyscraper::Pages do
2
+ it "should set convert string to items array" do
3
+ Skyscraper::Pages.new("http://google.com").items.should == ["http://google.com"]
4
+ end
5
+
6
+ it "should set items array from array" do
7
+ Skyscraper::Pages.new(["http://google.com"]).items.should == ["http://google.com"]
8
+ end
9
+
10
+ it "should flat pages from nested arrays" do
11
+ Skyscraper::Pages.new(["http://google.com", ["http://yahoo.com"]]).items.should == ["http://google.com", "http://yahoo.com"]
12
+ end
13
+
14
+ it "should set array from block" do
15
+ Skyscraper::Pages.new do
16
+ 2.times.map { |i| "http://google.com/#{i}.html"}
17
+ end.items.should == ["http://google.com/0.html", "http://google.com/1.html"]
18
+ end
19
+
20
+ it "should pass scraper instance to block" do
21
+ Skyscraper::Pages.new do |scraper|
22
+ scraper.fetch(path_to("skyscraper-pages.html")).first("a").href
23
+ end.items.should == ["a.html"]
24
+ end
25
+
26
+ it "should works when block is passed without arguments" do
27
+ Skyscraper::Pages.new do
28
+ "a.html"
29
+ end.items.should == ["a.html"]
30
+ end
31
+
32
+ it "should return next item" do
33
+ pages = Skyscraper::Pages.new(["a", "b", "c"])
34
+ pages.next.should == "a"
35
+ pages.next.should == "b"
36
+ pages.next.should == "c"
37
+ end
38
+
39
+ it "should reset pages" do
40
+ pages = Skyscraper::Pages.new(["a", "b", "c"])
41
+ pages.next.should == "a"
42
+ pages.next.should == "b"
43
+ pages.reset
44
+ pages.next.should == "a"
45
+ end
46
+ end
@@ -0,0 +1,110 @@
1
+ require "spec_helper"
2
+
3
+ describe Skyscraper::Path do
4
+ describe "when path is REMOTE" do
5
+ before(:each) do
6
+ @path = Skyscraper::Path.factory("http://google.com/index.php?q=e")
7
+ end
8
+
9
+ it "should returns domain" do
10
+ @path.domain.should == "http://google.com"
11
+ end
12
+
13
+ it "should returns domain with no scheme" do
14
+ @path = Skyscraper::Path.factory("google.com/index.php?q=e")
15
+ @path.full_path.should == "google.com/index.php?q=e"
16
+ end
17
+
18
+ it "should returns path" do
19
+ @path.path.should == "/index.php"
20
+ end
21
+
22
+ it "should returns query" do
23
+ @path.query.should == "?q=e"
24
+ end
25
+
26
+ it "should returns base" do
27
+ @path.base.should == "http://google.com/"
28
+ end
29
+
30
+ it "should returns full path" do
31
+ @path.full_path.should == "http://google.com/index.php?q=e"
32
+ end
33
+
34
+ it "should be converted to string" do
35
+ @path.to_s.should == @path.full_path
36
+ end
37
+
38
+ it "should returns full path for full different" do
39
+ path = Skyscraper::Path.factory("http://google.com/a/index.php")
40
+ path.full_path_for("http://yahoo.com/b.html").should == "http://yahoo.com/b.html"
41
+ end
42
+
43
+ it "should returns full path for relative" do
44
+ path = Skyscraper::Path.factory("http://google.com/a/index.php")
45
+ path.full_path_for("b.html").should == "http://google.com/a/b.html"
46
+ end
47
+
48
+ it "should returns full path for absolute" do
49
+ path = Skyscraper::Path.factory("http://google.com/a/index.php")
50
+ path.full_path_for("/b.html").should == "http://google.com/b.html"
51
+ end
52
+
53
+ it "should returns file name" do
54
+ path = Skyscraper::Path.factory("http://google.com/a/index.php")
55
+ path.file_name.should == "index.php"
56
+ end
57
+ end
58
+
59
+ describe "when path is LOCAL" do
60
+ before(:each) do
61
+ @path = Skyscraper::Path.factory("/var/www/files/file.ext")
62
+ end
63
+
64
+ it "should returns folder" do
65
+ @path.folder.should == "/var/www/files/"
66
+ end
67
+
68
+ it "should returns file name" do
69
+ @path.file_name.should == "file.ext"
70
+ end
71
+
72
+ it "should returns full path" do
73
+ @path.full_path.should == "/var/www/files/file.ext"
74
+ end
75
+
76
+ it "should returns base" do
77
+ @path.base.should == "/var/www/files/"
78
+ end
79
+
80
+ it "should returns full path for relative" do
81
+ path = Skyscraper::Path.factory("/var/www/public/index.html")
82
+ path.full_path_for("../b.html").should == "/var/www/public/../b.html"
83
+ path.full_path_for("b.html").should == "/var/www/public/b.html"
84
+ end
85
+
86
+ it "should returns full path for absolute full" do
87
+ path = Skyscraper::Path.factory("/var/www/public/index.html")
88
+ path.full_path_for("/var/www/test.html").should == "/var/www/test.html"
89
+ end
90
+ end
91
+
92
+ it "should detect if string is remote " do
93
+ Skyscraper::Path.remote?("http://google.com").should == true
94
+ Skyscraper::Path.remote?("google.com").should == true
95
+ end
96
+
97
+ it "should detect if string is not remote " do
98
+ Skyscraper::Path.remote?("/var/www/projects").should == false
99
+ Skyscraper::Path.remote?("/var/www/projects/file.ext").should == false
100
+ end
101
+
102
+ it "should check if is absolute address" do
103
+ Skyscraper::Path.absolute?("/some/relative/path").should == true
104
+ end
105
+
106
+ it "should returns nil for wrong path" do
107
+ path = Skyscraper::Path.factory("/var/www/files/")
108
+ path.file_name.should == nil
109
+ end
110
+ end
@@ -0,0 +1,151 @@
1
+ describe Skyscraper::Results do
2
+ def fetch options = {}
3
+ options.reverse_merge! fields: {}, options: {}
4
+ base = Skyscraper::Base.new
5
+ base.pages options[:path]
6
+
7
+ options[:fields].each_pair do |key, value|
8
+ base.field key, value
9
+ end
10
+
11
+ Skyscraper::Results.new(base, options[:options]).fetch
12
+ end
13
+
14
+ it "should fetch file content" do
15
+ results = fetch path: path_to("skyscraper-fetch.html"), fields: { h1: "h1" }
16
+ results[0][:h1].should == "Hello world"
17
+ end
18
+
19
+ it "should fetch with delay" do
20
+ time = Time.now
21
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 2, fields: { title: "title" }, options: { delay: 1 }
22
+ time_diff = Time.now - time
23
+ time_diff.should > 1
24
+ end
25
+
26
+ it "should fetch with delay after" do
27
+ time = Time.now
28
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { delay: { sleep: 1, after: 7 } }
29
+ time_diff = Time.now - time
30
+ time_diff.should > 1
31
+ time_diff.should < 3
32
+
33
+ time = Time.now
34
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { delay: { sleep: 1, after: 11 } }
35
+ time_diff = Time.now - time
36
+ time_diff.should < 1
37
+ end
38
+
39
+ it "should fetch with results limit" do
40
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 11, options: { limit: 10 }
41
+ results.length.should == 10
42
+ end
43
+
44
+ it "should apply config defaults" do
45
+ base = Skyscraper::Base.new
46
+ base.config.limit = 2
47
+ base.pages [path_to("skyscraper-fetch.html")] * 10
48
+
49
+ results = Skyscraper::Results.new(base).fetch
50
+ results.length.should == 2
51
+ end
52
+
53
+ it "should continue after limit reached" do
54
+ base = Skyscraper::Base.new
55
+ base.config.limit = 1
56
+ base.pages [path_to("skyscraper-fetch.html"), path_to("skyscraper-fetch-2.html")]
57
+ base.field :h1, "h1"
58
+
59
+ results = Skyscraper::Results.new(base)
60
+ results.fetch.length.should == 1
61
+ results.records.length.should == 1
62
+ results.continue.length.should == 1
63
+ results.records.length.should == 2
64
+ results.records[1][:h1].should == "Hello from A"
65
+ end
66
+
67
+ describe "callbacks" do
68
+ it "should calls after each page callback" do
69
+ @call_count = 0
70
+ callback = proc do |result, page|
71
+ result.should be_an_instance_of(Hash)
72
+ page.should be_an_instance_of(Skyscraper::Node::Base)
73
+ @call_count += 1
74
+ end
75
+
76
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { after_each: [callback] }
77
+
78
+ @call_count.should == 10
79
+ end
80
+
81
+ it "should calls after all callback" do
82
+ @call_count = 0
83
+ callback = proc do |results|
84
+ results.should be_an_instance_of(Array)
85
+ @call_count += 1
86
+ end
87
+
88
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { after_all: [callback] }
89
+
90
+ @call_count.should == 1
91
+ end
92
+
93
+ it "should change result value for each" do
94
+ callback = proc do |result, page|
95
+ result[:h1] += " with callback"
96
+ end
97
+
98
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_each: [callback] }
99
+ results[0][:h1].should == "Hello world with callback"
100
+ end
101
+
102
+ it "should change results values for all" do
103
+ callback = proc do |results|
104
+ results << "test"
105
+ end
106
+
107
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_all: [callback] }
108
+ results.last.should == "test"
109
+ end
110
+
111
+ it "should doesn't requires callback arguments" do
112
+ callback = proc do
113
+ "with callback"
114
+ end
115
+
116
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_each: [callback] }
117
+ results[0][:h1].should == "Hello world"
118
+ end
119
+
120
+ end
121
+
122
+ describe "errors" do
123
+ before(:all) do
124
+ Skyscraper.config.noise_errors = false
125
+ end
126
+
127
+ it "should catch invalid url exception" do
128
+ Skyscraper.config.skip_on_error = false
129
+ lambda do
130
+ fetch path: "http://google.wrong"
131
+ end.should raise_error Skyscraper::NoResourceException
132
+ end
133
+
134
+ it "should catch file not exists exception" do
135
+ Skyscraper.config.skip_on_error = false
136
+ lambda do
137
+ fetch path: "/tmp/skyscraper/unknow_file"
138
+ end.should raise_error Skyscraper::NoResourceException
139
+ end
140
+
141
+ it "should skip on error" do
142
+ Skyscraper.config.skip_on_error = true
143
+ begin
144
+ fetch path: "http://google.wrong"
145
+ rescue Skyscraper::NoResourceException
146
+ catched = true
147
+ end
148
+ catched.should == nil
149
+ end
150
+ end
151
+ end