skyscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +22 -0
  5. data/README.md +180 -0
  6. data/Rakefile +5 -0
  7. data/lib/skyscraper.rb +56 -0
  8. data/lib/skyscraper/base.rb +44 -0
  9. data/lib/skyscraper/config.rb +15 -0
  10. data/lib/skyscraper/document.rb +11 -0
  11. data/lib/skyscraper/field.rb +24 -0
  12. data/lib/skyscraper/node.rb +8 -0
  13. data/lib/skyscraper/node/base.rb +103 -0
  14. data/lib/skyscraper/node/resource.rb +57 -0
  15. data/lib/skyscraper/pages.rb +27 -0
  16. data/lib/skyscraper/path.rb +29 -0
  17. data/lib/skyscraper/path/base.rb +15 -0
  18. data/lib/skyscraper/path/local.rb +29 -0
  19. data/lib/skyscraper/path/remote.rb +32 -0
  20. data/lib/skyscraper/results.rb +93 -0
  21. data/lib/version.rb +3 -0
  22. data/skyscraper.gemspec +22 -0
  23. data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
  24. data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
  25. data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
  26. data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
  27. data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
  28. data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
  29. data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
  30. data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
  31. data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
  32. data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
  33. data/spec/skyscraper/skyscraper_spec.rb +39 -0
  34. data/spec/spec_helper.rb +3 -0
  35. data/spec/support/skyscraper_helpers.rb +9 -0
  36. data/spec/test_files/encoding.html~ +12 -0
  37. data/spec/test_files/skyscraper-base.html +30 -0
  38. data/spec/test_files/skyscraper-document.html +30 -0
  39. data/spec/test_files/skyscraper-encoding.html +12 -0
  40. data/spec/test_files/skyscraper-fetch-2.html +11 -0
  41. data/spec/test_files/skyscraper-fetch.html +31 -0
  42. data/spec/test_files/skyscraper-field.html +30 -0
  43. data/spec/test_files/skyscraper-node-base-a.html +11 -0
  44. data/spec/test_files/skyscraper-node-base-b.html +10 -0
  45. data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
  46. data/spec/test_files/skyscraper-node-base.html +30 -0
  47. data/spec/test_files/skyscraper-node-resource-b.html +10 -0
  48. data/spec/test_files/skyscraper-node-resource-image.png +0 -0
  49. data/spec/test_files/skyscraper-node-resource.html +12 -0
  50. data/spec/test_files/skyscraper-pages.html +30 -0
  51. data/spec/test_files/skyscraper.html +30 -0
  52. metadata +169 -0
@@ -0,0 +1,36 @@
1
+ describe Skyscraper::Field do
2
+ before(:all) do
3
+ @page = Skyscraper::fetch(path_to("skyscraper-field.html"))
4
+ end
5
+
6
+ it "should find field value using css selector" do
7
+ field = Skyscraper::Field.new name: :name, selector: ".item strong.name"
8
+ field.find_in_document @page
9
+ field.value.should == "Name value"
10
+ end
11
+
12
+ it "should apply callback" do
13
+ callback = proc { |item| item.href * 2 }
14
+ field = Skyscraper::Field.new name: :name, selector: "a", callback: callback
15
+ field.find_in_document @page
16
+ field.value.should == "a.htmla.html"
17
+ end
18
+
19
+ it "should read attributes from elements" do
20
+ field = Skyscraper::Field.new name: :name, selector: "a", attribute: :href
21
+ field.find_in_document @page
22
+ field.value.should == "a.html"
23
+ end
24
+
25
+ it "should returns text code of inner element by default" do
26
+ field = Skyscraper::Field.new name: :name, selector: ".item"
27
+ field.find_in_document @page
28
+ field.value.should include "Name value"
29
+ end
30
+
31
+ it "should returns html code of inner element" do
32
+ field = Skyscraper::Field.new name: :name, selector: ".item", attribute: "html"
33
+ field.find_in_document @page
34
+ field.value.should include "<strong class=\"name\">Name value</strong>"
35
+ end
36
+ end
@@ -0,0 +1,87 @@
1
+ describe Skyscraper::Node::Base do
2
+ describe "when is initialized" do
3
+ before(:each) do
4
+ @node = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("div.item")
5
+ end
6
+
7
+ it "should returns html code" do
8
+ @node.html.should include "<strong class=\"name\">Name value</strong>"
9
+ end
10
+
11
+ it "should returns class name" do
12
+ @node.class.should == "item"
13
+ end
14
+
15
+ it "should be auto converted to string with stripped tags" do
16
+ @node.text.should == "Name value"
17
+ end
18
+ end
19
+
20
+ it "should follow links" do
21
+ Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("h1").text.should == "Hello from A"
22
+ end
23
+
24
+ it "should deep follow links" do
25
+ Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").follow.first("h1").text.should == "Hello from B"
26
+ end
27
+
28
+ it "should download page" do
29
+ remove_test_directory
30
+ Skyscraper.config.download_path = "/tmp/skyscraper_test/nodes/:file_name"
31
+ file = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").download
32
+ File.exists?(file).should == true
33
+ end
34
+ describe "traversing" do
35
+ before(:each) do
36
+ @node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first(".menu")
37
+ end
38
+
39
+ it "should find descendands items" do
40
+ result = @node.find("li")
41
+ result.length.should == 5
42
+ result.map(&:text).should include "Item 4 1"
43
+ end
44
+
45
+ it "should returns children of element with selector" do
46
+ node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first("#parent-3")
47
+ node.children(".a").length.should == 4
48
+ node.children(".b").length.should == 2
49
+ end
50
+
51
+ it "should returns children of element without selector" do
52
+ result = @node.children
53
+ result.length.should == 4
54
+ result.map(&:to_s).should_not include "Item 4 1"
55
+ end
56
+
57
+ it "should returns first element" do
58
+ @node.first("li").class.should == "item-1"
59
+ end
60
+
61
+ it "should returns parent of item" do
62
+ @node.parent.class.should == "parent-2"
63
+ end
64
+
65
+ it "should tells if element have parent" do
66
+ @node.have_parent?.should == true
67
+ @node.parents("html").first.have_parent?.should == false
68
+ end
69
+
70
+ it "should returns parents of item" do
71
+ @node.parents.length.should == 4
72
+ end
73
+
74
+ it "should returns parents of item matched by selector" do
75
+ @node.parents("div").length.should == 2
76
+ end
77
+
78
+ it "should returns siblings of item" do
79
+ @node.first(".item-3").siblings.length.should == 3
80
+ end
81
+
82
+ it "should returns node tag" do
83
+ @node.tag.should == "ul"
84
+ end
85
+ end
86
+ end
87
+
@@ -0,0 +1,58 @@
1
+ describe Skyscraper::Node::Resource do
2
+ def should_download_resource_to node, path, options = {}
3
+ resource = Skyscraper::Node::Resource.new(node)
4
+ resource.download(options).should == path
5
+ end
6
+
7
+ before(:all) do
8
+ Skyscraper.config.download_path = "/tmp/skyscraper_test/:sequence/:file_name"
9
+ @node = Skyscraper::fetch(path_to("skyscraper-node-resource.html")).first("a")
10
+ end
11
+
12
+ before(:each) do
13
+ remove_test_directory
14
+ end
15
+
16
+ it "should create path if not exists when downloaded" do
17
+ File.directory?("/tmp/skyscraper_test/1").should == false
18
+ Skyscraper::Node::Resource.new(@node)
19
+ File.directory?("/tmp/skyscraper_test/1").should == false
20
+ Skyscraper::Node::Resource.new(@node).download
21
+ File.directory?("/tmp/skyscraper_test/1").should == true
22
+ end
23
+
24
+ it "should not fail if path already exists" do
25
+ Skyscraper::Node::Resource.new(@node).download path: "/tmp/skyscraper_test/some_directory/:file_name"
26
+ File.directory?("/tmp/skyscraper_test/some_directory").should == true
27
+ Skyscraper::Node::Resource.new(@node).download
28
+ File.directory?("/tmp/skyscraper_test/some_directory").should == true
29
+ end
30
+
31
+ it "should have file name" do
32
+ resource = Skyscraper::Node::Resource.new(@node)
33
+ resource.download.should == "/tmp/skyscraper_test/1/skyscraper-node-resource-b.html"
34
+ end
35
+
36
+ it "should create path with :sequence variable" do
37
+ download_to = "/tmp/skyscraper_test/sequences/:sequence/:file_name"
38
+ should_download_resource_to @node, "/tmp/skyscraper_test/sequences/1/skyscraper-node-resource-b.html", path: download_to
39
+ should_download_resource_to @node, "/tmp/skyscraper_test/sequences/2/skyscraper-node-resource-b.html", path: download_to
40
+ should_download_resource_to @node, "/tmp/skyscraper_test/sequences/3/skyscraper-node-resource-b.html", path: download_to
41
+ end
42
+
43
+ it "should create custom file name if provided" do
44
+ download_to = "/tmp/skyscraper_test/custom_name/:file_name"
45
+ should_download_resource_to @node, "/tmp/skyscraper_test/custom_name/test.html", path: download_to, file_name: "test.html"
46
+ end
47
+
48
+ it "should download resource" do
49
+ Skyscraper::Node::Resource.new(@node).download
50
+ File.exists?("/tmp/skyscraper_test/1/skyscraper-node-resource-b.html").should == true
51
+ end
52
+
53
+ it "should download image" do
54
+ image_node = Skyscraper::fetch(path_to("skyscraper-node-resource.html")).first("img")
55
+ Skyscraper::Node::Resource.new(image_node).download
56
+ File.exists?("/tmp/skyscraper_test/1/skyscraper-node-resource-image.png").should == true
57
+ end
58
+ end
@@ -0,0 +1,2 @@
1
+ describe Skyscraper::Node do
2
+ end
@@ -0,0 +1,46 @@
1
+ describe Skyscraper::Pages do
2
+ it "should set convert string to items array" do
3
+ Skyscraper::Pages.new("http://google.com").items.should == ["http://google.com"]
4
+ end
5
+
6
+ it "should set items array from array" do
7
+ Skyscraper::Pages.new(["http://google.com"]).items.should == ["http://google.com"]
8
+ end
9
+
10
+ it "should flat pages from nested arrays" do
11
+ Skyscraper::Pages.new(["http://google.com", ["http://yahoo.com"]]).items.should == ["http://google.com", "http://yahoo.com"]
12
+ end
13
+
14
+ it "should set array from block" do
15
+ Skyscraper::Pages.new do
16
+ 2.times.map { |i| "http://google.com/#{i}.html"}
17
+ end.items.should == ["http://google.com/0.html", "http://google.com/1.html"]
18
+ end
19
+
20
+ it "should pass scraper instance to block" do
21
+ Skyscraper::Pages.new do |scraper|
22
+ scraper.fetch(path_to("skyscraper-pages.html")).first("a").href
23
+ end.items.should == ["a.html"]
24
+ end
25
+
26
+ it "should works when block is passed without arguments" do
27
+ Skyscraper::Pages.new do
28
+ "a.html"
29
+ end.items.should == ["a.html"]
30
+ end
31
+
32
+ it "should return next item" do
33
+ pages = Skyscraper::Pages.new(["a", "b", "c"])
34
+ pages.next.should == "a"
35
+ pages.next.should == "b"
36
+ pages.next.should == "c"
37
+ end
38
+
39
+ it "should reset pages" do
40
+ pages = Skyscraper::Pages.new(["a", "b", "c"])
41
+ pages.next.should == "a"
42
+ pages.next.should == "b"
43
+ pages.reset
44
+ pages.next.should == "a"
45
+ end
46
+ end
@@ -0,0 +1,110 @@
1
+ require "spec_helper"
2
+
3
+ describe Skyscraper::Path do
4
+ describe "when path is REMOTE" do
5
+ before(:each) do
6
+ @path = Skyscraper::Path.factory("http://google.com/index.php?q=e")
7
+ end
8
+
9
+ it "should returns domain" do
10
+ @path.domain.should == "http://google.com"
11
+ end
12
+
13
+ it "should returns domain with no scheme" do
14
+ @path = Skyscraper::Path.factory("google.com/index.php?q=e")
15
+ @path.full_path.should == "google.com/index.php?q=e"
16
+ end
17
+
18
+ it "should returns path" do
19
+ @path.path.should == "/index.php"
20
+ end
21
+
22
+ it "should returns query" do
23
+ @path.query.should == "?q=e"
24
+ end
25
+
26
+ it "should returns base" do
27
+ @path.base.should == "http://google.com/"
28
+ end
29
+
30
+ it "should returns full path" do
31
+ @path.full_path.should == "http://google.com/index.php?q=e"
32
+ end
33
+
34
+ it "should be converted to string" do
35
+ @path.to_s.should == @path.full_path
36
+ end
37
+
38
+ it "should returns full path for full different" do
39
+ path = Skyscraper::Path.factory("http://google.com/a/index.php")
40
+ path.full_path_for("http://yahoo.com/b.html").should == "http://yahoo.com/b.html"
41
+ end
42
+
43
+ it "should returns full path for relative" do
44
+ path = Skyscraper::Path.factory("http://google.com/a/index.php")
45
+ path.full_path_for("b.html").should == "http://google.com/a/b.html"
46
+ end
47
+
48
+ it "should returns full path for absolute" do
49
+ path = Skyscraper::Path.factory("http://google.com/a/index.php")
50
+ path.full_path_for("/b.html").should == "http://google.com/b.html"
51
+ end
52
+
53
+ it "should returns file name" do
54
+ path = Skyscraper::Path.factory("http://google.com/a/index.php")
55
+ path.file_name.should == "index.php"
56
+ end
57
+ end
58
+
59
+ describe "when path is LOCAL" do
60
+ before(:each) do
61
+ @path = Skyscraper::Path.factory("/var/www/files/file.ext")
62
+ end
63
+
64
+ it "should returns folder" do
65
+ @path.folder.should == "/var/www/files/"
66
+ end
67
+
68
+ it "should returns file name" do
69
+ @path.file_name.should == "file.ext"
70
+ end
71
+
72
+ it "should returns full path" do
73
+ @path.full_path.should == "/var/www/files/file.ext"
74
+ end
75
+
76
+ it "should returns base" do
77
+ @path.base.should == "/var/www/files/"
78
+ end
79
+
80
+ it "should returns full path for relative" do
81
+ path = Skyscraper::Path.factory("/var/www/public/index.html")
82
+ path.full_path_for("../b.html").should == "/var/www/public/../b.html"
83
+ path.full_path_for("b.html").should == "/var/www/public/b.html"
84
+ end
85
+
86
+ it "should returns full path for absolute full" do
87
+ path = Skyscraper::Path.factory("/var/www/public/index.html")
88
+ path.full_path_for("/var/www/test.html").should == "/var/www/test.html"
89
+ end
90
+ end
91
+
92
+ it "should detect if string is remote " do
93
+ Skyscraper::Path.remote?("http://google.com").should == true
94
+ Skyscraper::Path.remote?("google.com").should == true
95
+ end
96
+
97
+ it "should detect if string is not remote " do
98
+ Skyscraper::Path.remote?("/var/www/projects").should == false
99
+ Skyscraper::Path.remote?("/var/www/projects/file.ext").should == false
100
+ end
101
+
102
+ it "should check if is absolute address" do
103
+ Skyscraper::Path.absolute?("/some/relative/path").should == true
104
+ end
105
+
106
+ it "should returns nil for wrong path" do
107
+ path = Skyscraper::Path.factory("/var/www/files/")
108
+ path.file_name.should == nil
109
+ end
110
+ end
@@ -0,0 +1,151 @@
1
+ describe Skyscraper::Results do
2
+ def fetch options = {}
3
+ options.reverse_merge! fields: {}, options: {}
4
+ base = Skyscraper::Base.new
5
+ base.pages options[:path]
6
+
7
+ options[:fields].each_pair do |key, value|
8
+ base.field key, value
9
+ end
10
+
11
+ Skyscraper::Results.new(base, options[:options]).fetch
12
+ end
13
+
14
+ it "should fetch file content" do
15
+ results = fetch path: path_to("skyscraper-fetch.html"), fields: { h1: "h1" }
16
+ results[0][:h1].should == "Hello world"
17
+ end
18
+
19
+ it "should fetch with delay" do
20
+ time = Time.now
21
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 2, fields: { title: "title" }, options: { delay: 1 }
22
+ time_diff = Time.now - time
23
+ time_diff.should > 1
24
+ end
25
+
26
+ it "should fetch with delay after" do
27
+ time = Time.now
28
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { delay: { sleep: 1, after: 7 } }
29
+ time_diff = Time.now - time
30
+ time_diff.should > 1
31
+ time_diff.should < 3
32
+
33
+ time = Time.now
34
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { delay: { sleep: 1, after: 11 } }
35
+ time_diff = Time.now - time
36
+ time_diff.should < 1
37
+ end
38
+
39
+ it "should fetch with results limit" do
40
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 11, options: { limit: 10 }
41
+ results.length.should == 10
42
+ end
43
+
44
+ it "should apply config defaults" do
45
+ base = Skyscraper::Base.new
46
+ base.config.limit = 2
47
+ base.pages [path_to("skyscraper-fetch.html")] * 10
48
+
49
+ results = Skyscraper::Results.new(base).fetch
50
+ results.length.should == 2
51
+ end
52
+
53
+ it "should continue after limit reached" do
54
+ base = Skyscraper::Base.new
55
+ base.config.limit = 1
56
+ base.pages [path_to("skyscraper-fetch.html"), path_to("skyscraper-fetch-2.html")]
57
+ base.field :h1, "h1"
58
+
59
+ results = Skyscraper::Results.new(base)
60
+ results.fetch.length.should == 1
61
+ results.records.length.should == 1
62
+ results.continue.length.should == 1
63
+ results.records.length.should == 2
64
+ results.records[1][:h1].should == "Hello from A"
65
+ end
66
+
67
+ describe "callbacks" do
68
+ it "should calls after each page callback" do
69
+ @call_count = 0
70
+ callback = proc do |result, page|
71
+ result.should be_an_instance_of(Hash)
72
+ page.should be_an_instance_of(Skyscraper::Node::Base)
73
+ @call_count += 1
74
+ end
75
+
76
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { after_each: [callback] }
77
+
78
+ @call_count.should == 10
79
+ end
80
+
81
+ it "should calls after all callback" do
82
+ @call_count = 0
83
+ callback = proc do |results|
84
+ results.should be_an_instance_of(Array)
85
+ @call_count += 1
86
+ end
87
+
88
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { after_all: [callback] }
89
+
90
+ @call_count.should == 1
91
+ end
92
+
93
+ it "should change result value for each" do
94
+ callback = proc do |result, page|
95
+ result[:h1] += " with callback"
96
+ end
97
+
98
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_each: [callback] }
99
+ results[0][:h1].should == "Hello world with callback"
100
+ end
101
+
102
+ it "should change results values for all" do
103
+ callback = proc do |results|
104
+ results << "test"
105
+ end
106
+
107
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_all: [callback] }
108
+ results.last.should == "test"
109
+ end
110
+
111
+ it "should doesn't requires callback arguments" do
112
+ callback = proc do
113
+ "with callback"
114
+ end
115
+
116
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_each: [callback] }
117
+ results[0][:h1].should == "Hello world"
118
+ end
119
+
120
+ end
121
+
122
+ describe "errors" do
123
+ before(:all) do
124
+ Skyscraper.config.noise_errors = false
125
+ end
126
+
127
+ it "should catch invalid url exception" do
128
+ Skyscraper.config.skip_on_error = false
129
+ lambda do
130
+ fetch path: "http://google.wrong"
131
+ end.should raise_error Skyscraper::NoResourceException
132
+ end
133
+
134
+ it "should catch file not exists exception" do
135
+ Skyscraper.config.skip_on_error = false
136
+ lambda do
137
+ fetch path: "/tmp/skyscraper/unknow_file"
138
+ end.should raise_error Skyscraper::NoResourceException
139
+ end
140
+
141
+ it "should skip on error" do
142
+ Skyscraper.config.skip_on_error = true
143
+ begin
144
+ fetch path: "http://google.wrong"
145
+ rescue Skyscraper::NoResourceException
146
+ catched = true
147
+ end
148
+ catched.should == nil
149
+ end
150
+ end
151
+ end