skyscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +22 -0
  5. data/README.md +180 -0
  6. data/Rakefile +5 -0
  7. data/lib/skyscraper.rb +56 -0
  8. data/lib/skyscraper/base.rb +44 -0
  9. data/lib/skyscraper/config.rb +15 -0
  10. data/lib/skyscraper/document.rb +11 -0
  11. data/lib/skyscraper/field.rb +24 -0
  12. data/lib/skyscraper/node.rb +8 -0
  13. data/lib/skyscraper/node/base.rb +103 -0
  14. data/lib/skyscraper/node/resource.rb +57 -0
  15. data/lib/skyscraper/pages.rb +27 -0
  16. data/lib/skyscraper/path.rb +29 -0
  17. data/lib/skyscraper/path/base.rb +15 -0
  18. data/lib/skyscraper/path/local.rb +29 -0
  19. data/lib/skyscraper/path/remote.rb +32 -0
  20. data/lib/skyscraper/results.rb +93 -0
  21. data/lib/version.rb +3 -0
  22. data/skyscraper.gemspec +22 -0
  23. data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
  24. data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
  25. data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
  26. data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
  27. data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
  28. data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
  29. data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
  30. data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
  31. data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
  32. data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
  33. data/spec/skyscraper/skyscraper_spec.rb +39 -0
  34. data/spec/spec_helper.rb +3 -0
  35. data/spec/support/skyscraper_helpers.rb +9 -0
  36. data/spec/test_files/encoding.html~ +12 -0
  37. data/spec/test_files/skyscraper-base.html +30 -0
  38. data/spec/test_files/skyscraper-document.html +30 -0
  39. data/spec/test_files/skyscraper-encoding.html +12 -0
  40. data/spec/test_files/skyscraper-fetch-2.html +11 -0
  41. data/spec/test_files/skyscraper-fetch.html +31 -0
  42. data/spec/test_files/skyscraper-field.html +30 -0
  43. data/spec/test_files/skyscraper-node-base-a.html +11 -0
  44. data/spec/test_files/skyscraper-node-base-b.html +10 -0
  45. data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
  46. data/spec/test_files/skyscraper-node-base.html +30 -0
  47. data/spec/test_files/skyscraper-node-resource-b.html +10 -0
  48. data/spec/test_files/skyscraper-node-resource-image.png +0 -0
  49. data/spec/test_files/skyscraper-node-resource.html +12 -0
  50. data/spec/test_files/skyscraper-pages.html +30 -0
  51. data/spec/test_files/skyscraper.html +30 -0
  52. metadata +169 -0
@@ -0,0 +1,39 @@
1
+ #encoding: utf-8
2
+
3
+ class TestScraper
4
+ include Skyscraper
5
+
6
+ pages [path_to("skyscraper.html")] * 2
7
+ field :h1, "h1"
8
+ end
9
+
10
+ describe Skyscraper do
11
+ it "requires necessery libraries" do
12
+ require("open-uri").should == false
13
+ require("uri").should == false
14
+ require("nokogiri").should == false
15
+ require("active_support").should == false
16
+ end
17
+
18
+ it "should fetch remote page" do
19
+ Skyscraper::fetch("http://google.com").should be_an Skyscraper::Node::Base
20
+ end
21
+
22
+ it "static method fetch should works" do
23
+ Skyscraper::fetch(path_to("skyscraper.html")).first("h1").text.should == "Hello world"
24
+ end
25
+
26
+ it "should support utf-8 encoding by default" do
27
+ Skyscraper::fetch(path_to("skyscraper-encoding.html")).first(".utf-8").text.should == "ąśćżół"
28
+ end
29
+
30
+ it "should works when included" do
31
+ TestScraper.new.fetch[0][:h1].should == "Hello world"
32
+ TestScraper.new.fetch[1][:h1].should == "Hello world"
33
+ end
34
+
35
+ it "should allow to set variable for config in chaining" do
36
+ Skyscraper.config.foo = "bar"
37
+ Skyscraper.config.foo.should == "bar"
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ require "skyscraper"
2
+ require "support/skyscraper_helpers"
3
+ include SkyscraperHelpers
@@ -0,0 +1,9 @@
1
+ module SkyscraperHelpers
2
+ def path_to name
3
+ "#{File.expand_path('../../test_files', __FILE__)}/#{name}"
4
+ end
5
+
6
+ def remove_test_directory
7
+ `rm -rf /tmp/skyscraper_test`
8
+ end
9
+ end
@@ -0,0 +1,12 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Encoding</title>
5
+ </head>
6
+
7
+ <body>
8
+ <div class="utf-8">ąśćżół</div>
9
+ <div class="euc-jp">
10
+ </body>
11
+ </html>
12
+
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,12 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Encoding</title>
5
+ </head>
6
+
7
+ <body>
8
+ <div class="utf-8">ąśćżół</div>
9
+ <div class="euc-jp">ナ</div>
10
+ </body>
11
+ </html>
12
+
@@ -0,0 +1,11 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from A</h1>
9
+ <a href="b.html">B</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,31 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
31
+
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,11 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from A</h1>
9
+ <a href="skyscraper-node-base-b.html">b</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,10 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from B</h1>
9
+ </body>
10
+ </html>
@@ -0,0 +1,34 @@
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Demo</title>
6
+ </head>
7
+ <body>
8
+ <a href="http://jquery.com/">jQuery</a>
9
+ <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script>
10
+ <div class="parent-1">
11
+ <div class="parent-2">
12
+ <ul class="menu">
13
+ <li class="item-1">Item 1</li>
14
+ <li class="item-2">Item 2</li>
15
+ <li class="item-3">Item 3</li>
16
+ <li class="item-4">
17
+ <ul class="menu-4">
18
+ <li class="item-4-1">Item 4 1</li>
19
+ </ul>
20
+ </li>
21
+ </ul>
22
+ <div id="parent-3">
23
+ <p class="a">a</p>
24
+ <p class="a">a</p>
25
+ <p class="a">a</p>
26
+ <p class="a">a</p>
27
+ <p class="b">b</p>
28
+ <p class="b">b</p>
29
+ </div>
30
+ </div>
31
+ </div>
32
+ </body>
33
+ </html>
34
+
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="skyscraper-node-base-a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,10 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from B</h1>
9
+ </body>
10
+ </html>
@@ -0,0 +1,12 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from A</h1>
9
+ <a href="skyscraper-node-resource-b.html">B</a>
10
+ <img src="skyscraper-node-resource-image.png">
11
+ </body>
12
+ </html>
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
metadata ADDED
@@ -0,0 +1,169 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: skyscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Adam Dratwinski
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &71707250 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *71707250
25
+ - !ruby/object:Gem::Dependency
26
+ name: rake
27
+ requirement: &71707040 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *71707040
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokogiri
38
+ requirement: &71706750 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *71706750
47
+ - !ruby/object:Gem::Dependency
48
+ name: actionpack
49
+ requirement: &71706490 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *71706490
58
+ description: Library that helps scraping data from websites in easy way
59
+ email:
60
+ - arboooz@gmail.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - .gitignore
66
+ - .rspec
67
+ - Gemfile
68
+ - LICENSE
69
+ - README.md
70
+ - Rakefile
71
+ - lib/skyscraper.rb
72
+ - lib/skyscraper/base.rb
73
+ - lib/skyscraper/config.rb
74
+ - lib/skyscraper/document.rb
75
+ - lib/skyscraper/field.rb
76
+ - lib/skyscraper/node.rb
77
+ - lib/skyscraper/node/base.rb
78
+ - lib/skyscraper/node/resource.rb
79
+ - lib/skyscraper/pages.rb
80
+ - lib/skyscraper/path.rb
81
+ - lib/skyscraper/path/base.rb
82
+ - lib/skyscraper/path/local.rb
83
+ - lib/skyscraper/path/remote.rb
84
+ - lib/skyscraper/results.rb
85
+ - lib/version.rb
86
+ - skyscraper.gemspec
87
+ - spec/skyscraper/skyscraper/base_spec.rb
88
+ - spec/skyscraper/skyscraper/config_spec.rb
89
+ - spec/skyscraper/skyscraper/document_spec.rb
90
+ - spec/skyscraper/skyscraper/field_spec.rb
91
+ - spec/skyscraper/skyscraper/node/base_spec.rb
92
+ - spec/skyscraper/skyscraper/node/resource_spec.rb
93
+ - spec/skyscraper/skyscraper/node_spec.rb
94
+ - spec/skyscraper/skyscraper/pages_spec.rb
95
+ - spec/skyscraper/skyscraper/path_spec.rb
96
+ - spec/skyscraper/skyscraper/results_spec.rb
97
+ - spec/skyscraper/skyscraper_spec.rb
98
+ - spec/spec_helper.rb
99
+ - spec/support/skyscraper_helpers.rb
100
+ - spec/test_files/encoding.html~
101
+ - spec/test_files/skyscraper-base.html
102
+ - spec/test_files/skyscraper-document.html
103
+ - spec/test_files/skyscraper-encoding.html
104
+ - spec/test_files/skyscraper-fetch-2.html
105
+ - spec/test_files/skyscraper-fetch.html
106
+ - spec/test_files/skyscraper-field.html
107
+ - spec/test_files/skyscraper-node-base-a.html
108
+ - spec/test_files/skyscraper-node-base-b.html
109
+ - spec/test_files/skyscraper-node-base-traversing.html
110
+ - spec/test_files/skyscraper-node-base.html
111
+ - spec/test_files/skyscraper-node-resource-b.html
112
+ - spec/test_files/skyscraper-node-resource-image.png
113
+ - spec/test_files/skyscraper-node-resource.html
114
+ - spec/test_files/skyscraper-pages.html
115
+ - spec/test_files/skyscraper.html
116
+ homepage: https://github.com/boooz/skyscraper
117
+ licenses: []
118
+ post_install_message:
119
+ rdoc_options: []
120
+ require_paths:
121
+ - lib
122
+ required_ruby_version: !ruby/object:Gem::Requirement
123
+ none: false
124
+ requirements:
125
+ - - ! '>='
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ required_rubygems_version: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ requirements: []
135
+ rubyforge_project:
136
+ rubygems_version: 1.8.15
137
+ signing_key:
138
+ specification_version: 3
139
+ summary: Library that helps scraping data from websites in easy way
140
+ test_files:
141
+ - spec/skyscraper/skyscraper/base_spec.rb
142
+ - spec/skyscraper/skyscraper/config_spec.rb
143
+ - spec/skyscraper/skyscraper/document_spec.rb
144
+ - spec/skyscraper/skyscraper/field_spec.rb
145
+ - spec/skyscraper/skyscraper/node/base_spec.rb
146
+ - spec/skyscraper/skyscraper/node/resource_spec.rb
147
+ - spec/skyscraper/skyscraper/node_spec.rb
148
+ - spec/skyscraper/skyscraper/pages_spec.rb
149
+ - spec/skyscraper/skyscraper/path_spec.rb
150
+ - spec/skyscraper/skyscraper/results_spec.rb
151
+ - spec/skyscraper/skyscraper_spec.rb
152
+ - spec/spec_helper.rb
153
+ - spec/support/skyscraper_helpers.rb
154
+ - spec/test_files/encoding.html~
155
+ - spec/test_files/skyscraper-base.html
156
+ - spec/test_files/skyscraper-document.html
157
+ - spec/test_files/skyscraper-encoding.html
158
+ - spec/test_files/skyscraper-fetch-2.html
159
+ - spec/test_files/skyscraper-fetch.html
160
+ - spec/test_files/skyscraper-field.html
161
+ - spec/test_files/skyscraper-node-base-a.html
162
+ - spec/test_files/skyscraper-node-base-b.html
163
+ - spec/test_files/skyscraper-node-base-traversing.html
164
+ - spec/test_files/skyscraper-node-base.html
165
+ - spec/test_files/skyscraper-node-resource-b.html
166
+ - spec/test_files/skyscraper-node-resource-image.png
167
+ - spec/test_files/skyscraper-node-resource.html
168
+ - spec/test_files/skyscraper-pages.html
169
+ - spec/test_files/skyscraper.html