skyscraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +22 -0
  5. data/README.md +180 -0
  6. data/Rakefile +5 -0
  7. data/lib/skyscraper.rb +56 -0
  8. data/lib/skyscraper/base.rb +44 -0
  9. data/lib/skyscraper/config.rb +15 -0
  10. data/lib/skyscraper/document.rb +11 -0
  11. data/lib/skyscraper/field.rb +24 -0
  12. data/lib/skyscraper/node.rb +8 -0
  13. data/lib/skyscraper/node/base.rb +103 -0
  14. data/lib/skyscraper/node/resource.rb +57 -0
  15. data/lib/skyscraper/pages.rb +27 -0
  16. data/lib/skyscraper/path.rb +29 -0
  17. data/lib/skyscraper/path/base.rb +15 -0
  18. data/lib/skyscraper/path/local.rb +29 -0
  19. data/lib/skyscraper/path/remote.rb +32 -0
  20. data/lib/skyscraper/results.rb +93 -0
  21. data/lib/version.rb +3 -0
  22. data/skyscraper.gemspec +22 -0
  23. data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
  24. data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
  25. data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
  26. data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
  27. data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
  28. data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
  29. data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
  30. data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
  31. data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
  32. data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
  33. data/spec/skyscraper/skyscraper_spec.rb +39 -0
  34. data/spec/spec_helper.rb +3 -0
  35. data/spec/support/skyscraper_helpers.rb +9 -0
  36. data/spec/test_files/encoding.html~ +12 -0
  37. data/spec/test_files/skyscraper-base.html +30 -0
  38. data/spec/test_files/skyscraper-document.html +30 -0
  39. data/spec/test_files/skyscraper-encoding.html +12 -0
  40. data/spec/test_files/skyscraper-fetch-2.html +11 -0
  41. data/spec/test_files/skyscraper-fetch.html +31 -0
  42. data/spec/test_files/skyscraper-field.html +30 -0
  43. data/spec/test_files/skyscraper-node-base-a.html +11 -0
  44. data/spec/test_files/skyscraper-node-base-b.html +10 -0
  45. data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
  46. data/spec/test_files/skyscraper-node-base.html +30 -0
  47. data/spec/test_files/skyscraper-node-resource-b.html +10 -0
  48. data/spec/test_files/skyscraper-node-resource-image.png +0 -0
  49. data/spec/test_files/skyscraper-node-resource.html +12 -0
  50. data/spec/test_files/skyscraper-pages.html +30 -0
  51. data/spec/test_files/skyscraper.html +30 -0
  52. metadata +169 -0
@@ -0,0 +1,39 @@
1
+ #encoding: utf-8
2
+
3
+ class TestScraper
4
+ include Skyscraper
5
+
6
+ pages [path_to("skyscraper.html")] * 2
7
+ field :h1, "h1"
8
+ end
9
+
10
+ describe Skyscraper do
11
+ it "requires necessery libraries" do
12
+ require("open-uri").should == false
13
+ require("uri").should == false
14
+ require("nokogiri").should == false
15
+ require("active_support").should == false
16
+ end
17
+
18
+ it "should fetch remote page" do
19
+ Skyscraper::fetch("http://google.com").should be_an Skyscraper::Node::Base
20
+ end
21
+
22
+ it "static method fetch should works" do
23
+ Skyscraper::fetch(path_to("skyscraper.html")).first("h1").text.should == "Hello world"
24
+ end
25
+
26
+ it "should support utf-8 encoding by default" do
27
+ Skyscraper::fetch(path_to("skyscraper-encoding.html")).first(".utf-8").text.should == "ąśćżół"
28
+ end
29
+
30
+ it "should works when included" do
31
+ TestScraper.new.fetch[0][:h1].should == "Hello world"
32
+ TestScraper.new.fetch[1][:h1].should == "Hello world"
33
+ end
34
+
35
+ it "should allow to set variable for config in chaining" do
36
+ Skyscraper.config.foo = "bar"
37
+ Skyscraper.config.foo.should == "bar"
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ require "skyscraper"
2
+ require "support/skyscraper_helpers"
3
+ include SkyscraperHelpers
@@ -0,0 +1,9 @@
1
+ module SkyscraperHelpers
2
+ def path_to name
3
+ "#{File.expand_path('../../test_files', __FILE__)}/#{name}"
4
+ end
5
+
6
+ def remove_test_directory
7
+ `rm -rf /tmp/skyscraper_test`
8
+ end
9
+ end
@@ -0,0 +1,12 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Encoding</title>
5
+ </head>
6
+
7
+ <body>
8
+ <div class="utf-8">ąśćżół</div>
9
+ <div class="euc-jp">
10
+ </body>
11
+ </html>
12
+
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,12 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Encoding</title>
5
+ </head>
6
+
7
+ <body>
8
+ <div class="utf-8">ąśćżół</div>
9
+ <div class="euc-jp">ナ</div>
10
+ </body>
11
+ </html>
12
+
@@ -0,0 +1,11 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from A</h1>
9
+ <a href="b.html">B</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,31 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
31
+
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,11 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from A</h1>
9
+ <a href="skyscraper-node-base-b.html">b</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,10 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from B</h1>
9
+ </body>
10
+ </html>
@@ -0,0 +1,34 @@
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Demo</title>
6
+ </head>
7
+ <body>
8
+ <a href="http://jquery.com/">jQuery</a>
9
+ <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script>
10
+ <div class="parent-1">
11
+ <div class="parent-2">
12
+ <ul class="menu">
13
+ <li class="item-1">Item 1</li>
14
+ <li class="item-2">Item 2</li>
15
+ <li class="item-3">Item 3</li>
16
+ <li class="item-4">
17
+ <ul class="menu-4">
18
+ <li class="item-4-1">Item 4 1</li>
19
+ </ul>
20
+ </li>
21
+ </ul>
22
+ <div id="parent-3">
23
+ <p class="a">a</p>
24
+ <p class="a">a</p>
25
+ <p class="a">a</p>
26
+ <p class="a">a</p>
27
+ <p class="b">b</p>
28
+ <p class="b">b</p>
29
+ </div>
30
+ </div>
31
+ </div>
32
+ </body>
33
+ </html>
34
+
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="skyscraper-node-base-a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,10 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from B</h1>
9
+ </body>
10
+ </html>
@@ -0,0 +1,12 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello from A</h1>
9
+ <a href="skyscraper-node-resource-b.html">B</a>
10
+ <img src="skyscraper-node-resource-image.png">
11
+ </body>
12
+ </html>
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
@@ -0,0 +1,30 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <h1>Hello world</h1>
9
+ <ul class="menu">
10
+ <li>
11
+ <a href="a.html">A</a>
12
+ <a href="b.html">A</a>
13
+ <a href="c.html">A</a>
14
+ <a href="d.html">A</a>
15
+ </li>
16
+ </ul>
17
+ <ul class="menu-full">
18
+ <li>
19
+ <a href="http://google.com/a.html">A</a>
20
+ <a href="http://google.com/b.html">A</a>
21
+ <a href="http://google.com/c.html">A</a>
22
+ <a href="http://google.com/d.html">A</a>
23
+ </li>
24
+ </ul>
25
+
26
+ <div class="item">
27
+ <strong class="name">Name value</strong>
28
+ </div>
29
+ </body>
30
+ </html>
metadata ADDED
@@ -0,0 +1,169 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: skyscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Adam Dratwinski
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &71707250 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *71707250
25
+ - !ruby/object:Gem::Dependency
26
+ name: rake
27
+ requirement: &71707040 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *71707040
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokogiri
38
+ requirement: &71706750 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *71706750
47
+ - !ruby/object:Gem::Dependency
48
+ name: actionpack
49
+ requirement: &71706490 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *71706490
58
+ description: Library that helps scraping data from websites in easy way
59
+ email:
60
+ - arboooz@gmail.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - .gitignore
66
+ - .rspec
67
+ - Gemfile
68
+ - LICENSE
69
+ - README.md
70
+ - Rakefile
71
+ - lib/skyscraper.rb
72
+ - lib/skyscraper/base.rb
73
+ - lib/skyscraper/config.rb
74
+ - lib/skyscraper/document.rb
75
+ - lib/skyscraper/field.rb
76
+ - lib/skyscraper/node.rb
77
+ - lib/skyscraper/node/base.rb
78
+ - lib/skyscraper/node/resource.rb
79
+ - lib/skyscraper/pages.rb
80
+ - lib/skyscraper/path.rb
81
+ - lib/skyscraper/path/base.rb
82
+ - lib/skyscraper/path/local.rb
83
+ - lib/skyscraper/path/remote.rb
84
+ - lib/skyscraper/results.rb
85
+ - lib/version.rb
86
+ - skyscraper.gemspec
87
+ - spec/skyscraper/skyscraper/base_spec.rb
88
+ - spec/skyscraper/skyscraper/config_spec.rb
89
+ - spec/skyscraper/skyscraper/document_spec.rb
90
+ - spec/skyscraper/skyscraper/field_spec.rb
91
+ - spec/skyscraper/skyscraper/node/base_spec.rb
92
+ - spec/skyscraper/skyscraper/node/resource_spec.rb
93
+ - spec/skyscraper/skyscraper/node_spec.rb
94
+ - spec/skyscraper/skyscraper/pages_spec.rb
95
+ - spec/skyscraper/skyscraper/path_spec.rb
96
+ - spec/skyscraper/skyscraper/results_spec.rb
97
+ - spec/skyscraper/skyscraper_spec.rb
98
+ - spec/spec_helper.rb
99
+ - spec/support/skyscraper_helpers.rb
100
+ - spec/test_files/encoding.html~
101
+ - spec/test_files/skyscraper-base.html
102
+ - spec/test_files/skyscraper-document.html
103
+ - spec/test_files/skyscraper-encoding.html
104
+ - spec/test_files/skyscraper-fetch-2.html
105
+ - spec/test_files/skyscraper-fetch.html
106
+ - spec/test_files/skyscraper-field.html
107
+ - spec/test_files/skyscraper-node-base-a.html
108
+ - spec/test_files/skyscraper-node-base-b.html
109
+ - spec/test_files/skyscraper-node-base-traversing.html
110
+ - spec/test_files/skyscraper-node-base.html
111
+ - spec/test_files/skyscraper-node-resource-b.html
112
+ - spec/test_files/skyscraper-node-resource-image.png
113
+ - spec/test_files/skyscraper-node-resource.html
114
+ - spec/test_files/skyscraper-pages.html
115
+ - spec/test_files/skyscraper.html
116
+ homepage: https://github.com/boooz/skyscraper
117
+ licenses: []
118
+ post_install_message:
119
+ rdoc_options: []
120
+ require_paths:
121
+ - lib
122
+ required_ruby_version: !ruby/object:Gem::Requirement
123
+ none: false
124
+ requirements:
125
+ - - ! '>='
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ required_rubygems_version: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ requirements: []
135
+ rubyforge_project:
136
+ rubygems_version: 1.8.15
137
+ signing_key:
138
+ specification_version: 3
139
+ summary: Library that helps scraping data from websites in easy way
140
+ test_files:
141
+ - spec/skyscraper/skyscraper/base_spec.rb
142
+ - spec/skyscraper/skyscraper/config_spec.rb
143
+ - spec/skyscraper/skyscraper/document_spec.rb
144
+ - spec/skyscraper/skyscraper/field_spec.rb
145
+ - spec/skyscraper/skyscraper/node/base_spec.rb
146
+ - spec/skyscraper/skyscraper/node/resource_spec.rb
147
+ - spec/skyscraper/skyscraper/node_spec.rb
148
+ - spec/skyscraper/skyscraper/pages_spec.rb
149
+ - spec/skyscraper/skyscraper/path_spec.rb
150
+ - spec/skyscraper/skyscraper/results_spec.rb
151
+ - spec/skyscraper/skyscraper_spec.rb
152
+ - spec/spec_helper.rb
153
+ - spec/support/skyscraper_helpers.rb
154
+ - spec/test_files/encoding.html~
155
+ - spec/test_files/skyscraper-base.html
156
+ - spec/test_files/skyscraper-document.html
157
+ - spec/test_files/skyscraper-encoding.html
158
+ - spec/test_files/skyscraper-fetch-2.html
159
+ - spec/test_files/skyscraper-fetch.html
160
+ - spec/test_files/skyscraper-field.html
161
+ - spec/test_files/skyscraper-node-base-a.html
162
+ - spec/test_files/skyscraper-node-base-b.html
163
+ - spec/test_files/skyscraper-node-base-traversing.html
164
+ - spec/test_files/skyscraper-node-base.html
165
+ - spec/test_files/skyscraper-node-resource-b.html
166
+ - spec/test_files/skyscraper-node-resource-image.png
167
+ - spec/test_files/skyscraper-node-resource.html
168
+ - spec/test_files/skyscraper-pages.html
169
+ - spec/test_files/skyscraper.html