skyscraper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +180 -0
- data/Rakefile +5 -0
- data/lib/skyscraper.rb +56 -0
- data/lib/skyscraper/base.rb +44 -0
- data/lib/skyscraper/config.rb +15 -0
- data/lib/skyscraper/document.rb +11 -0
- data/lib/skyscraper/field.rb +24 -0
- data/lib/skyscraper/node.rb +8 -0
- data/lib/skyscraper/node/base.rb +103 -0
- data/lib/skyscraper/node/resource.rb +57 -0
- data/lib/skyscraper/pages.rb +27 -0
- data/lib/skyscraper/path.rb +29 -0
- data/lib/skyscraper/path/base.rb +15 -0
- data/lib/skyscraper/path/local.rb +29 -0
- data/lib/skyscraper/path/remote.rb +32 -0
- data/lib/skyscraper/results.rb +93 -0
- data/lib/version.rb +3 -0
- data/skyscraper.gemspec +22 -0
- data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
- data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
- data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
- data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
- data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
- data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
- data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
- data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
- data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
- data/spec/skyscraper/skyscraper_spec.rb +39 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/support/skyscraper_helpers.rb +9 -0
- data/spec/test_files/encoding.html~ +12 -0
- data/spec/test_files/skyscraper-base.html +30 -0
- data/spec/test_files/skyscraper-document.html +30 -0
- data/spec/test_files/skyscraper-encoding.html +12 -0
- data/spec/test_files/skyscraper-fetch-2.html +11 -0
- data/spec/test_files/skyscraper-fetch.html +31 -0
- data/spec/test_files/skyscraper-field.html +30 -0
- data/spec/test_files/skyscraper-node-base-a.html +11 -0
- data/spec/test_files/skyscraper-node-base-b.html +10 -0
- data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
- data/spec/test_files/skyscraper-node-base.html +30 -0
- data/spec/test_files/skyscraper-node-resource-b.html +10 -0
- data/spec/test_files/skyscraper-node-resource-image.png +0 -0
- data/spec/test_files/skyscraper-node-resource.html +12 -0
- data/spec/test_files/skyscraper-pages.html +30 -0
- data/spec/test_files/skyscraper.html +30 -0
- metadata +169 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
|
3
|
+
class TestScraper
|
4
|
+
include Skyscraper
|
5
|
+
|
6
|
+
pages [path_to("skyscraper.html")] * 2
|
7
|
+
field :h1, "h1"
|
8
|
+
end
|
9
|
+
|
10
|
+
describe Skyscraper do
|
11
|
+
it "requires necessery libraries" do
|
12
|
+
require("open-uri").should == false
|
13
|
+
require("uri").should == false
|
14
|
+
require("nokogiri").should == false
|
15
|
+
require("active_support").should == false
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should fetch remote page" do
|
19
|
+
Skyscraper::fetch("http://google.com").should be_an Skyscraper::Node::Base
|
20
|
+
end
|
21
|
+
|
22
|
+
it "static method fetch should works" do
|
23
|
+
Skyscraper::fetch(path_to("skyscraper.html")).first("h1").text.should == "Hello world"
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should support utf-8 encoding by default" do
|
27
|
+
Skyscraper::fetch(path_to("skyscraper-encoding.html")).first(".utf-8").text.should == "ąśćżół"
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should works when included" do
|
31
|
+
TestScraper.new.fetch[0][:h1].should == "Hello world"
|
32
|
+
TestScraper.new.fetch[1][:h1].should == "Hello world"
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should allow to set variable for config in chaining" do
|
36
|
+
Skyscraper.config.foo = "bar"
|
37
|
+
Skyscraper.config.foo.should == "bar"
|
38
|
+
end
|
39
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
<!DOCTYPE HTML>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Title</title>
|
5
|
+
</head>
|
6
|
+
|
7
|
+
<body>
|
8
|
+
<h1>Hello world</h1>
|
9
|
+
<ul class="menu">
|
10
|
+
<li>
|
11
|
+
<a href="a.html">A</a>
|
12
|
+
<a href="b.html">A</a>
|
13
|
+
<a href="c.html">A</a>
|
14
|
+
<a href="d.html">A</a>
|
15
|
+
</li>
|
16
|
+
</ul>
|
17
|
+
<ul class="menu-full">
|
18
|
+
<li>
|
19
|
+
<a href="http://google.com/a.html">A</a>
|
20
|
+
<a href="http://google.com/b.html">A</a>
|
21
|
+
<a href="http://google.com/c.html">A</a>
|
22
|
+
<a href="http://google.com/d.html">A</a>
|
23
|
+
</li>
|
24
|
+
</ul>
|
25
|
+
|
26
|
+
<div class="item">
|
27
|
+
<strong class="name">Name value</strong>
|
28
|
+
</div>
|
29
|
+
</body>
|
30
|
+
</html>
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<!DOCTYPE HTML>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Title</title>
|
5
|
+
</head>
|
6
|
+
|
7
|
+
<body>
|
8
|
+
<h1>Hello world</h1>
|
9
|
+
<ul class="menu">
|
10
|
+
<li>
|
11
|
+
<a href="a.html">A</a>
|
12
|
+
<a href="b.html">A</a>
|
13
|
+
<a href="c.html">A</a>
|
14
|
+
<a href="d.html">A</a>
|
15
|
+
</li>
|
16
|
+
</ul>
|
17
|
+
<ul class="menu-full">
|
18
|
+
<li>
|
19
|
+
<a href="http://google.com/a.html">A</a>
|
20
|
+
<a href="http://google.com/b.html">A</a>
|
21
|
+
<a href="http://google.com/c.html">A</a>
|
22
|
+
<a href="http://google.com/d.html">A</a>
|
23
|
+
</li>
|
24
|
+
</ul>
|
25
|
+
|
26
|
+
<div class="item">
|
27
|
+
<strong class="name">Name value</strong>
|
28
|
+
</div>
|
29
|
+
</body>
|
30
|
+
</html>
|
@@ -0,0 +1,31 @@
|
|
1
|
+
<!DOCTYPE HTML>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Title</title>
|
5
|
+
</head>
|
6
|
+
|
7
|
+
<body>
|
8
|
+
<h1>Hello world</h1>
|
9
|
+
<ul class="menu">
|
10
|
+
<li>
|
11
|
+
<a href="a.html">A</a>
|
12
|
+
<a href="b.html">A</a>
|
13
|
+
<a href="c.html">A</a>
|
14
|
+
<a href="d.html">A</a>
|
15
|
+
</li>
|
16
|
+
</ul>
|
17
|
+
<ul class="menu-full">
|
18
|
+
<li>
|
19
|
+
<a href="http://google.com/a.html">A</a>
|
20
|
+
<a href="http://google.com/b.html">A</a>
|
21
|
+
<a href="http://google.com/c.html">A</a>
|
22
|
+
<a href="http://google.com/d.html">A</a>
|
23
|
+
</li>
|
24
|
+
</ul>
|
25
|
+
|
26
|
+
<div class="item">
|
27
|
+
<strong class="name">Name value</strong>
|
28
|
+
</div>
|
29
|
+
</body>
|
30
|
+
</html>
|
31
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<!DOCTYPE HTML>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Title</title>
|
5
|
+
</head>
|
6
|
+
|
7
|
+
<body>
|
8
|
+
<h1>Hello world</h1>
|
9
|
+
<ul class="menu">
|
10
|
+
<li>
|
11
|
+
<a href="a.html">A</a>
|
12
|
+
<a href="b.html">A</a>
|
13
|
+
<a href="c.html">A</a>
|
14
|
+
<a href="d.html">A</a>
|
15
|
+
</li>
|
16
|
+
</ul>
|
17
|
+
<ul class="menu-full">
|
18
|
+
<li>
|
19
|
+
<a href="http://google.com/a.html">A</a>
|
20
|
+
<a href="http://google.com/b.html">A</a>
|
21
|
+
<a href="http://google.com/c.html">A</a>
|
22
|
+
<a href="http://google.com/d.html">A</a>
|
23
|
+
</li>
|
24
|
+
</ul>
|
25
|
+
|
26
|
+
<div class="item">
|
27
|
+
<strong class="name">Name value</strong>
|
28
|
+
</div>
|
29
|
+
</body>
|
30
|
+
</html>
|
@@ -0,0 +1,34 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="utf-8">
|
5
|
+
<title>Demo</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<a href="http://jquery.com/">jQuery</a>
|
9
|
+
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script>
|
10
|
+
<div class="parent-1">
|
11
|
+
<div class="parent-2">
|
12
|
+
<ul class="menu">
|
13
|
+
<li class="item-1">Item 1</li>
|
14
|
+
<li class="item-2">Item 2</li>
|
15
|
+
<li class="item-3">Item 3</li>
|
16
|
+
<li class="item-4">
|
17
|
+
<ul class="menu-4">
|
18
|
+
<li class="item-4-1">Item 4 1</li>
|
19
|
+
</ul>
|
20
|
+
</li>
|
21
|
+
</ul>
|
22
|
+
<div id="parent-3">
|
23
|
+
<p class="a">a</p>
|
24
|
+
<p class="a">a</p>
|
25
|
+
<p class="a">a</p>
|
26
|
+
<p class="a">a</p>
|
27
|
+
<p class="b">b</p>
|
28
|
+
<p class="b">b</p>
|
29
|
+
</div>
|
30
|
+
</div>
|
31
|
+
</div>
|
32
|
+
</body>
|
33
|
+
</html>
|
34
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<!DOCTYPE HTML>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Title</title>
|
5
|
+
</head>
|
6
|
+
|
7
|
+
<body>
|
8
|
+
<h1>Hello world</h1>
|
9
|
+
<ul class="menu">
|
10
|
+
<li>
|
11
|
+
<a href="skyscraper-node-base-a.html">A</a>
|
12
|
+
<a href="b.html">A</a>
|
13
|
+
<a href="c.html">A</a>
|
14
|
+
<a href="d.html">A</a>
|
15
|
+
</li>
|
16
|
+
</ul>
|
17
|
+
<ul class="menu-full">
|
18
|
+
<li>
|
19
|
+
<a href="http://google.com/a.html">A</a>
|
20
|
+
<a href="http://google.com/b.html">A</a>
|
21
|
+
<a href="http://google.com/c.html">A</a>
|
22
|
+
<a href="http://google.com/d.html">A</a>
|
23
|
+
</li>
|
24
|
+
</ul>
|
25
|
+
|
26
|
+
<div class="item">
|
27
|
+
<strong class="name">Name value</strong>
|
28
|
+
</div>
|
29
|
+
</body>
|
30
|
+
</html>
|
Binary file
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<!DOCTYPE HTML>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Title</title>
|
5
|
+
</head>
|
6
|
+
|
7
|
+
<body>
|
8
|
+
<h1>Hello world</h1>
|
9
|
+
<ul class="menu">
|
10
|
+
<li>
|
11
|
+
<a href="a.html">A</a>
|
12
|
+
<a href="b.html">A</a>
|
13
|
+
<a href="c.html">A</a>
|
14
|
+
<a href="d.html">A</a>
|
15
|
+
</li>
|
16
|
+
</ul>
|
17
|
+
<ul class="menu-full">
|
18
|
+
<li>
|
19
|
+
<a href="http://google.com/a.html">A</a>
|
20
|
+
<a href="http://google.com/b.html">A</a>
|
21
|
+
<a href="http://google.com/c.html">A</a>
|
22
|
+
<a href="http://google.com/d.html">A</a>
|
23
|
+
</li>
|
24
|
+
</ul>
|
25
|
+
|
26
|
+
<div class="item">
|
27
|
+
<strong class="name">Name value</strong>
|
28
|
+
</div>
|
29
|
+
</body>
|
30
|
+
</html>
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<!DOCTYPE HTML>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Title</title>
|
5
|
+
</head>
|
6
|
+
|
7
|
+
<body>
|
8
|
+
<h1>Hello world</h1>
|
9
|
+
<ul class="menu">
|
10
|
+
<li>
|
11
|
+
<a href="a.html">A</a>
|
12
|
+
<a href="b.html">A</a>
|
13
|
+
<a href="c.html">A</a>
|
14
|
+
<a href="d.html">A</a>
|
15
|
+
</li>
|
16
|
+
</ul>
|
17
|
+
<ul class="menu-full">
|
18
|
+
<li>
|
19
|
+
<a href="http://google.com/a.html">A</a>
|
20
|
+
<a href="http://google.com/b.html">A</a>
|
21
|
+
<a href="http://google.com/c.html">A</a>
|
22
|
+
<a href="http://google.com/d.html">A</a>
|
23
|
+
</li>
|
24
|
+
</ul>
|
25
|
+
|
26
|
+
<div class="item">
|
27
|
+
<strong class="name">Name value</strong>
|
28
|
+
</div>
|
29
|
+
</body>
|
30
|
+
</html>
|
metadata
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: skyscraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Adam Dratwinski
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-05-17 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &71707250 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *71707250
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rake
|
27
|
+
requirement: &71707040 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *71707040
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: nokogiri
|
38
|
+
requirement: &71706750 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *71706750
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: actionpack
|
49
|
+
requirement: &71706490 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *71706490
|
58
|
+
description: Library that helps scraping data from websites in easy way
|
59
|
+
email:
|
60
|
+
- arboooz@gmail.com
|
61
|
+
executables: []
|
62
|
+
extensions: []
|
63
|
+
extra_rdoc_files: []
|
64
|
+
files:
|
65
|
+
- .gitignore
|
66
|
+
- .rspec
|
67
|
+
- Gemfile
|
68
|
+
- LICENSE
|
69
|
+
- README.md
|
70
|
+
- Rakefile
|
71
|
+
- lib/skyscraper.rb
|
72
|
+
- lib/skyscraper/base.rb
|
73
|
+
- lib/skyscraper/config.rb
|
74
|
+
- lib/skyscraper/document.rb
|
75
|
+
- lib/skyscraper/field.rb
|
76
|
+
- lib/skyscraper/node.rb
|
77
|
+
- lib/skyscraper/node/base.rb
|
78
|
+
- lib/skyscraper/node/resource.rb
|
79
|
+
- lib/skyscraper/pages.rb
|
80
|
+
- lib/skyscraper/path.rb
|
81
|
+
- lib/skyscraper/path/base.rb
|
82
|
+
- lib/skyscraper/path/local.rb
|
83
|
+
- lib/skyscraper/path/remote.rb
|
84
|
+
- lib/skyscraper/results.rb
|
85
|
+
- lib/version.rb
|
86
|
+
- skyscraper.gemspec
|
87
|
+
- spec/skyscraper/skyscraper/base_spec.rb
|
88
|
+
- spec/skyscraper/skyscraper/config_spec.rb
|
89
|
+
- spec/skyscraper/skyscraper/document_spec.rb
|
90
|
+
- spec/skyscraper/skyscraper/field_spec.rb
|
91
|
+
- spec/skyscraper/skyscraper/node/base_spec.rb
|
92
|
+
- spec/skyscraper/skyscraper/node/resource_spec.rb
|
93
|
+
- spec/skyscraper/skyscraper/node_spec.rb
|
94
|
+
- spec/skyscraper/skyscraper/pages_spec.rb
|
95
|
+
- spec/skyscraper/skyscraper/path_spec.rb
|
96
|
+
- spec/skyscraper/skyscraper/results_spec.rb
|
97
|
+
- spec/skyscraper/skyscraper_spec.rb
|
98
|
+
- spec/spec_helper.rb
|
99
|
+
- spec/support/skyscraper_helpers.rb
|
100
|
+
- spec/test_files/encoding.html~
|
101
|
+
- spec/test_files/skyscraper-base.html
|
102
|
+
- spec/test_files/skyscraper-document.html
|
103
|
+
- spec/test_files/skyscraper-encoding.html
|
104
|
+
- spec/test_files/skyscraper-fetch-2.html
|
105
|
+
- spec/test_files/skyscraper-fetch.html
|
106
|
+
- spec/test_files/skyscraper-field.html
|
107
|
+
- spec/test_files/skyscraper-node-base-a.html
|
108
|
+
- spec/test_files/skyscraper-node-base-b.html
|
109
|
+
- spec/test_files/skyscraper-node-base-traversing.html
|
110
|
+
- spec/test_files/skyscraper-node-base.html
|
111
|
+
- spec/test_files/skyscraper-node-resource-b.html
|
112
|
+
- spec/test_files/skyscraper-node-resource-image.png
|
113
|
+
- spec/test_files/skyscraper-node-resource.html
|
114
|
+
- spec/test_files/skyscraper-pages.html
|
115
|
+
- spec/test_files/skyscraper.html
|
116
|
+
homepage: https://github.com/boooz/skyscraper
|
117
|
+
licenses: []
|
118
|
+
post_install_message:
|
119
|
+
rdoc_options: []
|
120
|
+
require_paths:
|
121
|
+
- lib
|
122
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
123
|
+
none: false
|
124
|
+
requirements:
|
125
|
+
- - ! '>='
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '0'
|
128
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
requirements: []
|
135
|
+
rubyforge_project:
|
136
|
+
rubygems_version: 1.8.15
|
137
|
+
signing_key:
|
138
|
+
specification_version: 3
|
139
|
+
summary: Library that helps scraping data from websites in easy way
|
140
|
+
test_files:
|
141
|
+
- spec/skyscraper/skyscraper/base_spec.rb
|
142
|
+
- spec/skyscraper/skyscraper/config_spec.rb
|
143
|
+
- spec/skyscraper/skyscraper/document_spec.rb
|
144
|
+
- spec/skyscraper/skyscraper/field_spec.rb
|
145
|
+
- spec/skyscraper/skyscraper/node/base_spec.rb
|
146
|
+
- spec/skyscraper/skyscraper/node/resource_spec.rb
|
147
|
+
- spec/skyscraper/skyscraper/node_spec.rb
|
148
|
+
- spec/skyscraper/skyscraper/pages_spec.rb
|
149
|
+
- spec/skyscraper/skyscraper/path_spec.rb
|
150
|
+
- spec/skyscraper/skyscraper/results_spec.rb
|
151
|
+
- spec/skyscraper/skyscraper_spec.rb
|
152
|
+
- spec/spec_helper.rb
|
153
|
+
- spec/support/skyscraper_helpers.rb
|
154
|
+
- spec/test_files/encoding.html~
|
155
|
+
- spec/test_files/skyscraper-base.html
|
156
|
+
- spec/test_files/skyscraper-document.html
|
157
|
+
- spec/test_files/skyscraper-encoding.html
|
158
|
+
- spec/test_files/skyscraper-fetch-2.html
|
159
|
+
- spec/test_files/skyscraper-fetch.html
|
160
|
+
- spec/test_files/skyscraper-field.html
|
161
|
+
- spec/test_files/skyscraper-node-base-a.html
|
162
|
+
- spec/test_files/skyscraper-node-base-b.html
|
163
|
+
- spec/test_files/skyscraper-node-base-traversing.html
|
164
|
+
- spec/test_files/skyscraper-node-base.html
|
165
|
+
- spec/test_files/skyscraper-node-resource-b.html
|
166
|
+
- spec/test_files/skyscraper-node-resource-image.png
|
167
|
+
- spec/test_files/skyscraper-node-resource.html
|
168
|
+
- spec/test_files/skyscraper-pages.html
|
169
|
+
- spec/test_files/skyscraper.html
|