skyscraper 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/skyscraper.rb +2 -2
- data/lib/skyscraper/node.rb +97 -4
- data/lib/skyscraper/resource.rb +55 -0
- data/lib/skyscraper/version.rb +1 -1
- data/skyscraper.gemspec +2 -2
- data/spec/skyscraper/skyscraper/node_spec.rb +85 -0
- data/spec/skyscraper/skyscraper/{node/resource_spec.rb → resource_spec.rb} +17 -17
- data/spec/skyscraper/skyscraper/results_spec.rb +1 -1
- data/spec/skyscraper/skyscraper_spec.rb +1 -1
- data/spec/test_files/{skyscraper-node-base-a.html → skyscraper-node-a.html} +1 -1
- data/spec/test_files/{skyscraper-node-base-b.html → skyscraper-node-b.html} +0 -0
- data/spec/test_files/{skyscraper-node-base-traversing.html → skyscraper-node-traversing.html} +0 -0
- data/spec/test_files/{skyscraper-node-base.html → skyscraper-node.html} +1 -1
- data/spec/test_files/{skyscraper-node-resource-b.html → skyscraper-resource-b.html} +0 -0
- data/spec/test_files/{skyscraper-node-resource-image.png → skyscraper-resource-image.png} +0 -0
- data/spec/test_files/{skyscraper-node-resource.html → skyscraper-resource.html} +2 -2
- metadata +34 -36
- data/lib/skyscraper/node/base.rb +0 -103
- data/lib/skyscraper/node/resource.rb +0 -57
- data/spec/skyscraper/skyscraper/node/base_spec.rb +0 -87
data/lib/skyscraper.rb
CHANGED
@@ -16,6 +16,7 @@ module Skyscraper
|
|
16
16
|
autoload :Node
|
17
17
|
autoload :Pages
|
18
18
|
autoload :Path
|
19
|
+
autoload :Resource
|
19
20
|
autoload :Results
|
20
21
|
|
21
22
|
mattr_accessor :defaults
|
@@ -24,7 +25,6 @@ module Skyscraper
|
|
24
25
|
limit: nil,
|
25
26
|
encoding: "utf-8",
|
26
27
|
download_path: "/tmp/skyscraper/:sequence/:file_name",
|
27
|
-
# reattempt_times: 1,
|
28
28
|
noise_errors: true,
|
29
29
|
skip_on_error: true
|
30
30
|
}
|
@@ -35,7 +35,7 @@ module Skyscraper
|
|
35
35
|
|
36
36
|
def self.fetch path, encoding = Skyscraper.config.encoding
|
37
37
|
document = Skyscraper::Document.load path, encoding
|
38
|
-
Node
|
38
|
+
Node.new document.css("html")
|
39
39
|
end
|
40
40
|
|
41
41
|
def fetch
|
data/lib/skyscraper/node.rb
CHANGED
@@ -1,8 +1,101 @@
|
|
1
1
|
module Skyscraper
|
2
|
-
|
3
|
-
|
2
|
+
class Node
|
3
|
+
alias :original_class :class
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
attr_accessor :element
|
6
|
+
|
7
|
+
def initialize element
|
8
|
+
@element = element
|
9
|
+
end
|
10
|
+
|
11
|
+
def first selector
|
12
|
+
self.find(selector).first
|
13
|
+
end
|
14
|
+
|
15
|
+
def find selector
|
16
|
+
@element.css(selector).map do |element|
|
17
|
+
Node.new(element)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def children selector = nil
|
22
|
+
if selector
|
23
|
+
children = @element.css(selector)
|
24
|
+
else
|
25
|
+
children = @element.children
|
26
|
+
end
|
27
|
+
|
28
|
+
children.select do |element|
|
29
|
+
element.parent == @element and element.is_a?(Nokogiri::XML::Element)
|
30
|
+
end.map do |child|
|
31
|
+
Node.new(child)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def parent
|
36
|
+
if @element.parent.is_a? Nokogiri::XML::Element
|
37
|
+
Node.new @element.parent
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def have_parent?
|
42
|
+
self.parent.present?
|
43
|
+
end
|
44
|
+
|
45
|
+
def parents selector = nil
|
46
|
+
node = self
|
47
|
+
parents = []
|
48
|
+
|
49
|
+
while node.have_parent?
|
50
|
+
node = node.parent
|
51
|
+
parents << node
|
52
|
+
end
|
53
|
+
|
54
|
+
parents.select! do |item|
|
55
|
+
item.element.matches? selector
|
56
|
+
end if selector
|
57
|
+
|
58
|
+
parents
|
59
|
+
end
|
60
|
+
|
61
|
+
def siblings
|
62
|
+
self.parent.children.select do |node|
|
63
|
+
node.element != self.element
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def follow
|
68
|
+
if self.href
|
69
|
+
Skyscraper::fetch(self.uri)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def html
|
74
|
+
@element.children.to_html
|
75
|
+
end
|
76
|
+
|
77
|
+
def class
|
78
|
+
@element.attribute("class").to_s
|
79
|
+
end
|
80
|
+
|
81
|
+
def download options = {}
|
82
|
+
Resource.new(self).download(options)
|
83
|
+
end
|
84
|
+
|
85
|
+
def uri
|
86
|
+
@element.document.path.full_path_for(self.href)
|
87
|
+
end
|
88
|
+
|
89
|
+
def method_missing name
|
90
|
+
@element.attribute(name.to_s).to_s
|
91
|
+
end
|
92
|
+
|
93
|
+
def text
|
94
|
+
@element.content.to_s.strip
|
95
|
+
end
|
96
|
+
|
97
|
+
def tag
|
98
|
+
@element.name
|
99
|
+
end
|
7
100
|
end
|
8
101
|
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Resource
|
3
|
+
def initialize node
|
4
|
+
@node = node
|
5
|
+
@path = extract_path_from_node(@node)
|
6
|
+
end
|
7
|
+
|
8
|
+
def download options = {}
|
9
|
+
name = options[:file_name] || @path.file_name
|
10
|
+
new_file_path = replace_path_variables(options[:path] || Skyscraper.config.download_path, name)
|
11
|
+
temp_file = open(@path.full_path)
|
12
|
+
|
13
|
+
copy temp_file.path, new_file_path
|
14
|
+
new_file_path
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def copy from, to
|
20
|
+
create_path_if_not_exists to
|
21
|
+
`cp #{from} #{to}`
|
22
|
+
end
|
23
|
+
|
24
|
+
def create_path_if_not_exists path
|
25
|
+
`mkdir -p #{path}` unless File.directory?(path)
|
26
|
+
end
|
27
|
+
|
28
|
+
def replace_path_variables path, name
|
29
|
+
new_path = path.dup
|
30
|
+
new_path.gsub! /:file_name/, name
|
31
|
+
new_path.gsub! /:sequence/, get_sequence_number_for(new_path)
|
32
|
+
new_path
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_sequence_number_for path
|
36
|
+
new_path = path.split(":sequence")[0]
|
37
|
+
if File.directory?(new_path)
|
38
|
+
entries = Dir.entries(new_path).select { |i| i =~ /^\d+$/ } || []
|
39
|
+
last = entries.sort.last.to_i
|
40
|
+
last += 1
|
41
|
+
last.to_s
|
42
|
+
else
|
43
|
+
"1"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_path_from_node node
|
48
|
+
if href_or_src = node.href.present? ? node.href : node.src
|
49
|
+
node.element.document.path.path_for(href_or_src)
|
50
|
+
else
|
51
|
+
throw Exception.new("no href no src")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/skyscraper/version.rb
CHANGED
data/skyscraper.gemspec
CHANGED
@@ -4,8 +4,8 @@ require File.expand_path('../lib/skyscraper/version', __FILE__)
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
5
|
gem.authors = ["Adam Dratwinski"]
|
6
6
|
gem.email = ["arboooz@gmail.com"]
|
7
|
-
gem.summary = %q{
|
8
|
-
gem.description = %q{
|
7
|
+
gem.summary = %q{Easy to use DSL that helps scraping data from websites}
|
8
|
+
gem.description = %q{Easy to use DSL that helps scraping data from websites. Thanks to it, writing web crawlers would be very fast and intuitive. Traversing through html nodes and fetching all of the HTML attributes, would be possible. Just like in jQuery - you will find methods like parent, children, first, find, siblings etc. Furthermore, you are able to download images, web pages, and store all content in the database. Please visit my Github account for more details.}
|
9
9
|
gem.homepage = "https://github.com/boooz/skyscraper"
|
10
10
|
|
11
11
|
gem.files = `git ls-files`.split($\)
|
@@ -1,2 +1,87 @@
|
|
1
1
|
describe Skyscraper::Node do
|
2
|
+
describe "when is initialized" do
|
3
|
+
before(:each) do
|
4
|
+
@node = Skyscraper::fetch(path_to("skyscraper-node.html")).first("div.item")
|
5
|
+
end
|
6
|
+
|
7
|
+
it "should returns html code" do
|
8
|
+
@node.html.should include "<strong class=\"name\">Name value</strong>"
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should returns class name" do
|
12
|
+
@node.class.should == "item"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should be auto converted to string with stripped tags" do
|
16
|
+
@node.text.should == "Name value"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should follow links" do
|
21
|
+
Skyscraper::fetch(path_to("skyscraper-node.html")).first("li a").follow.first("h1").text.should == "Hello from A"
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should deep follow links" do
|
25
|
+
Skyscraper::fetch(path_to("skyscraper-node.html")).first("li a").follow.first("a").follow.first("h1").text.should == "Hello from B"
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should download page" do
|
29
|
+
remove_test_directory
|
30
|
+
Skyscraper.config.download_path = "/tmp/skyscraper_test/nodes/:file_name"
|
31
|
+
file = Skyscraper::fetch(path_to("skyscraper-node.html")).first("li a").follow.first("a").download
|
32
|
+
File.exists?(file).should == true
|
33
|
+
end
|
34
|
+
describe "traversing" do
|
35
|
+
before(:each) do
|
36
|
+
@node = Skyscraper::fetch(path_to("skyscraper-node-traversing.html")).first(".menu")
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should find descendands items" do
|
40
|
+
result = @node.find("li")
|
41
|
+
result.length.should == 5
|
42
|
+
result.map(&:text).should include "Item 4 1"
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should returns children of element with selector" do
|
46
|
+
node = Skyscraper::fetch(path_to("skyscraper-node-traversing.html")).first("#parent-3")
|
47
|
+
node.children(".a").length.should == 4
|
48
|
+
node.children(".b").length.should == 2
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should returns children of element without selector" do
|
52
|
+
result = @node.children
|
53
|
+
result.length.should == 4
|
54
|
+
result.map(&:to_s).should_not include "Item 4 1"
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should returns first element" do
|
58
|
+
@node.first("li").class.should == "item-1"
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should returns parent of item" do
|
62
|
+
@node.parent.class.should == "parent-2"
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should tells if element have parent" do
|
66
|
+
@node.have_parent?.should == true
|
67
|
+
@node.parents("html").first.have_parent?.should == false
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should returns parents of item" do
|
71
|
+
@node.parents.length.should == 4
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should returns parents of item matched by selector" do
|
75
|
+
@node.parents("div").length.should == 2
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should returns siblings of item" do
|
79
|
+
@node.first(".item-3").siblings.length.should == 3
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should returns node tag" do
|
83
|
+
@node.tag.should == "ul"
|
84
|
+
end
|
85
|
+
end
|
2
86
|
end
|
87
|
+
|
@@ -1,12 +1,12 @@
|
|
1
|
-
describe Skyscraper::
|
1
|
+
describe Skyscraper::Resource do
|
2
2
|
def should_download_resource_to node, path, options = {}
|
3
|
-
resource = Skyscraper::
|
3
|
+
resource = Skyscraper::Resource.new(node)
|
4
4
|
resource.download(options).should == path
|
5
5
|
end
|
6
6
|
|
7
7
|
before(:all) do
|
8
8
|
Skyscraper.config.download_path = "/tmp/skyscraper_test/:sequence/:file_name"
|
9
|
-
@node = Skyscraper::fetch(path_to("skyscraper-
|
9
|
+
@node = Skyscraper::fetch(path_to("skyscraper-resource.html")).first("a")
|
10
10
|
end
|
11
11
|
|
12
12
|
before(:each) do
|
@@ -15,29 +15,29 @@ describe Skyscraper::Node::Resource do
|
|
15
15
|
|
16
16
|
it "should create path if not exists when downloaded" do
|
17
17
|
File.directory?("/tmp/skyscraper_test/1").should == false
|
18
|
-
Skyscraper::
|
18
|
+
Skyscraper::Resource.new(@node)
|
19
19
|
File.directory?("/tmp/skyscraper_test/1").should == false
|
20
|
-
Skyscraper::
|
20
|
+
Skyscraper::Resource.new(@node).download
|
21
21
|
File.directory?("/tmp/skyscraper_test/1").should == true
|
22
22
|
end
|
23
23
|
|
24
24
|
it "should not fail if path already exists" do
|
25
|
-
Skyscraper::
|
25
|
+
Skyscraper::Resource.new(@node).download path: "/tmp/skyscraper_test/some_directory/:file_name"
|
26
26
|
File.directory?("/tmp/skyscraper_test/some_directory").should == true
|
27
|
-
Skyscraper::
|
27
|
+
Skyscraper::Resource.new(@node).download
|
28
28
|
File.directory?("/tmp/skyscraper_test/some_directory").should == true
|
29
29
|
end
|
30
30
|
|
31
31
|
it "should have file name" do
|
32
|
-
resource = Skyscraper::
|
33
|
-
resource.download.should == "/tmp/skyscraper_test/1/skyscraper-
|
32
|
+
resource = Skyscraper::Resource.new(@node)
|
33
|
+
resource.download.should == "/tmp/skyscraper_test/1/skyscraper-resource-b.html"
|
34
34
|
end
|
35
35
|
|
36
36
|
it "should create path with :sequence variable" do
|
37
37
|
download_to = "/tmp/skyscraper_test/sequences/:sequence/:file_name"
|
38
|
-
should_download_resource_to @node, "/tmp/skyscraper_test/sequences/1/skyscraper-
|
39
|
-
should_download_resource_to @node, "/tmp/skyscraper_test/sequences/2/skyscraper-
|
40
|
-
should_download_resource_to @node, "/tmp/skyscraper_test/sequences/3/skyscraper-
|
38
|
+
should_download_resource_to @node, "/tmp/skyscraper_test/sequences/1/skyscraper-resource-b.html", path: download_to
|
39
|
+
should_download_resource_to @node, "/tmp/skyscraper_test/sequences/2/skyscraper-resource-b.html", path: download_to
|
40
|
+
should_download_resource_to @node, "/tmp/skyscraper_test/sequences/3/skyscraper-resource-b.html", path: download_to
|
41
41
|
end
|
42
42
|
|
43
43
|
it "should create custom file name if provided" do
|
@@ -46,13 +46,13 @@ describe Skyscraper::Node::Resource do
|
|
46
46
|
end
|
47
47
|
|
48
48
|
it "should download resource" do
|
49
|
-
Skyscraper::
|
50
|
-
File.exists?("/tmp/skyscraper_test/1/skyscraper-
|
49
|
+
Skyscraper::Resource.new(@node).download
|
50
|
+
File.exists?("/tmp/skyscraper_test/1/skyscraper-resource-b.html").should == true
|
51
51
|
end
|
52
52
|
|
53
53
|
it "should download image" do
|
54
|
-
image_node = Skyscraper::fetch(path_to("skyscraper-
|
55
|
-
Skyscraper::
|
56
|
-
File.exists?("/tmp/skyscraper_test/1/skyscraper-
|
54
|
+
image_node = Skyscraper::fetch(path_to("skyscraper-resource.html")).first("img")
|
55
|
+
Skyscraper::Resource.new(image_node).download
|
56
|
+
File.exists?("/tmp/skyscraper_test/1/skyscraper-resource-image.png").should == true
|
57
57
|
end
|
58
58
|
end
|
@@ -69,7 +69,7 @@ describe Skyscraper::Results do
|
|
69
69
|
@call_count = 0
|
70
70
|
callback = proc do |result, page|
|
71
71
|
result.should be_an_instance_of(Hash)
|
72
|
-
page.should be_an_instance_of(Skyscraper::Node
|
72
|
+
page.should be_an_instance_of(Skyscraper::Node)
|
73
73
|
@call_count += 1
|
74
74
|
end
|
75
75
|
|
@@ -16,7 +16,7 @@ describe Skyscraper do
|
|
16
16
|
end
|
17
17
|
|
18
18
|
it "should fetch remote page" do
|
19
|
-
Skyscraper::fetch("http://google.com").should be_an Skyscraper::Node
|
19
|
+
Skyscraper::fetch("http://google.com").should be_an Skyscraper::Node
|
20
20
|
end
|
21
21
|
|
22
22
|
it "static method fetch should works" do
|
File without changes
|
data/spec/test_files/{skyscraper-node-base-traversing.html → skyscraper-node-traversing.html}
RENAMED
File without changes
|
File without changes
|
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: skyscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-05-
|
12
|
+
date: 2012-05-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &77062930 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *77062930
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rake
|
27
|
-
requirement: &
|
27
|
+
requirement: &77124830 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *77124830
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &77122840 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *77122840
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: actionpack
|
49
|
-
requirement: &
|
49
|
+
requirement: &77119590 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,12 +54,13 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
58
|
-
description:
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
57
|
+
version_requirements: *77119590
|
58
|
+
description: Easy to use DSL that helps scraping data from websites. Thanks to it,
|
59
|
+
writing web crawlers would be very fast and intuitive. Traversing through html nodes
|
60
|
+
and fetching all of the HTML attributes, would be possible. Just like in jQuery
|
61
|
+
- you will find methods like parent, children, first, find, siblings etc. Furthermore,
|
62
|
+
you are able to download images, web pages, and store all content in the database.
|
63
|
+
Please visit my Github account for more details.
|
63
64
|
email:
|
64
65
|
- arboooz@gmail.com
|
65
66
|
executables: []
|
@@ -78,13 +79,12 @@ files:
|
|
78
79
|
- lib/skyscraper/document.rb
|
79
80
|
- lib/skyscraper/field.rb
|
80
81
|
- lib/skyscraper/node.rb
|
81
|
-
- lib/skyscraper/node/base.rb
|
82
|
-
- lib/skyscraper/node/resource.rb
|
83
82
|
- lib/skyscraper/pages.rb
|
84
83
|
- lib/skyscraper/path.rb
|
85
84
|
- lib/skyscraper/path/base.rb
|
86
85
|
- lib/skyscraper/path/local.rb
|
87
86
|
- lib/skyscraper/path/remote.rb
|
87
|
+
- lib/skyscraper/resource.rb
|
88
88
|
- lib/skyscraper/results.rb
|
89
89
|
- lib/skyscraper/version.rb
|
90
90
|
- skyscraper.gemspec
|
@@ -92,11 +92,10 @@ files:
|
|
92
92
|
- spec/skyscraper/skyscraper/config_spec.rb
|
93
93
|
- spec/skyscraper/skyscraper/document_spec.rb
|
94
94
|
- spec/skyscraper/skyscraper/field_spec.rb
|
95
|
-
- spec/skyscraper/skyscraper/node/base_spec.rb
|
96
|
-
- spec/skyscraper/skyscraper/node/resource_spec.rb
|
97
95
|
- spec/skyscraper/skyscraper/node_spec.rb
|
98
96
|
- spec/skyscraper/skyscraper/pages_spec.rb
|
99
97
|
- spec/skyscraper/skyscraper/path_spec.rb
|
98
|
+
- spec/skyscraper/skyscraper/resource_spec.rb
|
100
99
|
- spec/skyscraper/skyscraper/results_spec.rb
|
101
100
|
- spec/skyscraper/skyscraper_spec.rb
|
102
101
|
- spec/spec_helper.rb
|
@@ -108,14 +107,14 @@ files:
|
|
108
107
|
- spec/test_files/skyscraper-fetch-2.html
|
109
108
|
- spec/test_files/skyscraper-fetch.html
|
110
109
|
- spec/test_files/skyscraper-field.html
|
111
|
-
- spec/test_files/skyscraper-node-
|
112
|
-
- spec/test_files/skyscraper-node-
|
113
|
-
- spec/test_files/skyscraper-node-
|
114
|
-
- spec/test_files/skyscraper-node
|
115
|
-
- spec/test_files/skyscraper-node-resource-b.html
|
116
|
-
- spec/test_files/skyscraper-node-resource-image.png
|
117
|
-
- spec/test_files/skyscraper-node-resource.html
|
110
|
+
- spec/test_files/skyscraper-node-a.html
|
111
|
+
- spec/test_files/skyscraper-node-b.html
|
112
|
+
- spec/test_files/skyscraper-node-traversing.html
|
113
|
+
- spec/test_files/skyscraper-node.html
|
118
114
|
- spec/test_files/skyscraper-pages.html
|
115
|
+
- spec/test_files/skyscraper-resource-b.html
|
116
|
+
- spec/test_files/skyscraper-resource-image.png
|
117
|
+
- spec/test_files/skyscraper-resource.html
|
119
118
|
- spec/test_files/skyscraper.html
|
120
119
|
homepage: https://github.com/boooz/skyscraper
|
121
120
|
licenses: []
|
@@ -140,17 +139,16 @@ rubyforge_project:
|
|
140
139
|
rubygems_version: 1.8.15
|
141
140
|
signing_key:
|
142
141
|
specification_version: 3
|
143
|
-
summary:
|
142
|
+
summary: Easy to use DSL that helps scraping data from websites
|
144
143
|
test_files:
|
145
144
|
- spec/skyscraper/skyscraper/base_spec.rb
|
146
145
|
- spec/skyscraper/skyscraper/config_spec.rb
|
147
146
|
- spec/skyscraper/skyscraper/document_spec.rb
|
148
147
|
- spec/skyscraper/skyscraper/field_spec.rb
|
149
|
-
- spec/skyscraper/skyscraper/node/base_spec.rb
|
150
|
-
- spec/skyscraper/skyscraper/node/resource_spec.rb
|
151
148
|
- spec/skyscraper/skyscraper/node_spec.rb
|
152
149
|
- spec/skyscraper/skyscraper/pages_spec.rb
|
153
150
|
- spec/skyscraper/skyscraper/path_spec.rb
|
151
|
+
- spec/skyscraper/skyscraper/resource_spec.rb
|
154
152
|
- spec/skyscraper/skyscraper/results_spec.rb
|
155
153
|
- spec/skyscraper/skyscraper_spec.rb
|
156
154
|
- spec/spec_helper.rb
|
@@ -162,12 +160,12 @@ test_files:
|
|
162
160
|
- spec/test_files/skyscraper-fetch-2.html
|
163
161
|
- spec/test_files/skyscraper-fetch.html
|
164
162
|
- spec/test_files/skyscraper-field.html
|
165
|
-
- spec/test_files/skyscraper-node-
|
166
|
-
- spec/test_files/skyscraper-node-
|
167
|
-
- spec/test_files/skyscraper-node-
|
168
|
-
- spec/test_files/skyscraper-node
|
169
|
-
- spec/test_files/skyscraper-node-resource-b.html
|
170
|
-
- spec/test_files/skyscraper-node-resource-image.png
|
171
|
-
- spec/test_files/skyscraper-node-resource.html
|
163
|
+
- spec/test_files/skyscraper-node-a.html
|
164
|
+
- spec/test_files/skyscraper-node-b.html
|
165
|
+
- spec/test_files/skyscraper-node-traversing.html
|
166
|
+
- spec/test_files/skyscraper-node.html
|
172
167
|
- spec/test_files/skyscraper-pages.html
|
168
|
+
- spec/test_files/skyscraper-resource-b.html
|
169
|
+
- spec/test_files/skyscraper-resource-image.png
|
170
|
+
- spec/test_files/skyscraper-resource.html
|
173
171
|
- spec/test_files/skyscraper.html
|
data/lib/skyscraper/node/base.rb
DELETED
@@ -1,103 +0,0 @@
|
|
1
|
-
module Skyscraper
|
2
|
-
module Node
|
3
|
-
class Base
|
4
|
-
alias :original_class :class
|
5
|
-
|
6
|
-
attr_accessor :element
|
7
|
-
|
8
|
-
def initialize element
|
9
|
-
@element = element
|
10
|
-
end
|
11
|
-
|
12
|
-
def first selector
|
13
|
-
self.find(selector).first
|
14
|
-
end
|
15
|
-
|
16
|
-
def find selector
|
17
|
-
@element.css(selector).map do |element|
|
18
|
-
Base.new(element)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
def children selector = nil
|
23
|
-
if selector
|
24
|
-
children = @element.css(selector)
|
25
|
-
else
|
26
|
-
children = @element.children
|
27
|
-
end
|
28
|
-
|
29
|
-
children.select do |element|
|
30
|
-
element.parent == @element and element.is_a?(Nokogiri::XML::Element)
|
31
|
-
end.map do |child|
|
32
|
-
Base.new(child)
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
def parent
|
37
|
-
if @element.parent.is_a? Nokogiri::XML::Element
|
38
|
-
Base.new @element.parent
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
def have_parent?
|
43
|
-
self.parent.present?
|
44
|
-
end
|
45
|
-
|
46
|
-
def parents selector = nil
|
47
|
-
node = self
|
48
|
-
parents = []
|
49
|
-
|
50
|
-
while node.have_parent?
|
51
|
-
node = node.parent
|
52
|
-
parents << node
|
53
|
-
end
|
54
|
-
|
55
|
-
parents.select! do |item|
|
56
|
-
item.element.matches? selector
|
57
|
-
end if selector
|
58
|
-
|
59
|
-
parents
|
60
|
-
end
|
61
|
-
|
62
|
-
def siblings
|
63
|
-
self.parent.children.select do |node|
|
64
|
-
node.element != self.element
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
def follow
|
69
|
-
if self.href
|
70
|
-
Skyscraper::fetch(self.uri)
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def html
|
75
|
-
@element.children.to_html
|
76
|
-
end
|
77
|
-
|
78
|
-
def class
|
79
|
-
@element.attribute("class").to_s
|
80
|
-
end
|
81
|
-
|
82
|
-
def download options = {}
|
83
|
-
Resource.new(self).download(options)
|
84
|
-
end
|
85
|
-
|
86
|
-
def uri
|
87
|
-
@element.document.path.full_path_for(self.href)
|
88
|
-
end
|
89
|
-
|
90
|
-
def method_missing name
|
91
|
-
@element.attribute(name.to_s).to_s
|
92
|
-
end
|
93
|
-
|
94
|
-
def text
|
95
|
-
@element.content.to_s.strip
|
96
|
-
end
|
97
|
-
|
98
|
-
def tag
|
99
|
-
@element.name
|
100
|
-
end
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
@@ -1,57 +0,0 @@
|
|
1
|
-
module Skyscraper
|
2
|
-
module Node
|
3
|
-
class Resource
|
4
|
-
def initialize node
|
5
|
-
@node = node
|
6
|
-
@path = extract_path_from_node(@node)
|
7
|
-
end
|
8
|
-
|
9
|
-
def download options = {}
|
10
|
-
@name = options[:file_name] || @path.file_name
|
11
|
-
@new_file_path = replace_path_variables(options[:path] || Skyscraper.config.download_path)
|
12
|
-
@temp_file = open(@path.full_path)
|
13
|
-
|
14
|
-
copy @temp_file.path, @new_file_path
|
15
|
-
@new_file_path
|
16
|
-
end
|
17
|
-
|
18
|
-
private
|
19
|
-
|
20
|
-
def copy from, to
|
21
|
-
create_path_if_not_exists to
|
22
|
-
`cp #{from} #{to}`
|
23
|
-
end
|
24
|
-
|
25
|
-
def create_path_if_not_exists path
|
26
|
-
`mkdir -p #{path}` unless File.directory?(path)
|
27
|
-
end
|
28
|
-
|
29
|
-
def replace_path_variables path
|
30
|
-
new_path = path.dup
|
31
|
-
new_path.gsub! /:file_name/, @name
|
32
|
-
new_path.gsub! /:sequence/, get_sequence_number_for(new_path)
|
33
|
-
new_path
|
34
|
-
end
|
35
|
-
|
36
|
-
def get_sequence_number_for path
|
37
|
-
new_path = path.split(":sequence")[0]
|
38
|
-
if File.directory?(new_path)
|
39
|
-
entries = Dir.entries(new_path).select { |i| i =~ /^\d+$/ } || []
|
40
|
-
last = entries.sort.last.to_i
|
41
|
-
last += 1
|
42
|
-
last.to_s
|
43
|
-
else
|
44
|
-
"1"
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def extract_path_from_node node
|
49
|
-
if href_or_src = node.href.present? ? node.href : node.src
|
50
|
-
node.element.document.path.path_for(href_or_src)
|
51
|
-
else
|
52
|
-
throw Exception.new("no href no src")
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
describe Skyscraper::Node::Base do
|
2
|
-
describe "when is initialized" do
|
3
|
-
before(:each) do
|
4
|
-
@node = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("div.item")
|
5
|
-
end
|
6
|
-
|
7
|
-
it "should returns html code" do
|
8
|
-
@node.html.should include "<strong class=\"name\">Name value</strong>"
|
9
|
-
end
|
10
|
-
|
11
|
-
it "should returns class name" do
|
12
|
-
@node.class.should == "item"
|
13
|
-
end
|
14
|
-
|
15
|
-
it "should be auto converted to string with stripped tags" do
|
16
|
-
@node.text.should == "Name value"
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
it "should follow links" do
|
21
|
-
Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("h1").text.should == "Hello from A"
|
22
|
-
end
|
23
|
-
|
24
|
-
it "should deep follow links" do
|
25
|
-
Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").follow.first("h1").text.should == "Hello from B"
|
26
|
-
end
|
27
|
-
|
28
|
-
it "should download page" do
|
29
|
-
remove_test_directory
|
30
|
-
Skyscraper.config.download_path = "/tmp/skyscraper_test/nodes/:file_name"
|
31
|
-
file = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").download
|
32
|
-
File.exists?(file).should == true
|
33
|
-
end
|
34
|
-
describe "traversing" do
|
35
|
-
before(:each) do
|
36
|
-
@node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first(".menu")
|
37
|
-
end
|
38
|
-
|
39
|
-
it "should find descendands items" do
|
40
|
-
result = @node.find("li")
|
41
|
-
result.length.should == 5
|
42
|
-
result.map(&:text).should include "Item 4 1"
|
43
|
-
end
|
44
|
-
|
45
|
-
it "should returns children of element with selector" do
|
46
|
-
node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first("#parent-3")
|
47
|
-
node.children(".a").length.should == 4
|
48
|
-
node.children(".b").length.should == 2
|
49
|
-
end
|
50
|
-
|
51
|
-
it "should returns children of element without selector" do
|
52
|
-
result = @node.children
|
53
|
-
result.length.should == 4
|
54
|
-
result.map(&:to_s).should_not include "Item 4 1"
|
55
|
-
end
|
56
|
-
|
57
|
-
it "should returns first element" do
|
58
|
-
@node.first("li").class.should == "item-1"
|
59
|
-
end
|
60
|
-
|
61
|
-
it "should returns parent of item" do
|
62
|
-
@node.parent.class.should == "parent-2"
|
63
|
-
end
|
64
|
-
|
65
|
-
it "should tells if element have parent" do
|
66
|
-
@node.have_parent?.should == true
|
67
|
-
@node.parents("html").first.have_parent?.should == false
|
68
|
-
end
|
69
|
-
|
70
|
-
it "should returns parents of item" do
|
71
|
-
@node.parents.length.should == 4
|
72
|
-
end
|
73
|
-
|
74
|
-
it "should returns parents of item matched by selector" do
|
75
|
-
@node.parents("div").length.should == 2
|
76
|
-
end
|
77
|
-
|
78
|
-
it "should returns siblings of item" do
|
79
|
-
@node.first(".item-3").siblings.length.should == 3
|
80
|
-
end
|
81
|
-
|
82
|
-
it "should returns node tag" do
|
83
|
-
@node.tag.should == "ul"
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|