skyscraper 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -16,6 +16,7 @@ module Skyscraper
16
16
  autoload :Node
17
17
  autoload :Pages
18
18
  autoload :Path
19
+ autoload :Resource
19
20
  autoload :Results
20
21
 
21
22
  mattr_accessor :defaults
@@ -24,7 +25,6 @@ module Skyscraper
24
25
  limit: nil,
25
26
  encoding: "utf-8",
26
27
  download_path: "/tmp/skyscraper/:sequence/:file_name",
27
- # reattempt_times: 1,
28
28
  noise_errors: true,
29
29
  skip_on_error: true
30
30
  }
@@ -35,7 +35,7 @@ module Skyscraper
35
35
 
36
36
  def self.fetch path, encoding = Skyscraper.config.encoding
37
37
  document = Skyscraper::Document.load path, encoding
38
- Node::Base.new document.css("html")
38
+ Node.new document.css("html")
39
39
  end
40
40
 
41
41
  def fetch
@@ -1,8 +1,101 @@
1
1
  module Skyscraper
2
- module Node
3
- extend ActiveSupport::Autoload
2
+ class Node
3
+ alias :original_class :class
4
4
 
5
- autoload :Base
6
- autoload :Resource
5
+ attr_accessor :element
6
+
7
+ def initialize element
8
+ @element = element
9
+ end
10
+
11
+ def first selector
12
+ self.find(selector).first
13
+ end
14
+
15
+ def find selector
16
+ @element.css(selector).map do |element|
17
+ Node.new(element)
18
+ end
19
+ end
20
+
21
+ def children selector = nil
22
+ if selector
23
+ children = @element.css(selector)
24
+ else
25
+ children = @element.children
26
+ end
27
+
28
+ children.select do |element|
29
+ element.parent == @element and element.is_a?(Nokogiri::XML::Element)
30
+ end.map do |child|
31
+ Node.new(child)
32
+ end
33
+ end
34
+
35
+ def parent
36
+ if @element.parent.is_a? Nokogiri::XML::Element
37
+ Node.new @element.parent
38
+ end
39
+ end
40
+
41
+ def have_parent?
42
+ self.parent.present?
43
+ end
44
+
45
+ def parents selector = nil
46
+ node = self
47
+ parents = []
48
+
49
+ while node.have_parent?
50
+ node = node.parent
51
+ parents << node
52
+ end
53
+
54
+ parents.select! do |item|
55
+ item.element.matches? selector
56
+ end if selector
57
+
58
+ parents
59
+ end
60
+
61
+ def siblings
62
+ self.parent.children.select do |node|
63
+ node.element != self.element
64
+ end
65
+ end
66
+
67
+ def follow
68
+ if self.href
69
+ Skyscraper::fetch(self.uri)
70
+ end
71
+ end
72
+
73
+ def html
74
+ @element.children.to_html
75
+ end
76
+
77
+ def class
78
+ @element.attribute("class").to_s
79
+ end
80
+
81
+ def download options = {}
82
+ Resource.new(self).download(options)
83
+ end
84
+
85
+ def uri
86
+ @element.document.path.full_path_for(self.href)
87
+ end
88
+
89
+ def method_missing name
90
+ @element.attribute(name.to_s).to_s
91
+ end
92
+
93
+ def text
94
+ @element.content.to_s.strip
95
+ end
96
+
97
+ def tag
98
+ @element.name
99
+ end
7
100
  end
8
101
  end
@@ -0,0 +1,55 @@
1
+ module Skyscraper
2
+ class Resource
3
+ def initialize node
4
+ @node = node
5
+ @path = extract_path_from_node(@node)
6
+ end
7
+
8
+ def download options = {}
9
+ name = options[:file_name] || @path.file_name
10
+ new_file_path = replace_path_variables(options[:path] || Skyscraper.config.download_path, name)
11
+ temp_file = open(@path.full_path)
12
+
13
+ copy temp_file.path, new_file_path
14
+ new_file_path
15
+ end
16
+
17
+ private
18
+
19
+ def copy from, to
20
+ create_path_if_not_exists to
21
+ `cp #{from} #{to}`
22
+ end
23
+
24
+ def create_path_if_not_exists path
25
+ `mkdir -p #{path}` unless File.directory?(path)
26
+ end
27
+
28
+ def replace_path_variables path, name
29
+ new_path = path.dup
30
+ new_path.gsub! /:file_name/, name
31
+ new_path.gsub! /:sequence/, get_sequence_number_for(new_path)
32
+ new_path
33
+ end
34
+
35
+ def get_sequence_number_for path
36
+ new_path = path.split(":sequence")[0]
37
+ if File.directory?(new_path)
38
+ entries = Dir.entries(new_path).select { |i| i =~ /^\d+$/ } || []
39
+ last = entries.sort.last.to_i
40
+ last += 1
41
+ last.to_s
42
+ else
43
+ "1"
44
+ end
45
+ end
46
+
47
+ def extract_path_from_node node
48
+ if href_or_src = node.href.present? ? node.href : node.src
49
+ node.element.document.path.path_for(href_or_src)
50
+ else
51
+ throw Exception.new("no href no src")
52
+ end
53
+ end
54
+ end
55
+ end
@@ -1,3 +1,3 @@
1
1
  module Skyscraper
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
@@ -4,8 +4,8 @@ require File.expand_path('../lib/skyscraper/version', __FILE__)
4
4
  Gem::Specification.new do |gem|
5
5
  gem.authors = ["Adam Dratwinski"]
6
6
  gem.email = ["arboooz@gmail.com"]
7
- gem.summary = %q{Library that helps scraping data from websites in easy way}
8
- gem.description = %q{Library that helps scraping data from websites in easy way. Skyscraper allows you to traversing through html nodes, similary to jquery, it provides methods like parent, children, first, find, siblings etc. Thanks to Skyscraper you can fetch all HTML attributes on any node. Furthermore it's allow to download images, webpages, and store content in the database. Please visit Github account for more details.}
7
+ gem.summary = %q{Easy to use DSL that helps scraping data from websites}
8
+ gem.description = %q{Easy to use DSL that helps scraping data from websites. Thanks to it, writing web crawlers would be very fast and intuitive. Traversing through html nodes and fetching all of the HTML attributes, would be possible. Just like in jQuery - you will find methods like parent, children, first, find, siblings etc. Furthermore, you are able to download images, web pages, and store all content in the database. Please visit my Github account for more details.}
9
9
  gem.homepage = "https://github.com/boooz/skyscraper"
10
10
 
11
11
  gem.files = `git ls-files`.split($\)
@@ -1,2 +1,87 @@
1
1
  describe Skyscraper::Node do
2
+ describe "when is initialized" do
3
+ before(:each) do
4
+ @node = Skyscraper::fetch(path_to("skyscraper-node.html")).first("div.item")
5
+ end
6
+
7
+ it "should returns html code" do
8
+ @node.html.should include "<strong class=\"name\">Name value</strong>"
9
+ end
10
+
11
+ it "should returns class name" do
12
+ @node.class.should == "item"
13
+ end
14
+
15
+ it "should be auto converted to string with stripped tags" do
16
+ @node.text.should == "Name value"
17
+ end
18
+ end
19
+
20
+ it "should follow links" do
21
+ Skyscraper::fetch(path_to("skyscraper-node.html")).first("li a").follow.first("h1").text.should == "Hello from A"
22
+ end
23
+
24
+ it "should deep follow links" do
25
+ Skyscraper::fetch(path_to("skyscraper-node.html")).first("li a").follow.first("a").follow.first("h1").text.should == "Hello from B"
26
+ end
27
+
28
+ it "should download page" do
29
+ remove_test_directory
30
+ Skyscraper.config.download_path = "/tmp/skyscraper_test/nodes/:file_name"
31
+ file = Skyscraper::fetch(path_to("skyscraper-node.html")).first("li a").follow.first("a").download
32
+ File.exists?(file).should == true
33
+ end
34
+ describe "traversing" do
35
+ before(:each) do
36
+ @node = Skyscraper::fetch(path_to("skyscraper-node-traversing.html")).first(".menu")
37
+ end
38
+
39
+ it "should find descendands items" do
40
+ result = @node.find("li")
41
+ result.length.should == 5
42
+ result.map(&:text).should include "Item 4 1"
43
+ end
44
+
45
+ it "should returns children of element with selector" do
46
+ node = Skyscraper::fetch(path_to("skyscraper-node-traversing.html")).first("#parent-3")
47
+ node.children(".a").length.should == 4
48
+ node.children(".b").length.should == 2
49
+ end
50
+
51
+ it "should returns children of element without selector" do
52
+ result = @node.children
53
+ result.length.should == 4
54
+ result.map(&:to_s).should_not include "Item 4 1"
55
+ end
56
+
57
+ it "should returns first element" do
58
+ @node.first("li").class.should == "item-1"
59
+ end
60
+
61
+ it "should returns parent of item" do
62
+ @node.parent.class.should == "parent-2"
63
+ end
64
+
65
+ it "should tells if element have parent" do
66
+ @node.have_parent?.should == true
67
+ @node.parents("html").first.have_parent?.should == false
68
+ end
69
+
70
+ it "should returns parents of item" do
71
+ @node.parents.length.should == 4
72
+ end
73
+
74
+ it "should returns parents of item matched by selector" do
75
+ @node.parents("div").length.should == 2
76
+ end
77
+
78
+ it "should returns siblings of item" do
79
+ @node.first(".item-3").siblings.length.should == 3
80
+ end
81
+
82
+ it "should returns node tag" do
83
+ @node.tag.should == "ul"
84
+ end
85
+ end
2
86
  end
87
+
@@ -1,12 +1,12 @@
1
- describe Skyscraper::Node::Resource do
1
+ describe Skyscraper::Resource do
2
2
  def should_download_resource_to node, path, options = {}
3
- resource = Skyscraper::Node::Resource.new(node)
3
+ resource = Skyscraper::Resource.new(node)
4
4
  resource.download(options).should == path
5
5
  end
6
6
 
7
7
  before(:all) do
8
8
  Skyscraper.config.download_path = "/tmp/skyscraper_test/:sequence/:file_name"
9
- @node = Skyscraper::fetch(path_to("skyscraper-node-resource.html")).first("a")
9
+ @node = Skyscraper::fetch(path_to("skyscraper-resource.html")).first("a")
10
10
  end
11
11
 
12
12
  before(:each) do
@@ -15,29 +15,29 @@ describe Skyscraper::Node::Resource do
15
15
 
16
16
  it "should create path if not exists when downloaded" do
17
17
  File.directory?("/tmp/skyscraper_test/1").should == false
18
- Skyscraper::Node::Resource.new(@node)
18
+ Skyscraper::Resource.new(@node)
19
19
  File.directory?("/tmp/skyscraper_test/1").should == false
20
- Skyscraper::Node::Resource.new(@node).download
20
+ Skyscraper::Resource.new(@node).download
21
21
  File.directory?("/tmp/skyscraper_test/1").should == true
22
22
  end
23
23
 
24
24
  it "should not fail if path already exists" do
25
- Skyscraper::Node::Resource.new(@node).download path: "/tmp/skyscraper_test/some_directory/:file_name"
25
+ Skyscraper::Resource.new(@node).download path: "/tmp/skyscraper_test/some_directory/:file_name"
26
26
  File.directory?("/tmp/skyscraper_test/some_directory").should == true
27
- Skyscraper::Node::Resource.new(@node).download
27
+ Skyscraper::Resource.new(@node).download
28
28
  File.directory?("/tmp/skyscraper_test/some_directory").should == true
29
29
  end
30
30
 
31
31
  it "should have file name" do
32
- resource = Skyscraper::Node::Resource.new(@node)
33
- resource.download.should == "/tmp/skyscraper_test/1/skyscraper-node-resource-b.html"
32
+ resource = Skyscraper::Resource.new(@node)
33
+ resource.download.should == "/tmp/skyscraper_test/1/skyscraper-resource-b.html"
34
34
  end
35
35
 
36
36
  it "should create path with :sequence variable" do
37
37
  download_to = "/tmp/skyscraper_test/sequences/:sequence/:file_name"
38
- should_download_resource_to @node, "/tmp/skyscraper_test/sequences/1/skyscraper-node-resource-b.html", path: download_to
39
- should_download_resource_to @node, "/tmp/skyscraper_test/sequences/2/skyscraper-node-resource-b.html", path: download_to
40
- should_download_resource_to @node, "/tmp/skyscraper_test/sequences/3/skyscraper-node-resource-b.html", path: download_to
38
+ should_download_resource_to @node, "/tmp/skyscraper_test/sequences/1/skyscraper-resource-b.html", path: download_to
39
+ should_download_resource_to @node, "/tmp/skyscraper_test/sequences/2/skyscraper-resource-b.html", path: download_to
40
+ should_download_resource_to @node, "/tmp/skyscraper_test/sequences/3/skyscraper-resource-b.html", path: download_to
41
41
  end
42
42
 
43
43
  it "should create custom file name if provided" do
@@ -46,13 +46,13 @@ describe Skyscraper::Node::Resource do
46
46
  end
47
47
 
48
48
  it "should download resource" do
49
- Skyscraper::Node::Resource.new(@node).download
50
- File.exists?("/tmp/skyscraper_test/1/skyscraper-node-resource-b.html").should == true
49
+ Skyscraper::Resource.new(@node).download
50
+ File.exists?("/tmp/skyscraper_test/1/skyscraper-resource-b.html").should == true
51
51
  end
52
52
 
53
53
  it "should download image" do
54
- image_node = Skyscraper::fetch(path_to("skyscraper-node-resource.html")).first("img")
55
- Skyscraper::Node::Resource.new(image_node).download
56
- File.exists?("/tmp/skyscraper_test/1/skyscraper-node-resource-image.png").should == true
54
+ image_node = Skyscraper::fetch(path_to("skyscraper-resource.html")).first("img")
55
+ Skyscraper::Resource.new(image_node).download
56
+ File.exists?("/tmp/skyscraper_test/1/skyscraper-resource-image.png").should == true
57
57
  end
58
58
  end
@@ -69,7 +69,7 @@ describe Skyscraper::Results do
69
69
  @call_count = 0
70
70
  callback = proc do |result, page|
71
71
  result.should be_an_instance_of(Hash)
72
- page.should be_an_instance_of(Skyscraper::Node::Base)
72
+ page.should be_an_instance_of(Skyscraper::Node)
73
73
  @call_count += 1
74
74
  end
75
75
 
@@ -16,7 +16,7 @@ describe Skyscraper do
16
16
  end
17
17
 
18
18
  it "should fetch remote page" do
19
- Skyscraper::fetch("http://google.com").should be_an Skyscraper::Node::Base
19
+ Skyscraper::fetch("http://google.com").should be_an Skyscraper::Node
20
20
  end
21
21
 
22
22
  it "static method fetch should works" do
@@ -6,6 +6,6 @@
6
6
 
7
7
  <body>
8
8
  <h1>Hello from A</h1>
9
- <a href="skyscraper-node-base-b.html">b</a>
9
+ <a href="skyscraper-node-b.html">b</a>
10
10
  </body>
11
11
  </html>
@@ -8,7 +8,7 @@
8
8
  <h1>Hello world</h1>
9
9
  <ul class="menu">
10
10
  <li>
11
- <a href="skyscraper-node-base-a.html">A</a>
11
+ <a href="skyscraper-node-a.html">A</a>
12
12
  <a href="b.html">A</a>
13
13
  <a href="c.html">A</a>
14
14
  <a href="d.html">A</a>
@@ -6,7 +6,7 @@
6
6
 
7
7
  <body>
8
8
  <h1>Hello from A</h1>
9
- <a href="skyscraper-node-resource-b.html">B</a>
10
- <img src="skyscraper-node-resource-image.png">
9
+ <a href="skyscraper-resource-b.html">B</a>
10
+ <img src="skyscraper-resource-image.png">
11
11
  </body>
12
12
  </html>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: skyscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-17 00:00:00.000000000 Z
12
+ date: 2012-05-21 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &71541210 !ruby/object:Gem::Requirement
16
+ requirement: &77062930 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *71541210
24
+ version_requirements: *77062930
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rake
27
- requirement: &71603060 !ruby/object:Gem::Requirement
27
+ requirement: &77124830 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *71603060
35
+ version_requirements: *77124830
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &71600890 !ruby/object:Gem::Requirement
38
+ requirement: &77122840 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *71600890
46
+ version_requirements: *77122840
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: actionpack
49
- requirement: &71599790 !ruby/object:Gem::Requirement
49
+ requirement: &77119590 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,12 +54,13 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *71599790
58
- description: Library that helps scraping data from websites in easy way. Skyscraper
59
- allows you to traversing through html nodes, similary to jquery, it provides methods
60
- like parent, children, first, find, siblings etc. Thanks to Skyscraper you can fetch
61
- all HTML attributes on any node. Furthermore it's allow to download images, webpages,
62
- and store content in the database. Please visit Github account for more details.
57
+ version_requirements: *77119590
58
+ description: Easy to use DSL that helps scraping data from websites. Thanks to it,
59
+ writing web crawlers would be very fast and intuitive. Traversing through html nodes
60
+ and fetching all of the HTML attributes, would be possible. Just like in jQuery
61
+ - you will find methods like parent, children, first, find, siblings etc. Furthermore,
62
+ you are able to download images, web pages, and store all content in the database.
63
+ Please visit my Github account for more details.
63
64
  email:
64
65
  - arboooz@gmail.com
65
66
  executables: []
@@ -78,13 +79,12 @@ files:
78
79
  - lib/skyscraper/document.rb
79
80
  - lib/skyscraper/field.rb
80
81
  - lib/skyscraper/node.rb
81
- - lib/skyscraper/node/base.rb
82
- - lib/skyscraper/node/resource.rb
83
82
  - lib/skyscraper/pages.rb
84
83
  - lib/skyscraper/path.rb
85
84
  - lib/skyscraper/path/base.rb
86
85
  - lib/skyscraper/path/local.rb
87
86
  - lib/skyscraper/path/remote.rb
87
+ - lib/skyscraper/resource.rb
88
88
  - lib/skyscraper/results.rb
89
89
  - lib/skyscraper/version.rb
90
90
  - skyscraper.gemspec
@@ -92,11 +92,10 @@ files:
92
92
  - spec/skyscraper/skyscraper/config_spec.rb
93
93
  - spec/skyscraper/skyscraper/document_spec.rb
94
94
  - spec/skyscraper/skyscraper/field_spec.rb
95
- - spec/skyscraper/skyscraper/node/base_spec.rb
96
- - spec/skyscraper/skyscraper/node/resource_spec.rb
97
95
  - spec/skyscraper/skyscraper/node_spec.rb
98
96
  - spec/skyscraper/skyscraper/pages_spec.rb
99
97
  - spec/skyscraper/skyscraper/path_spec.rb
98
+ - spec/skyscraper/skyscraper/resource_spec.rb
100
99
  - spec/skyscraper/skyscraper/results_spec.rb
101
100
  - spec/skyscraper/skyscraper_spec.rb
102
101
  - spec/spec_helper.rb
@@ -108,14 +107,14 @@ files:
108
107
  - spec/test_files/skyscraper-fetch-2.html
109
108
  - spec/test_files/skyscraper-fetch.html
110
109
  - spec/test_files/skyscraper-field.html
111
- - spec/test_files/skyscraper-node-base-a.html
112
- - spec/test_files/skyscraper-node-base-b.html
113
- - spec/test_files/skyscraper-node-base-traversing.html
114
- - spec/test_files/skyscraper-node-base.html
115
- - spec/test_files/skyscraper-node-resource-b.html
116
- - spec/test_files/skyscraper-node-resource-image.png
117
- - spec/test_files/skyscraper-node-resource.html
110
+ - spec/test_files/skyscraper-node-a.html
111
+ - spec/test_files/skyscraper-node-b.html
112
+ - spec/test_files/skyscraper-node-traversing.html
113
+ - spec/test_files/skyscraper-node.html
118
114
  - spec/test_files/skyscraper-pages.html
115
+ - spec/test_files/skyscraper-resource-b.html
116
+ - spec/test_files/skyscraper-resource-image.png
117
+ - spec/test_files/skyscraper-resource.html
119
118
  - spec/test_files/skyscraper.html
120
119
  homepage: https://github.com/boooz/skyscraper
121
120
  licenses: []
@@ -140,17 +139,16 @@ rubyforge_project:
140
139
  rubygems_version: 1.8.15
141
140
  signing_key:
142
141
  specification_version: 3
143
- summary: Library that helps scraping data from websites in easy way
142
+ summary: Easy to use DSL that helps scraping data from websites
144
143
  test_files:
145
144
  - spec/skyscraper/skyscraper/base_spec.rb
146
145
  - spec/skyscraper/skyscraper/config_spec.rb
147
146
  - spec/skyscraper/skyscraper/document_spec.rb
148
147
  - spec/skyscraper/skyscraper/field_spec.rb
149
- - spec/skyscraper/skyscraper/node/base_spec.rb
150
- - spec/skyscraper/skyscraper/node/resource_spec.rb
151
148
  - spec/skyscraper/skyscraper/node_spec.rb
152
149
  - spec/skyscraper/skyscraper/pages_spec.rb
153
150
  - spec/skyscraper/skyscraper/path_spec.rb
151
+ - spec/skyscraper/skyscraper/resource_spec.rb
154
152
  - spec/skyscraper/skyscraper/results_spec.rb
155
153
  - spec/skyscraper/skyscraper_spec.rb
156
154
  - spec/spec_helper.rb
@@ -162,12 +160,12 @@ test_files:
162
160
  - spec/test_files/skyscraper-fetch-2.html
163
161
  - spec/test_files/skyscraper-fetch.html
164
162
  - spec/test_files/skyscraper-field.html
165
- - spec/test_files/skyscraper-node-base-a.html
166
- - spec/test_files/skyscraper-node-base-b.html
167
- - spec/test_files/skyscraper-node-base-traversing.html
168
- - spec/test_files/skyscraper-node-base.html
169
- - spec/test_files/skyscraper-node-resource-b.html
170
- - spec/test_files/skyscraper-node-resource-image.png
171
- - spec/test_files/skyscraper-node-resource.html
163
+ - spec/test_files/skyscraper-node-a.html
164
+ - spec/test_files/skyscraper-node-b.html
165
+ - spec/test_files/skyscraper-node-traversing.html
166
+ - spec/test_files/skyscraper-node.html
172
167
  - spec/test_files/skyscraper-pages.html
168
+ - spec/test_files/skyscraper-resource-b.html
169
+ - spec/test_files/skyscraper-resource-image.png
170
+ - spec/test_files/skyscraper-resource.html
173
171
  - spec/test_files/skyscraper.html
@@ -1,103 +0,0 @@
1
- module Skyscraper
2
- module Node
3
- class Base
4
- alias :original_class :class
5
-
6
- attr_accessor :element
7
-
8
- def initialize element
9
- @element = element
10
- end
11
-
12
- def first selector
13
- self.find(selector).first
14
- end
15
-
16
- def find selector
17
- @element.css(selector).map do |element|
18
- Base.new(element)
19
- end
20
- end
21
-
22
- def children selector = nil
23
- if selector
24
- children = @element.css(selector)
25
- else
26
- children = @element.children
27
- end
28
-
29
- children.select do |element|
30
- element.parent == @element and element.is_a?(Nokogiri::XML::Element)
31
- end.map do |child|
32
- Base.new(child)
33
- end
34
- end
35
-
36
- def parent
37
- if @element.parent.is_a? Nokogiri::XML::Element
38
- Base.new @element.parent
39
- end
40
- end
41
-
42
- def have_parent?
43
- self.parent.present?
44
- end
45
-
46
- def parents selector = nil
47
- node = self
48
- parents = []
49
-
50
- while node.have_parent?
51
- node = node.parent
52
- parents << node
53
- end
54
-
55
- parents.select! do |item|
56
- item.element.matches? selector
57
- end if selector
58
-
59
- parents
60
- end
61
-
62
- def siblings
63
- self.parent.children.select do |node|
64
- node.element != self.element
65
- end
66
- end
67
-
68
- def follow
69
- if self.href
70
- Skyscraper::fetch(self.uri)
71
- end
72
- end
73
-
74
- def html
75
- @element.children.to_html
76
- end
77
-
78
- def class
79
- @element.attribute("class").to_s
80
- end
81
-
82
- def download options = {}
83
- Resource.new(self).download(options)
84
- end
85
-
86
- def uri
87
- @element.document.path.full_path_for(self.href)
88
- end
89
-
90
- def method_missing name
91
- @element.attribute(name.to_s).to_s
92
- end
93
-
94
- def text
95
- @element.content.to_s.strip
96
- end
97
-
98
- def tag
99
- @element.name
100
- end
101
- end
102
- end
103
- end
@@ -1,57 +0,0 @@
1
- module Skyscraper
2
- module Node
3
- class Resource
4
- def initialize node
5
- @node = node
6
- @path = extract_path_from_node(@node)
7
- end
8
-
9
- def download options = {}
10
- @name = options[:file_name] || @path.file_name
11
- @new_file_path = replace_path_variables(options[:path] || Skyscraper.config.download_path)
12
- @temp_file = open(@path.full_path)
13
-
14
- copy @temp_file.path, @new_file_path
15
- @new_file_path
16
- end
17
-
18
- private
19
-
20
- def copy from, to
21
- create_path_if_not_exists to
22
- `cp #{from} #{to}`
23
- end
24
-
25
- def create_path_if_not_exists path
26
- `mkdir -p #{path}` unless File.directory?(path)
27
- end
28
-
29
- def replace_path_variables path
30
- new_path = path.dup
31
- new_path.gsub! /:file_name/, @name
32
- new_path.gsub! /:sequence/, get_sequence_number_for(new_path)
33
- new_path
34
- end
35
-
36
- def get_sequence_number_for path
37
- new_path = path.split(":sequence")[0]
38
- if File.directory?(new_path)
39
- entries = Dir.entries(new_path).select { |i| i =~ /^\d+$/ } || []
40
- last = entries.sort.last.to_i
41
- last += 1
42
- last.to_s
43
- else
44
- "1"
45
- end
46
- end
47
-
48
- def extract_path_from_node node
49
- if href_or_src = node.href.present? ? node.href : node.src
50
- node.element.document.path.path_for(href_or_src)
51
- else
52
- throw Exception.new("no href no src")
53
- end
54
- end
55
- end
56
- end
57
- end
@@ -1,87 +0,0 @@
1
- describe Skyscraper::Node::Base do
2
- describe "when is initialized" do
3
- before(:each) do
4
- @node = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("div.item")
5
- end
6
-
7
- it "should returns html code" do
8
- @node.html.should include "<strong class=\"name\">Name value</strong>"
9
- end
10
-
11
- it "should returns class name" do
12
- @node.class.should == "item"
13
- end
14
-
15
- it "should be auto converted to string with stripped tags" do
16
- @node.text.should == "Name value"
17
- end
18
- end
19
-
20
- it "should follow links" do
21
- Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("h1").text.should == "Hello from A"
22
- end
23
-
24
- it "should deep follow links" do
25
- Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").follow.first("h1").text.should == "Hello from B"
26
- end
27
-
28
- it "should download page" do
29
- remove_test_directory
30
- Skyscraper.config.download_path = "/tmp/skyscraper_test/nodes/:file_name"
31
- file = Skyscraper::fetch(path_to("skyscraper-node-base.html")).first("li a").follow.first("a").download
32
- File.exists?(file).should == true
33
- end
34
- describe "traversing" do
35
- before(:each) do
36
- @node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first(".menu")
37
- end
38
-
39
- it "should find descendands items" do
40
- result = @node.find("li")
41
- result.length.should == 5
42
- result.map(&:text).should include "Item 4 1"
43
- end
44
-
45
- it "should returns children of element with selector" do
46
- node = Skyscraper::fetch(path_to("skyscraper-node-base-traversing.html")).first("#parent-3")
47
- node.children(".a").length.should == 4
48
- node.children(".b").length.should == 2
49
- end
50
-
51
- it "should returns children of element without selector" do
52
- result = @node.children
53
- result.length.should == 4
54
- result.map(&:to_s).should_not include "Item 4 1"
55
- end
56
-
57
- it "should returns first element" do
58
- @node.first("li").class.should == "item-1"
59
- end
60
-
61
- it "should returns parent of item" do
62
- @node.parent.class.should == "parent-2"
63
- end
64
-
65
- it "should tells if element have parent" do
66
- @node.have_parent?.should == true
67
- @node.parents("html").first.have_parent?.should == false
68
- end
69
-
70
- it "should returns parents of item" do
71
- @node.parents.length.should == 4
72
- end
73
-
74
- it "should returns parents of item matched by selector" do
75
- @node.parents("div").length.should == 2
76
- end
77
-
78
- it "should returns siblings of item" do
79
- @node.first(".item-3").siblings.length.should == 3
80
- end
81
-
82
- it "should returns node tag" do
83
- @node.tag.should == "ul"
84
- end
85
- end
86
- end
87
-