skyscraper 0.0.5 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec CHANGED
@@ -1 +1 @@
1
- --color
1
+ --color --fail-fast --format nested
@@ -2,6 +2,7 @@ require "open-uri"
2
2
  require "uri"
3
3
  require "nokogiri"
4
4
  require "active_support/core_ext"
5
+ require "net/http"
5
6
 
6
7
  include ActiveSupport
7
8
 
@@ -38,6 +39,7 @@ module Skyscraper
38
39
  Node.new document.css("html")
39
40
  end
40
41
 
42
+
41
43
  def fetch
42
44
  self.class.send(:base).fetch
43
45
  end
@@ -3,9 +3,29 @@ module Skyscraper
3
3
  attr_accessor :path
4
4
 
5
5
  def self.load path, encoding = 'utf-8'
6
- document = Skyscraper::Document.parse open(path), nil, encoding
6
+ document = Skyscraper::Document.parse open_from_path(path), nil, encoding
7
7
  document.path = Skyscraper::Path.factory(path)
8
8
  document
9
9
  end
10
+
11
+ def self.load_post path, params = {}, encoding = 'utf-8'
12
+ file = Net::HTTP.post_form(URI.parse(path), params).body
13
+
14
+ document = Skyscraper::Document.parse file
15
+ document.path = path
16
+ document
17
+ end
18
+
19
+ def self.open_from_path path
20
+ begin
21
+ file = open(path)
22
+ rescue RuntimeError
23
+ https_path = path.gsub /http:/, "https:"
24
+ file = open(https_path) unless https_path == path
25
+ end
26
+
27
+ file
28
+ end
29
+
10
30
  end
11
31
  end
@@ -97,5 +97,20 @@ module Skyscraper
97
97
  def tag
98
98
  @element.name
99
99
  end
100
+
101
+ def submit params = {}
102
+ raise Skyscraper::LocalFormException if @element.document.path.local?
103
+ raise Skyscraper::NotActionException if self.action.blank?
104
+
105
+ path = @element.document.path.full_path_for(self.action)
106
+ document = Skyscraper::Document.load_post path, params
107
+
108
+ Node.new(document.css("html"))
109
+ end
110
+ end
111
+
112
+ class LocalFormException < Exception
113
+ end
114
+ class NotActionException < Exception
100
115
  end
101
116
  end
@@ -5,6 +5,14 @@ module Skyscraper
5
5
  Path::factory(self.full_path_for(path))
6
6
  end
7
7
 
8
+ def local?
9
+ self.is_a? Skyscraper::Path::Local
10
+ end
11
+
12
+ def remote?
13
+ self.is_a? Skyscraper::Path::Remote
14
+ end
15
+
8
16
  private
9
17
 
10
18
  def get_file_name path
@@ -40,8 +40,9 @@ module Skyscraper
40
40
  result[field.name] = field.find_in_document document
41
41
  end
42
42
 
43
- call_callbacks @after_each, result, document
43
+ callback_result = call_callbacks @after_each, result, document
44
44
  results << result
45
+ break if callback_result === false
45
46
  sleep @delay[:sleep] if (i+1) % @delay[:after] == 0
46
47
 
47
48
  rescue SocketError, Errno::ENOENT
@@ -82,9 +83,11 @@ module Skyscraper
82
83
  end
83
84
 
84
85
  def call_callbacks callbacks, *args
86
+ result = true
85
87
  callbacks.each do |callback|
86
- callback.call(*args)
88
+ result = callback.call(*args)
87
89
  end
90
+ result
88
91
  end
89
92
  end
90
93
 
@@ -1,3 +1,3 @@
1
1
  module Skyscraper
2
- VERSION = "0.0.5"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -1,3 +1,5 @@
1
+ require "spec_helper"
2
+
1
3
  describe Skyscraper::Base do
2
4
 
3
5
  it "should set pages" do
@@ -1,3 +1,5 @@
1
+ require "spec_helper"
2
+
1
3
  describe Skyscraper::Config do
2
4
  it "should set variable on initialize" do
3
5
  config = Skyscraper::Config.new foo: "bar"
@@ -1,4 +1,5 @@
1
1
  #encoding: utf-8
2
+ require "spec_helper"
2
3
 
3
4
  describe Skyscraper::Document do
4
5
  it "should support utf-8 encoding by default in remote pages" do
@@ -11,4 +12,11 @@ describe Skyscraper::Document do
11
12
  document = Skyscraper::Document::load(path_to("skyscraper-document.html"))
12
13
  document.path.should be_an Skyscraper::Path::Base
13
14
  end
15
+
16
+ describe "when is opening" do
17
+ it "should handle https redirects" do
18
+ Skyscraper::Document.open_from_path("http://github.com").should be_an_instance_of Tempfile
19
+ end
20
+ end
21
+
14
22
  end
@@ -1,3 +1,5 @@
1
+ require "spec_helper"
2
+
1
3
  describe Skyscraper::Field do
2
4
  before(:all) do
3
5
  @page = Skyscraper::fetch(path_to("skyscraper-field.html"))
@@ -1,3 +1,5 @@
1
+ require "spec_helper"
2
+
1
3
  describe Skyscraper::Node do
2
4
  describe "when is initialized" do
3
5
  before(:each) do
@@ -83,5 +85,32 @@ describe Skyscraper::Node do
83
85
  @node.tag.should == "ul"
84
86
  end
85
87
  end
88
+
89
+ describe "Submit post data" do
90
+ it "should submit form post data" do
91
+ node = Skyscraper::fetch("http://www.balticplaza.eu/kontakt").first("#new_inquiry")
92
+ submited_page = node.submit(:"inquiry[name]" => "Example name")
93
+ submited_page.first("#inquiry_name").value.should == "Example name"
94
+ end
95
+
96
+ it "should throws an LocalFormException" do
97
+ lambda do
98
+ node = Skyscraper::fetch(path_to("skyscraper-node-form.html"))
99
+ node.first("form").submit
100
+ end.should raise_error Skyscraper::LocalFormException
101
+ end
102
+
103
+ it "should throws NotActionException" do
104
+ Skyscraper::Path::Local.any_instance.stub(:local? => false)
105
+
106
+ lambda do
107
+ Skyscraper::fetch(path_to("skyscraper-node-traversing.html")).first(".menu").submit
108
+ end.should raise_error Skyscraper::NotActionException
109
+ end
110
+
111
+ it "should handle GET form method" do
112
+ pending "todo"
113
+ end
114
+ end
86
115
  end
87
116
 
@@ -1,3 +1,5 @@
1
+ require "spec_helper"
2
+
1
3
  describe Skyscraper::Pages do
2
4
  it "should set convert string to items array" do
3
5
  Skyscraper::Pages.new("http://google.com").items.should == ["http://google.com"]
@@ -6,6 +6,14 @@ describe Skyscraper::Path do
6
6
  @path = Skyscraper::Path.factory("http://google.com/index.php?q=e")
7
7
  end
8
8
 
9
+ it "local? method should returns false" do
10
+ @path.local?.should == false
11
+ end
12
+
13
+ it "remote? method should returns true" do
14
+ @path.remote?.should == true
15
+ end
16
+
9
17
  it "should returns domain" do
10
18
  @path.domain.should == "http://google.com"
11
19
  end
@@ -65,6 +73,14 @@ describe Skyscraper::Path do
65
73
  @path.folder.should == "/var/www/files/"
66
74
  end
67
75
 
76
+ it "local? method should returns true" do
77
+ @path.local?.should == true
78
+ end
79
+
80
+ it "remote? method should returns false" do
81
+ @path.remote?.should == false
82
+ end
83
+
68
84
  it "should returns file name" do
69
85
  @path.file_name.should == "file.ext"
70
86
  end
@@ -1,3 +1,5 @@
1
+ require "spec_helper"
2
+
1
3
  describe Skyscraper::Resource do
2
4
  def should_download_resource_to node, path, options = {}
3
5
  resource = Skyscraper::Resource.new(node)
@@ -1,3 +1,5 @@
1
+ require "spec_helper"
2
+
1
3
  describe Skyscraper::Results do
2
4
  def fetch options = {}
3
5
  options.reverse_merge! fields: {}, options: {}
@@ -72,7 +74,7 @@ describe Skyscraper::Results do
72
74
  page.should be_an_instance_of(Skyscraper::Node)
73
75
  @call_count += 1
74
76
  end
75
-
77
+
76
78
  results = fetch path: [path_to("skyscraper-fetch.html")] * 10, options: { after_each: [callback] }
77
79
 
78
80
  @call_count.should == 10
@@ -117,6 +119,21 @@ describe Skyscraper::Results do
117
119
  results[0][:h1].should == "Hello world"
118
120
  end
119
121
 
122
+ it "should stops when after each callback returns false" do
123
+ counter = 0
124
+
125
+ callback = proc do
126
+ if counter == 1
127
+ counter = 0
128
+ false
129
+ else
130
+ counter += 1
131
+ end
132
+ end
133
+
134
+ results = fetch path: [path_to("skyscraper-fetch.html")] * 10, fields: { h1: "h1" }, options: { after_each: [callback] }
135
+ results.length.should == 2
136
+ end
120
137
  end
121
138
 
122
139
  describe "errors" do
@@ -1,5 +1,7 @@
1
1
  #encoding: utf-8
2
2
 
3
+ require "spec_helper"
4
+
3
5
  class TestScraper
4
6
  include Skyscraper
5
7
 
@@ -27,6 +29,10 @@ describe Skyscraper do
27
29
  Skyscraper::fetch(path_to("skyscraper-encoding.html")).first(".utf-8").text.should == "ąśćżół"
28
30
  end
29
31
 
32
+ it "should handle http -> https redirects" do
33
+ Skyscraper::fetch("http://github.com").first("title").text.should =~ /GitHub/
34
+ end
35
+
30
36
  it "should works when included" do
31
37
  TestScraper.new.fetch[0][:h1].should == "Hello world"
32
38
  TestScraper.new.fetch[1][:h1].should == "Hello world"
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <title>Title</title>
5
+ </head>
6
+
7
+ <body>
8
+ <form method="post">
9
+
10
+ </form>
11
+ </body>
12
+ </html>
13
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: skyscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-21 00:00:00.000000000 Z
12
+ date: 2012-05-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &77062930 !ruby/object:Gem::Requirement
16
+ requirement: &78066490 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *77062930
24
+ version_requirements: *78066490
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rake
27
- requirement: &77124830 !ruby/object:Gem::Requirement
27
+ requirement: &78128330 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *77124830
35
+ version_requirements: *78128330
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &77122840 !ruby/object:Gem::Requirement
38
+ requirement: &78126140 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *77122840
46
+ version_requirements: *78126140
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: actionpack
49
- requirement: &77119590 !ruby/object:Gem::Requirement
49
+ requirement: &78123100 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *77119590
57
+ version_requirements: *78123100
58
58
  description: Easy to use DSL that helps scraping data from websites. Thanks to it,
59
59
  writing web crawlers would be very fast and intuitive. Traversing through html nodes
60
60
  and fetching all of the HTML attributes, would be possible. Just like in jQuery
@@ -109,6 +109,7 @@ files:
109
109
  - spec/test_files/skyscraper-field.html
110
110
  - spec/test_files/skyscraper-node-a.html
111
111
  - spec/test_files/skyscraper-node-b.html
112
+ - spec/test_files/skyscraper-node-form.html
112
113
  - spec/test_files/skyscraper-node-traversing.html
113
114
  - spec/test_files/skyscraper-node.html
114
115
  - spec/test_files/skyscraper-pages.html
@@ -162,6 +163,7 @@ test_files:
162
163
  - spec/test_files/skyscraper-field.html
163
164
  - spec/test_files/skyscraper-node-a.html
164
165
  - spec/test_files/skyscraper-node-b.html
166
+ - spec/test_files/skyscraper-node-form.html
165
167
  - spec/test_files/skyscraper-node-traversing.html
166
168
  - spec/test_files/skyscraper-node.html
167
169
  - spec/test_files/skyscraper-pages.html