skyscraper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +180 -0
- data/Rakefile +5 -0
- data/lib/skyscraper.rb +56 -0
- data/lib/skyscraper/base.rb +44 -0
- data/lib/skyscraper/config.rb +15 -0
- data/lib/skyscraper/document.rb +11 -0
- data/lib/skyscraper/field.rb +24 -0
- data/lib/skyscraper/node.rb +8 -0
- data/lib/skyscraper/node/base.rb +103 -0
- data/lib/skyscraper/node/resource.rb +57 -0
- data/lib/skyscraper/pages.rb +27 -0
- data/lib/skyscraper/path.rb +29 -0
- data/lib/skyscraper/path/base.rb +15 -0
- data/lib/skyscraper/path/local.rb +29 -0
- data/lib/skyscraper/path/remote.rb +32 -0
- data/lib/skyscraper/results.rb +93 -0
- data/lib/version.rb +3 -0
- data/skyscraper.gemspec +22 -0
- data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
- data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
- data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
- data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
- data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
- data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
- data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
- data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
- data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
- data/spec/skyscraper/skyscraper_spec.rb +39 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/support/skyscraper_helpers.rb +9 -0
- data/spec/test_files/encoding.html~ +12 -0
- data/spec/test_files/skyscraper-base.html +30 -0
- data/spec/test_files/skyscraper-document.html +30 -0
- data/spec/test_files/skyscraper-encoding.html +12 -0
- data/spec/test_files/skyscraper-fetch-2.html +11 -0
- data/spec/test_files/skyscraper-fetch.html +31 -0
- data/spec/test_files/skyscraper-field.html +30 -0
- data/spec/test_files/skyscraper-node-base-a.html +11 -0
- data/spec/test_files/skyscraper-node-base-b.html +10 -0
- data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
- data/spec/test_files/skyscraper-node-base.html +30 -0
- data/spec/test_files/skyscraper-node-resource-b.html +10 -0
- data/spec/test_files/skyscraper-node-resource-image.png +0 -0
- data/spec/test_files/skyscraper-node-resource.html +12 -0
- data/spec/test_files/skyscraper-pages.html +30 -0
- data/spec/test_files/skyscraper.html +30 -0
- metadata +169 -0
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Adam Dratwinski
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
# Skyscraper
|
2
|
+
|
3
|
+
## Installation
|
4
|
+
|
5
|
+
**Skyscraper** installation is simple, just run:
|
6
|
+
|
7
|
+
`gem install skyscraper`
|
8
|
+
|
9
|
+
or add following entry to your gemfile:
|
10
|
+
|
11
|
+
`gem "skyscraper"`
|
12
|
+
|
13
|
+
if you want to use it in your rails project.
|
14
|
+
|
15
|
+
## Finding nodes by CSS Selectors
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first("title").text
|
19
|
+
# => "Ruby on Rails"
|
20
|
+
|
21
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".copyright p").text
|
22
|
+
# => "\\"Rails\\", \\"Ruby on Rails\\", and the Rails logo are registered trademarks of David Heinemeier Hansson. All rights reserved."
|
23
|
+
```
|
24
|
+
|
25
|
+
You can use this thanks to **Nokogiri#css** method.
|
26
|
+
|
27
|
+
## Reading HTML attributes
|
28
|
+
```ruby
|
29
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".announce").class
|
30
|
+
# => "announce"
|
31
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first("img").height
|
32
|
+
# => "112"
|
33
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".copyright").style
|
34
|
+
# => "margin-top: 20px"
|
35
|
+
|
36
|
+
```
|
37
|
+
|
38
|
+
> ## Notice!
|
39
|
+
>**Skyscraper::Node::Base#class** method is overriden, to access original **class** method, please call Skyscraper::Node::Base#original_class
|
40
|
+
|
41
|
+
You can find list of all available methods in [Reading attributes Section](wiki/reading_attributes)
|
42
|
+
|
43
|
+
## Using Skyscraper as included module
|
44
|
+
|
45
|
+
Fetch content from multiple pages and store it in the active record database is a common problem. You can do this quick, using **Skyscraper** as included module.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
class Sample
|
49
|
+
include Skyscraper
|
50
|
+
|
51
|
+
settings limit: 10, delay: { after: 5, time: 1 }, encoding: "utf-8"
|
52
|
+
|
53
|
+
pages ["http://google.com", "https://github.com", "http://rubyonrails.org"]
|
54
|
+
# pages method also accepts blocks as argument, then you can use Skyscraper::fetch method inside to get list of pages from website more dynamically
|
55
|
+
|
56
|
+
field :html, "html", :html
|
57
|
+
field :title, "title" do |node|
|
58
|
+
"'#{node.text}'"
|
59
|
+
end
|
60
|
+
field :first_link, "body" do |node|
|
61
|
+
"'#{node.first("a").href}'"
|
62
|
+
end
|
63
|
+
field :first_image, "img", :download
|
64
|
+
|
65
|
+
# field method takes following arguments:
|
66
|
+
# field_name => name that the record will have in the results table
|
67
|
+
# selector => css selector of fetching element, so it can even looks like "tag #id.some_class"
|
68
|
+
# optionally symbol with the node method or block, if nothing is provided, text method on the node is fired
|
69
|
+
|
70
|
+
after_each do |result|
|
71
|
+
page = Page.new
|
72
|
+
page.title = result[:title]
|
73
|
+
page.html = result[:html]
|
74
|
+
page.first_link = result[:first_link]
|
75
|
+
page.first_image_path = results[:first_image]
|
76
|
+
page.save
|
77
|
+
end
|
78
|
+
|
79
|
+
after_all do
|
80
|
+
puts "Job done"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
Sample.new.fetch #this will run above code applying provided callbacks and returns array with results
|
85
|
+
```
|
86
|
+
You will find more details in [Including section](wiki/Including).
|
87
|
+
|
88
|
+
## Traversing
|
89
|
+
|
90
|
+
Traversing through **Skyscraper** nodes is very similar to the way **jQuery** provides.
|
91
|
+
|
92
|
+
```ruby
|
93
|
+
>> Skyscraper::fetch("https://github.com").first(".top-nav").find("li").map(&:html)
|
94
|
+
# => ["<a href="https://github.com/plans">Signup and Pricing</a>", "<a href="https://github.com/explore">Explore GitHub</a>", "<a href="https://github.com/features">Features</a>", "<a href="https://github.com/blog">Blog</a>", "<a href="https://github.com/login">Login</a>"]
|
95
|
+
```
|
96
|
+
|
97
|
+
Of course you can write the same code in the easier way:
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
>> Skyscraper::fetch("https://github.com").find(".top-nav li").map(&:html)
|
101
|
+
# => ["<a href=\\"https://github.com/plans\\">Signup and Pricing</a>", "<a href=\\"https://github.com/explore\\">Explore GitHub</a>", "<a href=\\"https://github.com/features\\">Features</a>", "<a href=\\"https://github.com/blog\\">Blog</a>", "<a href=\\"https://github.com/login\\">Login</a>"]
|
102
|
+
```
|
103
|
+
|
104
|
+
or even:
|
105
|
+
|
106
|
+
```ruby
|
107
|
+
>> Skyscraper::fetch("https://github.com").find(".top-nav li a").map(&:content)
|
108
|
+
# => ["<a href=\\"https://github.com/plans\\">Signup and Pricing</a>", "<a href=\\"https://github.com/explore\\">Explore GitHub</a>", "<a href=\\"https://github.com/feature\s\">Features</a>", "<a href=\\"https://github.com/blog\\">Blog</a>", "<a href=\\"https://github.com/login\\">Login</a>"]
|
109
|
+
```
|
110
|
+
|
111
|
+
Read more about traversing in [Traversing section](wiki/traversing)
|
112
|
+
|
113
|
+
## Following
|
114
|
+
|
115
|
+
You can quickly follow node element if it have **href** attribute:
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
>> Skyscraper::fetch("https://github.com").first(".top-nav li a").follow.first("title").html
|
119
|
+
# => "Plans & Pricing · GitHub"
|
120
|
+
```
|
121
|
+
|
122
|
+
This example visits first menu item from github.com page, and then fetch title of it.
|
123
|
+
|
124
|
+
## Downloading
|
125
|
+
|
126
|
+
When node element have **src** or **href** attribute, you can easily download it:
|
127
|
+
|
128
|
+
```ruby
|
129
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download
|
130
|
+
# => "/tmp/skyscraper/1/rails.png"
|
131
|
+
```
|
132
|
+
|
133
|
+
You can either provide download path and new file name in arguments. Default path is also available to set in configuration.
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/:sequence/:file_name")
|
137
|
+
# => "/tmp/test/1/rails.png"
|
138
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/:sequence/:file_name")
|
139
|
+
# => "/tmp/test/2/rails.png"
|
140
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/my_file.png")
|
141
|
+
# => "/tmp/test/my_file.png"
|
142
|
+
>> Skyscraper.config.download_path = "/tmp/test/my_path_from_config/:file_name"
|
143
|
+
# => "/tmp/test/my_path_from_config/:file_name"
|
144
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download
|
145
|
+
# => "/tmp/test/my_path_from_config/rails.png"
|
146
|
+
```
|
147
|
+
|
148
|
+
\#download method returns path to saved file.
|
149
|
+
|
150
|
+
## Configuration
|
151
|
+
|
152
|
+
Please visit [Configuration section](wiki/configuration) to get all details of **Skyscraper** configuration.
|
153
|
+
|
154
|
+
## Testing
|
155
|
+
|
156
|
+
Please consider that you can fetch not only remote sites but also local files. This can be very helpful when you prefer TDD coding.
|
157
|
+
|
158
|
+
## Other topics
|
159
|
+
|
160
|
+
* [Fetching from sites with large amount of pages](wiki/fetching_large_pages) - dealing with limits, delays and other stuff
|
161
|
+
|
162
|
+
## Requirements
|
163
|
+
|
164
|
+
**Skyscraper** requires ruby in > 1.9 version. It's also depending on Nokogiri, Open-Uri, Uri and Actionpack libraries.
|
165
|
+
|
166
|
+
## What is consider to be added?
|
167
|
+
|
168
|
+
* POST requests support
|
169
|
+
* Reattempt fetching on errors
|
170
|
+
* Redirects support
|
171
|
+
* Testing mode - downloading only small amount of records, and showing how they would look in database
|
172
|
+
* Ruby < 1.9 versions support
|
173
|
+
* Redis, ActiveRecord cache and storage
|
174
|
+
* Ruby on Rails generators
|
175
|
+
|
176
|
+
Please don't hesitate to post me a comment about above or other functionality that might be added.
|
177
|
+
|
178
|
+
## Contributors
|
179
|
+
|
180
|
+
Here I will post list of contributors, which helps to created documentation and create bug fixes.
|
data/Rakefile
ADDED
data/lib/skyscraper.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "uri"
|
3
|
+
require "nokogiri"
|
4
|
+
require "active_support/core_ext"
|
5
|
+
|
6
|
+
include ActiveSupport
|
7
|
+
|
8
|
+
module Skyscraper
|
9
|
+
extend ActiveSupport::Autoload
|
10
|
+
extend ActiveSupport::Concern
|
11
|
+
|
12
|
+
autoload :Base
|
13
|
+
autoload :Config
|
14
|
+
autoload :Document
|
15
|
+
autoload :Field
|
16
|
+
autoload :Node
|
17
|
+
autoload :Pages
|
18
|
+
autoload :Path
|
19
|
+
autoload :Results
|
20
|
+
|
21
|
+
mattr_accessor :defaults
|
22
|
+
@@defaults = {
|
23
|
+
delay: { sleep: 0, after: 1 },
|
24
|
+
limit: nil,
|
25
|
+
encoding: "utf-8",
|
26
|
+
download_path: "/tmp/skyscraper/:sequence/:file_name",
|
27
|
+
# reattempt_times: 1,
|
28
|
+
noise_errors: true,
|
29
|
+
skip_on_error: true
|
30
|
+
}
|
31
|
+
|
32
|
+
def self.config
|
33
|
+
@config ||= Skyscraper::Config.new @@defaults
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.fetch path, encoding = Skyscraper.config.encoding
|
37
|
+
document = Skyscraper::Document.load path, encoding
|
38
|
+
Node::Base.new document.css("html")
|
39
|
+
end
|
40
|
+
|
41
|
+
def fetch
|
42
|
+
self.class.send(:base).fetch
|
43
|
+
end
|
44
|
+
|
45
|
+
module ClassMethods
|
46
|
+
def method_missing method, *args, &block
|
47
|
+
base.send method, *args, &block
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def base
|
53
|
+
@base ||= Skyscraper::Base.new Skyscraper.config
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Base
|
3
|
+
attr_accessor :fields, :pages_object, :config, :results
|
4
|
+
|
5
|
+
def initialize config = nil
|
6
|
+
@config = (config || Skyscraper::Config.new(Skyscraper.defaults.dup))
|
7
|
+
@fields = []
|
8
|
+
@fetch_options = {}
|
9
|
+
@pages_object = Pages.new
|
10
|
+
@results = Results.new self
|
11
|
+
end
|
12
|
+
|
13
|
+
def pages options = {}, &block
|
14
|
+
@pages_object.set options, &block
|
15
|
+
end
|
16
|
+
|
17
|
+
def field name, selector, attribute = nil, &block
|
18
|
+
@fields.delete @fields.detect { |f| f.name == name }
|
19
|
+
@fields << Field.new(name: name, selector: selector, callback: block)
|
20
|
+
end
|
21
|
+
|
22
|
+
def after_each &block
|
23
|
+
@results.add_after_each &block
|
24
|
+
end
|
25
|
+
|
26
|
+
def after_all &block
|
27
|
+
@results.add_after_all &block
|
28
|
+
end
|
29
|
+
|
30
|
+
def settings options = {}
|
31
|
+
options.each_pair do |key, val|
|
32
|
+
@config.send "#{key}=", val
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch
|
37
|
+
@results.fetch
|
38
|
+
end
|
39
|
+
|
40
|
+
def continue
|
41
|
+
@results.continue
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Config
|
3
|
+
def initialize settings = {}
|
4
|
+
@settings = settings
|
5
|
+
end
|
6
|
+
|
7
|
+
def method_missing name, value = nil
|
8
|
+
if name.to_s.match /\=$/
|
9
|
+
@settings.merge! name.to_s.delete("=").to_sym => value
|
10
|
+
else
|
11
|
+
@settings[name]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Document < Nokogiri::HTML::Document
|
3
|
+
attr_accessor :path
|
4
|
+
|
5
|
+
def self.load path, encoding = 'utf-8'
|
6
|
+
document = Skyscraper::Document.parse open(path), nil, encoding
|
7
|
+
document.path = Skyscraper::Path.factory(path)
|
8
|
+
document
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Field
|
3
|
+
attr_accessor :name, :selector, :callback, :attribute, :value
|
4
|
+
|
5
|
+
def initialize options = {}
|
6
|
+
@name = options[:name]
|
7
|
+
@selector = options[:selector]
|
8
|
+
@attribute = options[:attribute]
|
9
|
+
@callback = options[:callback]
|
10
|
+
end
|
11
|
+
|
12
|
+
def find_in_document document
|
13
|
+
first_node = document.first(@selector)
|
14
|
+
|
15
|
+
if @callback
|
16
|
+
@value = @callback.call(first_node)
|
17
|
+
elsif @attribute
|
18
|
+
@value = first_node.send @attribute
|
19
|
+
else
|
20
|
+
@value = first_node.text
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Node
|
3
|
+
class Base
|
4
|
+
alias :original_class :class
|
5
|
+
|
6
|
+
attr_accessor :element
|
7
|
+
|
8
|
+
def initialize element
|
9
|
+
@element = element
|
10
|
+
end
|
11
|
+
|
12
|
+
def first selector
|
13
|
+
self.find(selector).first
|
14
|
+
end
|
15
|
+
|
16
|
+
def find selector
|
17
|
+
@element.css(selector).map do |element|
|
18
|
+
Base.new(element)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def children selector = nil
|
23
|
+
if selector
|
24
|
+
children = @element.css(selector)
|
25
|
+
else
|
26
|
+
children = @element.children
|
27
|
+
end
|
28
|
+
|
29
|
+
children.select do |element|
|
30
|
+
element.parent == @element and element.is_a?(Nokogiri::XML::Element)
|
31
|
+
end.map do |child|
|
32
|
+
Base.new(child)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def parent
|
37
|
+
if @element.parent.is_a? Nokogiri::XML::Element
|
38
|
+
Base.new @element.parent
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def have_parent?
|
43
|
+
self.parent.present?
|
44
|
+
end
|
45
|
+
|
46
|
+
def parents selector = nil
|
47
|
+
node = self
|
48
|
+
parents = []
|
49
|
+
|
50
|
+
while node.have_parent?
|
51
|
+
node = node.parent
|
52
|
+
parents << node
|
53
|
+
end
|
54
|
+
|
55
|
+
parents.select! do |item|
|
56
|
+
item.element.matches? selector
|
57
|
+
end if selector
|
58
|
+
|
59
|
+
parents
|
60
|
+
end
|
61
|
+
|
62
|
+
def siblings
|
63
|
+
self.parent.children.select do |node|
|
64
|
+
node.element != self.element
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def follow
|
69
|
+
if self.href
|
70
|
+
Skyscraper::fetch(self.uri)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def html
|
75
|
+
@element.children.to_html
|
76
|
+
end
|
77
|
+
|
78
|
+
def class
|
79
|
+
@element.attribute("class").to_s
|
80
|
+
end
|
81
|
+
|
82
|
+
def download options = {}
|
83
|
+
Resource.new(self).download(options)
|
84
|
+
end
|
85
|
+
|
86
|
+
def uri
|
87
|
+
@element.document.path.full_path_for(self.href)
|
88
|
+
end
|
89
|
+
|
90
|
+
def method_missing name
|
91
|
+
@element.attribute(name.to_s).to_s
|
92
|
+
end
|
93
|
+
|
94
|
+
def text
|
95
|
+
@element.content.to_s.strip
|
96
|
+
end
|
97
|
+
|
98
|
+
def tag
|
99
|
+
@element.name
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|