skyscraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +180 -0
- data/Rakefile +5 -0
- data/lib/skyscraper.rb +56 -0
- data/lib/skyscraper/base.rb +44 -0
- data/lib/skyscraper/config.rb +15 -0
- data/lib/skyscraper/document.rb +11 -0
- data/lib/skyscraper/field.rb +24 -0
- data/lib/skyscraper/node.rb +8 -0
- data/lib/skyscraper/node/base.rb +103 -0
- data/lib/skyscraper/node/resource.rb +57 -0
- data/lib/skyscraper/pages.rb +27 -0
- data/lib/skyscraper/path.rb +29 -0
- data/lib/skyscraper/path/base.rb +15 -0
- data/lib/skyscraper/path/local.rb +29 -0
- data/lib/skyscraper/path/remote.rb +32 -0
- data/lib/skyscraper/results.rb +93 -0
- data/lib/version.rb +3 -0
- data/skyscraper.gemspec +22 -0
- data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
- data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
- data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
- data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
- data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
- data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
- data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
- data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
- data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
- data/spec/skyscraper/skyscraper_spec.rb +39 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/support/skyscraper_helpers.rb +9 -0
- data/spec/test_files/encoding.html~ +12 -0
- data/spec/test_files/skyscraper-base.html +30 -0
- data/spec/test_files/skyscraper-document.html +30 -0
- data/spec/test_files/skyscraper-encoding.html +12 -0
- data/spec/test_files/skyscraper-fetch-2.html +11 -0
- data/spec/test_files/skyscraper-fetch.html +31 -0
- data/spec/test_files/skyscraper-field.html +30 -0
- data/spec/test_files/skyscraper-node-base-a.html +11 -0
- data/spec/test_files/skyscraper-node-base-b.html +10 -0
- data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
- data/spec/test_files/skyscraper-node-base.html +30 -0
- data/spec/test_files/skyscraper-node-resource-b.html +10 -0
- data/spec/test_files/skyscraper-node-resource-image.png +0 -0
- data/spec/test_files/skyscraper-node-resource.html +12 -0
- data/spec/test_files/skyscraper-pages.html +30 -0
- data/spec/test_files/skyscraper.html +30 -0
- metadata +169 -0
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Adam Dratwinski
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
# Skyscraper
|
2
|
+
|
3
|
+
## Installation
|
4
|
+
|
5
|
+
**Skyscraper** installation is simple, just run:
|
6
|
+
|
7
|
+
`gem install skyscraper`
|
8
|
+
|
9
|
+
or add following entry to your gemfile:
|
10
|
+
|
11
|
+
`gem "skyscraper"`
|
12
|
+
|
13
|
+
if you want to use it in your rails project.
|
14
|
+
|
15
|
+
## Finding nodes by CSS Selectors
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first("title").text
|
19
|
+
# => "Ruby on Rails"
|
20
|
+
|
21
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".copyright p").text
|
22
|
+
# => "\\"Rails\\", \\"Ruby on Rails\\", and the Rails logo are registered trademarks of David Heinemeier Hansson. All rights reserved."
|
23
|
+
```
|
24
|
+
|
25
|
+
You can use this thanks to **Nokogiri#css** method.
|
26
|
+
|
27
|
+
## Reading HTML attributes
|
28
|
+
```ruby
|
29
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".announce").class
|
30
|
+
# => "announce"
|
31
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first("img").height
|
32
|
+
# => "112"
|
33
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".copyright").style
|
34
|
+
# => "margin-top: 20px"
|
35
|
+
|
36
|
+
```
|
37
|
+
|
38
|
+
> ## Notice!
|
39
|
+
>**Skyscraper::Node::Base#class** method is overriden, to access original **class** method, please call Skyscraper::Node::Base#original_class
|
40
|
+
|
41
|
+
You can find list of all available methods in [Reading attributes Section](wiki/reading_attributes)
|
42
|
+
|
43
|
+
## Using Skyscraper as included module
|
44
|
+
|
45
|
+
Fetch content from multiple pages and store it in the active record database is a common problem. You can do this quick, using **Skyscraper** as included module.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
class Sample
|
49
|
+
include Skyscraper
|
50
|
+
|
51
|
+
settings limit: 10, delay: { after: 5, time: 1 }, encoding: "utf-8"
|
52
|
+
|
53
|
+
pages ["http://google.com", "https://github.com", "http://rubyonrails.org"]
|
54
|
+
# pages method also accepts blocks as argument, then you can use Skyscraper::fetch method inside to get list of pages from website more dynamically
|
55
|
+
|
56
|
+
field :html, "html", :html
|
57
|
+
field :title, "title" do |node|
|
58
|
+
"'#{node.text}'"
|
59
|
+
end
|
60
|
+
field :first_link, "body" do |node|
|
61
|
+
"'#{node.first("a").href}'"
|
62
|
+
end
|
63
|
+
field :first_image, "img", :download
|
64
|
+
|
65
|
+
# field method takes following arguments:
|
66
|
+
# field_name => name that the record will have in the results table
|
67
|
+
# selector => css selector of fetching element, so it can even looks like "tag #id.some_class"
|
68
|
+
# optionally symbol with the node method or block, if nothing is provided, text method on the node is fired
|
69
|
+
|
70
|
+
after_each do |result|
|
71
|
+
page = Page.new
|
72
|
+
page.title = result[:title]
|
73
|
+
page.html = result[:html]
|
74
|
+
page.first_link = result[:first_link]
|
75
|
+
page.first_image_path = results[:first_image]
|
76
|
+
page.save
|
77
|
+
end
|
78
|
+
|
79
|
+
after_all do
|
80
|
+
puts "Job done"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
Sample.new.fetch #this will run above code applying provided callbacks and returns array with results
|
85
|
+
```
|
86
|
+
You will find more details in [Including section](wiki/Including).
|
87
|
+
|
88
|
+
## Traversing
|
89
|
+
|
90
|
+
Traversing through **Skyscraper** nodes is very similar to the way **jQuery** provides.
|
91
|
+
|
92
|
+
```ruby
|
93
|
+
>> Skyscraper::fetch("https://github.com").first(".top-nav").find("li").map(&:html)
|
94
|
+
# => ["<a href="https://github.com/plans">Signup and Pricing</a>", "<a href="https://github.com/explore">Explore GitHub</a>", "<a href="https://github.com/features">Features</a>", "<a href="https://github.com/blog">Blog</a>", "<a href="https://github.com/login">Login</a>"]
|
95
|
+
```
|
96
|
+
|
97
|
+
Of course you can write the same code in the easier way:
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
>> Skyscraper::fetch("https://github.com").find(".top-nav li").map(&:html)
|
101
|
+
# => ["<a href=\\"https://github.com/plans\\">Signup and Pricing</a>", "<a href=\\"https://github.com/explore\\">Explore GitHub</a>", "<a href=\\"https://github.com/features\\">Features</a>", "<a href=\\"https://github.com/blog\\">Blog</a>", "<a href=\\"https://github.com/login\\">Login</a>"]
|
102
|
+
```
|
103
|
+
|
104
|
+
or even:
|
105
|
+
|
106
|
+
```ruby
|
107
|
+
>> Skyscraper::fetch("https://github.com").find(".top-nav li a").map(&:content)
|
108
|
+
# => ["<a href=\\"https://github.com/plans\\">Signup and Pricing</a>", "<a href=\\"https://github.com/explore\\">Explore GitHub</a>", "<a href=\\"https://github.com/feature\s\">Features</a>", "<a href=\\"https://github.com/blog\\">Blog</a>", "<a href=\\"https://github.com/login\\">Login</a>"]
|
109
|
+
```
|
110
|
+
|
111
|
+
Read more about traversing in [Traversing section](wiki/traversing)
|
112
|
+
|
113
|
+
## Following
|
114
|
+
|
115
|
+
You can quickly follow node element if it have **href** attribute:
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
>> Skyscraper::fetch("https://github.com").first(".top-nav li a").follow.first("title").html
|
119
|
+
# => "Plans & Pricing · GitHub"
|
120
|
+
```
|
121
|
+
|
122
|
+
This example visits first menu item from github.com page, and then fetch title of it.
|
123
|
+
|
124
|
+
## Downloading
|
125
|
+
|
126
|
+
When node element have **src** or **href** attribute, you can easily download it:
|
127
|
+
|
128
|
+
```ruby
|
129
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download
|
130
|
+
# => "/tmp/skyscraper/1/rails.png"
|
131
|
+
```
|
132
|
+
|
133
|
+
You can either provide download path and new file name in arguments. Default path is also available to set in configuration.
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/:sequence/:file_name")
|
137
|
+
# => "/tmp/test/1/rails.png"
|
138
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/:sequence/:file_name")
|
139
|
+
# => "/tmp/test/2/rails.png"
|
140
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/my_file.png")
|
141
|
+
# => "/tmp/test/my_file.png"
|
142
|
+
>> Skyscraper.config.download_path = "/tmp/test/my_path_from_config/:file_name"
|
143
|
+
# => "/tmp/test/my_path_from_config/:file_name"
|
144
|
+
>> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download
|
145
|
+
# => "/tmp/test/my_path_from_config/rails.png"
|
146
|
+
```
|
147
|
+
|
148
|
+
\#download method returns path to saved file.
|
149
|
+
|
150
|
+
## Configuration
|
151
|
+
|
152
|
+
Please visit [Configuration section](wiki/configuration) to get all details of **Skyscraper** configuration.
|
153
|
+
|
154
|
+
## Testing
|
155
|
+
|
156
|
+
Please consider that you can fetch not only remote sites but also local files. This can be very helpful when you prefer TDD coding.
|
157
|
+
|
158
|
+
## Other topics
|
159
|
+
|
160
|
+
* [Fetching from sites with large amount of pages](wiki/fetching_large_pages) - dealing with limits, delays and other stuff
|
161
|
+
|
162
|
+
## Requirements
|
163
|
+
|
164
|
+
**Skyscraper** requires ruby in > 1.9 version. It's also depending on Nokogiri, Open-Uri, Uri and Actionpack libraries.
|
165
|
+
|
166
|
+
## What is consider to be added?
|
167
|
+
|
168
|
+
* POST requests support
|
169
|
+
* Reattempt fetching on errors
|
170
|
+
* Redirects support
|
171
|
+
* Testing mode - downloading only small amount of records, and showing how they would look in database
|
172
|
+
* Ruby < 1.9 versions support
|
173
|
+
* Redis, ActiveRecord cache and storage
|
174
|
+
* Ruby on Rails generators
|
175
|
+
|
176
|
+
Please don't hesitate to post me a comment about above or other functionality that might be added.
|
177
|
+
|
178
|
+
## Contributors
|
179
|
+
|
180
|
+
Here I will post list of contributors, which helps to created documentation and create bug fixes.
|
data/Rakefile
ADDED
data/lib/skyscraper.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "uri"
|
3
|
+
require "nokogiri"
|
4
|
+
require "active_support/core_ext"
|
5
|
+
|
6
|
+
include ActiveSupport
|
7
|
+
|
8
|
+
module Skyscraper
|
9
|
+
extend ActiveSupport::Autoload
|
10
|
+
extend ActiveSupport::Concern
|
11
|
+
|
12
|
+
autoload :Base
|
13
|
+
autoload :Config
|
14
|
+
autoload :Document
|
15
|
+
autoload :Field
|
16
|
+
autoload :Node
|
17
|
+
autoload :Pages
|
18
|
+
autoload :Path
|
19
|
+
autoload :Results
|
20
|
+
|
21
|
+
mattr_accessor :defaults
|
22
|
+
@@defaults = {
|
23
|
+
delay: { sleep: 0, after: 1 },
|
24
|
+
limit: nil,
|
25
|
+
encoding: "utf-8",
|
26
|
+
download_path: "/tmp/skyscraper/:sequence/:file_name",
|
27
|
+
# reattempt_times: 1,
|
28
|
+
noise_errors: true,
|
29
|
+
skip_on_error: true
|
30
|
+
}
|
31
|
+
|
32
|
+
def self.config
|
33
|
+
@config ||= Skyscraper::Config.new @@defaults
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.fetch path, encoding = Skyscraper.config.encoding
|
37
|
+
document = Skyscraper::Document.load path, encoding
|
38
|
+
Node::Base.new document.css("html")
|
39
|
+
end
|
40
|
+
|
41
|
+
def fetch
|
42
|
+
self.class.send(:base).fetch
|
43
|
+
end
|
44
|
+
|
45
|
+
module ClassMethods
|
46
|
+
def method_missing method, *args, &block
|
47
|
+
base.send method, *args, &block
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def base
|
53
|
+
@base ||= Skyscraper::Base.new Skyscraper.config
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Base
|
3
|
+
attr_accessor :fields, :pages_object, :config, :results
|
4
|
+
|
5
|
+
def initialize config = nil
|
6
|
+
@config = (config || Skyscraper::Config.new(Skyscraper.defaults.dup))
|
7
|
+
@fields = []
|
8
|
+
@fetch_options = {}
|
9
|
+
@pages_object = Pages.new
|
10
|
+
@results = Results.new self
|
11
|
+
end
|
12
|
+
|
13
|
+
def pages options = {}, &block
|
14
|
+
@pages_object.set options, &block
|
15
|
+
end
|
16
|
+
|
17
|
+
def field name, selector, attribute = nil, &block
|
18
|
+
@fields.delete @fields.detect { |f| f.name == name }
|
19
|
+
@fields << Field.new(name: name, selector: selector, callback: block)
|
20
|
+
end
|
21
|
+
|
22
|
+
def after_each &block
|
23
|
+
@results.add_after_each &block
|
24
|
+
end
|
25
|
+
|
26
|
+
def after_all &block
|
27
|
+
@results.add_after_all &block
|
28
|
+
end
|
29
|
+
|
30
|
+
def settings options = {}
|
31
|
+
options.each_pair do |key, val|
|
32
|
+
@config.send "#{key}=", val
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch
|
37
|
+
@results.fetch
|
38
|
+
end
|
39
|
+
|
40
|
+
def continue
|
41
|
+
@results.continue
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Config
|
3
|
+
def initialize settings = {}
|
4
|
+
@settings = settings
|
5
|
+
end
|
6
|
+
|
7
|
+
def method_missing name, value = nil
|
8
|
+
if name.to_s.match /\=$/
|
9
|
+
@settings.merge! name.to_s.delete("=").to_sym => value
|
10
|
+
else
|
11
|
+
@settings[name]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Document < Nokogiri::HTML::Document
|
3
|
+
attr_accessor :path
|
4
|
+
|
5
|
+
def self.load path, encoding = 'utf-8'
|
6
|
+
document = Skyscraper::Document.parse open(path), nil, encoding
|
7
|
+
document.path = Skyscraper::Path.factory(path)
|
8
|
+
document
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Field
|
3
|
+
attr_accessor :name, :selector, :callback, :attribute, :value
|
4
|
+
|
5
|
+
def initialize options = {}
|
6
|
+
@name = options[:name]
|
7
|
+
@selector = options[:selector]
|
8
|
+
@attribute = options[:attribute]
|
9
|
+
@callback = options[:callback]
|
10
|
+
end
|
11
|
+
|
12
|
+
def find_in_document document
|
13
|
+
first_node = document.first(@selector)
|
14
|
+
|
15
|
+
if @callback
|
16
|
+
@value = @callback.call(first_node)
|
17
|
+
elsif @attribute
|
18
|
+
@value = first_node.send @attribute
|
19
|
+
else
|
20
|
+
@value = first_node.text
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Node
|
3
|
+
class Base
|
4
|
+
alias :original_class :class
|
5
|
+
|
6
|
+
attr_accessor :element
|
7
|
+
|
8
|
+
def initialize element
|
9
|
+
@element = element
|
10
|
+
end
|
11
|
+
|
12
|
+
def first selector
|
13
|
+
self.find(selector).first
|
14
|
+
end
|
15
|
+
|
16
|
+
def find selector
|
17
|
+
@element.css(selector).map do |element|
|
18
|
+
Base.new(element)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def children selector = nil
|
23
|
+
if selector
|
24
|
+
children = @element.css(selector)
|
25
|
+
else
|
26
|
+
children = @element.children
|
27
|
+
end
|
28
|
+
|
29
|
+
children.select do |element|
|
30
|
+
element.parent == @element and element.is_a?(Nokogiri::XML::Element)
|
31
|
+
end.map do |child|
|
32
|
+
Base.new(child)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def parent
|
37
|
+
if @element.parent.is_a? Nokogiri::XML::Element
|
38
|
+
Base.new @element.parent
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def have_parent?
|
43
|
+
self.parent.present?
|
44
|
+
end
|
45
|
+
|
46
|
+
def parents selector = nil
|
47
|
+
node = self
|
48
|
+
parents = []
|
49
|
+
|
50
|
+
while node.have_parent?
|
51
|
+
node = node.parent
|
52
|
+
parents << node
|
53
|
+
end
|
54
|
+
|
55
|
+
parents.select! do |item|
|
56
|
+
item.element.matches? selector
|
57
|
+
end if selector
|
58
|
+
|
59
|
+
parents
|
60
|
+
end
|
61
|
+
|
62
|
+
def siblings
|
63
|
+
self.parent.children.select do |node|
|
64
|
+
node.element != self.element
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def follow
|
69
|
+
if self.href
|
70
|
+
Skyscraper::fetch(self.uri)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def html
|
75
|
+
@element.children.to_html
|
76
|
+
end
|
77
|
+
|
78
|
+
def class
|
79
|
+
@element.attribute("class").to_s
|
80
|
+
end
|
81
|
+
|
82
|
+
def download options = {}
|
83
|
+
Resource.new(self).download(options)
|
84
|
+
end
|
85
|
+
|
86
|
+
def uri
|
87
|
+
@element.document.path.full_path_for(self.href)
|
88
|
+
end
|
89
|
+
|
90
|
+
def method_missing name
|
91
|
+
@element.attribute(name.to_s).to_s
|
92
|
+
end
|
93
|
+
|
94
|
+
def text
|
95
|
+
@element.content.to_s.strip
|
96
|
+
end
|
97
|
+
|
98
|
+
def tag
|
99
|
+
@element.name
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|