skyscraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +22 -0
  5. data/README.md +180 -0
  6. data/Rakefile +5 -0
  7. data/lib/skyscraper.rb +56 -0
  8. data/lib/skyscraper/base.rb +44 -0
  9. data/lib/skyscraper/config.rb +15 -0
  10. data/lib/skyscraper/document.rb +11 -0
  11. data/lib/skyscraper/field.rb +24 -0
  12. data/lib/skyscraper/node.rb +8 -0
  13. data/lib/skyscraper/node/base.rb +103 -0
  14. data/lib/skyscraper/node/resource.rb +57 -0
  15. data/lib/skyscraper/pages.rb +27 -0
  16. data/lib/skyscraper/path.rb +29 -0
  17. data/lib/skyscraper/path/base.rb +15 -0
  18. data/lib/skyscraper/path/local.rb +29 -0
  19. data/lib/skyscraper/path/remote.rb +32 -0
  20. data/lib/skyscraper/results.rb +93 -0
  21. data/lib/version.rb +3 -0
  22. data/skyscraper.gemspec +22 -0
  23. data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
  24. data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
  25. data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
  26. data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
  27. data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
  28. data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
  29. data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
  30. data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
  31. data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
  32. data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
  33. data/spec/skyscraper/skyscraper_spec.rb +39 -0
  34. data/spec/spec_helper.rb +3 -0
  35. data/spec/support/skyscraper_helpers.rb +9 -0
  36. data/spec/test_files/encoding.html~ +12 -0
  37. data/spec/test_files/skyscraper-base.html +30 -0
  38. data/spec/test_files/skyscraper-document.html +30 -0
  39. data/spec/test_files/skyscraper-encoding.html +12 -0
  40. data/spec/test_files/skyscraper-fetch-2.html +11 -0
  41. data/spec/test_files/skyscraper-fetch.html +31 -0
  42. data/spec/test_files/skyscraper-field.html +30 -0
  43. data/spec/test_files/skyscraper-node-base-a.html +11 -0
  44. data/spec/test_files/skyscraper-node-base-b.html +10 -0
  45. data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
  46. data/spec/test_files/skyscraper-node-base.html +30 -0
  47. data/spec/test_files/skyscraper-node-resource-b.html +10 -0
  48. data/spec/test_files/skyscraper-node-resource-image.png +0 -0
  49. data/spec/test_files/skyscraper-node-resource.html +12 -0
  50. data/spec/test_files/skyscraper-pages.html +30 -0
  51. data/spec/test_files/skyscraper.html +30 -0
  52. metadata +169 -0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ..gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Adam Dratwinski
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,180 @@
1
+ # Skyscraper
2
+
3
+ ## Installation
4
+
5
+ **Skyscraper** installation is simple, just run:
6
+
7
+ `gem install skyscraper`
8
+
9
+ or add following entry to your gemfile:
10
+
11
+ `gem "skyscraper"`
12
+
13
+ if you want to use it in your rails project.
14
+
15
+ ## Finding nodes by CSS Selectors
16
+
17
+ ```ruby
18
+ >> Skyscraper::fetch("http://rubyonrails.org").first("title").text
19
+ # => "Ruby on Rails"
20
+
21
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".copyright p").text
22
+ # => "\\"Rails\\", \\"Ruby on Rails\\", and the Rails logo are registered trademarks of David Heinemeier Hansson. All rights reserved."
23
+ ```
24
+
25
+ You can use this thanks to **Nokogiri#css** method.
26
+
27
+ ## Reading HTML attributes
28
+ ```ruby
29
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".announce").class
30
+ # => "announce"
31
+ >> Skyscraper::fetch("http://rubyonrails.org").first("img").height
32
+ # => "112"
33
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".copyright").style
34
+ # => "margin-top: 20px"
35
+
36
+ ```
37
+
38
+ > ## Notice!
39
+ >**Skyscraper::Node::Base#class** method is overriden, to access original **class** method, please call Skyscraper::Node::Base#original_class
40
+
41
+ You can find list of all available methods in [Reading attributes Section](wiki/reading_attributes)
42
+
43
+ ## Using Skyscraper as included module
44
+
45
+ Fetch content from multiple pages and store it in the active record database is a common problem. You can do this quick, using **Skyscraper** as included module.
46
+
47
+ ```ruby
48
+ class Sample
49
+ include Skyscraper
50
+
51
+ settings limit: 10, delay: { after: 5, time: 1 }, encoding: "utf-8"
52
+
53
+ pages ["http://google.com", "https://github.com", "http://rubyonrails.org"]
54
+ # pages method also accepts blocks as argument, then you can use Skyscraper::fetch method inside to get list of pages from website more dynamically
55
+
56
+ field :html, "html", :html
57
+ field :title, "title" do |node|
58
+ "'#{node.text}'"
59
+ end
60
+ field :first_link, "body" do |node|
61
+ "'#{node.first("a").href}'"
62
+ end
63
+ field :first_image, "img", :download
64
+
65
+ # field method takes following arguments:
66
+ # field_name => name that the record will have in the results table
67
+ # selector => css selector of fetching element, so it can even looks like "tag #id.some_class"
68
+ # optionally symbol with the node method or block, if nothing is provided, text method on the node is fired
69
+
70
+ after_each do |result|
71
+ page = Page.new
72
+ page.title = result[:title]
73
+ page.html = result[:html]
74
+ page.first_link = result[:first_link]
75
+ page.first_image_path = results[:first_image]
76
+ page.save
77
+ end
78
+
79
+ after_all do
80
+ puts "Job done"
81
+ end
82
+ end
83
+
84
+ Sample.new.fetch #this will run above code applying provided callbacks and returns array with results
85
+ ```
86
+ You will find more details in [Including section](wiki/Including).
87
+
88
+ ## Traversing
89
+
90
+ Traversing through **Skyscraper** nodes is very similar to the way **jQuery** provides.
91
+
92
+ ```ruby
93
+ >> Skyscraper::fetch("https://github.com").first(".top-nav").find("li").map(&:html)
94
+ # => ["<a href="https://github.com/plans">Signup and Pricing</a>", "<a href="https://github.com/explore">Explore GitHub</a>", "<a href="https://github.com/features">Features</a>", "<a href="https://github.com/blog">Blog</a>", "<a href="https://github.com/login">Login</a>"]
95
+ ```
96
+
97
+ Of course you can write the same code in the easier way:
98
+
99
+ ```ruby
100
+ >> Skyscraper::fetch("https://github.com").find(".top-nav li").map(&:html)
101
+ # => ["<a href=\\"https://github.com/plans\\">Signup and Pricing</a>", "<a href=\\"https://github.com/explore\\">Explore GitHub</a>", "<a href=\\"https://github.com/features\\">Features</a>", "<a href=\\"https://github.com/blog\\">Blog</a>", "<a href=\\"https://github.com/login\\">Login</a>"]
102
+ ```
103
+
104
+ or even:
105
+
106
+ ```ruby
107
+ >> Skyscraper::fetch("https://github.com").find(".top-nav li a").map(&:content)
108
+ # => ["<a href=\\"https://github.com/plans\\">Signup and Pricing</a>", "<a href=\\"https://github.com/explore\\">Explore GitHub</a>", "<a href=\\"https://github.com/feature\s\">Features</a>", "<a href=\\"https://github.com/blog\\">Blog</a>", "<a href=\\"https://github.com/login\\">Login</a>"]
109
+ ```
110
+
111
+ Read more about traversing in [Traversing section](wiki/traversing)
112
+
113
+ ## Following
114
+
115
+ You can quickly follow node element if it have **href** attribute:
116
+
117
+ ```ruby
118
+ >> Skyscraper::fetch("https://github.com").first(".top-nav li a").follow.first("title").html
119
+ # => "Plans &amp; Pricing · GitHub"
120
+ ```
121
+
122
+ This example visits first menu item from github.com page, and then fetch title of it.
123
+
124
+ ## Downloading
125
+
126
+ When node element have **src** or **href** attribute, you can easily download it:
127
+
128
+ ```ruby
129
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download
130
+ # => "/tmp/skyscraper/1/rails.png"
131
+ ```
132
+
133
+ You can either provide download path and new file name in arguments. Default path is also available to set in configuration.
134
+
135
+ ```ruby
136
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/:sequence/:file_name")
137
+ # => "/tmp/test/1/rails.png"
138
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/:sequence/:file_name")
139
+ # => "/tmp/test/2/rails.png"
140
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/my_file.png")
141
+ # => "/tmp/test/my_file.png"
142
+ >> Skyscraper.config.download_path = "/tmp/test/my_path_from_config/:file_name"
143
+ # => "/tmp/test/my_path_from_config/:file_name"
144
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download
145
+ # => "/tmp/test/my_path_from_config/rails.png"
146
+ ```
147
+
148
+ \#download method returns path to saved file.
149
+
150
+ ## Configuration
151
+
152
+ Please visit [Configuration section](wiki/configuration) to get all details of **Skyscraper** configuration.
153
+
154
+ ## Testing
155
+
156
+ Please consider that you can fetch not only remote sites but also local files. This can be very helpful when you prefer TDD coding.
157
+
158
+ ## Other topics
159
+
160
+ * [Fetching from sites with large amount of pages](wiki/fetching_large_pages) - dealing with limits, delays and other stuff
161
+
162
+ ## Requirements
163
+
164
+ **Skyscraper** requires ruby in > 1.9 version. It's also depending on Nokogiri, Open-Uri, Uri and Actionpack libraries.
165
+
166
+ ## What is consider to be added?
167
+
168
+ * POST requests support
169
+ * Reattempt fetching on errors
170
+ * Redirects support
171
+ * Testing mode - downloading only small amount of records, and showing how they would look in database
172
+ * Ruby < 1.9 versions support
173
+ * Redis, ActiveRecord cache and storage
174
+ * Ruby on Rails generators
175
+
176
+ Please don't hesitate to post me a comment about above or other functionality that might be added.
177
+
178
+ ## Contributors
179
+
180
+ Here I will post list of contributors, which helps to created documentation and create bug fixes.
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task default: :spec
data/lib/skyscraper.rb ADDED
@@ -0,0 +1,56 @@
1
+ require "open-uri"
2
+ require "uri"
3
+ require "nokogiri"
4
+ require "active_support/core_ext"
5
+
6
+ include ActiveSupport
7
+
8
+ module Skyscraper
9
+ extend ActiveSupport::Autoload
10
+ extend ActiveSupport::Concern
11
+
12
+ autoload :Base
13
+ autoload :Config
14
+ autoload :Document
15
+ autoload :Field
16
+ autoload :Node
17
+ autoload :Pages
18
+ autoload :Path
19
+ autoload :Results
20
+
21
+ mattr_accessor :defaults
22
+ @@defaults = {
23
+ delay: { sleep: 0, after: 1 },
24
+ limit: nil,
25
+ encoding: "utf-8",
26
+ download_path: "/tmp/skyscraper/:sequence/:file_name",
27
+ # reattempt_times: 1,
28
+ noise_errors: true,
29
+ skip_on_error: true
30
+ }
31
+
32
+ def self.config
33
+ @config ||= Skyscraper::Config.new @@defaults
34
+ end
35
+
36
+ def self.fetch path, encoding = Skyscraper.config.encoding
37
+ document = Skyscraper::Document.load path, encoding
38
+ Node::Base.new document.css("html")
39
+ end
40
+
41
+ def fetch
42
+ self.class.send(:base).fetch
43
+ end
44
+
45
+ module ClassMethods
46
+ def method_missing method, *args, &block
47
+ base.send method, *args, &block
48
+ end
49
+
50
+ private
51
+
52
+ def base
53
+ @base ||= Skyscraper::Base.new Skyscraper.config
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,44 @@
1
+ module Skyscraper
2
+ class Base
3
+ attr_accessor :fields, :pages_object, :config, :results
4
+
5
+ def initialize config = nil
6
+ @config = (config || Skyscraper::Config.new(Skyscraper.defaults.dup))
7
+ @fields = []
8
+ @fetch_options = {}
9
+ @pages_object = Pages.new
10
+ @results = Results.new self
11
+ end
12
+
13
+ def pages options = {}, &block
14
+ @pages_object.set options, &block
15
+ end
16
+
17
+ def field name, selector, attribute = nil, &block
18
+ @fields.delete @fields.detect { |f| f.name == name }
19
+ @fields << Field.new(name: name, selector: selector, callback: block)
20
+ end
21
+
22
+ def after_each &block
23
+ @results.add_after_each &block
24
+ end
25
+
26
+ def after_all &block
27
+ @results.add_after_all &block
28
+ end
29
+
30
+ def settings options = {}
31
+ options.each_pair do |key, val|
32
+ @config.send "#{key}=", val
33
+ end
34
+ end
35
+
36
+ def fetch
37
+ @results.fetch
38
+ end
39
+
40
+ def continue
41
+ @results.continue
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,15 @@
1
+ module Skyscraper
2
+ class Config
3
+ def initialize settings = {}
4
+ @settings = settings
5
+ end
6
+
7
+ def method_missing name, value = nil
8
+ if name.to_s.match /\=$/
9
+ @settings.merge! name.to_s.delete("=").to_sym => value
10
+ else
11
+ @settings[name]
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,11 @@
1
+ module Skyscraper
2
+ class Document < Nokogiri::HTML::Document
3
+ attr_accessor :path
4
+
5
+ def self.load path, encoding = 'utf-8'
6
+ document = Skyscraper::Document.parse open(path), nil, encoding
7
+ document.path = Skyscraper::Path.factory(path)
8
+ document
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,24 @@
1
+ module Skyscraper
2
+ class Field
3
+ attr_accessor :name, :selector, :callback, :attribute, :value
4
+
5
+ def initialize options = {}
6
+ @name = options[:name]
7
+ @selector = options[:selector]
8
+ @attribute = options[:attribute]
9
+ @callback = options[:callback]
10
+ end
11
+
12
+ def find_in_document document
13
+ first_node = document.first(@selector)
14
+
15
+ if @callback
16
+ @value = @callback.call(first_node)
17
+ elsif @attribute
18
+ @value = first_node.send @attribute
19
+ else
20
+ @value = first_node.text
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,8 @@
1
+ module Skyscraper
2
+ module Node
3
+ extend ActiveSupport::Autoload
4
+
5
+ autoload :Base
6
+ autoload :Resource
7
+ end
8
+ end
@@ -0,0 +1,103 @@
1
+ module Skyscraper
2
+ module Node
3
+ class Base
4
+ alias :original_class :class
5
+
6
+ attr_accessor :element
7
+
8
+ def initialize element
9
+ @element = element
10
+ end
11
+
12
+ def first selector
13
+ self.find(selector).first
14
+ end
15
+
16
+ def find selector
17
+ @element.css(selector).map do |element|
18
+ Base.new(element)
19
+ end
20
+ end
21
+
22
+ def children selector = nil
23
+ if selector
24
+ children = @element.css(selector)
25
+ else
26
+ children = @element.children
27
+ end
28
+
29
+ children.select do |element|
30
+ element.parent == @element and element.is_a?(Nokogiri::XML::Element)
31
+ end.map do |child|
32
+ Base.new(child)
33
+ end
34
+ end
35
+
36
+ def parent
37
+ if @element.parent.is_a? Nokogiri::XML::Element
38
+ Base.new @element.parent
39
+ end
40
+ end
41
+
42
+ def have_parent?
43
+ self.parent.present?
44
+ end
45
+
46
+ def parents selector = nil
47
+ node = self
48
+ parents = []
49
+
50
+ while node.have_parent?
51
+ node = node.parent
52
+ parents << node
53
+ end
54
+
55
+ parents.select! do |item|
56
+ item.element.matches? selector
57
+ end if selector
58
+
59
+ parents
60
+ end
61
+
62
+ def siblings
63
+ self.parent.children.select do |node|
64
+ node.element != self.element
65
+ end
66
+ end
67
+
68
+ def follow
69
+ if self.href
70
+ Skyscraper::fetch(self.uri)
71
+ end
72
+ end
73
+
74
+ def html
75
+ @element.children.to_html
76
+ end
77
+
78
+ def class
79
+ @element.attribute("class").to_s
80
+ end
81
+
82
+ def download options = {}
83
+ Resource.new(self).download(options)
84
+ end
85
+
86
+ def uri
87
+ @element.document.path.full_path_for(self.href)
88
+ end
89
+
90
+ def method_missing name
91
+ @element.attribute(name.to_s).to_s
92
+ end
93
+
94
+ def text
95
+ @element.content.to_s.strip
96
+ end
97
+
98
+ def tag
99
+ @element.name
100
+ end
101
+ end
102
+ end
103
+ end