skyscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +22 -0
  5. data/README.md +180 -0
  6. data/Rakefile +5 -0
  7. data/lib/skyscraper.rb +56 -0
  8. data/lib/skyscraper/base.rb +44 -0
  9. data/lib/skyscraper/config.rb +15 -0
  10. data/lib/skyscraper/document.rb +11 -0
  11. data/lib/skyscraper/field.rb +24 -0
  12. data/lib/skyscraper/node.rb +8 -0
  13. data/lib/skyscraper/node/base.rb +103 -0
  14. data/lib/skyscraper/node/resource.rb +57 -0
  15. data/lib/skyscraper/pages.rb +27 -0
  16. data/lib/skyscraper/path.rb +29 -0
  17. data/lib/skyscraper/path/base.rb +15 -0
  18. data/lib/skyscraper/path/local.rb +29 -0
  19. data/lib/skyscraper/path/remote.rb +32 -0
  20. data/lib/skyscraper/results.rb +93 -0
  21. data/lib/version.rb +3 -0
  22. data/skyscraper.gemspec +22 -0
  23. data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
  24. data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
  25. data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
  26. data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
  27. data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
  28. data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
  29. data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
  30. data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
  31. data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
  32. data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
  33. data/spec/skyscraper/skyscraper_spec.rb +39 -0
  34. data/spec/spec_helper.rb +3 -0
  35. data/spec/support/skyscraper_helpers.rb +9 -0
  36. data/spec/test_files/encoding.html~ +12 -0
  37. data/spec/test_files/skyscraper-base.html +30 -0
  38. data/spec/test_files/skyscraper-document.html +30 -0
  39. data/spec/test_files/skyscraper-encoding.html +12 -0
  40. data/spec/test_files/skyscraper-fetch-2.html +11 -0
  41. data/spec/test_files/skyscraper-fetch.html +31 -0
  42. data/spec/test_files/skyscraper-field.html +30 -0
  43. data/spec/test_files/skyscraper-node-base-a.html +11 -0
  44. data/spec/test_files/skyscraper-node-base-b.html +10 -0
  45. data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
  46. data/spec/test_files/skyscraper-node-base.html +30 -0
  47. data/spec/test_files/skyscraper-node-resource-b.html +10 -0
  48. data/spec/test_files/skyscraper-node-resource-image.png +0 -0
  49. data/spec/test_files/skyscraper-node-resource.html +12 -0
  50. data/spec/test_files/skyscraper-pages.html +30 -0
  51. data/spec/test_files/skyscraper.html +30 -0
  52. metadata +169 -0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ..gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Adam Dratwinski
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,180 @@
1
+ # Skyscraper
2
+
3
+ ## Installation
4
+
5
+ **Skyscraper** installation is simple, just run:
6
+
7
+ `gem install skyscraper`
8
+
9
+ or add following entry to your gemfile:
10
+
11
+ `gem "skyscraper"`
12
+
13
+ if you want to use it in your rails project.
14
+
15
+ ## Finding nodes by CSS Selectors
16
+
17
+ ```ruby
18
+ >> Skyscraper::fetch("http://rubyonrails.org").first("title").text
19
+ # => "Ruby on Rails"
20
+
21
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".copyright p").text
22
+ # => "\\"Rails\\", \\"Ruby on Rails\\", and the Rails logo are registered trademarks of David Heinemeier Hansson. All rights reserved."
23
+ ```
24
+
25
+ You can use this thanks to **Nokogiri#css** method.
26
+
27
+ ## Reading HTML attributes
28
+ ```ruby
29
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".announce").class
30
+ # => "announce"
31
+ >> Skyscraper::fetch("http://rubyonrails.org").first("img").height
32
+ # => "112"
33
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".copyright").style
34
+ # => "margin-top: 20px"
35
+
36
+ ```
37
+
38
+ > ## Notice!
39
+ >**Skyscraper::Node::Base#class** method is overriden, to access original **class** method, please call Skyscraper::Node::Base#original_class
40
+
41
+ You can find list of all available methods in [Reading attributes Section](wiki/reading_attributes)
42
+
43
+ ## Using Skyscraper as included module
44
+
45
+ Fetch content from multiple pages and store it in the active record database is a common problem. You can do this quick, using **Skyscraper** as included module.
46
+
47
+ ```ruby
48
+ class Sample
49
+ include Skyscraper
50
+
51
+ settings limit: 10, delay: { after: 5, time: 1 }, encoding: "utf-8"
52
+
53
+ pages ["http://google.com", "https://github.com", "http://rubyonrails.org"]
54
+ # pages method also accepts blocks as argument, then you can use Skyscraper::fetch method inside to get list of pages from website more dynamically
55
+
56
+ field :html, "html", :html
57
+ field :title, "title" do |node|
58
+ "'#{node.text}'"
59
+ end
60
+ field :first_link, "body" do |node|
61
+ "'#{node.first("a").href}'"
62
+ end
63
+ field :first_image, "img", :download
64
+
65
+ # field method takes following arguments:
66
+ # field_name => name that the record will have in the results table
67
+ # selector => css selector of fetching element, so it can even looks like "tag #id.some_class"
68
+ # optionally symbol with the node method or block, if nothing is provided, text method on the node is fired
69
+
70
+ after_each do |result|
71
+ page = Page.new
72
+ page.title = result[:title]
73
+ page.html = result[:html]
74
+ page.first_link = result[:first_link]
75
+ page.first_image_path = results[:first_image]
76
+ page.save
77
+ end
78
+
79
+ after_all do
80
+ puts "Job done"
81
+ end
82
+ end
83
+
84
+ Sample.new.fetch #this will run above code applying provided callbacks and returns array with results
85
+ ```
86
+ You will find more details in [Including section](wiki/Including).
87
+
88
+ ## Traversing
89
+
90
+ Traversing through **Skyscraper** nodes is very similar to the way **jQuery** provides.
91
+
92
+ ```ruby
93
+ >> Skyscraper::fetch("https://github.com").first(".top-nav").find("li").map(&:html)
94
+ # => ["<a href="https://github.com/plans">Signup and Pricing</a>", "<a href="https://github.com/explore">Explore GitHub</a>", "<a href="https://github.com/features">Features</a>", "<a href="https://github.com/blog">Blog</a>", "<a href="https://github.com/login">Login</a>"]
95
+ ```
96
+
97
+ Of course you can write the same code in the easier way:
98
+
99
+ ```ruby
100
+ >> Skyscraper::fetch("https://github.com").find(".top-nav li").map(&:html)
101
+ # => ["<a href=\\"https://github.com/plans\\">Signup and Pricing</a>", "<a href=\\"https://github.com/explore\\">Explore GitHub</a>", "<a href=\\"https://github.com/features\\">Features</a>", "<a href=\\"https://github.com/blog\\">Blog</a>", "<a href=\\"https://github.com/login\\">Login</a>"]
102
+ ```
103
+
104
+ or even:
105
+
106
+ ```ruby
107
+ >> Skyscraper::fetch("https://github.com").find(".top-nav li a").map(&:content)
108
+ # => ["<a href=\\"https://github.com/plans\\">Signup and Pricing</a>", "<a href=\\"https://github.com/explore\\">Explore GitHub</a>", "<a href=\\"https://github.com/feature\s\">Features</a>", "<a href=\\"https://github.com/blog\\">Blog</a>", "<a href=\\"https://github.com/login\\">Login</a>"]
109
+ ```
110
+
111
+ Read more about traversing in [Traversing section](wiki/traversing)
112
+
113
+ ## Following
114
+
115
+ You can quickly follow node element if it have **href** attribute:
116
+
117
+ ```ruby
118
+ >> Skyscraper::fetch("https://github.com").first(".top-nav li a").follow.first("title").html
119
+ # => "Plans &amp; Pricing · GitHub"
120
+ ```
121
+
122
+ This example visits first menu item from github.com page, and then fetch title of it.
123
+
124
+ ## Downloading
125
+
126
+ When node element have **src** or **href** attribute, you can easily download it:
127
+
128
+ ```ruby
129
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download
130
+ # => "/tmp/skyscraper/1/rails.png"
131
+ ```
132
+
133
+ You can either provide download path and new file name in arguments. Default path is also available to set in configuration.
134
+
135
+ ```ruby
136
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/:sequence/:file_name")
137
+ # => "/tmp/test/1/rails.png"
138
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/:sequence/:file_name")
139
+ # => "/tmp/test/2/rails.png"
140
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download(path: "/tmp/test/my_file.png")
141
+ # => "/tmp/test/my_file.png"
142
+ >> Skyscraper.config.download_path = "/tmp/test/my_path_from_config/:file_name"
143
+ # => "/tmp/test/my_path_from_config/:file_name"
144
+ >> Skyscraper::fetch("http://rubyonrails.org").first(".message img").download
145
+ # => "/tmp/test/my_path_from_config/rails.png"
146
+ ```
147
+
148
+ \#download method returns path to saved file.
149
+
150
+ ## Configuration
151
+
152
+ Please visit [Configuration section](wiki/configuration) to get all details of **Skyscraper** configuration.
153
+
154
+ ## Testing
155
+
156
+ Please consider that you can fetch not only remote sites but also local files. This can be very helpful when you prefer TDD coding.
157
+
158
+ ## Other topics
159
+
160
+ * [Fetching from sites with large amount of pages](wiki/fetching_large_pages) - dealing with limits, delays and other stuff
161
+
162
+ ## Requirements
163
+
164
+ **Skyscraper** requires ruby in > 1.9 version. It's also depending on Nokogiri, Open-Uri, Uri and Actionpack libraries.
165
+
166
+ ## What is consider to be added?
167
+
168
+ * POST requests support
169
+ * Reattempt fetching on errors
170
+ * Redirects support
171
+ * Testing mode - downloading only small amount of records, and showing how they would look in database
172
+ * Ruby < 1.9 versions support
173
+ * Redis, ActiveRecord cache and storage
174
+ * Ruby on Rails generators
175
+
176
+ Please don't hesitate to post me a comment about above or other functionality that might be added.
177
+
178
+ ## Contributors
179
+
180
+ Here I will post list of contributors, which helps to created documentation and create bug fixes.
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task default: :spec
data/lib/skyscraper.rb ADDED
@@ -0,0 +1,56 @@
1
+ require "open-uri"
2
+ require "uri"
3
+ require "nokogiri"
4
+ require "active_support/core_ext"
5
+
6
+ include ActiveSupport
7
+
8
+ module Skyscraper
9
+ extend ActiveSupport::Autoload
10
+ extend ActiveSupport::Concern
11
+
12
+ autoload :Base
13
+ autoload :Config
14
+ autoload :Document
15
+ autoload :Field
16
+ autoload :Node
17
+ autoload :Pages
18
+ autoload :Path
19
+ autoload :Results
20
+
21
+ mattr_accessor :defaults
22
+ @@defaults = {
23
+ delay: { sleep: 0, after: 1 },
24
+ limit: nil,
25
+ encoding: "utf-8",
26
+ download_path: "/tmp/skyscraper/:sequence/:file_name",
27
+ # reattempt_times: 1,
28
+ noise_errors: true,
29
+ skip_on_error: true
30
+ }
31
+
32
+ def self.config
33
+ @config ||= Skyscraper::Config.new @@defaults
34
+ end
35
+
36
+ def self.fetch path, encoding = Skyscraper.config.encoding
37
+ document = Skyscraper::Document.load path, encoding
38
+ Node::Base.new document.css("html")
39
+ end
40
+
41
+ def fetch
42
+ self.class.send(:base).fetch
43
+ end
44
+
45
+ module ClassMethods
46
+ def method_missing method, *args, &block
47
+ base.send method, *args, &block
48
+ end
49
+
50
+ private
51
+
52
+ def base
53
+ @base ||= Skyscraper::Base.new Skyscraper.config
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,44 @@
1
+ module Skyscraper
2
+ class Base
3
+ attr_accessor :fields, :pages_object, :config, :results
4
+
5
+ def initialize config = nil
6
+ @config = (config || Skyscraper::Config.new(Skyscraper.defaults.dup))
7
+ @fields = []
8
+ @fetch_options = {}
9
+ @pages_object = Pages.new
10
+ @results = Results.new self
11
+ end
12
+
13
+ def pages options = {}, &block
14
+ @pages_object.set options, &block
15
+ end
16
+
17
+ def field name, selector, attribute = nil, &block
18
+ @fields.delete @fields.detect { |f| f.name == name }
19
+ @fields << Field.new(name: name, selector: selector, callback: block)
20
+ end
21
+
22
+ def after_each &block
23
+ @results.add_after_each &block
24
+ end
25
+
26
+ def after_all &block
27
+ @results.add_after_all &block
28
+ end
29
+
30
+ def settings options = {}
31
+ options.each_pair do |key, val|
32
+ @config.send "#{key}=", val
33
+ end
34
+ end
35
+
36
+ def fetch
37
+ @results.fetch
38
+ end
39
+
40
+ def continue
41
+ @results.continue
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,15 @@
1
+ module Skyscraper
2
+ class Config
3
+ def initialize settings = {}
4
+ @settings = settings
5
+ end
6
+
7
+ def method_missing name, value = nil
8
+ if name.to_s.match /\=$/
9
+ @settings.merge! name.to_s.delete("=").to_sym => value
10
+ else
11
+ @settings[name]
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,11 @@
1
+ module Skyscraper
2
+ class Document < Nokogiri::HTML::Document
3
+ attr_accessor :path
4
+
5
+ def self.load path, encoding = 'utf-8'
6
+ document = Skyscraper::Document.parse open(path), nil, encoding
7
+ document.path = Skyscraper::Path.factory(path)
8
+ document
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,24 @@
1
+ module Skyscraper
2
+ class Field
3
+ attr_accessor :name, :selector, :callback, :attribute, :value
4
+
5
+ def initialize options = {}
6
+ @name = options[:name]
7
+ @selector = options[:selector]
8
+ @attribute = options[:attribute]
9
+ @callback = options[:callback]
10
+ end
11
+
12
+ def find_in_document document
13
+ first_node = document.first(@selector)
14
+
15
+ if @callback
16
+ @value = @callback.call(first_node)
17
+ elsif @attribute
18
+ @value = first_node.send @attribute
19
+ else
20
+ @value = first_node.text
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,8 @@
1
+ module Skyscraper
2
+ module Node
3
+ extend ActiveSupport::Autoload
4
+
5
+ autoload :Base
6
+ autoload :Resource
7
+ end
8
+ end
@@ -0,0 +1,103 @@
1
+ module Skyscraper
2
+ module Node
3
+ class Base
4
+ alias :original_class :class
5
+
6
+ attr_accessor :element
7
+
8
+ def initialize element
9
+ @element = element
10
+ end
11
+
12
+ def first selector
13
+ self.find(selector).first
14
+ end
15
+
16
+ def find selector
17
+ @element.css(selector).map do |element|
18
+ Base.new(element)
19
+ end
20
+ end
21
+
22
+ def children selector = nil
23
+ if selector
24
+ children = @element.css(selector)
25
+ else
26
+ children = @element.children
27
+ end
28
+
29
+ children.select do |element|
30
+ element.parent == @element and element.is_a?(Nokogiri::XML::Element)
31
+ end.map do |child|
32
+ Base.new(child)
33
+ end
34
+ end
35
+
36
+ def parent
37
+ if @element.parent.is_a? Nokogiri::XML::Element
38
+ Base.new @element.parent
39
+ end
40
+ end
41
+
42
+ def have_parent?
43
+ self.parent.present?
44
+ end
45
+
46
+ def parents selector = nil
47
+ node = self
48
+ parents = []
49
+
50
+ while node.have_parent?
51
+ node = node.parent
52
+ parents << node
53
+ end
54
+
55
+ parents.select! do |item|
56
+ item.element.matches? selector
57
+ end if selector
58
+
59
+ parents
60
+ end
61
+
62
+ def siblings
63
+ self.parent.children.select do |node|
64
+ node.element != self.element
65
+ end
66
+ end
67
+
68
+ def follow
69
+ if self.href
70
+ Skyscraper::fetch(self.uri)
71
+ end
72
+ end
73
+
74
+ def html
75
+ @element.children.to_html
76
+ end
77
+
78
+ def class
79
+ @element.attribute("class").to_s
80
+ end
81
+
82
+ def download options = {}
83
+ Resource.new(self).download(options)
84
+ end
85
+
86
+ def uri
87
+ @element.document.path.full_path_for(self.href)
88
+ end
89
+
90
+ def method_missing name
91
+ @element.attribute(name.to_s).to_s
92
+ end
93
+
94
+ def text
95
+ @element.content.to_s.strip
96
+ end
97
+
98
+ def tag
99
+ @element.name
100
+ end
101
+ end
102
+ end
103
+ end