skyscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +22 -0
  5. data/README.md +180 -0
  6. data/Rakefile +5 -0
  7. data/lib/skyscraper.rb +56 -0
  8. data/lib/skyscraper/base.rb +44 -0
  9. data/lib/skyscraper/config.rb +15 -0
  10. data/lib/skyscraper/document.rb +11 -0
  11. data/lib/skyscraper/field.rb +24 -0
  12. data/lib/skyscraper/node.rb +8 -0
  13. data/lib/skyscraper/node/base.rb +103 -0
  14. data/lib/skyscraper/node/resource.rb +57 -0
  15. data/lib/skyscraper/pages.rb +27 -0
  16. data/lib/skyscraper/path.rb +29 -0
  17. data/lib/skyscraper/path/base.rb +15 -0
  18. data/lib/skyscraper/path/local.rb +29 -0
  19. data/lib/skyscraper/path/remote.rb +32 -0
  20. data/lib/skyscraper/results.rb +93 -0
  21. data/lib/version.rb +3 -0
  22. data/skyscraper.gemspec +22 -0
  23. data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
  24. data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
  25. data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
  26. data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
  27. data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
  28. data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
  29. data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
  30. data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
  31. data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
  32. data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
  33. data/spec/skyscraper/skyscraper_spec.rb +39 -0
  34. data/spec/spec_helper.rb +3 -0
  35. data/spec/support/skyscraper_helpers.rb +9 -0
  36. data/spec/test_files/encoding.html~ +12 -0
  37. data/spec/test_files/skyscraper-base.html +30 -0
  38. data/spec/test_files/skyscraper-document.html +30 -0
  39. data/spec/test_files/skyscraper-encoding.html +12 -0
  40. data/spec/test_files/skyscraper-fetch-2.html +11 -0
  41. data/spec/test_files/skyscraper-fetch.html +31 -0
  42. data/spec/test_files/skyscraper-field.html +30 -0
  43. data/spec/test_files/skyscraper-node-base-a.html +11 -0
  44. data/spec/test_files/skyscraper-node-base-b.html +10 -0
  45. data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
  46. data/spec/test_files/skyscraper-node-base.html +30 -0
  47. data/spec/test_files/skyscraper-node-resource-b.html +10 -0
  48. data/spec/test_files/skyscraper-node-resource-image.png +0 -0
  49. data/spec/test_files/skyscraper-node-resource.html +12 -0
  50. data/spec/test_files/skyscraper-pages.html +30 -0
  51. data/spec/test_files/skyscraper.html +30 -0
  52. metadata +169 -0
@@ -0,0 +1,57 @@
1
+ module Skyscraper
2
+ module Node
3
+ class Resource
4
+ def initialize node
5
+ @node = node
6
+ @path = extract_path_from_node(@node)
7
+ end
8
+
9
+ def download options = {}
10
+ @name = options[:file_name] || @path.file_name
11
+ @new_file_path = replace_path_variables(options[:path] || Skyscraper.config.download_path)
12
+ @temp_file = open(@path.full_path)
13
+
14
+ copy @temp_file.path, @new_file_path
15
+ @new_file_path
16
+ end
17
+
18
+ private
19
+
20
+ def copy from, to
21
+ create_path_if_not_exists to
22
+ `cp #{from} #{to}`
23
+ end
24
+
25
+ def create_path_if_not_exists path
26
+ `mkdir -p #{path}` unless File.directory?(path)
27
+ end
28
+
29
+ def replace_path_variables path
30
+ new_path = path.dup
31
+ new_path.gsub! /:file_name/, @name
32
+ new_path.gsub! /:sequence/, get_sequence_number_for(new_path)
33
+ new_path
34
+ end
35
+
36
+ def get_sequence_number_for path
37
+ new_path = path.split(":sequence")[0]
38
+ if File.directory?(new_path)
39
+ entries = Dir.entries(new_path).select { |i| i =~ /^\d+$/ } || []
40
+ last = entries.sort.last.to_i
41
+ last += 1
42
+ last.to_s
43
+ else
44
+ "1"
45
+ end
46
+ end
47
+
48
+ def extract_path_from_node node
49
+ if href_or_src = node.href.present? ? node.href : node.src
50
+ node.element.document.path.path_for(href_or_src)
51
+ else
52
+ throw Exception.new("no href no src")
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,27 @@
1
+ require "open-uri"
2
+
3
+ module Skyscraper
4
+ class Pages
5
+ attr_accessor :items
6
+
7
+ def initialize value = nil, &block
8
+ set value, &block
9
+ end
10
+
11
+ def set value = nil, &block
12
+ @items = block ? block.call(Skyscraper) : value
13
+ @items = @items.is_a?(Array) ? @items : [@items]
14
+ @items.flatten!
15
+ reset
16
+ self
17
+ end
18
+
19
+ def next
20
+ @items[@current += 1]
21
+ end
22
+
23
+ def reset
24
+ @current = -1
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,29 @@
1
+ module Skyscraper
2
+ module Path
3
+ extend ActiveSupport::Autoload
4
+
5
+ autoload :Base
6
+ autoload :Local
7
+ autoload :Remote
8
+
9
+ class << self
10
+ def factory path
11
+ if Path.remote?(path)
12
+ Path::Remote.new(path)
13
+ else
14
+ Path::Local.new(path)
15
+ end
16
+ end
17
+
18
+ def remote? path
19
+ path = "http://"+path unless path.match /^(http|https):\/\//
20
+ uri = URI.parse(path)
21
+ uri.host ? true : false
22
+ end
23
+
24
+ def absolute? path
25
+ path.starts_with? "/"
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,15 @@
1
+ module Skyscraper
2
+ module Path
3
+ class Base
4
+ def path_for path
5
+ Path::factory(self.full_path_for(path))
6
+ end
7
+
8
+ private
9
+
10
+ def get_file_name path
11
+ path.last == "/" ? nil : path.split("/").last
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,29 @@
1
+ module Skyscraper
2
+ module Path
3
+ class Local < Path::Base
4
+ attr_accessor :base, :folder, :file_name, :full_path
5
+
6
+ def initialize path
7
+ @folder = get_folder(path)
8
+ @full_path = path
9
+ @file_name = get_file_name(path)
10
+ @base = @folder
11
+ end
12
+
13
+ def full_path_for href
14
+ Path.absolute?(href) ? href : "#{@folder}#{href}"
15
+ end
16
+
17
+ def to_s
18
+ self.full_path
19
+ end
20
+
21
+ private
22
+
23
+ def get_folder path
24
+ path.match(/\/.+\//)[0]
25
+ end
26
+ end
27
+ end
28
+ end
29
+
@@ -0,0 +1,32 @@
1
+ module Skyscraper
2
+ module Path
3
+ class Remote < Path::Base
4
+ attr_accessor :uri, :base, :domain, :full_path, :path, :query, :file_name
5
+
6
+ def initialize path
7
+ uri = URI.parse(path)
8
+
9
+ if uri.scheme.present?
10
+ @domain = uri.scheme + "://" + uri.host
11
+ else
12
+ @domain = uri.host
13
+ end
14
+
15
+ @path = uri.path
16
+ @query = "?" + uri.query if uri.query
17
+ @full_path = "#{@domain}#{@path}#{@query}"
18
+ @base = "#{@domain}/"
19
+ @uri = uri
20
+ @file_name = get_file_name(@path)
21
+ end
22
+
23
+ def full_path_for href
24
+ @uri.merge(URI.parse(href)).to_s
25
+ end
26
+
27
+ def to_s
28
+ self.full_path
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,93 @@
1
+ module Skyscraper
2
+ class Results
3
+ attr_accessor :limit, :delay, :after_each, :after_all, :records
4
+
5
+ def initialize base, options = {}
6
+ @delay = extract_delay_hash(options[:delay] || base.config.delay)
7
+ @limit = options[:limit] || base.config.limit
8
+
9
+ @base = base
10
+
11
+ @after_each = options[:after_each] || []
12
+ @after_all = options[:after_all] || []
13
+
14
+ @records = []
15
+ end
16
+
17
+ def add_after_each &block
18
+ @after_each << block
19
+ end
20
+
21
+ def add_after_all &block
22
+ @after_all << block
23
+ end
24
+
25
+ def fetch continue = false
26
+ results = []
27
+ documents = []
28
+
29
+ @base.pages_object.reset unless continue
30
+
31
+ i = 0
32
+
33
+ while i != @limit and page = @base.pages_object.next
34
+ result = {}
35
+
36
+ begin
37
+ document = Skyscraper::fetch(page)
38
+
39
+ @base.fields.each do |field|
40
+ result[field.name] = field.find_in_document document
41
+ end
42
+
43
+ call_callbacks @after_each, result, document
44
+ results << result
45
+ sleep @delay[:sleep] if (i+1) % @delay[:after] == 0
46
+
47
+ rescue SocketError, Errno::ENOENT
48
+ warning_msg = "WARNGIN: resource '#{page}' not found!"
49
+ puts warning_msg if @base.config.noise_errors
50
+ raise NoResourceException, warning_msg unless @base.config.skip_on_error
51
+ end
52
+
53
+ i += 1
54
+ end
55
+
56
+ call_callbacks @after_all, results
57
+
58
+ @records += results
59
+ results
60
+ end
61
+
62
+ def continue
63
+ fetch true
64
+ end
65
+
66
+ private
67
+
68
+ def extract_delay_hash delay_hash
69
+ delay = {}
70
+
71
+ if delay_hash and delay_hash.is_a? Hash
72
+ delay = delay_hash
73
+ elsif delay_hash
74
+ delay[:sleep] = delay_hash
75
+ delay[:after] = 1
76
+ else
77
+ delay[:sleep] = 0
78
+ delay[:after] = 1
79
+ end
80
+
81
+ delay
82
+ end
83
+
84
+ def call_callbacks callbacks, *args
85
+ callbacks.each do |callback|
86
+ callback.call(*args)
87
+ end
88
+ end
89
+ end
90
+
91
+ class NoResourceException < Exception
92
+ end
93
+ end
data/lib/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module Skyscraper
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/./version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Adam Dratwinski"]
6
+ gem.email = ["arboooz@gmail.com"]
7
+ gem.description = %q{Library that helps scraping data from websites in easy way}
8
+ gem.summary = %q{Library that helps scraping data from websites in easy way}
9
+ gem.homepage = "https://github.com/boooz/skyscraper"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(spec)/})
14
+ gem.name = "skyscraper"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Skyscraper::VERSION
17
+
18
+ gem.add_development_dependency "rspec"
19
+ gem.add_development_dependency "rake"
20
+ gem.add_dependency "nokogiri"
21
+ gem.add_dependency "actionpack"
22
+ end
@@ -0,0 +1,83 @@
1
+ describe Skyscraper::Base do
2
+
3
+ it "should set pages" do
4
+ base = Skyscraper::Base.new
5
+ base.pages "http://onet.pl"
6
+ base.pages_object.is_a? Skyscraper::Pages
7
+ end
8
+
9
+ it "should have defaults for config" do
10
+ Skyscraper.config.encoding = "utf-8"
11
+ base = Skyscraper::Base.new
12
+ base.config.encoding.should == "utf-8"
13
+ end
14
+
15
+ it "should be able to have different config for each instance" do
16
+ base_a = Skyscraper::Base.new
17
+ base_a.config.bar = "foo"
18
+ base_a.settings foo: "biz"
19
+
20
+ base_b = Skyscraper::Base.new
21
+ base_b.config.bar = "biz"
22
+ base_b.settings foo: "bar"
23
+
24
+ base_a.config.bar.should == "foo"
25
+ base_a.config.foo.should == "biz"
26
+ base_b.config.bar.should == "biz"
27
+ base_b.config.foo.should == "bar"
28
+ end
29
+
30
+ it "should add fields" do
31
+ base = Skyscraper::Base.new
32
+ base.field :name, ".selector"
33
+ base.field :other, ".selector"
34
+ base.fields.length.should == 2
35
+ end
36
+
37
+ it "should override fields with the same name" do
38
+ base = Skyscraper::Base.new
39
+ base.field :name, ".selector"
40
+ base.field :name, ".selector"
41
+ base.fields.length.should == 1
42
+ end
43
+
44
+ it "should add after each callback" do
45
+ base = Skyscraper::Base.new
46
+ base.pages path_to("skyscraper-base.html")
47
+ base.field :h1, "h1"
48
+ base.after_each { |result, page| result[:h1] += "2" }
49
+ base.fetch[0][:h1].should == "Hello world2"
50
+ end
51
+
52
+ it "should add after all callback" do
53
+ base = Skyscraper::Base.new
54
+ base.pages path_to("skyscraper-base.html")
55
+ base.field :h1, "h1"
56
+ base.after_all { |results| results << "2" }
57
+ base.fetch.length.should == 2
58
+ end
59
+
60
+ it "should set settings" do
61
+ base = Skyscraper::Base.new
62
+ base.settings limit: 100, delay: { sleep: 2, after: 10 }
63
+ base.config.limit.should == 100
64
+ base.config.delay[:sleep].should == 2
65
+ base.config.delay[:after].should == 10
66
+ end
67
+
68
+ it "should fetch data" do
69
+ base = Skyscraper::Base.new
70
+ base.pages path_to("skyscraper-base.html")
71
+ base.field :h1, "h1"
72
+ base.fetch[0][:h1].should == "Hello world"
73
+ end
74
+
75
+ it "should be able to continue fetching" do
76
+ Skyscraper.config.limit = 10
77
+ base = Skyscraper::Base.new
78
+ base.pages [path_to("skyscraper-base.html")] * 12
79
+ base.field :h1, "h1"
80
+ base.fetch.length.should == 10
81
+ base.continue.length.should == 2
82
+ end
83
+ end
@@ -0,0 +1,25 @@
1
+ describe Skyscraper::Config do
2
+ it "should set variable on initialize" do
3
+ config = Skyscraper::Config.new foo: "bar"
4
+ config.foo.should == "bar"
5
+ end
6
+
7
+ it "should set dynamic variable" do
8
+ config = Skyscraper::Config.new foo: "bar"
9
+ config.foo.should == "bar"
10
+ config.bar = "foo"
11
+ config.bar.should == "foo"
12
+ end
13
+
14
+ it "should override variable value" do
15
+ config = Skyscraper::Config.new foo: "bar"
16
+ config.foo = "bizz"
17
+ config.foo.should == "bizz"
18
+ end
19
+
20
+ it "should override false value" do
21
+ config = Skyscraper::Config.new foo: true
22
+ config.foo = false
23
+ config.foo.should == false
24
+ end
25
+ end
@@ -0,0 +1,14 @@
1
+ #encoding: utf-8
2
+
3
+ describe Skyscraper::Document do
4
+ it "should support utf-8 encoding by default in remote pages" do
5
+ document = Skyscraper::Document::load("http://www.sjp.pl/grzegrz%F3%B3ka")
6
+ document.encoding.should == "utf-8"
7
+ document.css(".lc").first.content.strip == "Grzegrzółka"
8
+ end
9
+
10
+ it "should have path" do
11
+ document = Skyscraper::Document::load(path_to("skyscraper-document.html"))
12
+ document.path.should be_an Skyscraper::Path::Base
13
+ end
14
+ end