skyscraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/.gitignore +17 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +22 -0
  5. data/README.md +180 -0
  6. data/Rakefile +5 -0
  7. data/lib/skyscraper.rb +56 -0
  8. data/lib/skyscraper/base.rb +44 -0
  9. data/lib/skyscraper/config.rb +15 -0
  10. data/lib/skyscraper/document.rb +11 -0
  11. data/lib/skyscraper/field.rb +24 -0
  12. data/lib/skyscraper/node.rb +8 -0
  13. data/lib/skyscraper/node/base.rb +103 -0
  14. data/lib/skyscraper/node/resource.rb +57 -0
  15. data/lib/skyscraper/pages.rb +27 -0
  16. data/lib/skyscraper/path.rb +29 -0
  17. data/lib/skyscraper/path/base.rb +15 -0
  18. data/lib/skyscraper/path/local.rb +29 -0
  19. data/lib/skyscraper/path/remote.rb +32 -0
  20. data/lib/skyscraper/results.rb +93 -0
  21. data/lib/version.rb +3 -0
  22. data/skyscraper.gemspec +22 -0
  23. data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
  24. data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
  25. data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
  26. data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
  27. data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
  28. data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
  29. data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
  30. data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
  31. data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
  32. data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
  33. data/spec/skyscraper/skyscraper_spec.rb +39 -0
  34. data/spec/spec_helper.rb +3 -0
  35. data/spec/support/skyscraper_helpers.rb +9 -0
  36. data/spec/test_files/encoding.html~ +12 -0
  37. data/spec/test_files/skyscraper-base.html +30 -0
  38. data/spec/test_files/skyscraper-document.html +30 -0
  39. data/spec/test_files/skyscraper-encoding.html +12 -0
  40. data/spec/test_files/skyscraper-fetch-2.html +11 -0
  41. data/spec/test_files/skyscraper-fetch.html +31 -0
  42. data/spec/test_files/skyscraper-field.html +30 -0
  43. data/spec/test_files/skyscraper-node-base-a.html +11 -0
  44. data/spec/test_files/skyscraper-node-base-b.html +10 -0
  45. data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
  46. data/spec/test_files/skyscraper-node-base.html +30 -0
  47. data/spec/test_files/skyscraper-node-resource-b.html +10 -0
  48. data/spec/test_files/skyscraper-node-resource-image.png +0 -0
  49. data/spec/test_files/skyscraper-node-resource.html +12 -0
  50. data/spec/test_files/skyscraper-pages.html +30 -0
  51. data/spec/test_files/skyscraper.html +30 -0
  52. metadata +169 -0
@@ -0,0 +1,57 @@
1
+ module Skyscraper
2
+ module Node
3
+ class Resource
4
+ def initialize node
5
+ @node = node
6
+ @path = extract_path_from_node(@node)
7
+ end
8
+
9
+ def download options = {}
10
+ @name = options[:file_name] || @path.file_name
11
+ @new_file_path = replace_path_variables(options[:path] || Skyscraper.config.download_path)
12
+ @temp_file = open(@path.full_path)
13
+
14
+ copy @temp_file.path, @new_file_path
15
+ @new_file_path
16
+ end
17
+
18
+ private
19
+
20
+ def copy from, to
21
+ create_path_if_not_exists to
22
+ `cp #{from} #{to}`
23
+ end
24
+
25
+ def create_path_if_not_exists path
26
+ `mkdir -p #{path}` unless File.directory?(path)
27
+ end
28
+
29
+ def replace_path_variables path
30
+ new_path = path.dup
31
+ new_path.gsub! /:file_name/, @name
32
+ new_path.gsub! /:sequence/, get_sequence_number_for(new_path)
33
+ new_path
34
+ end
35
+
36
+ def get_sequence_number_for path
37
+ new_path = path.split(":sequence")[0]
38
+ if File.directory?(new_path)
39
+ entries = Dir.entries(new_path).select { |i| i =~ /^\d+$/ } || []
40
+ last = entries.sort.last.to_i
41
+ last += 1
42
+ last.to_s
43
+ else
44
+ "1"
45
+ end
46
+ end
47
+
48
+ def extract_path_from_node node
49
+ if href_or_src = node.href.present? ? node.href : node.src
50
+ node.element.document.path.path_for(href_or_src)
51
+ else
52
+ throw Exception.new("no href no src")
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,27 @@
1
+ require "open-uri"
2
+
3
+ module Skyscraper
4
+ class Pages
5
+ attr_accessor :items
6
+
7
+ def initialize value = nil, &block
8
+ set value, &block
9
+ end
10
+
11
+ def set value = nil, &block
12
+ @items = block ? block.call(Skyscraper) : value
13
+ @items = @items.is_a?(Array) ? @items : [@items]
14
+ @items.flatten!
15
+ reset
16
+ self
17
+ end
18
+
19
+ def next
20
+ @items[@current += 1]
21
+ end
22
+
23
+ def reset
24
+ @current = -1
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,29 @@
1
+ module Skyscraper
2
+ module Path
3
+ extend ActiveSupport::Autoload
4
+
5
+ autoload :Base
6
+ autoload :Local
7
+ autoload :Remote
8
+
9
+ class << self
10
+ def factory path
11
+ if Path.remote?(path)
12
+ Path::Remote.new(path)
13
+ else
14
+ Path::Local.new(path)
15
+ end
16
+ end
17
+
18
+ def remote? path
19
+ path = "http://"+path unless path.match /^(http|https):\/\//
20
+ uri = URI.parse(path)
21
+ uri.host ? true : false
22
+ end
23
+
24
+ def absolute? path
25
+ path.starts_with? "/"
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,15 @@
1
+ module Skyscraper
2
+ module Path
3
+ class Base
4
+ def path_for path
5
+ Path::factory(self.full_path_for(path))
6
+ end
7
+
8
+ private
9
+
10
+ def get_file_name path
11
+ path.last == "/" ? nil : path.split("/").last
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,29 @@
1
+ module Skyscraper
2
+ module Path
3
+ class Local < Path::Base
4
+ attr_accessor :base, :folder, :file_name, :full_path
5
+
6
+ def initialize path
7
+ @folder = get_folder(path)
8
+ @full_path = path
9
+ @file_name = get_file_name(path)
10
+ @base = @folder
11
+ end
12
+
13
+ def full_path_for href
14
+ Path.absolute?(href) ? href : "#{@folder}#{href}"
15
+ end
16
+
17
+ def to_s
18
+ self.full_path
19
+ end
20
+
21
+ private
22
+
23
+ def get_folder path
24
+ path.match(/\/.+\//)[0]
25
+ end
26
+ end
27
+ end
28
+ end
29
+
@@ -0,0 +1,32 @@
1
+ module Skyscraper
2
+ module Path
3
+ class Remote < Path::Base
4
+ attr_accessor :uri, :base, :domain, :full_path, :path, :query, :file_name
5
+
6
+ def initialize path
7
+ uri = URI.parse(path)
8
+
9
+ if uri.scheme.present?
10
+ @domain = uri.scheme + "://" + uri.host
11
+ else
12
+ @domain = uri.host
13
+ end
14
+
15
+ @path = uri.path
16
+ @query = "?" + uri.query if uri.query
17
+ @full_path = "#{@domain}#{@path}#{@query}"
18
+ @base = "#{@domain}/"
19
+ @uri = uri
20
+ @file_name = get_file_name(@path)
21
+ end
22
+
23
+ def full_path_for href
24
+ @uri.merge(URI.parse(href)).to_s
25
+ end
26
+
27
+ def to_s
28
+ self.full_path
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,93 @@
1
+ module Skyscraper
2
+ class Results
3
+ attr_accessor :limit, :delay, :after_each, :after_all, :records
4
+
5
+ def initialize base, options = {}
6
+ @delay = extract_delay_hash(options[:delay] || base.config.delay)
7
+ @limit = options[:limit] || base.config.limit
8
+
9
+ @base = base
10
+
11
+ @after_each = options[:after_each] || []
12
+ @after_all = options[:after_all] || []
13
+
14
+ @records = []
15
+ end
16
+
17
+ def add_after_each &block
18
+ @after_each << block
19
+ end
20
+
21
+ def add_after_all &block
22
+ @after_all << block
23
+ end
24
+
25
+ def fetch continue = false
26
+ results = []
27
+ documents = []
28
+
29
+ @base.pages_object.reset unless continue
30
+
31
+ i = 0
32
+
33
+ while i != @limit and page = @base.pages_object.next
34
+ result = {}
35
+
36
+ begin
37
+ document = Skyscraper::fetch(page)
38
+
39
+ @base.fields.each do |field|
40
+ result[field.name] = field.find_in_document document
41
+ end
42
+
43
+ call_callbacks @after_each, result, document
44
+ results << result
45
+ sleep @delay[:sleep] if (i+1) % @delay[:after] == 0
46
+
47
+ rescue SocketError, Errno::ENOENT
48
+ warning_msg = "WARNGIN: resource '#{page}' not found!"
49
+ puts warning_msg if @base.config.noise_errors
50
+ raise NoResourceException, warning_msg unless @base.config.skip_on_error
51
+ end
52
+
53
+ i += 1
54
+ end
55
+
56
+ call_callbacks @after_all, results
57
+
58
+ @records += results
59
+ results
60
+ end
61
+
62
+ def continue
63
+ fetch true
64
+ end
65
+
66
+ private
67
+
68
+ def extract_delay_hash delay_hash
69
+ delay = {}
70
+
71
+ if delay_hash and delay_hash.is_a? Hash
72
+ delay = delay_hash
73
+ elsif delay_hash
74
+ delay[:sleep] = delay_hash
75
+ delay[:after] = 1
76
+ else
77
+ delay[:sleep] = 0
78
+ delay[:after] = 1
79
+ end
80
+
81
+ delay
82
+ end
83
+
84
+ def call_callbacks callbacks, *args
85
+ callbacks.each do |callback|
86
+ callback.call(*args)
87
+ end
88
+ end
89
+ end
90
+
91
+ class NoResourceException < Exception
92
+ end
93
+ end
data/lib/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module Skyscraper
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/./version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Adam Dratwinski"]
6
+ gem.email = ["arboooz@gmail.com"]
7
+ gem.description = %q{Library that helps scraping data from websites in easy way}
8
+ gem.summary = %q{Library that helps scraping data from websites in easy way}
9
+ gem.homepage = "https://github.com/boooz/skyscraper"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(spec)/})
14
+ gem.name = "skyscraper"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Skyscraper::VERSION
17
+
18
+ gem.add_development_dependency "rspec"
19
+ gem.add_development_dependency "rake"
20
+ gem.add_dependency "nokogiri"
21
+ gem.add_dependency "actionpack"
22
+ end
@@ -0,0 +1,83 @@
1
+ describe Skyscraper::Base do
2
+
3
+ it "should set pages" do
4
+ base = Skyscraper::Base.new
5
+ base.pages "http://onet.pl"
6
+ base.pages_object.is_a? Skyscraper::Pages
7
+ end
8
+
9
+ it "should have defaults for config" do
10
+ Skyscraper.config.encoding = "utf-8"
11
+ base = Skyscraper::Base.new
12
+ base.config.encoding.should == "utf-8"
13
+ end
14
+
15
+ it "should be able to have different config for each instance" do
16
+ base_a = Skyscraper::Base.new
17
+ base_a.config.bar = "foo"
18
+ base_a.settings foo: "biz"
19
+
20
+ base_b = Skyscraper::Base.new
21
+ base_b.config.bar = "biz"
22
+ base_b.settings foo: "bar"
23
+
24
+ base_a.config.bar.should == "foo"
25
+ base_a.config.foo.should == "biz"
26
+ base_b.config.bar.should == "biz"
27
+ base_b.config.foo.should == "bar"
28
+ end
29
+
30
+ it "should add fields" do
31
+ base = Skyscraper::Base.new
32
+ base.field :name, ".selector"
33
+ base.field :other, ".selector"
34
+ base.fields.length.should == 2
35
+ end
36
+
37
+ it "should override fields with the same name" do
38
+ base = Skyscraper::Base.new
39
+ base.field :name, ".selector"
40
+ base.field :name, ".selector"
41
+ base.fields.length.should == 1
42
+ end
43
+
44
+ it "should add after each callback" do
45
+ base = Skyscraper::Base.new
46
+ base.pages path_to("skyscraper-base.html")
47
+ base.field :h1, "h1"
48
+ base.after_each { |result, page| result[:h1] += "2" }
49
+ base.fetch[0][:h1].should == "Hello world2"
50
+ end
51
+
52
+ it "should add after all callback" do
53
+ base = Skyscraper::Base.new
54
+ base.pages path_to("skyscraper-base.html")
55
+ base.field :h1, "h1"
56
+ base.after_all { |results| results << "2" }
57
+ base.fetch.length.should == 2
58
+ end
59
+
60
+ it "should set settings" do
61
+ base = Skyscraper::Base.new
62
+ base.settings limit: 100, delay: { sleep: 2, after: 10 }
63
+ base.config.limit.should == 100
64
+ base.config.delay[:sleep].should == 2
65
+ base.config.delay[:after].should == 10
66
+ end
67
+
68
+ it "should fetch data" do
69
+ base = Skyscraper::Base.new
70
+ base.pages path_to("skyscraper-base.html")
71
+ base.field :h1, "h1"
72
+ base.fetch[0][:h1].should == "Hello world"
73
+ end
74
+
75
+ it "should be able to continue fetching" do
76
+ Skyscraper.config.limit = 10
77
+ base = Skyscraper::Base.new
78
+ base.pages [path_to("skyscraper-base.html")] * 12
79
+ base.field :h1, "h1"
80
+ base.fetch.length.should == 10
81
+ base.continue.length.should == 2
82
+ end
83
+ end
@@ -0,0 +1,25 @@
1
+ describe Skyscraper::Config do
2
+ it "should set variable on initialize" do
3
+ config = Skyscraper::Config.new foo: "bar"
4
+ config.foo.should == "bar"
5
+ end
6
+
7
+ it "should set dynamic variable" do
8
+ config = Skyscraper::Config.new foo: "bar"
9
+ config.foo.should == "bar"
10
+ config.bar = "foo"
11
+ config.bar.should == "foo"
12
+ end
13
+
14
+ it "should override variable value" do
15
+ config = Skyscraper::Config.new foo: "bar"
16
+ config.foo = "bizz"
17
+ config.foo.should == "bizz"
18
+ end
19
+
20
+ it "should override false value" do
21
+ config = Skyscraper::Config.new foo: true
22
+ config.foo = false
23
+ config.foo.should == false
24
+ end
25
+ end
@@ -0,0 +1,14 @@
1
+ #encoding: utf-8
2
+
3
+ describe Skyscraper::Document do
4
+ it "should support utf-8 encoding by default in remote pages" do
5
+ document = Skyscraper::Document::load("http://www.sjp.pl/grzegrz%F3%B3ka")
6
+ document.encoding.should == "utf-8"
7
+ document.css(".lc").first.content.strip == "Grzegrzółka"
8
+ end
9
+
10
+ it "should have path" do
11
+ document = Skyscraper::Document::load(path_to("skyscraper-document.html"))
12
+ document.path.should be_an Skyscraper::Path::Base
13
+ end
14
+ end