skyscraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +180 -0
- data/Rakefile +5 -0
- data/lib/skyscraper.rb +56 -0
- data/lib/skyscraper/base.rb +44 -0
- data/lib/skyscraper/config.rb +15 -0
- data/lib/skyscraper/document.rb +11 -0
- data/lib/skyscraper/field.rb +24 -0
- data/lib/skyscraper/node.rb +8 -0
- data/lib/skyscraper/node/base.rb +103 -0
- data/lib/skyscraper/node/resource.rb +57 -0
- data/lib/skyscraper/pages.rb +27 -0
- data/lib/skyscraper/path.rb +29 -0
- data/lib/skyscraper/path/base.rb +15 -0
- data/lib/skyscraper/path/local.rb +29 -0
- data/lib/skyscraper/path/remote.rb +32 -0
- data/lib/skyscraper/results.rb +93 -0
- data/lib/version.rb +3 -0
- data/skyscraper.gemspec +22 -0
- data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
- data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
- data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
- data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
- data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
- data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
- data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
- data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
- data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
- data/spec/skyscraper/skyscraper_spec.rb +39 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/support/skyscraper_helpers.rb +9 -0
- data/spec/test_files/encoding.html~ +12 -0
- data/spec/test_files/skyscraper-base.html +30 -0
- data/spec/test_files/skyscraper-document.html +30 -0
- data/spec/test_files/skyscraper-encoding.html +12 -0
- data/spec/test_files/skyscraper-fetch-2.html +11 -0
- data/spec/test_files/skyscraper-fetch.html +31 -0
- data/spec/test_files/skyscraper-field.html +30 -0
- data/spec/test_files/skyscraper-node-base-a.html +11 -0
- data/spec/test_files/skyscraper-node-base-b.html +10 -0
- data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
- data/spec/test_files/skyscraper-node-base.html +30 -0
- data/spec/test_files/skyscraper-node-resource-b.html +10 -0
- data/spec/test_files/skyscraper-node-resource-image.png +0 -0
- data/spec/test_files/skyscraper-node-resource.html +12 -0
- data/spec/test_files/skyscraper-pages.html +30 -0
- data/spec/test_files/skyscraper.html +30 -0
- metadata +169 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Node
|
3
|
+
class Resource
|
4
|
+
def initialize node
|
5
|
+
@node = node
|
6
|
+
@path = extract_path_from_node(@node)
|
7
|
+
end
|
8
|
+
|
9
|
+
def download options = {}
|
10
|
+
@name = options[:file_name] || @path.file_name
|
11
|
+
@new_file_path = replace_path_variables(options[:path] || Skyscraper.config.download_path)
|
12
|
+
@temp_file = open(@path.full_path)
|
13
|
+
|
14
|
+
copy @temp_file.path, @new_file_path
|
15
|
+
@new_file_path
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def copy from, to
|
21
|
+
create_path_if_not_exists to
|
22
|
+
`cp #{from} #{to}`
|
23
|
+
end
|
24
|
+
|
25
|
+
def create_path_if_not_exists path
|
26
|
+
`mkdir -p #{path}` unless File.directory?(path)
|
27
|
+
end
|
28
|
+
|
29
|
+
def replace_path_variables path
|
30
|
+
new_path = path.dup
|
31
|
+
new_path.gsub! /:file_name/, @name
|
32
|
+
new_path.gsub! /:sequence/, get_sequence_number_for(new_path)
|
33
|
+
new_path
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_sequence_number_for path
|
37
|
+
new_path = path.split(":sequence")[0]
|
38
|
+
if File.directory?(new_path)
|
39
|
+
entries = Dir.entries(new_path).select { |i| i =~ /^\d+$/ } || []
|
40
|
+
last = entries.sort.last.to_i
|
41
|
+
last += 1
|
42
|
+
last.to_s
|
43
|
+
else
|
44
|
+
"1"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def extract_path_from_node node
|
49
|
+
if href_or_src = node.href.present? ? node.href : node.src
|
50
|
+
node.element.document.path.path_for(href_or_src)
|
51
|
+
else
|
52
|
+
throw Exception.new("no href no src")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
|
3
|
+
module Skyscraper
|
4
|
+
class Pages
|
5
|
+
attr_accessor :items
|
6
|
+
|
7
|
+
def initialize value = nil, &block
|
8
|
+
set value, &block
|
9
|
+
end
|
10
|
+
|
11
|
+
def set value = nil, &block
|
12
|
+
@items = block ? block.call(Skyscraper) : value
|
13
|
+
@items = @items.is_a?(Array) ? @items : [@items]
|
14
|
+
@items.flatten!
|
15
|
+
reset
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
def next
|
20
|
+
@items[@current += 1]
|
21
|
+
end
|
22
|
+
|
23
|
+
def reset
|
24
|
+
@current = -1
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Path
|
3
|
+
extend ActiveSupport::Autoload
|
4
|
+
|
5
|
+
autoload :Base
|
6
|
+
autoload :Local
|
7
|
+
autoload :Remote
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def factory path
|
11
|
+
if Path.remote?(path)
|
12
|
+
Path::Remote.new(path)
|
13
|
+
else
|
14
|
+
Path::Local.new(path)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def remote? path
|
19
|
+
path = "http://"+path unless path.match /^(http|https):\/\//
|
20
|
+
uri = URI.parse(path)
|
21
|
+
uri.host ? true : false
|
22
|
+
end
|
23
|
+
|
24
|
+
def absolute? path
|
25
|
+
path.starts_with? "/"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Path
|
3
|
+
class Local < Path::Base
|
4
|
+
attr_accessor :base, :folder, :file_name, :full_path
|
5
|
+
|
6
|
+
def initialize path
|
7
|
+
@folder = get_folder(path)
|
8
|
+
@full_path = path
|
9
|
+
@file_name = get_file_name(path)
|
10
|
+
@base = @folder
|
11
|
+
end
|
12
|
+
|
13
|
+
def full_path_for href
|
14
|
+
Path.absolute?(href) ? href : "#{@folder}#{href}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
self.full_path
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def get_folder path
|
24
|
+
path.match(/\/.+\//)[0]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Path
|
3
|
+
class Remote < Path::Base
|
4
|
+
attr_accessor :uri, :base, :domain, :full_path, :path, :query, :file_name
|
5
|
+
|
6
|
+
def initialize path
|
7
|
+
uri = URI.parse(path)
|
8
|
+
|
9
|
+
if uri.scheme.present?
|
10
|
+
@domain = uri.scheme + "://" + uri.host
|
11
|
+
else
|
12
|
+
@domain = uri.host
|
13
|
+
end
|
14
|
+
|
15
|
+
@path = uri.path
|
16
|
+
@query = "?" + uri.query if uri.query
|
17
|
+
@full_path = "#{@domain}#{@path}#{@query}"
|
18
|
+
@base = "#{@domain}/"
|
19
|
+
@uri = uri
|
20
|
+
@file_name = get_file_name(@path)
|
21
|
+
end
|
22
|
+
|
23
|
+
def full_path_for href
|
24
|
+
@uri.merge(URI.parse(href)).to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
self.full_path
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Results
|
3
|
+
attr_accessor :limit, :delay, :after_each, :after_all, :records
|
4
|
+
|
5
|
+
def initialize base, options = {}
|
6
|
+
@delay = extract_delay_hash(options[:delay] || base.config.delay)
|
7
|
+
@limit = options[:limit] || base.config.limit
|
8
|
+
|
9
|
+
@base = base
|
10
|
+
|
11
|
+
@after_each = options[:after_each] || []
|
12
|
+
@after_all = options[:after_all] || []
|
13
|
+
|
14
|
+
@records = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_after_each &block
|
18
|
+
@after_each << block
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_after_all &block
|
22
|
+
@after_all << block
|
23
|
+
end
|
24
|
+
|
25
|
+
def fetch continue = false
|
26
|
+
results = []
|
27
|
+
documents = []
|
28
|
+
|
29
|
+
@base.pages_object.reset unless continue
|
30
|
+
|
31
|
+
i = 0
|
32
|
+
|
33
|
+
while i != @limit and page = @base.pages_object.next
|
34
|
+
result = {}
|
35
|
+
|
36
|
+
begin
|
37
|
+
document = Skyscraper::fetch(page)
|
38
|
+
|
39
|
+
@base.fields.each do |field|
|
40
|
+
result[field.name] = field.find_in_document document
|
41
|
+
end
|
42
|
+
|
43
|
+
call_callbacks @after_each, result, document
|
44
|
+
results << result
|
45
|
+
sleep @delay[:sleep] if (i+1) % @delay[:after] == 0
|
46
|
+
|
47
|
+
rescue SocketError, Errno::ENOENT
|
48
|
+
warning_msg = "WARNGIN: resource '#{page}' not found!"
|
49
|
+
puts warning_msg if @base.config.noise_errors
|
50
|
+
raise NoResourceException, warning_msg unless @base.config.skip_on_error
|
51
|
+
end
|
52
|
+
|
53
|
+
i += 1
|
54
|
+
end
|
55
|
+
|
56
|
+
call_callbacks @after_all, results
|
57
|
+
|
58
|
+
@records += results
|
59
|
+
results
|
60
|
+
end
|
61
|
+
|
62
|
+
def continue
|
63
|
+
fetch true
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def extract_delay_hash delay_hash
|
69
|
+
delay = {}
|
70
|
+
|
71
|
+
if delay_hash and delay_hash.is_a? Hash
|
72
|
+
delay = delay_hash
|
73
|
+
elsif delay_hash
|
74
|
+
delay[:sleep] = delay_hash
|
75
|
+
delay[:after] = 1
|
76
|
+
else
|
77
|
+
delay[:sleep] = 0
|
78
|
+
delay[:after] = 1
|
79
|
+
end
|
80
|
+
|
81
|
+
delay
|
82
|
+
end
|
83
|
+
|
84
|
+
def call_callbacks callbacks, *args
|
85
|
+
callbacks.each do |callback|
|
86
|
+
callback.call(*args)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
class NoResourceException < Exception
|
92
|
+
end
|
93
|
+
end
|
data/lib/version.rb
ADDED
data/skyscraper.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/./version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Adam Dratwinski"]
|
6
|
+
gem.email = ["arboooz@gmail.com"]
|
7
|
+
gem.description = %q{Library that helps scraping data from websites in easy way}
|
8
|
+
gem.summary = %q{Library that helps scraping data from websites in easy way}
|
9
|
+
gem.homepage = "https://github.com/boooz/skyscraper"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(spec)/})
|
14
|
+
gem.name = "skyscraper"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = Skyscraper::VERSION
|
17
|
+
|
18
|
+
gem.add_development_dependency "rspec"
|
19
|
+
gem.add_development_dependency "rake"
|
20
|
+
gem.add_dependency "nokogiri"
|
21
|
+
gem.add_dependency "actionpack"
|
22
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
describe Skyscraper::Base do
|
2
|
+
|
3
|
+
it "should set pages" do
|
4
|
+
base = Skyscraper::Base.new
|
5
|
+
base.pages "http://onet.pl"
|
6
|
+
base.pages_object.is_a? Skyscraper::Pages
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have defaults for config" do
|
10
|
+
Skyscraper.config.encoding = "utf-8"
|
11
|
+
base = Skyscraper::Base.new
|
12
|
+
base.config.encoding.should == "utf-8"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should be able to have different config for each instance" do
|
16
|
+
base_a = Skyscraper::Base.new
|
17
|
+
base_a.config.bar = "foo"
|
18
|
+
base_a.settings foo: "biz"
|
19
|
+
|
20
|
+
base_b = Skyscraper::Base.new
|
21
|
+
base_b.config.bar = "biz"
|
22
|
+
base_b.settings foo: "bar"
|
23
|
+
|
24
|
+
base_a.config.bar.should == "foo"
|
25
|
+
base_a.config.foo.should == "biz"
|
26
|
+
base_b.config.bar.should == "biz"
|
27
|
+
base_b.config.foo.should == "bar"
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should add fields" do
|
31
|
+
base = Skyscraper::Base.new
|
32
|
+
base.field :name, ".selector"
|
33
|
+
base.field :other, ".selector"
|
34
|
+
base.fields.length.should == 2
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should override fields with the same name" do
|
38
|
+
base = Skyscraper::Base.new
|
39
|
+
base.field :name, ".selector"
|
40
|
+
base.field :name, ".selector"
|
41
|
+
base.fields.length.should == 1
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should add after each callback" do
|
45
|
+
base = Skyscraper::Base.new
|
46
|
+
base.pages path_to("skyscraper-base.html")
|
47
|
+
base.field :h1, "h1"
|
48
|
+
base.after_each { |result, page| result[:h1] += "2" }
|
49
|
+
base.fetch[0][:h1].should == "Hello world2"
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should add after all callback" do
|
53
|
+
base = Skyscraper::Base.new
|
54
|
+
base.pages path_to("skyscraper-base.html")
|
55
|
+
base.field :h1, "h1"
|
56
|
+
base.after_all { |results| results << "2" }
|
57
|
+
base.fetch.length.should == 2
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should set settings" do
|
61
|
+
base = Skyscraper::Base.new
|
62
|
+
base.settings limit: 100, delay: { sleep: 2, after: 10 }
|
63
|
+
base.config.limit.should == 100
|
64
|
+
base.config.delay[:sleep].should == 2
|
65
|
+
base.config.delay[:after].should == 10
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should fetch data" do
|
69
|
+
base = Skyscraper::Base.new
|
70
|
+
base.pages path_to("skyscraper-base.html")
|
71
|
+
base.field :h1, "h1"
|
72
|
+
base.fetch[0][:h1].should == "Hello world"
|
73
|
+
end
|
74
|
+
|
75
|
+
it "should be able to continue fetching" do
|
76
|
+
Skyscraper.config.limit = 10
|
77
|
+
base = Skyscraper::Base.new
|
78
|
+
base.pages [path_to("skyscraper-base.html")] * 12
|
79
|
+
base.field :h1, "h1"
|
80
|
+
base.fetch.length.should == 10
|
81
|
+
base.continue.length.should == 2
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
describe Skyscraper::Config do
|
2
|
+
it "should set variable on initialize" do
|
3
|
+
config = Skyscraper::Config.new foo: "bar"
|
4
|
+
config.foo.should == "bar"
|
5
|
+
end
|
6
|
+
|
7
|
+
it "should set dynamic variable" do
|
8
|
+
config = Skyscraper::Config.new foo: "bar"
|
9
|
+
config.foo.should == "bar"
|
10
|
+
config.bar = "foo"
|
11
|
+
config.bar.should == "foo"
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should override variable value" do
|
15
|
+
config = Skyscraper::Config.new foo: "bar"
|
16
|
+
config.foo = "bizz"
|
17
|
+
config.foo.should == "bizz"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should override false value" do
|
21
|
+
config = Skyscraper::Config.new foo: true
|
22
|
+
config.foo = false
|
23
|
+
config.foo.should == false
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
|
3
|
+
describe Skyscraper::Document do
|
4
|
+
it "should support utf-8 encoding by default in remote pages" do
|
5
|
+
document = Skyscraper::Document::load("http://www.sjp.pl/grzegrz%F3%B3ka")
|
6
|
+
document.encoding.should == "utf-8"
|
7
|
+
document.css(".lc").first.content.strip == "Grzegrzółka"
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should have path" do
|
11
|
+
document = Skyscraper::Document::load(path_to("skyscraper-document.html"))
|
12
|
+
document.path.should be_an Skyscraper::Path::Base
|
13
|
+
end
|
14
|
+
end
|