skyscraper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +180 -0
- data/Rakefile +5 -0
- data/lib/skyscraper.rb +56 -0
- data/lib/skyscraper/base.rb +44 -0
- data/lib/skyscraper/config.rb +15 -0
- data/lib/skyscraper/document.rb +11 -0
- data/lib/skyscraper/field.rb +24 -0
- data/lib/skyscraper/node.rb +8 -0
- data/lib/skyscraper/node/base.rb +103 -0
- data/lib/skyscraper/node/resource.rb +57 -0
- data/lib/skyscraper/pages.rb +27 -0
- data/lib/skyscraper/path.rb +29 -0
- data/lib/skyscraper/path/base.rb +15 -0
- data/lib/skyscraper/path/local.rb +29 -0
- data/lib/skyscraper/path/remote.rb +32 -0
- data/lib/skyscraper/results.rb +93 -0
- data/lib/version.rb +3 -0
- data/skyscraper.gemspec +22 -0
- data/spec/skyscraper/skyscraper/base_spec.rb +83 -0
- data/spec/skyscraper/skyscraper/config_spec.rb +25 -0
- data/spec/skyscraper/skyscraper/document_spec.rb +14 -0
- data/spec/skyscraper/skyscraper/field_spec.rb +36 -0
- data/spec/skyscraper/skyscraper/node/base_spec.rb +87 -0
- data/spec/skyscraper/skyscraper/node/resource_spec.rb +58 -0
- data/spec/skyscraper/skyscraper/node_spec.rb +2 -0
- data/spec/skyscraper/skyscraper/pages_spec.rb +46 -0
- data/spec/skyscraper/skyscraper/path_spec.rb +110 -0
- data/spec/skyscraper/skyscraper/results_spec.rb +151 -0
- data/spec/skyscraper/skyscraper_spec.rb +39 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/support/skyscraper_helpers.rb +9 -0
- data/spec/test_files/encoding.html~ +12 -0
- data/spec/test_files/skyscraper-base.html +30 -0
- data/spec/test_files/skyscraper-document.html +30 -0
- data/spec/test_files/skyscraper-encoding.html +12 -0
- data/spec/test_files/skyscraper-fetch-2.html +11 -0
- data/spec/test_files/skyscraper-fetch.html +31 -0
- data/spec/test_files/skyscraper-field.html +30 -0
- data/spec/test_files/skyscraper-node-base-a.html +11 -0
- data/spec/test_files/skyscraper-node-base-b.html +10 -0
- data/spec/test_files/skyscraper-node-base-traversing.html +34 -0
- data/spec/test_files/skyscraper-node-base.html +30 -0
- data/spec/test_files/skyscraper-node-resource-b.html +10 -0
- data/spec/test_files/skyscraper-node-resource-image.png +0 -0
- data/spec/test_files/skyscraper-node-resource.html +12 -0
- data/spec/test_files/skyscraper-pages.html +30 -0
- data/spec/test_files/skyscraper.html +30 -0
- metadata +169 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Node
|
3
|
+
class Resource
|
4
|
+
def initialize node
|
5
|
+
@node = node
|
6
|
+
@path = extract_path_from_node(@node)
|
7
|
+
end
|
8
|
+
|
9
|
+
def download options = {}
|
10
|
+
@name = options[:file_name] || @path.file_name
|
11
|
+
@new_file_path = replace_path_variables(options[:path] || Skyscraper.config.download_path)
|
12
|
+
@temp_file = open(@path.full_path)
|
13
|
+
|
14
|
+
copy @temp_file.path, @new_file_path
|
15
|
+
@new_file_path
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def copy from, to
|
21
|
+
create_path_if_not_exists to
|
22
|
+
`cp #{from} #{to}`
|
23
|
+
end
|
24
|
+
|
25
|
+
def create_path_if_not_exists path
|
26
|
+
`mkdir -p #{path}` unless File.directory?(path)
|
27
|
+
end
|
28
|
+
|
29
|
+
def replace_path_variables path
|
30
|
+
new_path = path.dup
|
31
|
+
new_path.gsub! /:file_name/, @name
|
32
|
+
new_path.gsub! /:sequence/, get_sequence_number_for(new_path)
|
33
|
+
new_path
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_sequence_number_for path
|
37
|
+
new_path = path.split(":sequence")[0]
|
38
|
+
if File.directory?(new_path)
|
39
|
+
entries = Dir.entries(new_path).select { |i| i =~ /^\d+$/ } || []
|
40
|
+
last = entries.sort.last.to_i
|
41
|
+
last += 1
|
42
|
+
last.to_s
|
43
|
+
else
|
44
|
+
"1"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def extract_path_from_node node
|
49
|
+
if href_or_src = node.href.present? ? node.href : node.src
|
50
|
+
node.element.document.path.path_for(href_or_src)
|
51
|
+
else
|
52
|
+
throw Exception.new("no href no src")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
|
3
|
+
module Skyscraper
|
4
|
+
class Pages
|
5
|
+
attr_accessor :items
|
6
|
+
|
7
|
+
def initialize value = nil, &block
|
8
|
+
set value, &block
|
9
|
+
end
|
10
|
+
|
11
|
+
def set value = nil, &block
|
12
|
+
@items = block ? block.call(Skyscraper) : value
|
13
|
+
@items = @items.is_a?(Array) ? @items : [@items]
|
14
|
+
@items.flatten!
|
15
|
+
reset
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
def next
|
20
|
+
@items[@current += 1]
|
21
|
+
end
|
22
|
+
|
23
|
+
def reset
|
24
|
+
@current = -1
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Path
|
3
|
+
extend ActiveSupport::Autoload
|
4
|
+
|
5
|
+
autoload :Base
|
6
|
+
autoload :Local
|
7
|
+
autoload :Remote
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def factory path
|
11
|
+
if Path.remote?(path)
|
12
|
+
Path::Remote.new(path)
|
13
|
+
else
|
14
|
+
Path::Local.new(path)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def remote? path
|
19
|
+
path = "http://"+path unless path.match /^(http|https):\/\//
|
20
|
+
uri = URI.parse(path)
|
21
|
+
uri.host ? true : false
|
22
|
+
end
|
23
|
+
|
24
|
+
def absolute? path
|
25
|
+
path.starts_with? "/"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Path
|
3
|
+
class Local < Path::Base
|
4
|
+
attr_accessor :base, :folder, :file_name, :full_path
|
5
|
+
|
6
|
+
def initialize path
|
7
|
+
@folder = get_folder(path)
|
8
|
+
@full_path = path
|
9
|
+
@file_name = get_file_name(path)
|
10
|
+
@base = @folder
|
11
|
+
end
|
12
|
+
|
13
|
+
def full_path_for href
|
14
|
+
Path.absolute?(href) ? href : "#{@folder}#{href}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
self.full_path
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def get_folder path
|
24
|
+
path.match(/\/.+\//)[0]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
module Path
|
3
|
+
class Remote < Path::Base
|
4
|
+
attr_accessor :uri, :base, :domain, :full_path, :path, :query, :file_name
|
5
|
+
|
6
|
+
def initialize path
|
7
|
+
uri = URI.parse(path)
|
8
|
+
|
9
|
+
if uri.scheme.present?
|
10
|
+
@domain = uri.scheme + "://" + uri.host
|
11
|
+
else
|
12
|
+
@domain = uri.host
|
13
|
+
end
|
14
|
+
|
15
|
+
@path = uri.path
|
16
|
+
@query = "?" + uri.query if uri.query
|
17
|
+
@full_path = "#{@domain}#{@path}#{@query}"
|
18
|
+
@base = "#{@domain}/"
|
19
|
+
@uri = uri
|
20
|
+
@file_name = get_file_name(@path)
|
21
|
+
end
|
22
|
+
|
23
|
+
def full_path_for href
|
24
|
+
@uri.merge(URI.parse(href)).to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
self.full_path
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module Skyscraper
|
2
|
+
class Results
|
3
|
+
attr_accessor :limit, :delay, :after_each, :after_all, :records
|
4
|
+
|
5
|
+
def initialize base, options = {}
|
6
|
+
@delay = extract_delay_hash(options[:delay] || base.config.delay)
|
7
|
+
@limit = options[:limit] || base.config.limit
|
8
|
+
|
9
|
+
@base = base
|
10
|
+
|
11
|
+
@after_each = options[:after_each] || []
|
12
|
+
@after_all = options[:after_all] || []
|
13
|
+
|
14
|
+
@records = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_after_each &block
|
18
|
+
@after_each << block
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_after_all &block
|
22
|
+
@after_all << block
|
23
|
+
end
|
24
|
+
|
25
|
+
def fetch continue = false
|
26
|
+
results = []
|
27
|
+
documents = []
|
28
|
+
|
29
|
+
@base.pages_object.reset unless continue
|
30
|
+
|
31
|
+
i = 0
|
32
|
+
|
33
|
+
while i != @limit and page = @base.pages_object.next
|
34
|
+
result = {}
|
35
|
+
|
36
|
+
begin
|
37
|
+
document = Skyscraper::fetch(page)
|
38
|
+
|
39
|
+
@base.fields.each do |field|
|
40
|
+
result[field.name] = field.find_in_document document
|
41
|
+
end
|
42
|
+
|
43
|
+
call_callbacks @after_each, result, document
|
44
|
+
results << result
|
45
|
+
sleep @delay[:sleep] if (i+1) % @delay[:after] == 0
|
46
|
+
|
47
|
+
rescue SocketError, Errno::ENOENT
|
48
|
+
warning_msg = "WARNGIN: resource '#{page}' not found!"
|
49
|
+
puts warning_msg if @base.config.noise_errors
|
50
|
+
raise NoResourceException, warning_msg unless @base.config.skip_on_error
|
51
|
+
end
|
52
|
+
|
53
|
+
i += 1
|
54
|
+
end
|
55
|
+
|
56
|
+
call_callbacks @after_all, results
|
57
|
+
|
58
|
+
@records += results
|
59
|
+
results
|
60
|
+
end
|
61
|
+
|
62
|
+
def continue
|
63
|
+
fetch true
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def extract_delay_hash delay_hash
|
69
|
+
delay = {}
|
70
|
+
|
71
|
+
if delay_hash and delay_hash.is_a? Hash
|
72
|
+
delay = delay_hash
|
73
|
+
elsif delay_hash
|
74
|
+
delay[:sleep] = delay_hash
|
75
|
+
delay[:after] = 1
|
76
|
+
else
|
77
|
+
delay[:sleep] = 0
|
78
|
+
delay[:after] = 1
|
79
|
+
end
|
80
|
+
|
81
|
+
delay
|
82
|
+
end
|
83
|
+
|
84
|
+
def call_callbacks callbacks, *args
|
85
|
+
callbacks.each do |callback|
|
86
|
+
callback.call(*args)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
class NoResourceException < Exception
|
92
|
+
end
|
93
|
+
end
|
data/lib/version.rb
ADDED
data/skyscraper.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/./version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Adam Dratwinski"]
|
6
|
+
gem.email = ["arboooz@gmail.com"]
|
7
|
+
gem.description = %q{Library that helps scraping data from websites in easy way}
|
8
|
+
gem.summary = %q{Library that helps scraping data from websites in easy way}
|
9
|
+
gem.homepage = "https://github.com/boooz/skyscraper"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(spec)/})
|
14
|
+
gem.name = "skyscraper"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = Skyscraper::VERSION
|
17
|
+
|
18
|
+
gem.add_development_dependency "rspec"
|
19
|
+
gem.add_development_dependency "rake"
|
20
|
+
gem.add_dependency "nokogiri"
|
21
|
+
gem.add_dependency "actionpack"
|
22
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
describe Skyscraper::Base do
|
2
|
+
|
3
|
+
it "should set pages" do
|
4
|
+
base = Skyscraper::Base.new
|
5
|
+
base.pages "http://onet.pl"
|
6
|
+
base.pages_object.is_a? Skyscraper::Pages
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have defaults for config" do
|
10
|
+
Skyscraper.config.encoding = "utf-8"
|
11
|
+
base = Skyscraper::Base.new
|
12
|
+
base.config.encoding.should == "utf-8"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should be able to have different config for each instance" do
|
16
|
+
base_a = Skyscraper::Base.new
|
17
|
+
base_a.config.bar = "foo"
|
18
|
+
base_a.settings foo: "biz"
|
19
|
+
|
20
|
+
base_b = Skyscraper::Base.new
|
21
|
+
base_b.config.bar = "biz"
|
22
|
+
base_b.settings foo: "bar"
|
23
|
+
|
24
|
+
base_a.config.bar.should == "foo"
|
25
|
+
base_a.config.foo.should == "biz"
|
26
|
+
base_b.config.bar.should == "biz"
|
27
|
+
base_b.config.foo.should == "bar"
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should add fields" do
|
31
|
+
base = Skyscraper::Base.new
|
32
|
+
base.field :name, ".selector"
|
33
|
+
base.field :other, ".selector"
|
34
|
+
base.fields.length.should == 2
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should override fields with the same name" do
|
38
|
+
base = Skyscraper::Base.new
|
39
|
+
base.field :name, ".selector"
|
40
|
+
base.field :name, ".selector"
|
41
|
+
base.fields.length.should == 1
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should add after each callback" do
|
45
|
+
base = Skyscraper::Base.new
|
46
|
+
base.pages path_to("skyscraper-base.html")
|
47
|
+
base.field :h1, "h1"
|
48
|
+
base.after_each { |result, page| result[:h1] += "2" }
|
49
|
+
base.fetch[0][:h1].should == "Hello world2"
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should add after all callback" do
|
53
|
+
base = Skyscraper::Base.new
|
54
|
+
base.pages path_to("skyscraper-base.html")
|
55
|
+
base.field :h1, "h1"
|
56
|
+
base.after_all { |results| results << "2" }
|
57
|
+
base.fetch.length.should == 2
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should set settings" do
|
61
|
+
base = Skyscraper::Base.new
|
62
|
+
base.settings limit: 100, delay: { sleep: 2, after: 10 }
|
63
|
+
base.config.limit.should == 100
|
64
|
+
base.config.delay[:sleep].should == 2
|
65
|
+
base.config.delay[:after].should == 10
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should fetch data" do
|
69
|
+
base = Skyscraper::Base.new
|
70
|
+
base.pages path_to("skyscraper-base.html")
|
71
|
+
base.field :h1, "h1"
|
72
|
+
base.fetch[0][:h1].should == "Hello world"
|
73
|
+
end
|
74
|
+
|
75
|
+
it "should be able to continue fetching" do
|
76
|
+
Skyscraper.config.limit = 10
|
77
|
+
base = Skyscraper::Base.new
|
78
|
+
base.pages [path_to("skyscraper-base.html")] * 12
|
79
|
+
base.field :h1, "h1"
|
80
|
+
base.fetch.length.should == 10
|
81
|
+
base.continue.length.should == 2
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
describe Skyscraper::Config do
|
2
|
+
it "should set variable on initialize" do
|
3
|
+
config = Skyscraper::Config.new foo: "bar"
|
4
|
+
config.foo.should == "bar"
|
5
|
+
end
|
6
|
+
|
7
|
+
it "should set dynamic variable" do
|
8
|
+
config = Skyscraper::Config.new foo: "bar"
|
9
|
+
config.foo.should == "bar"
|
10
|
+
config.bar = "foo"
|
11
|
+
config.bar.should == "foo"
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should override variable value" do
|
15
|
+
config = Skyscraper::Config.new foo: "bar"
|
16
|
+
config.foo = "bizz"
|
17
|
+
config.foo.should == "bizz"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should override false value" do
|
21
|
+
config = Skyscraper::Config.new foo: true
|
22
|
+
config.foo = false
|
23
|
+
config.foo.should == false
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
|
3
|
+
describe Skyscraper::Document do
|
4
|
+
it "should support utf-8 encoding by default in remote pages" do
|
5
|
+
document = Skyscraper::Document::load("http://www.sjp.pl/grzegrz%F3%B3ka")
|
6
|
+
document.encoding.should == "utf-8"
|
7
|
+
document.css(".lc").first.content.strip == "Grzegrzółka"
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should have path" do
|
11
|
+
document = Skyscraper::Document::load(path_to("skyscraper-document.html"))
|
12
|
+
document.path.should be_an Skyscraper::Path::Base
|
13
|
+
end
|
14
|
+
end
|