scruber 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/scruber/version.rb +1 -1
- data/scruber.gemspec +1 -1
- data/spec/core/extensions/csv_output_spec.rb +44 -0
- data/spec/core/extensions/dict.csv +4 -0
- data/spec/core/extensions/log_spec.rb +25 -0
- data/spec/core/extensions/loop_spec.rb +26 -0
- data/spec/core/extensions/parser_aliases_spec.rb +89 -0
- data/spec/core/extensions/queue_aliases_spec.rb +72 -0
- data/spec/core/extensions/seed_spec.rb +44 -0
- data/spec/fetcher.rb +27 -0
- data/spec/helpers/dictionary_reader/dict.csv +4 -0
- data/spec/helpers/dictionary_reader/dict.xml +5 -0
- data/spec/helpers/dictionary_reader/dict_records.xml +5 -0
- data/spec/helpers/dictionary_reader/dictionary_reader_csv_spec.rb +36 -0
- data/spec/helpers/dictionary_reader/dictionary_reader_xml_spec.rb +46 -0
- data/spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb +46 -0
- data/spec/helpers/fetcher_agent_adapters/memory_spec.rb +45 -0
- data/spec/helpers/proxy_rotator/proxy_rotator_proxy_spec.rb +21 -0
- data/spec/helpers/proxy_rotator/proxy_rotator_spec.rb +118 -0
- data/spec/helpers/user_agent_rotator/user_agent_rotator_spec.rb +145 -0
- data/spec/helpers/user_agent_rotator/user_agent_rotator_user_agent_spec.rb +40 -0
- data/spec/helpers/user_agent_rotator/user_agents.xml +6 -0
- data/spec/queue_adapter/memory_spec.rb +15 -0
- data/spec/queue_spec.rb +27 -0
- data/spec/scruber_spec.rb +198 -0
- data/spec/spec_helper.rb +36 -0
- data/spec/support/queue/queue_adapter.rb +171 -0
- metadata +26 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b845332207b108efa91983b4721cf7631120ea36
|
4
|
+
data.tar.gz: 4c9d931ccdbf777c9d469d7cf2697f00acfe94dc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b7c9bc638e7f168401bfd15de02746b691c5477563404db66810f894e7a1925821064b096d47c065209223856782e787bdc3f6272a179068aa0fdfcb6c14994d
|
7
|
+
data.tar.gz: f4c73d7e94e795b32c647285f320cf9e25b9f6efa0c528aaa3138ab99c263627f9ad47088851126d2df07421e633037fa3e8f434d7303a91ad350804d9a64903
|
data/lib/scruber/version.rb
CHANGED
data/scruber.gemspec
CHANGED
@@ -24,7 +24,7 @@ Gem::Specification.new do |spec|
|
|
24
24
|
end
|
25
25
|
|
26
26
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
27
|
-
f.match(%r{^(test|
|
27
|
+
f.match(%r{^(test|features)/})
|
28
28
|
end
|
29
29
|
spec.bindir = "exe"
|
30
30
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::CsvOutput do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Scruber::CsvOutput with csv_file and csv_out method" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:csv_file)).to be_truthy
|
10
|
+
expect(Scruber::Core::Crawler.method_defined?(:csv_out)).to be_truthy
|
11
|
+
expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\Acsv_(\w+)_file\Z/)).to be_truthy
|
12
|
+
expect(Scruber::Core::Crawler.new(:sample).respond_to?(:csv_products_file)).to be_truthy
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "csv_file" do
|
17
|
+
it "should create csv_file and write output" do
|
18
|
+
described_class.register
|
19
|
+
csv_file_name = File.join(File.expand_path(File.dirname(__FILE__)), 'test.csv')
|
20
|
+
|
21
|
+
Scruber.run :sample do
|
22
|
+
csv_file csv_file_name, col_sep: '|'
|
23
|
+
csv_out [1,2,3]
|
24
|
+
end
|
25
|
+
expect(File.exists?(csv_file_name)).to be_truthy
|
26
|
+
expect(File.open(csv_file_name, 'r'){|f| f.read }.strip).to eq('1|2|3')
|
27
|
+
File.delete(csv_file_name) if File.exists?(csv_file_name)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "csv_{pattern}_file" do
|
32
|
+
it "should register file and write output" do
|
33
|
+
described_class.register
|
34
|
+
csv_file_name = File.join(File.expand_path(File.dirname(__FILE__)), 'products.csv')
|
35
|
+
Scruber.run :sample do
|
36
|
+
csv_products_file csv_file_name, col_sep: '|'
|
37
|
+
csv_products_out [1,2,3]
|
38
|
+
end
|
39
|
+
expect(File.exists?(csv_file_name)).to be_truthy
|
40
|
+
expect(File.open(csv_file_name, 'r'){|f| f.read }.strip).to eq('1|2|3')
|
41
|
+
File.delete(csv_file_name) if File.exists?(csv_file_name)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::Log do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Scruber::Core with log method" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:log)).to be_truthy
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#log" do
|
14
|
+
let(:log_file) { Pathname.new(File.expand_path('../log.txt', __FILE__)) }
|
15
|
+
before { Scruber.logger = Logger.new(log_file) }
|
16
|
+
after{ (File.delete(log_file) rescue nil) }
|
17
|
+
|
18
|
+
it "should write log to file" do
|
19
|
+
Scruber.run :sample, silent: true do
|
20
|
+
log "Seeding"
|
21
|
+
end
|
22
|
+
expect(File.open(log_file){|f| f.read}).to match(/Seeding/)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::Loop do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Scruber::Core with loop method" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:loop)).to be_truthy
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should add dictionary and read info" do
|
13
|
+
Scruber::Core::Extensions::Loop.register
|
14
|
+
$zip_codes = []
|
15
|
+
Scruber.run :sample do
|
16
|
+
add_dictionary :zip_codes_usa, File.expand_path(File.dirname(__FILE__))+'/dict.csv', :csv
|
17
|
+
seed do
|
18
|
+
loop :zip_codes_usa, state: 'NY' do |row|
|
19
|
+
$zip_codes.push row['zip']
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
expect($zip_codes).to eq(['10001', '10002'])
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Crawler with parse and parse_* methods" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:parse)).to be_truthy
|
10
|
+
expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\Aparse_(\w+)\Z/)).to be_truthy
|
11
|
+
expect(Scruber::Core::Crawler.new(:sample).respond_to?(:parse_product)).to be_truthy
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "#parse" do
|
16
|
+
context "without format" do
|
17
|
+
it "should register parser" do
|
18
|
+
described_class.register
|
19
|
+
|
20
|
+
stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
|
21
|
+
|
22
|
+
Scruber.run :sample do
|
23
|
+
get "http://example.com"
|
24
|
+
|
25
|
+
parse do |page|
|
26
|
+
$page = page
|
27
|
+
end
|
28
|
+
end
|
29
|
+
expect($page.url).to eq("http://example.com")
|
30
|
+
expect($page.page_type.to_s).to eq("seed")
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should register parser with custom page_type" do
|
34
|
+
described_class.register
|
35
|
+
|
36
|
+
stub_request(:post, "http://example.com").to_return(body: 'Example Domain')
|
37
|
+
|
38
|
+
Scruber.run :sample do
|
39
|
+
post_product "http://example.com"
|
40
|
+
|
41
|
+
parse_product do |page|
|
42
|
+
$page = page
|
43
|
+
end
|
44
|
+
end
|
45
|
+
expect($page.url).to eq("http://example.com")
|
46
|
+
expect($page.method.to_s).to eq("post")
|
47
|
+
expect($page.page_type.to_s).to eq("product")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
context "with format" do
|
52
|
+
it "should register parser" do
|
53
|
+
described_class.register
|
54
|
+
|
55
|
+
stub_request(:get, "http://example.com").to_return(body: '<div><span>Example Domain</span></div>')
|
56
|
+
|
57
|
+
Scruber.run :sample do
|
58
|
+
get "http://example.com"
|
59
|
+
|
60
|
+
parse :html do |page,doc|
|
61
|
+
$page = page
|
62
|
+
$doc = doc
|
63
|
+
end
|
64
|
+
end
|
65
|
+
expect($doc.at('span').text).to eq("Example Domain")
|
66
|
+
expect($page.page_type.to_s).to eq("seed")
|
67
|
+
expect($page.method.to_s).to eq("get")
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should register parser with custom page_type" do
|
71
|
+
described_class.register
|
72
|
+
|
73
|
+
stub_request(:post, "http://example.com").to_return(body: '<div><span>Example Post</span></div>')
|
74
|
+
|
75
|
+
Scruber.run :sample do
|
76
|
+
post_product "http://example.com"
|
77
|
+
|
78
|
+
parse_product :html do |page,doc|
|
79
|
+
$page = page
|
80
|
+
$doc = doc
|
81
|
+
end
|
82
|
+
end
|
83
|
+
expect($doc.at('span').text).to eq("Example Post")
|
84
|
+
expect($page.method.to_s).to eq("post")
|
85
|
+
expect($page.page_type.to_s).to eq("product")
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::QueueAliases do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Crawler with get,post,head and (get|post|head)_* methods" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:get)).to be_truthy
|
10
|
+
expect(Scruber::Core::Crawler.method_defined?(:head)).to be_truthy
|
11
|
+
expect(Scruber::Core::Crawler.method_defined?(:post)).to be_truthy
|
12
|
+
expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\A(get|post|head)_(\w+)\Z/)).to be_truthy
|
13
|
+
expect(Scruber::Core::Crawler.new(:sample).respond_to?(:get_product)).to be_truthy
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "#get,#post" do
|
18
|
+
context "without options" do
|
19
|
+
it "should add page to queue" do
|
20
|
+
described_class.register
|
21
|
+
|
22
|
+
Scruber.run :sample do
|
23
|
+
get "http://example.com"
|
24
|
+
$page = queue.fetch_pending
|
25
|
+
end
|
26
|
+
expect($page.url).to eq("http://example.com")
|
27
|
+
expect($page.method.to_s).to eq("get")
|
28
|
+
expect($page.page_type.to_s).to eq("seed")
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should add page to queue" do
|
32
|
+
described_class.register
|
33
|
+
|
34
|
+
Scruber.run :sample do
|
35
|
+
post_product "http://example.com"
|
36
|
+
$page = queue.fetch_pending
|
37
|
+
end
|
38
|
+
expect($page.url).to eq("http://example.com")
|
39
|
+
expect($page.method.to_s).to eq("post")
|
40
|
+
expect($page.page_type).to eq("product")
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "with options" do
|
45
|
+
it "should add page to queue" do
|
46
|
+
described_class.register
|
47
|
+
|
48
|
+
Scruber.run :sample do
|
49
|
+
get "http://example.com", user_agent: 'Agent 1'
|
50
|
+
$page = queue.fetch_pending
|
51
|
+
end
|
52
|
+
expect($page.url).to eq("http://example.com")
|
53
|
+
expect($page.method.to_s).to eq("get")
|
54
|
+
expect($page.page_type.to_s).to eq("seed")
|
55
|
+
expect($page.user_agent).to eq('Agent 1')
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should add page to queue" do
|
59
|
+
described_class.register
|
60
|
+
|
61
|
+
Scruber.run :sample do
|
62
|
+
post_product "http://example.com", user_agent: 'Agent 1'
|
63
|
+
$page = queue.fetch_pending
|
64
|
+
end
|
65
|
+
expect($page.url).to eq("http://example.com")
|
66
|
+
expect($page.method.to_s).to eq("post")
|
67
|
+
expect($page.page_type).to eq("product")
|
68
|
+
expect($page.user_agent).to eq('Agent 1')
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::Seed do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Scruber::Core with seed method" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:seed)).to be_truthy
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
before do
|
14
|
+
stub_request(:get, "http://example.com").to_return(body: '<div><a>Main</a></div>')
|
15
|
+
stub_request(:get, "http://example.com/contacts").to_return(body: '<div><a>Contacts</a></div>')
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should execute seed block" do
|
19
|
+
$queue_size = 0
|
20
|
+
Scruber.run :sample do
|
21
|
+
seed do
|
22
|
+
get 'http://example.com'
|
23
|
+
end
|
24
|
+
$queue_size = queue.size
|
25
|
+
end
|
26
|
+
expect($queue_size).to eq(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should not execute seed block" do
|
30
|
+
$queue_size = 0
|
31
|
+
Scruber.run :sample do
|
32
|
+
seed do
|
33
|
+
get 'http://example.com'
|
34
|
+
end
|
35
|
+
seed do
|
36
|
+
get 'http://example.com/contacts'
|
37
|
+
end
|
38
|
+
$queue_size = queue.size
|
39
|
+
$page = queue.fetch_pending
|
40
|
+
end
|
41
|
+
expect($queue_size).to eq(1)
|
42
|
+
expect($page.url).to eq("http://example.com")
|
43
|
+
end
|
44
|
+
end
|
data/spec/fetcher.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Fetcher do
|
4
|
+
|
5
|
+
describe "add_adapter" do
|
6
|
+
it "should raise error" do
|
7
|
+
expect{ described_class.add_adapter(:obj, Object) }.to raise_error(NoMethodError)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should add new adapter and return added class" do
|
11
|
+
expect(described_class.add_adapter(:typhoeus_fetcher, Scruber::FetcherAdapters::TyphoeusFetcher)).to eq(Scruber::FetcherAdapters::TyphoeusFetcher)
|
12
|
+
expect(described_class._adapters.keys).to include(:typhoeus_fetcher)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "adapter" do
|
17
|
+
it "should return default adapter" do
|
18
|
+
expect(described_class.adapter).to eq(Scruber::FetcherAdapters::TyphoeusFetcher)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "new" do
|
23
|
+
it "should return instance of default adapter" do
|
24
|
+
expect(described_class.new).to be_a(Scruber::FetcherAdapters::TyphoeusFetcher)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Helpers::DictionaryReader::Csv do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should correctly read first element" do
|
7
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
|
8
|
+
|
9
|
+
result = nil
|
10
|
+
cl.read do |obj|
|
11
|
+
result = obj
|
12
|
+
end
|
13
|
+
expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should read 3 elements total" do
|
17
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
|
18
|
+
|
19
|
+
count = 0
|
20
|
+
cl.read do |obj|
|
21
|
+
count += 1
|
22
|
+
end
|
23
|
+
expect(count).to eq(3)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should read 1 elements with state=WI" do
|
27
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
|
28
|
+
|
29
|
+
results = []
|
30
|
+
cl.read({state: 'WI'}) do |obj|
|
31
|
+
results.push obj.sort
|
32
|
+
end
|
33
|
+
expect(results).to eq([{"r10"=>"false", "country"=>"US", "state"=>"WI", "postal_code"=>"54914"}.sort])
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Helpers::DictionaryReader::Xml do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should correctly read first element" do
|
7
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
|
8
|
+
|
9
|
+
result = nil
|
10
|
+
cl.read do |obj|
|
11
|
+
result = obj
|
12
|
+
end
|
13
|
+
expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should correctly read first element with different selector" do
|
17
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict_records.xml')
|
18
|
+
|
19
|
+
result = nil
|
20
|
+
cl.read(selector: 'record') do |obj|
|
21
|
+
result = obj
|
22
|
+
end
|
23
|
+
expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should read 3 elements total" do
|
27
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
|
28
|
+
|
29
|
+
count = 0
|
30
|
+
cl.read do |obj|
|
31
|
+
count += 1
|
32
|
+
end
|
33
|
+
expect(count).to eq(3)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should read 1 elements with state=WI" do
|
37
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
|
38
|
+
|
39
|
+
results = []
|
40
|
+
cl.read({state: 'WI'}) do |obj|
|
41
|
+
results.push obj.sort
|
42
|
+
end
|
43
|
+
expect(results).to eq([{"r10"=>"false", "country"=>"US", "state"=>"WI", "postal_code"=>"54914"}.sort])
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|