scruber 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/scruber/version.rb +1 -1
- data/scruber.gemspec +1 -1
- data/spec/core/extensions/csv_output_spec.rb +44 -0
- data/spec/core/extensions/dict.csv +4 -0
- data/spec/core/extensions/log_spec.rb +25 -0
- data/spec/core/extensions/loop_spec.rb +26 -0
- data/spec/core/extensions/parser_aliases_spec.rb +89 -0
- data/spec/core/extensions/queue_aliases_spec.rb +72 -0
- data/spec/core/extensions/seed_spec.rb +44 -0
- data/spec/fetcher.rb +27 -0
- data/spec/helpers/dictionary_reader/dict.csv +4 -0
- data/spec/helpers/dictionary_reader/dict.xml +5 -0
- data/spec/helpers/dictionary_reader/dict_records.xml +5 -0
- data/spec/helpers/dictionary_reader/dictionary_reader_csv_spec.rb +36 -0
- data/spec/helpers/dictionary_reader/dictionary_reader_xml_spec.rb +46 -0
- data/spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb +46 -0
- data/spec/helpers/fetcher_agent_adapters/memory_spec.rb +45 -0
- data/spec/helpers/proxy_rotator/proxy_rotator_proxy_spec.rb +21 -0
- data/spec/helpers/proxy_rotator/proxy_rotator_spec.rb +118 -0
- data/spec/helpers/user_agent_rotator/user_agent_rotator_spec.rb +145 -0
- data/spec/helpers/user_agent_rotator/user_agent_rotator_user_agent_spec.rb +40 -0
- data/spec/helpers/user_agent_rotator/user_agents.xml +6 -0
- data/spec/queue_adapter/memory_spec.rb +15 -0
- data/spec/queue_spec.rb +27 -0
- data/spec/scruber_spec.rb +198 -0
- data/spec/spec_helper.rb +36 -0
- data/spec/support/queue/queue_adapter.rb +171 -0
- metadata +26 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b845332207b108efa91983b4721cf7631120ea36
|
4
|
+
data.tar.gz: 4c9d931ccdbf777c9d469d7cf2697f00acfe94dc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b7c9bc638e7f168401bfd15de02746b691c5477563404db66810f894e7a1925821064b096d47c065209223856782e787bdc3f6272a179068aa0fdfcb6c14994d
|
7
|
+
data.tar.gz: f4c73d7e94e795b32c647285f320cf9e25b9f6efa0c528aaa3138ab99c263627f9ad47088851126d2df07421e633037fa3e8f434d7303a91ad350804d9a64903
|
data/lib/scruber/version.rb
CHANGED
data/scruber.gemspec
CHANGED
@@ -24,7 +24,7 @@ Gem::Specification.new do |spec|
|
|
24
24
|
end
|
25
25
|
|
26
26
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
27
|
-
f.match(%r{^(test|
|
27
|
+
f.match(%r{^(test|features)/})
|
28
28
|
end
|
29
29
|
spec.bindir = "exe"
|
30
30
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::CsvOutput do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Scruber::CsvOutput with csv_file and csv_out method" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:csv_file)).to be_truthy
|
10
|
+
expect(Scruber::Core::Crawler.method_defined?(:csv_out)).to be_truthy
|
11
|
+
expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\Acsv_(\w+)_file\Z/)).to be_truthy
|
12
|
+
expect(Scruber::Core::Crawler.new(:sample).respond_to?(:csv_products_file)).to be_truthy
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "csv_file" do
|
17
|
+
it "should create csv_file and write output" do
|
18
|
+
described_class.register
|
19
|
+
csv_file_name = File.join(File.expand_path(File.dirname(__FILE__)), 'test.csv')
|
20
|
+
|
21
|
+
Scruber.run :sample do
|
22
|
+
csv_file csv_file_name, col_sep: '|'
|
23
|
+
csv_out [1,2,3]
|
24
|
+
end
|
25
|
+
expect(File.exists?(csv_file_name)).to be_truthy
|
26
|
+
expect(File.open(csv_file_name, 'r'){|f| f.read }.strip).to eq('1|2|3')
|
27
|
+
File.delete(csv_file_name) if File.exists?(csv_file_name)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "csv_{pattern}_file" do
|
32
|
+
it "should register file and write output" do
|
33
|
+
described_class.register
|
34
|
+
csv_file_name = File.join(File.expand_path(File.dirname(__FILE__)), 'products.csv')
|
35
|
+
Scruber.run :sample do
|
36
|
+
csv_products_file csv_file_name, col_sep: '|'
|
37
|
+
csv_products_out [1,2,3]
|
38
|
+
end
|
39
|
+
expect(File.exists?(csv_file_name)).to be_truthy
|
40
|
+
expect(File.open(csv_file_name, 'r'){|f| f.read }.strip).to eq('1|2|3')
|
41
|
+
File.delete(csv_file_name) if File.exists?(csv_file_name)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::Log do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Scruber::Core with log method" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:log)).to be_truthy
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#log" do
|
14
|
+
let(:log_file) { Pathname.new(File.expand_path('../log.txt', __FILE__)) }
|
15
|
+
before { Scruber.logger = Logger.new(log_file) }
|
16
|
+
after{ (File.delete(log_file) rescue nil) }
|
17
|
+
|
18
|
+
it "should write log to file" do
|
19
|
+
Scruber.run :sample, silent: true do
|
20
|
+
log "Seeding"
|
21
|
+
end
|
22
|
+
expect(File.open(log_file){|f| f.read}).to match(/Seeding/)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::Loop do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Scruber::Core with loop method" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:loop)).to be_truthy
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should add dictionary and read info" do
|
13
|
+
Scruber::Core::Extensions::Loop.register
|
14
|
+
$zip_codes = []
|
15
|
+
Scruber.run :sample do
|
16
|
+
add_dictionary :zip_codes_usa, File.expand_path(File.dirname(__FILE__))+'/dict.csv', :csv
|
17
|
+
seed do
|
18
|
+
loop :zip_codes_usa, state: 'NY' do |row|
|
19
|
+
$zip_codes.push row['zip']
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
expect($zip_codes).to eq(['10001', '10002'])
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Crawler with parse and parse_* methods" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:parse)).to be_truthy
|
10
|
+
expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\Aparse_(\w+)\Z/)).to be_truthy
|
11
|
+
expect(Scruber::Core::Crawler.new(:sample).respond_to?(:parse_product)).to be_truthy
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "#parse" do
|
16
|
+
context "without format" do
|
17
|
+
it "should register parser" do
|
18
|
+
described_class.register
|
19
|
+
|
20
|
+
stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
|
21
|
+
|
22
|
+
Scruber.run :sample do
|
23
|
+
get "http://example.com"
|
24
|
+
|
25
|
+
parse do |page|
|
26
|
+
$page = page
|
27
|
+
end
|
28
|
+
end
|
29
|
+
expect($page.url).to eq("http://example.com")
|
30
|
+
expect($page.page_type.to_s).to eq("seed")
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should register parser with custom page_type" do
|
34
|
+
described_class.register
|
35
|
+
|
36
|
+
stub_request(:post, "http://example.com").to_return(body: 'Example Domain')
|
37
|
+
|
38
|
+
Scruber.run :sample do
|
39
|
+
post_product "http://example.com"
|
40
|
+
|
41
|
+
parse_product do |page|
|
42
|
+
$page = page
|
43
|
+
end
|
44
|
+
end
|
45
|
+
expect($page.url).to eq("http://example.com")
|
46
|
+
expect($page.method.to_s).to eq("post")
|
47
|
+
expect($page.page_type.to_s).to eq("product")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
context "with format" do
|
52
|
+
it "should register parser" do
|
53
|
+
described_class.register
|
54
|
+
|
55
|
+
stub_request(:get, "http://example.com").to_return(body: '<div><span>Example Domain</span></div>')
|
56
|
+
|
57
|
+
Scruber.run :sample do
|
58
|
+
get "http://example.com"
|
59
|
+
|
60
|
+
parse :html do |page,doc|
|
61
|
+
$page = page
|
62
|
+
$doc = doc
|
63
|
+
end
|
64
|
+
end
|
65
|
+
expect($doc.at('span').text).to eq("Example Domain")
|
66
|
+
expect($page.page_type.to_s).to eq("seed")
|
67
|
+
expect($page.method.to_s).to eq("get")
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should register parser with custom page_type" do
|
71
|
+
described_class.register
|
72
|
+
|
73
|
+
stub_request(:post, "http://example.com").to_return(body: '<div><span>Example Post</span></div>')
|
74
|
+
|
75
|
+
Scruber.run :sample do
|
76
|
+
post_product "http://example.com"
|
77
|
+
|
78
|
+
parse_product :html do |page,doc|
|
79
|
+
$page = page
|
80
|
+
$doc = doc
|
81
|
+
end
|
82
|
+
end
|
83
|
+
expect($doc.at('span').text).to eq("Example Post")
|
84
|
+
expect($page.method.to_s).to eq("post")
|
85
|
+
expect($page.page_type.to_s).to eq("product")
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::QueueAliases do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Crawler with get,post,head and (get|post|head)_* methods" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:get)).to be_truthy
|
10
|
+
expect(Scruber::Core::Crawler.method_defined?(:head)).to be_truthy
|
11
|
+
expect(Scruber::Core::Crawler.method_defined?(:post)).to be_truthy
|
12
|
+
expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\A(get|post|head)_(\w+)\Z/)).to be_truthy
|
13
|
+
expect(Scruber::Core::Crawler.new(:sample).respond_to?(:get_product)).to be_truthy
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "#get,#post" do
|
18
|
+
context "without options" do
|
19
|
+
it "should add page to queue" do
|
20
|
+
described_class.register
|
21
|
+
|
22
|
+
Scruber.run :sample do
|
23
|
+
get "http://example.com"
|
24
|
+
$page = queue.fetch_pending
|
25
|
+
end
|
26
|
+
expect($page.url).to eq("http://example.com")
|
27
|
+
expect($page.method.to_s).to eq("get")
|
28
|
+
expect($page.page_type.to_s).to eq("seed")
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should add page to queue" do
|
32
|
+
described_class.register
|
33
|
+
|
34
|
+
Scruber.run :sample do
|
35
|
+
post_product "http://example.com"
|
36
|
+
$page = queue.fetch_pending
|
37
|
+
end
|
38
|
+
expect($page.url).to eq("http://example.com")
|
39
|
+
expect($page.method.to_s).to eq("post")
|
40
|
+
expect($page.page_type).to eq("product")
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "with options" do
|
45
|
+
it "should add page to queue" do
|
46
|
+
described_class.register
|
47
|
+
|
48
|
+
Scruber.run :sample do
|
49
|
+
get "http://example.com", user_agent: 'Agent 1'
|
50
|
+
$page = queue.fetch_pending
|
51
|
+
end
|
52
|
+
expect($page.url).to eq("http://example.com")
|
53
|
+
expect($page.method.to_s).to eq("get")
|
54
|
+
expect($page.page_type.to_s).to eq("seed")
|
55
|
+
expect($page.user_agent).to eq('Agent 1')
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should add page to queue" do
|
59
|
+
described_class.register
|
60
|
+
|
61
|
+
Scruber.run :sample do
|
62
|
+
post_product "http://example.com", user_agent: 'Agent 1'
|
63
|
+
$page = queue.fetch_pending
|
64
|
+
end
|
65
|
+
expect($page.url).to eq("http://example.com")
|
66
|
+
expect($page.method.to_s).to eq("post")
|
67
|
+
expect($page.page_type).to eq("product")
|
68
|
+
expect($page.user_agent).to eq('Agent 1')
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Core::Extensions::Seed do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should extend Scruber::Core with seed method" do
|
7
|
+
described_class.register
|
8
|
+
|
9
|
+
expect(Scruber::Core::Crawler.method_defined?(:seed)).to be_truthy
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
before do
|
14
|
+
stub_request(:get, "http://example.com").to_return(body: '<div><a>Main</a></div>')
|
15
|
+
stub_request(:get, "http://example.com/contacts").to_return(body: '<div><a>Contacts</a></div>')
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should execute seed block" do
|
19
|
+
$queue_size = 0
|
20
|
+
Scruber.run :sample do
|
21
|
+
seed do
|
22
|
+
get 'http://example.com'
|
23
|
+
end
|
24
|
+
$queue_size = queue.size
|
25
|
+
end
|
26
|
+
expect($queue_size).to eq(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should not execute seed block" do
|
30
|
+
$queue_size = 0
|
31
|
+
Scruber.run :sample do
|
32
|
+
seed do
|
33
|
+
get 'http://example.com'
|
34
|
+
end
|
35
|
+
seed do
|
36
|
+
get 'http://example.com/contacts'
|
37
|
+
end
|
38
|
+
$queue_size = queue.size
|
39
|
+
$page = queue.fetch_pending
|
40
|
+
end
|
41
|
+
expect($queue_size).to eq(1)
|
42
|
+
expect($page.url).to eq("http://example.com")
|
43
|
+
end
|
44
|
+
end
|
data/spec/fetcher.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Fetcher do
|
4
|
+
|
5
|
+
describe "add_adapter" do
|
6
|
+
it "should raise error" do
|
7
|
+
expect{ described_class.add_adapter(:obj, Object) }.to raise_error(NoMethodError)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should add new adapter and return added class" do
|
11
|
+
expect(described_class.add_adapter(:typhoeus_fetcher, Scruber::FetcherAdapters::TyphoeusFetcher)).to eq(Scruber::FetcherAdapters::TyphoeusFetcher)
|
12
|
+
expect(described_class._adapters.keys).to include(:typhoeus_fetcher)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "adapter" do
|
17
|
+
it "should return default adapter" do
|
18
|
+
expect(described_class.adapter).to eq(Scruber::FetcherAdapters::TyphoeusFetcher)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "new" do
|
23
|
+
it "should return instance of default adapter" do
|
24
|
+
expect(described_class.new).to be_a(Scruber::FetcherAdapters::TyphoeusFetcher)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Helpers::DictionaryReader::Csv do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should correctly read first element" do
|
7
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
|
8
|
+
|
9
|
+
result = nil
|
10
|
+
cl.read do |obj|
|
11
|
+
result = obj
|
12
|
+
end
|
13
|
+
expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should read 3 elements total" do
|
17
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
|
18
|
+
|
19
|
+
count = 0
|
20
|
+
cl.read do |obj|
|
21
|
+
count += 1
|
22
|
+
end
|
23
|
+
expect(count).to eq(3)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should read 1 elements with state=WI" do
|
27
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
|
28
|
+
|
29
|
+
results = []
|
30
|
+
cl.read({state: 'WI'}) do |obj|
|
31
|
+
results.push obj.sort
|
32
|
+
end
|
33
|
+
expect(results).to eq([{"r10"=>"false", "country"=>"US", "state"=>"WI", "postal_code"=>"54914"}.sort])
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber::Helpers::DictionaryReader::Xml do
|
4
|
+
|
5
|
+
describe "register" do
|
6
|
+
it "should correctly read first element" do
|
7
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
|
8
|
+
|
9
|
+
result = nil
|
10
|
+
cl.read do |obj|
|
11
|
+
result = obj
|
12
|
+
end
|
13
|
+
expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should correctly read first element with different selector" do
|
17
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict_records.xml')
|
18
|
+
|
19
|
+
result = nil
|
20
|
+
cl.read(selector: 'record') do |obj|
|
21
|
+
result = obj
|
22
|
+
end
|
23
|
+
expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should read 3 elements total" do
|
27
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
|
28
|
+
|
29
|
+
count = 0
|
30
|
+
cl.read do |obj|
|
31
|
+
count += 1
|
32
|
+
end
|
33
|
+
expect(count).to eq(3)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should read 1 elements with state=WI" do
|
37
|
+
cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
|
38
|
+
|
39
|
+
results = []
|
40
|
+
cl.read({state: 'WI'}) do |obj|
|
41
|
+
results.push obj.sort
|
42
|
+
end
|
43
|
+
expect(results).to eq([{"r10"=>"false", "country"=>"US", "state"=>"WI", "postal_code"=>"54914"}.sort])
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|