scruber 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4661c429c6b33a12841569c62835cc587e7f9464
4
- data.tar.gz: 9cb87a48248746b30d2ece9db71136560629bdcb
3
+ metadata.gz: b845332207b108efa91983b4721cf7631120ea36
4
+ data.tar.gz: 4c9d931ccdbf777c9d469d7cf2697f00acfe94dc
5
5
  SHA512:
6
- metadata.gz: c86674726ae45109383e8d0712612895136384eba1414e061c7d1de3dd9b699af6ab4022de037a537b1377a95b46be2aba10b4e9cea1664449ec880ae1b8189d
7
- data.tar.gz: 738d34f37fdc629eb3f4755fffeb70e640e53bd7e33333357af0f9af286d648dd92846220a7ef0f9d06918e412fdf8c801a0b25e15cf74b912b0e0459e04009d
6
+ metadata.gz: b7c9bc638e7f168401bfd15de02746b691c5477563404db66810f894e7a1925821064b096d47c065209223856782e787bdc3f6272a179068aa0fdfcb6c14994d
7
+ data.tar.gz: f4c73d7e94e795b32c647285f320cf9e25b9f6efa0c528aaa3138ab99c263627f9ad47088851126d2df07421e633037fa3e8f434d7303a91ad350804d9a64903
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -24,7 +24,7 @@ Gem::Specification.new do |spec|
24
24
  end
25
25
 
26
26
  spec.files = `git ls-files -z`.split("\x0").reject do |f|
27
- f.match(%r{^(test|spec|features)/})
27
+ f.match(%r{^(test|features)/})
28
28
  end
29
29
  spec.bindir = "exe"
30
30
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
@@ -0,0 +1,44 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::CsvOutput do
4
+
5
+ describe "register" do
6
+ it "should extend Scruber::CsvOutput with csv_file and csv_out method" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:csv_file)).to be_truthy
10
+ expect(Scruber::Core::Crawler.method_defined?(:csv_out)).to be_truthy
11
+ expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\Acsv_(\w+)_file\Z/)).to be_truthy
12
+ expect(Scruber::Core::Crawler.new(:sample).respond_to?(:csv_products_file)).to be_truthy
13
+ end
14
+ end
15
+
16
+ describe "csv_file" do
17
+ it "should create csv_file and write output" do
18
+ described_class.register
19
+ csv_file_name = File.join(File.expand_path(File.dirname(__FILE__)), 'test.csv')
20
+
21
+ Scruber.run :sample do
22
+ csv_file csv_file_name, col_sep: '|'
23
+ csv_out [1,2,3]
24
+ end
25
+ expect(File.exists?(csv_file_name)).to be_truthy
26
+ expect(File.open(csv_file_name, 'r'){|f| f.read }.strip).to eq('1|2|3')
27
+ File.delete(csv_file_name) if File.exists?(csv_file_name)
28
+ end
29
+ end
30
+
31
+ describe "csv_{pattern}_file" do
32
+ it "should register file and write output" do
33
+ described_class.register
34
+ csv_file_name = File.join(File.expand_path(File.dirname(__FILE__)), 'products.csv')
35
+ Scruber.run :sample do
36
+ csv_products_file csv_file_name, col_sep: '|'
37
+ csv_products_out [1,2,3]
38
+ end
39
+ expect(File.exists?(csv_file_name)).to be_truthy
40
+ expect(File.open(csv_file_name, 'r'){|f| f.read }.strip).to eq('1|2|3')
41
+ File.delete(csv_file_name) if File.exists?(csv_file_name)
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,4 @@
1
+ zip,r10,country,state
2
+ 10001,true,US,NY
3
+ 54914,false,US,WI
4
+ 10002,true,US,NY
@@ -0,0 +1,25 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::Log do
4
+
5
+ describe "register" do
6
+ it "should extend Scruber::Core with log method" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:log)).to be_truthy
10
+ end
11
+ end
12
+
13
+ describe "#log" do
14
+ let(:log_file) { Pathname.new(File.expand_path('../log.txt', __FILE__)) }
15
+ before { Scruber.logger = Logger.new(log_file) }
16
+ after{ (File.delete(log_file) rescue nil) }
17
+
18
+ it "should write log to file" do
19
+ Scruber.run :sample, silent: true do
20
+ log "Seeding"
21
+ end
22
+ expect(File.open(log_file){|f| f.read}).to match(/Seeding/)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,26 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::Loop do
4
+
5
+ describe "register" do
6
+ it "should extend Scruber::Core with loop method" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:loop)).to be_truthy
10
+ end
11
+
12
+ it "should add dictionary and read info" do
13
+ Scruber::Core::Extensions::Loop.register
14
+ $zip_codes = []
15
+ Scruber.run :sample do
16
+ add_dictionary :zip_codes_usa, File.expand_path(File.dirname(__FILE__))+'/dict.csv', :csv
17
+ seed do
18
+ loop :zip_codes_usa, state: 'NY' do |row|
19
+ $zip_codes.push row['zip']
20
+ end
21
+ end
22
+ end
23
+ expect($zip_codes).to eq(['10001', '10002'])
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,89 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::ParserAliases do
4
+
5
+ describe "register" do
6
+ it "should extend Crawler with parse and parse_* methods" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:parse)).to be_truthy
10
+ expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\Aparse_(\w+)\Z/)).to be_truthy
11
+ expect(Scruber::Core::Crawler.new(:sample).respond_to?(:parse_product)).to be_truthy
12
+ end
13
+ end
14
+
15
+ describe "#parse" do
16
+ context "without format" do
17
+ it "should register parser" do
18
+ described_class.register
19
+
20
+ stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
21
+
22
+ Scruber.run :sample do
23
+ get "http://example.com"
24
+
25
+ parse do |page|
26
+ $page = page
27
+ end
28
+ end
29
+ expect($page.url).to eq("http://example.com")
30
+ expect($page.page_type.to_s).to eq("seed")
31
+ end
32
+
33
+ it "should register parser with custom page_type" do
34
+ described_class.register
35
+
36
+ stub_request(:post, "http://example.com").to_return(body: 'Example Domain')
37
+
38
+ Scruber.run :sample do
39
+ post_product "http://example.com"
40
+
41
+ parse_product do |page|
42
+ $page = page
43
+ end
44
+ end
45
+ expect($page.url).to eq("http://example.com")
46
+ expect($page.method.to_s).to eq("post")
47
+ expect($page.page_type.to_s).to eq("product")
48
+ end
49
+ end
50
+
51
+ context "with format" do
52
+ it "should register parser" do
53
+ described_class.register
54
+
55
+ stub_request(:get, "http://example.com").to_return(body: '<div><span>Example Domain</span></div>')
56
+
57
+ Scruber.run :sample do
58
+ get "http://example.com"
59
+
60
+ parse :html do |page,doc|
61
+ $page = page
62
+ $doc = doc
63
+ end
64
+ end
65
+ expect($doc.at('span').text).to eq("Example Domain")
66
+ expect($page.page_type.to_s).to eq("seed")
67
+ expect($page.method.to_s).to eq("get")
68
+ end
69
+
70
+ it "should register parser with custom page_type" do
71
+ described_class.register
72
+
73
+ stub_request(:post, "http://example.com").to_return(body: '<div><span>Example Post</span></div>')
74
+
75
+ Scruber.run :sample do
76
+ post_product "http://example.com"
77
+
78
+ parse_product :html do |page,doc|
79
+ $page = page
80
+ $doc = doc
81
+ end
82
+ end
83
+ expect($doc.at('span').text).to eq("Example Post")
84
+ expect($page.method.to_s).to eq("post")
85
+ expect($page.page_type.to_s).to eq("product")
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,72 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::QueueAliases do
4
+
5
+ describe "register" do
6
+ it "should extend Crawler with get,post,head and (get|post|head)_* methods" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:get)).to be_truthy
10
+ expect(Scruber::Core::Crawler.method_defined?(:head)).to be_truthy
11
+ expect(Scruber::Core::Crawler.method_defined?(:post)).to be_truthy
12
+ expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\A(get|post|head)_(\w+)\Z/)).to be_truthy
13
+ expect(Scruber::Core::Crawler.new(:sample).respond_to?(:get_product)).to be_truthy
14
+ end
15
+ end
16
+
17
+ describe "#get,#post" do
18
+ context "without options" do
19
+ it "should add page to queue" do
20
+ described_class.register
21
+
22
+ Scruber.run :sample do
23
+ get "http://example.com"
24
+ $page = queue.fetch_pending
25
+ end
26
+ expect($page.url).to eq("http://example.com")
27
+ expect($page.method.to_s).to eq("get")
28
+ expect($page.page_type.to_s).to eq("seed")
29
+ end
30
+
31
+ it "should add page to queue" do
32
+ described_class.register
33
+
34
+ Scruber.run :sample do
35
+ post_product "http://example.com"
36
+ $page = queue.fetch_pending
37
+ end
38
+ expect($page.url).to eq("http://example.com")
39
+ expect($page.method.to_s).to eq("post")
40
+ expect($page.page_type).to eq("product")
41
+ end
42
+ end
43
+
44
+ context "with options" do
45
+ it "should add page to queue" do
46
+ described_class.register
47
+
48
+ Scruber.run :sample do
49
+ get "http://example.com", user_agent: 'Agent 1'
50
+ $page = queue.fetch_pending
51
+ end
52
+ expect($page.url).to eq("http://example.com")
53
+ expect($page.method.to_s).to eq("get")
54
+ expect($page.page_type.to_s).to eq("seed")
55
+ expect($page.user_agent).to eq('Agent 1')
56
+ end
57
+
58
+ it "should add page to queue" do
59
+ described_class.register
60
+
61
+ Scruber.run :sample do
62
+ post_product "http://example.com", user_agent: 'Agent 1'
63
+ $page = queue.fetch_pending
64
+ end
65
+ expect($page.url).to eq("http://example.com")
66
+ expect($page.method.to_s).to eq("post")
67
+ expect($page.page_type).to eq("product")
68
+ expect($page.user_agent).to eq('Agent 1')
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,44 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::Seed do
4
+
5
+ describe "register" do
6
+ it "should extend Scruber::Core with seed method" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:seed)).to be_truthy
10
+ end
11
+ end
12
+
13
+ before do
14
+ stub_request(:get, "http://example.com").to_return(body: '<div><a>Main</a></div>')
15
+ stub_request(:get, "http://example.com/contacts").to_return(body: '<div><a>Contacts</a></div>')
16
+ end
17
+
18
+ it "should execute seed block" do
19
+ $queue_size = 0
20
+ Scruber.run :sample do
21
+ seed do
22
+ get 'http://example.com'
23
+ end
24
+ $queue_size = queue.size
25
+ end
26
+ expect($queue_size).to eq(1)
27
+ end
28
+
29
+ it "should not execute seed block" do
30
+ $queue_size = 0
31
+ Scruber.run :sample do
32
+ seed do
33
+ get 'http://example.com'
34
+ end
35
+ seed do
36
+ get 'http://example.com/contacts'
37
+ end
38
+ $queue_size = queue.size
39
+ $page = queue.fetch_pending
40
+ end
41
+ expect($queue_size).to eq(1)
42
+ expect($page.url).to eq("http://example.com")
43
+ end
44
+ end
@@ -0,0 +1,27 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Fetcher do
4
+
5
+ describe "add_adapter" do
6
+ it "should raise error" do
7
+ expect{ described_class.add_adapter(:obj, Object) }.to raise_error(NoMethodError)
8
+ end
9
+
10
+ it "should add new adapter and return added class" do
11
+ expect(described_class.add_adapter(:typhoeus_fetcher, Scruber::FetcherAdapters::TyphoeusFetcher)).to eq(Scruber::FetcherAdapters::TyphoeusFetcher)
12
+ expect(described_class._adapters.keys).to include(:typhoeus_fetcher)
13
+ end
14
+ end
15
+
16
+ describe "adapter" do
17
+ it "should return default adapter" do
18
+ expect(described_class.adapter).to eq(Scruber::FetcherAdapters::TyphoeusFetcher)
19
+ end
20
+ end
21
+
22
+ describe "new" do
23
+ it "should return instance of default adapter" do
24
+ expect(described_class.new).to be_a(Scruber::FetcherAdapters::TyphoeusFetcher)
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,4 @@
1
+ postal_code,r10,country,state
2
+ 10001,true,US,NY
3
+ 54914,false,US,WI
4
+ 10002,true,US,NY
@@ -0,0 +1,5 @@
1
+ <items>
2
+ <item r10="true" country="US" state="NY" postal_code="10001" />
3
+ <item r10="false" country="US" state="WI" postal_code="54914" />
4
+ <item r10="true" country="US" state="NY" postal_code="10002" />
5
+ </items>
@@ -0,0 +1,5 @@
1
+ <records>
2
+ <record r10="true" country="US" state="NY" postal_code="10001" />
3
+ <record r10="false" country="US" state="WI" postal_code="54914" />
4
+ <record r10="true" country="US" state="NY" postal_code="10002" />
5
+ </records>
@@ -0,0 +1,36 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::DictionaryReader::Csv do
4
+
5
+ describe "register" do
6
+ it "should correctly read first element" do
7
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
8
+
9
+ result = nil
10
+ cl.read do |obj|
11
+ result = obj
12
+ end
13
+ expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
14
+ end
15
+
16
+ it "should read 3 elements total" do
17
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
18
+
19
+ count = 0
20
+ cl.read do |obj|
21
+ count += 1
22
+ end
23
+ expect(count).to eq(3)
24
+ end
25
+
26
+ it "should read 1 elements with state=WI" do
27
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
28
+
29
+ results = []
30
+ cl.read({state: 'WI'}) do |obj|
31
+ results.push obj.sort
32
+ end
33
+ expect(results).to eq([{"r10"=>"false", "country"=>"US", "state"=>"WI", "postal_code"=>"54914"}.sort])
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,46 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::DictionaryReader::Xml do
4
+
5
+ describe "register" do
6
+ it "should correctly read first element" do
7
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
8
+
9
+ result = nil
10
+ cl.read do |obj|
11
+ result = obj
12
+ end
13
+ expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
14
+ end
15
+
16
+ it "should correctly read first element with different selector" do
17
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict_records.xml')
18
+
19
+ result = nil
20
+ cl.read(selector: 'record') do |obj|
21
+ result = obj
22
+ end
23
+ expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
24
+ end
25
+
26
+ it "should read 3 elements total" do
27
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
28
+
29
+ count = 0
30
+ cl.read do |obj|
31
+ count += 1
32
+ end
33
+ expect(count).to eq(3)
34
+ end
35
+
36
+ it "should read 1 elements with state=WI" do
37
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
38
+
39
+ results = []
40
+ cl.read({state: 'WI'}) do |obj|
41
+ results.push obj.sort
42
+ end
43
+ expect(results).to eq([{"r10"=>"false", "country"=>"US", "state"=>"WI", "postal_code"=>"54914"}.sort])
44
+ end
45
+ end
46
+ end