scruber 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4661c429c6b33a12841569c62835cc587e7f9464
4
- data.tar.gz: 9cb87a48248746b30d2ece9db71136560629bdcb
3
+ metadata.gz: b845332207b108efa91983b4721cf7631120ea36
4
+ data.tar.gz: 4c9d931ccdbf777c9d469d7cf2697f00acfe94dc
5
5
  SHA512:
6
- metadata.gz: c86674726ae45109383e8d0712612895136384eba1414e061c7d1de3dd9b699af6ab4022de037a537b1377a95b46be2aba10b4e9cea1664449ec880ae1b8189d
7
- data.tar.gz: 738d34f37fdc629eb3f4755fffeb70e640e53bd7e33333357af0f9af286d648dd92846220a7ef0f9d06918e412fdf8c801a0b25e15cf74b912b0e0459e04009d
6
+ metadata.gz: b7c9bc638e7f168401bfd15de02746b691c5477563404db66810f894e7a1925821064b096d47c065209223856782e787bdc3f6272a179068aa0fdfcb6c14994d
7
+ data.tar.gz: f4c73d7e94e795b32c647285f320cf9e25b9f6efa0c528aaa3138ab99c263627f9ad47088851126d2df07421e633037fa3e8f434d7303a91ad350804d9a64903
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -24,7 +24,7 @@ Gem::Specification.new do |spec|
24
24
  end
25
25
 
26
26
  spec.files = `git ls-files -z`.split("\x0").reject do |f|
27
- f.match(%r{^(test|spec|features)/})
27
+ f.match(%r{^(test|features)/})
28
28
  end
29
29
  spec.bindir = "exe"
30
30
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
@@ -0,0 +1,44 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::CsvOutput do
4
+
5
+ describe "register" do
6
+ it "should extend Scruber::CsvOutput with csv_file and csv_out method" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:csv_file)).to be_truthy
10
+ expect(Scruber::Core::Crawler.method_defined?(:csv_out)).to be_truthy
11
+ expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\Acsv_(\w+)_file\Z/)).to be_truthy
12
+ expect(Scruber::Core::Crawler.new(:sample).respond_to?(:csv_products_file)).to be_truthy
13
+ end
14
+ end
15
+
16
+ describe "csv_file" do
17
+ it "should create csv_file and write output" do
18
+ described_class.register
19
+ csv_file_name = File.join(File.expand_path(File.dirname(__FILE__)), 'test.csv')
20
+
21
+ Scruber.run :sample do
22
+ csv_file csv_file_name, col_sep: '|'
23
+ csv_out [1,2,3]
24
+ end
25
+ expect(File.exists?(csv_file_name)).to be_truthy
26
+ expect(File.open(csv_file_name, 'r'){|f| f.read }.strip).to eq('1|2|3')
27
+ File.delete(csv_file_name) if File.exists?(csv_file_name)
28
+ end
29
+ end
30
+
31
+ describe "csv_{pattern}_file" do
32
+ it "should register file and write output" do
33
+ described_class.register
34
+ csv_file_name = File.join(File.expand_path(File.dirname(__FILE__)), 'products.csv')
35
+ Scruber.run :sample do
36
+ csv_products_file csv_file_name, col_sep: '|'
37
+ csv_products_out [1,2,3]
38
+ end
39
+ expect(File.exists?(csv_file_name)).to be_truthy
40
+ expect(File.open(csv_file_name, 'r'){|f| f.read }.strip).to eq('1|2|3')
41
+ File.delete(csv_file_name) if File.exists?(csv_file_name)
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,4 @@
1
+ zip,r10,country,state
2
+ 10001,true,US,NY
3
+ 54914,false,US,WI
4
+ 10002,true,US,NY
@@ -0,0 +1,25 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::Log do
4
+
5
+ describe "register" do
6
+ it "should extend Scruber::Core with log method" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:log)).to be_truthy
10
+ end
11
+ end
12
+
13
+ describe "#log" do
14
+ let(:log_file) { Pathname.new(File.expand_path('../log.txt', __FILE__)) }
15
+ before { Scruber.logger = Logger.new(log_file) }
16
+ after{ (File.delete(log_file) rescue nil) }
17
+
18
+ it "should write log to file" do
19
+ Scruber.run :sample, silent: true do
20
+ log "Seeding"
21
+ end
22
+ expect(File.open(log_file){|f| f.read}).to match(/Seeding/)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,26 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::Loop do
4
+
5
+ describe "register" do
6
+ it "should extend Scruber::Core with loop method" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:loop)).to be_truthy
10
+ end
11
+
12
+ it "should add dictionary and read info" do
13
+ Scruber::Core::Extensions::Loop.register
14
+ $zip_codes = []
15
+ Scruber.run :sample do
16
+ add_dictionary :zip_codes_usa, File.expand_path(File.dirname(__FILE__))+'/dict.csv', :csv
17
+ seed do
18
+ loop :zip_codes_usa, state: 'NY' do |row|
19
+ $zip_codes.push row['zip']
20
+ end
21
+ end
22
+ end
23
+ expect($zip_codes).to eq(['10001', '10002'])
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,89 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::ParserAliases do
4
+
5
+ describe "register" do
6
+ it "should extend Crawler with parse and parse_* methods" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:parse)).to be_truthy
10
+ expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\Aparse_(\w+)\Z/)).to be_truthy
11
+ expect(Scruber::Core::Crawler.new(:sample).respond_to?(:parse_product)).to be_truthy
12
+ end
13
+ end
14
+
15
+ describe "#parse" do
16
+ context "without format" do
17
+ it "should register parser" do
18
+ described_class.register
19
+
20
+ stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
21
+
22
+ Scruber.run :sample do
23
+ get "http://example.com"
24
+
25
+ parse do |page|
26
+ $page = page
27
+ end
28
+ end
29
+ expect($page.url).to eq("http://example.com")
30
+ expect($page.page_type.to_s).to eq("seed")
31
+ end
32
+
33
+ it "should register parser with custom page_type" do
34
+ described_class.register
35
+
36
+ stub_request(:post, "http://example.com").to_return(body: 'Example Domain')
37
+
38
+ Scruber.run :sample do
39
+ post_product "http://example.com"
40
+
41
+ parse_product do |page|
42
+ $page = page
43
+ end
44
+ end
45
+ expect($page.url).to eq("http://example.com")
46
+ expect($page.method.to_s).to eq("post")
47
+ expect($page.page_type.to_s).to eq("product")
48
+ end
49
+ end
50
+
51
+ context "with format" do
52
+ it "should register parser" do
53
+ described_class.register
54
+
55
+ stub_request(:get, "http://example.com").to_return(body: '<div><span>Example Domain</span></div>')
56
+
57
+ Scruber.run :sample do
58
+ get "http://example.com"
59
+
60
+ parse :html do |page,doc|
61
+ $page = page
62
+ $doc = doc
63
+ end
64
+ end
65
+ expect($doc.at('span').text).to eq("Example Domain")
66
+ expect($page.page_type.to_s).to eq("seed")
67
+ expect($page.method.to_s).to eq("get")
68
+ end
69
+
70
+ it "should register parser with custom page_type" do
71
+ described_class.register
72
+
73
+ stub_request(:post, "http://example.com").to_return(body: '<div><span>Example Post</span></div>')
74
+
75
+ Scruber.run :sample do
76
+ post_product "http://example.com"
77
+
78
+ parse_product :html do |page,doc|
79
+ $page = page
80
+ $doc = doc
81
+ end
82
+ end
83
+ expect($doc.at('span').text).to eq("Example Post")
84
+ expect($page.method.to_s).to eq("post")
85
+ expect($page.page_type.to_s).to eq("product")
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,72 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::QueueAliases do
4
+
5
+ describe "register" do
6
+ it "should extend Crawler with get,post,head and (get|post|head)_* methods" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:get)).to be_truthy
10
+ expect(Scruber::Core::Crawler.method_defined?(:head)).to be_truthy
11
+ expect(Scruber::Core::Crawler.method_defined?(:post)).to be_truthy
12
+ expect(Scruber::Core::Crawler._registered_method_missings.keys.include?(/\A(get|post|head)_(\w+)\Z/)).to be_truthy
13
+ expect(Scruber::Core::Crawler.new(:sample).respond_to?(:get_product)).to be_truthy
14
+ end
15
+ end
16
+
17
+ describe "#get,#post" do
18
+ context "without options" do
19
+ it "should add page to queue" do
20
+ described_class.register
21
+
22
+ Scruber.run :sample do
23
+ get "http://example.com"
24
+ $page = queue.fetch_pending
25
+ end
26
+ expect($page.url).to eq("http://example.com")
27
+ expect($page.method.to_s).to eq("get")
28
+ expect($page.page_type.to_s).to eq("seed")
29
+ end
30
+
31
+ it "should add page to queue" do
32
+ described_class.register
33
+
34
+ Scruber.run :sample do
35
+ post_product "http://example.com"
36
+ $page = queue.fetch_pending
37
+ end
38
+ expect($page.url).to eq("http://example.com")
39
+ expect($page.method.to_s).to eq("post")
40
+ expect($page.page_type).to eq("product")
41
+ end
42
+ end
43
+
44
+ context "with options" do
45
+ it "should add page to queue" do
46
+ described_class.register
47
+
48
+ Scruber.run :sample do
49
+ get "http://example.com", user_agent: 'Agent 1'
50
+ $page = queue.fetch_pending
51
+ end
52
+ expect($page.url).to eq("http://example.com")
53
+ expect($page.method.to_s).to eq("get")
54
+ expect($page.page_type.to_s).to eq("seed")
55
+ expect($page.user_agent).to eq('Agent 1')
56
+ end
57
+
58
+ it "should add page to queue" do
59
+ described_class.register
60
+
61
+ Scruber.run :sample do
62
+ post_product "http://example.com", user_agent: 'Agent 1'
63
+ $page = queue.fetch_pending
64
+ end
65
+ expect($page.url).to eq("http://example.com")
66
+ expect($page.method.to_s).to eq("post")
67
+ expect($page.page_type).to eq("product")
68
+ expect($page.user_agent).to eq('Agent 1')
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,44 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Core::Extensions::Seed do
4
+
5
+ describe "register" do
6
+ it "should extend Scruber::Core with seed method" do
7
+ described_class.register
8
+
9
+ expect(Scruber::Core::Crawler.method_defined?(:seed)).to be_truthy
10
+ end
11
+ end
12
+
13
+ before do
14
+ stub_request(:get, "http://example.com").to_return(body: '<div><a>Main</a></div>')
15
+ stub_request(:get, "http://example.com/contacts").to_return(body: '<div><a>Contacts</a></div>')
16
+ end
17
+
18
+ it "should execute seed block" do
19
+ $queue_size = 0
20
+ Scruber.run :sample do
21
+ seed do
22
+ get 'http://example.com'
23
+ end
24
+ $queue_size = queue.size
25
+ end
26
+ expect($queue_size).to eq(1)
27
+ end
28
+
29
+ it "should not execute seed block" do
30
+ $queue_size = 0
31
+ Scruber.run :sample do
32
+ seed do
33
+ get 'http://example.com'
34
+ end
35
+ seed do
36
+ get 'http://example.com/contacts'
37
+ end
38
+ $queue_size = queue.size
39
+ $page = queue.fetch_pending
40
+ end
41
+ expect($queue_size).to eq(1)
42
+ expect($page.url).to eq("http://example.com")
43
+ end
44
+ end
@@ -0,0 +1,27 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Fetcher do
4
+
5
+ describe "add_adapter" do
6
+ it "should raise error" do
7
+ expect{ described_class.add_adapter(:obj, Object) }.to raise_error(NoMethodError)
8
+ end
9
+
10
+ it "should add new adapter and return added class" do
11
+ expect(described_class.add_adapter(:typhoeus_fetcher, Scruber::FetcherAdapters::TyphoeusFetcher)).to eq(Scruber::FetcherAdapters::TyphoeusFetcher)
12
+ expect(described_class._adapters.keys).to include(:typhoeus_fetcher)
13
+ end
14
+ end
15
+
16
+ describe "adapter" do
17
+ it "should return default adapter" do
18
+ expect(described_class.adapter).to eq(Scruber::FetcherAdapters::TyphoeusFetcher)
19
+ end
20
+ end
21
+
22
+ describe "new" do
23
+ it "should return instance of default adapter" do
24
+ expect(described_class.new).to be_a(Scruber::FetcherAdapters::TyphoeusFetcher)
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,4 @@
1
+ postal_code,r10,country,state
2
+ 10001,true,US,NY
3
+ 54914,false,US,WI
4
+ 10002,true,US,NY
@@ -0,0 +1,5 @@
1
+ <items>
2
+ <item r10="true" country="US" state="NY" postal_code="10001" />
3
+ <item r10="false" country="US" state="WI" postal_code="54914" />
4
+ <item r10="true" country="US" state="NY" postal_code="10002" />
5
+ </items>
@@ -0,0 +1,5 @@
1
+ <records>
2
+ <record r10="true" country="US" state="NY" postal_code="10001" />
3
+ <record r10="false" country="US" state="WI" postal_code="54914" />
4
+ <record r10="true" country="US" state="NY" postal_code="10002" />
5
+ </records>
@@ -0,0 +1,36 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::DictionaryReader::Csv do
4
+
5
+ describe "register" do
6
+ it "should correctly read first element" do
7
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
8
+
9
+ result = nil
10
+ cl.read do |obj|
11
+ result = obj
12
+ end
13
+ expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
14
+ end
15
+
16
+ it "should read 3 elements total" do
17
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
18
+
19
+ count = 0
20
+ cl.read do |obj|
21
+ count += 1
22
+ end
23
+ expect(count).to eq(3)
24
+ end
25
+
26
+ it "should read 1 elements with state=WI" do
27
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.csv')
28
+
29
+ results = []
30
+ cl.read({state: 'WI'}) do |obj|
31
+ results.push obj.sort
32
+ end
33
+ expect(results).to eq([{"r10"=>"false", "country"=>"US", "state"=>"WI", "postal_code"=>"54914"}.sort])
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,46 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber::Helpers::DictionaryReader::Xml do
4
+
5
+ describe "register" do
6
+ it "should correctly read first element" do
7
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
8
+
9
+ result = nil
10
+ cl.read do |obj|
11
+ result = obj
12
+ end
13
+ expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
14
+ end
15
+
16
+ it "should correctly read first element with different selector" do
17
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict_records.xml')
18
+
19
+ result = nil
20
+ cl.read(selector: 'record') do |obj|
21
+ result = obj
22
+ end
23
+ expect(result.sort).to eq({"r10"=>"true", "country"=>"US", "state"=>"NY", "postal_code"=>"10002"}.sort)
24
+ end
25
+
26
+ it "should read 3 elements total" do
27
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
28
+
29
+ count = 0
30
+ cl.read do |obj|
31
+ count += 1
32
+ end
33
+ expect(count).to eq(3)
34
+ end
35
+
36
+ it "should read 1 elements with state=WI" do
37
+ cl = described_class.new(File.expand_path(File.dirname(__FILE__))+'/dict.xml')
38
+
39
+ results = []
40
+ cl.read({state: 'WI'}) do |obj|
41
+ results.push obj.sort
42
+ end
43
+ expect(results).to eq([{"r10"=>"false", "country"=>"US", "state"=>"WI", "postal_code"=>"54914"}.sort])
44
+ end
45
+ end
46
+ end