scruber 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,198 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber do
4
+ before do
5
+ Scruber::Helpers::UserAgentRotator.configure do
6
+ clean
7
+ set_filter :all
8
+ add "Scruber 1.0", tags: [:robot, :scruber]
9
+ add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", tags: [:desktop, :chrome, :macos]
10
+ end
11
+ end
12
+
13
+ it "has a version number" do
14
+ expect(Scruber::VERSION).not_to be nil
15
+ end
16
+
17
+ describe "configurable" do
18
+ before do
19
+ Scruber.configure do |config|
20
+ config.fetcher_adapter = :typhoeus_fetcher
21
+ end
22
+ end
23
+
24
+ it "returns :typhoeus_fetcher as fetcher" do
25
+ expect(Scruber.configuration.fetcher_adapter).to eq(:typhoeus_fetcher)
26
+ end
27
+ end
28
+
29
+ describe "#run" do
30
+ context "without args" do
31
+ it "should raise error" do
32
+ expect { Scruber.run { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
33
+ end
34
+
35
+ it "should set scraper name from ENV" do
36
+ ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
37
+ Scruber.run do
38
+ $scraper_name = scraper_name
39
+ end
40
+ expect($scraper_name).to eq(:sample)
41
+ end
42
+ end
43
+
44
+ context "with args" do
45
+ it "should set scraper name from first arg" do
46
+ Scruber.run :sample1 do
47
+ $scraper_name = scraper_name
48
+ end
49
+ expect($scraper_name).to eq(:sample1)
50
+ end
51
+
52
+ it "should set scraper name from first arg, and options from second" do
53
+ Scruber.run :sample2, queue_adapter: :test do
54
+ $scraper_name = scraper_name
55
+ $opt = Scruber.configuration.queue_adapter
56
+ end
57
+ expect($scraper_name).to eq(:sample2)
58
+ expect($opt).to eq(:test)
59
+ end
60
+
61
+ it "options from first arg and scraper_name from ENV" do
62
+ ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
63
+ Scruber.run queue_adapter: :test2 do
64
+ $scraper_name = scraper_name
65
+ $opt = Scruber.configuration.queue_adapter
66
+ end
67
+ expect($scraper_name).to eq(:sample)
68
+ expect($opt).to eq(:test2)
69
+ end
70
+
71
+ it "should raise error if passed only options without ENV" do
72
+ ENV['SCRUBER_SCRAPER_NAME'] = nil
73
+ expect { Scruber.run(queue_adapter: :test2) { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
74
+ end
75
+ end
76
+
77
+ it "simple example" do
78
+ stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
79
+
80
+ Scruber.run :sample do
81
+ queue.add "http://example.com"
82
+
83
+ parser :seed do |page|
84
+ $title = page.response_body
85
+ end
86
+ end
87
+ expect($title).to eq('Example Domain')
88
+ end
89
+
90
+ it "should return Nokogiri object" do
91
+ stub_request(:get, "http://example.com/contacts.html").to_return(body: '<div><a>Contacts</a></div>')
92
+
93
+ Scruber.run :sample do
94
+ queue.add "http://example.com/contacts.html"
95
+
96
+ parser :seed, format: :html do |page, html|
97
+ $title = html.at('a').text
98
+ end
99
+ end
100
+ expect($title).to eq('Contacts')
101
+ end
102
+
103
+ context "complex example" do
104
+ it "should parse pages in 2 steps" do
105
+ stub_request(:get, "http://example.com/catalog").to_return(body: '<div><a href="/product1">Product 1</a><a href="/product2">Product 2</a><a href="/product3">Product 3</a></div>')
106
+ stub_request(:get, "http://example.com/product1").to_return(body: '<div><h1>Product 1</h1></div>')
107
+ stub_request(:get, "http://example.com/product2").to_return(body: '<div><h1>Product 2</h1></div>')
108
+ stub_request(:get, "http://example.com/product3").to_return(body: '<div><h1>Product 3</h1></div>')
109
+
110
+ $products = []
111
+ Scruber.run :sample do
112
+ get "http://example.com/catalog"
113
+
114
+ parse :html do |page, doc|
115
+ doc.search('a').each do |a|
116
+ get_product URI.join(page.url, a.attr('href')).to_s
117
+ end
118
+ end
119
+
120
+ parse_product :html do |page,doc|
121
+ $products.push doc.at('h1').text
122
+ end
123
+ end
124
+ expect($products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
125
+ end
126
+
127
+ it "should redownload page and increase retry" do
128
+ stub_request(:get, "http://example.com/").to_return(body: '<div>blocked</div>').times(2).then.to_return(body: '<div><h1>Product</h1></div>')
129
+
130
+ Scruber.run :sample do
131
+ get "http://example.com/"
132
+
133
+ parse :html do |page, doc|
134
+ if page.response_body =~ /blocked/
135
+ page.redownload!
136
+ else
137
+ $title = doc.at('h1').text
138
+ $retry_count = page.retry_count
139
+ end
140
+ end
141
+ end
142
+ expect($title).to eq('Product')
143
+ expect($retry_count).to eq(2)
144
+ end
145
+ end
146
+
147
+ context "processing error examples" do
148
+ it "should process 500 error page" do
149
+ stub_request(:get, "http://example.com").to_return(body: '<div><h1>500</h1></div>', status: 500)
150
+
151
+ $error_title = nil
152
+ Scruber.run :sample do
153
+ get "http://example.com", max_retry_times: 1
154
+
155
+ parse :html do |page,doc|
156
+ $error_title = doc.at('h1').text
157
+ end
158
+
159
+ on_page_error do |page|
160
+ $error_title = page.response_body
161
+ page.processed!
162
+ end
163
+ end
164
+ expect($error_title).to eq('<div><h1>500</h1></div>')
165
+ end
166
+
167
+ it "should process 404 error page" do
168
+ stub_request(:get, "http://example.com").to_return(body: '<div><h1>404</h1></div>', status: 404)
169
+
170
+ $error_title = nil
171
+ Scruber.run :sample do
172
+ get "http://example.com", max_retry_times: 1
173
+
174
+ parse :html do |page,doc|
175
+ $error_title = doc.at('h1').text
176
+ end
177
+
178
+ on_page_error do |page|
179
+ $error_title = page.response_body
180
+ page.processed!
181
+ end
182
+ end
183
+ expect($error_title).to eq('<div><h1>404</h1></div>')
184
+ end
185
+ end
186
+ end
187
+
188
+ describe "#root" do
189
+ it "should return nil without APP_PATH defined" do
190
+ expect(Scruber.root).to eq(nil)
191
+ end
192
+
193
+ it "should return path object" do
194
+ APP_PATH='/tmp/a/b/'
195
+ expect(Scruber.root.to_s).to eq('/tmp')
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,36 @@
1
+ require "bundler/setup"
2
+ require "scruber"
3
+ require 'webmock/rspec'
4
+
5
+ Encoding.default_external = Encoding::UTF_8
6
+ Encoding.default_internal = Encoding::UTF_8
7
+
8
+ Dir[File.expand_path(File.dirname(__FILE__))+"/support/**/*.rb"].each { |f| require f }
9
+
10
+ Scruber::Helpers::UserAgentRotator.configure do
11
+ set_filter :all
12
+ add "Scruber 1.0", tags: [:robot, :scruber]
13
+ add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", tags: [:desktop, :chrome, :macos]
14
+ end
15
+
16
+ RSpec.configure do |config|
17
+ # Enable flags like --only-failures and --next-failure
18
+ config.example_status_persistence_file_path = ".rspec_status"
19
+
20
+ # Disable RSpec exposing methods globally on `Module` and `main`
21
+ config.disable_monkey_patching!
22
+
23
+ config.expect_with :rspec do |c|
24
+ c.syntax = :expect
25
+ end
26
+
27
+ # Use color in STDOUT
28
+ config.color = true
29
+
30
+ # Use color not only in STDOUT but also in pagers and files
31
+ config.tty = true
32
+
33
+ # Use the specified formatter
34
+ config.formatter = :progress # :documentation, :html, :textmate
35
+
36
+ end
@@ -0,0 +1,171 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.shared_examples "queue_adapter" do
4
+
5
+ it "should update page" do
6
+ queue.add "http://example.com"
7
+ page = queue.fetch_pending
8
+ page.url = "http://example.net"
9
+ page.save
10
+ page = queue.fetch_pending
11
+ expect(page.url).to eq("http://example.net")
12
+ end
13
+
14
+ it "should update page and fetch downloaded page" do
15
+ queue.add "http://example.com"
16
+ page = queue.fetch_pending
17
+ page.fetched_at = Time.now.to_i
18
+ page.save
19
+ pending_page = queue.fetch_pending
20
+ downloaded_page = queue.fetch_downloaded
21
+ expect(pending_page).to eq(nil)
22
+ expect(downloaded_page.url).to eq("http://example.com")
23
+ end
24
+
25
+ describe "processing errors page" do
26
+ it "should fetch error page" do
27
+ queue.add "http://example.com"
28
+ page = queue.fetch_pending
29
+ page.retry_count = 5
30
+ page.max_retry_times = 5
31
+ page.save
32
+ error_page = queue.fetch_error
33
+ expect(error_page).not_to eq(nil)
34
+ expect(error_page.id).to eq(page.id)
35
+ end
36
+
37
+ it "should return page to downloading" do
38
+ queue.add "http://example.com"
39
+ page = queue.fetch_pending
40
+ page.retry_count = 5
41
+ page.max_retry_times = 5
42
+ page.save
43
+ error_page = queue.fetch_error
44
+ error_page.redownload!(0)
45
+ pending_page = queue.fetch_pending
46
+ err_page = queue.fetch_error
47
+ d_page = queue.fetch_downloaded
48
+ expect(error_page.id).to eq(pending_page.id)
49
+ expect(err_page).to be_nil
50
+ expect(d_page).to be_nil
51
+ end
52
+
53
+ it "should delete page from queue" do
54
+ queue.add "http://example.com"
55
+ page = queue.fetch_pending
56
+ page.retry_count = 5
57
+ page.max_retry_times = 5
58
+ page.save
59
+ error_page = queue.fetch_error
60
+ error_page.delete
61
+ pending_page = queue.fetch_pending
62
+ err_page = queue.fetch_error
63
+ d_page = queue.fetch_downloaded
64
+ expect(pending_page).to be_nil
65
+ expect(err_page).to be_nil
66
+ expect(d_page).to be_nil
67
+ end
68
+
69
+ it "should process page" do
70
+ queue.add "http://example.com"
71
+ page = queue.fetch_pending
72
+ page.retry_count = 5
73
+ page.max_retry_times = 5
74
+ page.save
75
+ error_page = queue.fetch_error
76
+ error_page.processed!
77
+ pending_page = queue.fetch_pending
78
+ err_page = queue.fetch_error
79
+ d_page = queue.fetch_downloaded
80
+ expect(pending_page).to be_nil
81
+ expect(err_page).to be_nil
82
+ expect(d_page).to be_nil
83
+ queue.add "http://example.com"
84
+ pending_page = queue.fetch_pending
85
+ expect(pending_page).to be_nil
86
+ end
87
+ end
88
+
89
+ context "#add" do
90
+ it "queue page for downloading" do
91
+ queue.add "http://example.com"
92
+ expect(queue.size).to eq(1)
93
+ end
94
+
95
+ it "should not add the same page twice" do
96
+ queue.add "http://example.com"
97
+ expect(queue.size).to eq(1)
98
+ queue.add "http://example.com"
99
+ expect(queue.size).to eq(1)
100
+ end
101
+
102
+ it "should not add the same page twice even if page was processed" do
103
+ queue.add "http://example.com"
104
+ page = queue.fetch_pending
105
+ page.fetched_at = Time.now.to_i
106
+ page.save
107
+ downloaded_page = queue.fetch_downloaded
108
+ downloaded_page.processed!
109
+ queue.add "http://example.com"
110
+ page = queue.fetch_pending
111
+ expect(page).to eq(nil)
112
+ end
113
+ end
114
+
115
+ context "#save" do
116
+ it "should delete page" do
117
+ queue.add "http://example.abc"
118
+ page = queue.fetch_pending
119
+ page.fetched_at = Time.now.to_i
120
+ page.save
121
+ page.delete
122
+ page = queue.fetch_downloaded
123
+
124
+ expect(page).to eq(nil)
125
+ end
126
+
127
+ it "should save additional arguments" do
128
+ queue.add "http://example.abc", id: 'abc', test_id: '1'
129
+ page = queue.find 'abc'
130
+
131
+ expect(page.options[:test_id]).to eq('1')
132
+ end
133
+
134
+ it "should not override page" do
135
+ queue.add "http://example.abc", id: 'abc'
136
+ page = queue.find 'abc'
137
+ page.fetched_at = 1
138
+ page.save
139
+ page = queue.find 'abc'
140
+ expect(page.fetched_at).to eq(1)
141
+ queue.add "http://example.abc", id: 'abc'
142
+ page = queue.find 'abc'
143
+ expect(page.fetched_at).to eq(1)
144
+ end
145
+ end
146
+
147
+ context "#processed!" do
148
+ it "should update page and set processed_at" do
149
+ queue.add "http://example.com"
150
+ page = queue.fetch_pending
151
+ page.fetched_at = Time.now.to_i
152
+ page.save
153
+ downloaded_page = queue.fetch_downloaded
154
+ downloaded_page.processed!
155
+ downloaded_page2 = queue.fetch_downloaded
156
+ expect(downloaded_page2).to eq(nil)
157
+ expect(downloaded_page.processed_at).to be >= 0
158
+ end
159
+ end
160
+
161
+ describe "Page" do
162
+ let(:page_class){ described_class.const_get(:Page) }
163
+
164
+ it "should generate different ids for different urls" do
165
+ page1 = page_class.new queue, url: "http://example.com/product1"
166
+ page2 = page_class.new queue, url: "http://example.com/product2"
167
+ expect(page1.id).not_to be_blank
168
+ expect(page1.id).not_to eq(page2.id)
169
+ end
170
+ end
171
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scruber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Goncharov
@@ -269,6 +269,31 @@ files:
269
269
  - lib/scruber/queue_adapters/memory.rb
270
270
  - lib/scruber/version.rb
271
271
  - scruber.gemspec
272
+ - spec/core/extensions/csv_output_spec.rb
273
+ - spec/core/extensions/dict.csv
274
+ - spec/core/extensions/log_spec.rb
275
+ - spec/core/extensions/loop_spec.rb
276
+ - spec/core/extensions/parser_aliases_spec.rb
277
+ - spec/core/extensions/queue_aliases_spec.rb
278
+ - spec/core/extensions/seed_spec.rb
279
+ - spec/fetcher.rb
280
+ - spec/helpers/dictionary_reader/dict.csv
281
+ - spec/helpers/dictionary_reader/dict.xml
282
+ - spec/helpers/dictionary_reader/dict_records.xml
283
+ - spec/helpers/dictionary_reader/dictionary_reader_csv_spec.rb
284
+ - spec/helpers/dictionary_reader/dictionary_reader_xml_spec.rb
285
+ - spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb
286
+ - spec/helpers/fetcher_agent_adapters/memory_spec.rb
287
+ - spec/helpers/proxy_rotator/proxy_rotator_proxy_spec.rb
288
+ - spec/helpers/proxy_rotator/proxy_rotator_spec.rb
289
+ - spec/helpers/user_agent_rotator/user_agent_rotator_spec.rb
290
+ - spec/helpers/user_agent_rotator/user_agent_rotator_user_agent_spec.rb
291
+ - spec/helpers/user_agent_rotator/user_agents.xml
292
+ - spec/queue_adapter/memory_spec.rb
293
+ - spec/queue_spec.rb
294
+ - spec/scruber_spec.rb
295
+ - spec/spec_helper.rb
296
+ - spec/support/queue/queue_adapter.rb
272
297
  homepage: https://github.com/scruber/scruber
273
298
  licenses:
274
299
  - MIT