scruber 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,198 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Scruber do
4
+ before do
5
+ Scruber::Helpers::UserAgentRotator.configure do
6
+ clean
7
+ set_filter :all
8
+ add "Scruber 1.0", tags: [:robot, :scruber]
9
+ add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", tags: [:desktop, :chrome, :macos]
10
+ end
11
+ end
12
+
13
+ it "has a version number" do
14
+ expect(Scruber::VERSION).not_to be nil
15
+ end
16
+
17
+ describe "configurable" do
18
+ before do
19
+ Scruber.configure do |config|
20
+ config.fetcher_adapter = :typhoeus_fetcher
21
+ end
22
+ end
23
+
24
+ it "returns :typhoeus_fetcher as fetcher" do
25
+ expect(Scruber.configuration.fetcher_adapter).to eq(:typhoeus_fetcher)
26
+ end
27
+ end
28
+
29
+ describe "#run" do
30
+ context "without args" do
31
+ it "should raise error" do
32
+ expect { Scruber.run { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
33
+ end
34
+
35
+ it "should set scraper name from ENV" do
36
+ ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
37
+ Scruber.run do
38
+ $scraper_name = scraper_name
39
+ end
40
+ expect($scraper_name).to eq(:sample)
41
+ end
42
+ end
43
+
44
+ context "with args" do
45
+ it "should set scraper name from first arg" do
46
+ Scruber.run :sample1 do
47
+ $scraper_name = scraper_name
48
+ end
49
+ expect($scraper_name).to eq(:sample1)
50
+ end
51
+
52
+ it "should set scraper name from first arg, and options from second" do
53
+ Scruber.run :sample2, queue_adapter: :test do
54
+ $scraper_name = scraper_name
55
+ $opt = Scruber.configuration.queue_adapter
56
+ end
57
+ expect($scraper_name).to eq(:sample2)
58
+ expect($opt).to eq(:test)
59
+ end
60
+
61
+ it "options from first arg and scraper_name from ENV" do
62
+ ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
63
+ Scruber.run queue_adapter: :test2 do
64
+ $scraper_name = scraper_name
65
+ $opt = Scruber.configuration.queue_adapter
66
+ end
67
+ expect($scraper_name).to eq(:sample)
68
+ expect($opt).to eq(:test2)
69
+ end
70
+
71
+ it "should raise error if passed only options without ENV" do
72
+ ENV['SCRUBER_SCRAPER_NAME'] = nil
73
+ expect { Scruber.run(queue_adapter: :test2) { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
74
+ end
75
+ end
76
+
77
+ it "simple example" do
78
+ stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
79
+
80
+ Scruber.run :sample do
81
+ queue.add "http://example.com"
82
+
83
+ parser :seed do |page|
84
+ $title = page.response_body
85
+ end
86
+ end
87
+ expect($title).to eq('Example Domain')
88
+ end
89
+
90
+ it "should return Nokogiri object" do
91
+ stub_request(:get, "http://example.com/contacts.html").to_return(body: '<div><a>Contacts</a></div>')
92
+
93
+ Scruber.run :sample do
94
+ queue.add "http://example.com/contacts.html"
95
+
96
+ parser :seed, format: :html do |page, html|
97
+ $title = html.at('a').text
98
+ end
99
+ end
100
+ expect($title).to eq('Contacts')
101
+ end
102
+
103
+ context "complex example" do
104
+ it "should parse pages in 2 steps" do
105
+ stub_request(:get, "http://example.com/catalog").to_return(body: '<div><a href="/product1">Product 1</a><a href="/product2">Product 2</a><a href="/product3">Product 3</a></div>')
106
+ stub_request(:get, "http://example.com/product1").to_return(body: '<div><h1>Product 1</h1></div>')
107
+ stub_request(:get, "http://example.com/product2").to_return(body: '<div><h1>Product 2</h1></div>')
108
+ stub_request(:get, "http://example.com/product3").to_return(body: '<div><h1>Product 3</h1></div>')
109
+
110
+ $products = []
111
+ Scruber.run :sample do
112
+ get "http://example.com/catalog"
113
+
114
+ parse :html do |page, doc|
115
+ doc.search('a').each do |a|
116
+ get_product URI.join(page.url, a.attr('href')).to_s
117
+ end
118
+ end
119
+
120
+ parse_product :html do |page,doc|
121
+ $products.push doc.at('h1').text
122
+ end
123
+ end
124
+ expect($products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
125
+ end
126
+
127
+ it "should redownload page and increase retry" do
128
+ stub_request(:get, "http://example.com/").to_return(body: '<div>blocked</div>').times(2).then.to_return(body: '<div><h1>Product</h1></div>')
129
+
130
+ Scruber.run :sample do
131
+ get "http://example.com/"
132
+
133
+ parse :html do |page, doc|
134
+ if page.response_body =~ /blocked/
135
+ page.redownload!
136
+ else
137
+ $title = doc.at('h1').text
138
+ $retry_count = page.retry_count
139
+ end
140
+ end
141
+ end
142
+ expect($title).to eq('Product')
143
+ expect($retry_count).to eq(2)
144
+ end
145
+ end
146
+
147
+ context "processing error examples" do
148
+ it "should process 500 error page" do
149
+ stub_request(:get, "http://example.com").to_return(body: '<div><h1>500</h1></div>', status: 500)
150
+
151
+ $error_title = nil
152
+ Scruber.run :sample do
153
+ get "http://example.com", max_retry_times: 1
154
+
155
+ parse :html do |page,doc|
156
+ $error_title = doc.at('h1').text
157
+ end
158
+
159
+ on_page_error do |page|
160
+ $error_title = page.response_body
161
+ page.processed!
162
+ end
163
+ end
164
+ expect($error_title).to eq('<div><h1>500</h1></div>')
165
+ end
166
+
167
+ it "should process 404 error page" do
168
+ stub_request(:get, "http://example.com").to_return(body: '<div><h1>404</h1></div>', status: 404)
169
+
170
+ $error_title = nil
171
+ Scruber.run :sample do
172
+ get "http://example.com", max_retry_times: 1
173
+
174
+ parse :html do |page,doc|
175
+ $error_title = doc.at('h1').text
176
+ end
177
+
178
+ on_page_error do |page|
179
+ $error_title = page.response_body
180
+ page.processed!
181
+ end
182
+ end
183
+ expect($error_title).to eq('<div><h1>404</h1></div>')
184
+ end
185
+ end
186
+ end
187
+
188
+ describe "#root" do
189
+ it "should return nil without APP_PATH defined" do
190
+ expect(Scruber.root).to eq(nil)
191
+ end
192
+
193
+ it "should return path object" do
194
+ APP_PATH='/tmp/a/b/'
195
+ expect(Scruber.root.to_s).to eq('/tmp')
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,36 @@
1
+ require "bundler/setup"
2
+ require "scruber"
3
+ require 'webmock/rspec'
4
+
5
+ Encoding.default_external = Encoding::UTF_8
6
+ Encoding.default_internal = Encoding::UTF_8
7
+
8
+ Dir[File.expand_path(File.dirname(__FILE__))+"/support/**/*.rb"].each { |f| require f }
9
+
10
+ Scruber::Helpers::UserAgentRotator.configure do
11
+ set_filter :all
12
+ add "Scruber 1.0", tags: [:robot, :scruber]
13
+ add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", tags: [:desktop, :chrome, :macos]
14
+ end
15
+
16
+ RSpec.configure do |config|
17
+ # Enable flags like --only-failures and --next-failure
18
+ config.example_status_persistence_file_path = ".rspec_status"
19
+
20
+ # Disable RSpec exposing methods globally on `Module` and `main`
21
+ config.disable_monkey_patching!
22
+
23
+ config.expect_with :rspec do |c|
24
+ c.syntax = :expect
25
+ end
26
+
27
+ # Use color in STDOUT
28
+ config.color = true
29
+
30
+ # Use color not only in STDOUT but also in pagers and files
31
+ config.tty = true
32
+
33
+ # Use the specified formatter
34
+ config.formatter = :progress # :documentation, :html, :textmate
35
+
36
+ end
@@ -0,0 +1,171 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.shared_examples "queue_adapter" do
4
+
5
+ it "should update page" do
6
+ queue.add "http://example.com"
7
+ page = queue.fetch_pending
8
+ page.url = "http://example.net"
9
+ page.save
10
+ page = queue.fetch_pending
11
+ expect(page.url).to eq("http://example.net")
12
+ end
13
+
14
+ it "should update page and fetch downloaded page" do
15
+ queue.add "http://example.com"
16
+ page = queue.fetch_pending
17
+ page.fetched_at = Time.now.to_i
18
+ page.save
19
+ pending_page = queue.fetch_pending
20
+ downloaded_page = queue.fetch_downloaded
21
+ expect(pending_page).to eq(nil)
22
+ expect(downloaded_page.url).to eq("http://example.com")
23
+ end
24
+
25
+ describe "processing errors page" do
26
+ it "should fetch error page" do
27
+ queue.add "http://example.com"
28
+ page = queue.fetch_pending
29
+ page.retry_count = 5
30
+ page.max_retry_times = 5
31
+ page.save
32
+ error_page = queue.fetch_error
33
+ expect(error_page).not_to eq(nil)
34
+ expect(error_page.id).to eq(page.id)
35
+ end
36
+
37
+ it "should return page to downloading" do
38
+ queue.add "http://example.com"
39
+ page = queue.fetch_pending
40
+ page.retry_count = 5
41
+ page.max_retry_times = 5
42
+ page.save
43
+ error_page = queue.fetch_error
44
+ error_page.redownload!(0)
45
+ pending_page = queue.fetch_pending
46
+ err_page = queue.fetch_error
47
+ d_page = queue.fetch_downloaded
48
+ expect(error_page.id).to eq(pending_page.id)
49
+ expect(err_page).to be_nil
50
+ expect(d_page).to be_nil
51
+ end
52
+
53
+ it "should delete page from queue" do
54
+ queue.add "http://example.com"
55
+ page = queue.fetch_pending
56
+ page.retry_count = 5
57
+ page.max_retry_times = 5
58
+ page.save
59
+ error_page = queue.fetch_error
60
+ error_page.delete
61
+ pending_page = queue.fetch_pending
62
+ err_page = queue.fetch_error
63
+ d_page = queue.fetch_downloaded
64
+ expect(pending_page).to be_nil
65
+ expect(err_page).to be_nil
66
+ expect(d_page).to be_nil
67
+ end
68
+
69
+ it "should process page" do
70
+ queue.add "http://example.com"
71
+ page = queue.fetch_pending
72
+ page.retry_count = 5
73
+ page.max_retry_times = 5
74
+ page.save
75
+ error_page = queue.fetch_error
76
+ error_page.processed!
77
+ pending_page = queue.fetch_pending
78
+ err_page = queue.fetch_error
79
+ d_page = queue.fetch_downloaded
80
+ expect(pending_page).to be_nil
81
+ expect(err_page).to be_nil
82
+ expect(d_page).to be_nil
83
+ queue.add "http://example.com"
84
+ pending_page = queue.fetch_pending
85
+ expect(pending_page).to be_nil
86
+ end
87
+ end
88
+
89
+ context "#add" do
90
+ it "queue page for downloading" do
91
+ queue.add "http://example.com"
92
+ expect(queue.size).to eq(1)
93
+ end
94
+
95
+ it "should not add the same page twice" do
96
+ queue.add "http://example.com"
97
+ expect(queue.size).to eq(1)
98
+ queue.add "http://example.com"
99
+ expect(queue.size).to eq(1)
100
+ end
101
+
102
+ it "should not add the same page twice even if page was processed" do
103
+ queue.add "http://example.com"
104
+ page = queue.fetch_pending
105
+ page.fetched_at = Time.now.to_i
106
+ page.save
107
+ downloaded_page = queue.fetch_downloaded
108
+ downloaded_page.processed!
109
+ queue.add "http://example.com"
110
+ page = queue.fetch_pending
111
+ expect(page).to eq(nil)
112
+ end
113
+ end
114
+
115
+ context "#save" do
116
+ it "should delete page" do
117
+ queue.add "http://example.abc"
118
+ page = queue.fetch_pending
119
+ page.fetched_at = Time.now.to_i
120
+ page.save
121
+ page.delete
122
+ page = queue.fetch_downloaded
123
+
124
+ expect(page).to eq(nil)
125
+ end
126
+
127
+ it "should save additional arguments" do
128
+ queue.add "http://example.abc", id: 'abc', test_id: '1'
129
+ page = queue.find 'abc'
130
+
131
+ expect(page.options[:test_id]).to eq('1')
132
+ end
133
+
134
+ it "should not override page" do
135
+ queue.add "http://example.abc", id: 'abc'
136
+ page = queue.find 'abc'
137
+ page.fetched_at = 1
138
+ page.save
139
+ page = queue.find 'abc'
140
+ expect(page.fetched_at).to eq(1)
141
+ queue.add "http://example.abc", id: 'abc'
142
+ page = queue.find 'abc'
143
+ expect(page.fetched_at).to eq(1)
144
+ end
145
+ end
146
+
147
+ context "#processed!" do
148
+ it "should update page and set processed_at" do
149
+ queue.add "http://example.com"
150
+ page = queue.fetch_pending
151
+ page.fetched_at = Time.now.to_i
152
+ page.save
153
+ downloaded_page = queue.fetch_downloaded
154
+ downloaded_page.processed!
155
+ downloaded_page2 = queue.fetch_downloaded
156
+ expect(downloaded_page2).to eq(nil)
157
+ expect(downloaded_page.processed_at).to be >= 0
158
+ end
159
+ end
160
+
161
+ describe "Page" do
162
+ let(:page_class){ described_class.const_get(:Page) }
163
+
164
+ it "should generate different ids for different urls" do
165
+ page1 = page_class.new queue, url: "http://example.com/product1"
166
+ page2 = page_class.new queue, url: "http://example.com/product2"
167
+ expect(page1.id).not_to be_blank
168
+ expect(page1.id).not_to eq(page2.id)
169
+ end
170
+ end
171
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scruber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Goncharov
@@ -269,6 +269,31 @@ files:
269
269
  - lib/scruber/queue_adapters/memory.rb
270
270
  - lib/scruber/version.rb
271
271
  - scruber.gemspec
272
+ - spec/core/extensions/csv_output_spec.rb
273
+ - spec/core/extensions/dict.csv
274
+ - spec/core/extensions/log_spec.rb
275
+ - spec/core/extensions/loop_spec.rb
276
+ - spec/core/extensions/parser_aliases_spec.rb
277
+ - spec/core/extensions/queue_aliases_spec.rb
278
+ - spec/core/extensions/seed_spec.rb
279
+ - spec/fetcher.rb
280
+ - spec/helpers/dictionary_reader/dict.csv
281
+ - spec/helpers/dictionary_reader/dict.xml
282
+ - spec/helpers/dictionary_reader/dict_records.xml
283
+ - spec/helpers/dictionary_reader/dictionary_reader_csv_spec.rb
284
+ - spec/helpers/dictionary_reader/dictionary_reader_xml_spec.rb
285
+ - spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb
286
+ - spec/helpers/fetcher_agent_adapters/memory_spec.rb
287
+ - spec/helpers/proxy_rotator/proxy_rotator_proxy_spec.rb
288
+ - spec/helpers/proxy_rotator/proxy_rotator_spec.rb
289
+ - spec/helpers/user_agent_rotator/user_agent_rotator_spec.rb
290
+ - spec/helpers/user_agent_rotator/user_agent_rotator_user_agent_spec.rb
291
+ - spec/helpers/user_agent_rotator/user_agents.xml
292
+ - spec/queue_adapter/memory_spec.rb
293
+ - spec/queue_spec.rb
294
+ - spec/scruber_spec.rb
295
+ - spec/spec_helper.rb
296
+ - spec/support/queue/queue_adapter.rb
272
297
  homepage: https://github.com/scruber/scruber
273
298
  licenses:
274
299
  - MIT