scruber 0.1.6 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b845332207b108efa91983b4721cf7631120ea36
4
- data.tar.gz: 4c9d931ccdbf777c9d469d7cf2697f00acfe94dc
3
+ metadata.gz: b74171eb49e730929f3b303cc1b65f7d55eb54f5
4
+ data.tar.gz: 67debe50e980a21e66fc9966e509b9c4fb65fc2b
5
5
  SHA512:
6
- metadata.gz: b7c9bc638e7f168401bfd15de02746b691c5477563404db66810f894e7a1925821064b096d47c065209223856782e787bdc3f6272a179068aa0fdfcb6c14994d
7
- data.tar.gz: f4c73d7e94e795b32c647285f320cf9e25b9f6efa0c528aaa3138ab99c263627f9ad47088851126d2df07421e633037fa3e8f434d7303a91ad350804d9a64903
6
+ metadata.gz: 2f997b9b072270c8a28dc00f6dcde03a9041adb8f1fc6245d862f4423e3d155bf9e6fe478c11fe0d6081a878407fdc35792fe3a17dbf93b3f566daeb3fd22a60
7
+ data.tar.gz: a5d4f2bafe347e9a775bf01cbfd2170678596fd133cb788a444099a03a95bcc7f72116bd725f32506766f9dbb0756c06f1e3dc909e526d684cfa14e3535f6ae2
@@ -23,6 +23,7 @@ require "scruber/core/page_format"
23
23
  require "scruber/core/page_format/base"
24
24
  require "scruber/core/page_format/xml"
25
25
  require "scruber/core/page_format/html"
26
+ require "scruber/core/page_format/json"
26
27
 
27
28
  require "scruber/core/extensions/base"
28
29
  require "scruber/core/extensions/loop"
@@ -58,7 +59,7 @@ module Scruber
58
59
 
59
60
  def run(*args, &block)
60
61
  raise "You need a block to build!" unless block_given?
61
-
62
+
62
63
  Core::Crawler.new(*args).run(&block)
63
64
  end
64
65
 
@@ -1,10 +1,10 @@
1
1
  module Scruber
2
2
  module Core
3
- #
3
+ #
4
4
  # Crawler class
5
- #
5
+ #
6
6
  # Main class-runner for scrapers.
7
- #
7
+ #
8
8
  # @example Simple scraper
9
9
  # Scruber::Core::Crawler.new(:simple) do
10
10
  # get 'http://example.com'
@@ -12,29 +12,29 @@ module Scruber
12
12
  # puts html.at('title').text
13
13
  # end
14
14
  # end
15
- #
15
+ #
16
16
  # @author Ivan Goncharov
17
- #
17
+ #
18
18
  class Crawler
19
19
  attr_reader :queue, :fetcher, :scraper_name
20
20
 
21
- #
21
+ #
22
22
  # Initialize crawler with scraper name and/or with options
23
- #
23
+ #
24
24
  # Crawler.new(:sample, fetcher_adapter: :custom)
25
25
  # Crawler.new(:sample)
26
26
  # Crawler.new(fetcher_adapter: :custom)
27
- #
27
+ #
28
28
  # @param args [Array] if first arg is a Symbol, it will be used as scraper_name, hash will me used as configuration options (see {Scruber::Core::Configuration})
29
- #
29
+ #
30
30
  # @return [Scruber::Core::Crawler] [description]
31
31
  def initialize(*args)
32
32
  if args.first.is_a?(Hash)
33
33
  scraper_name = nil
34
- options = args.first
34
+ @options = args.first
35
35
  else
36
- scraper_name, options = args
37
- options ||= {}
36
+ scraper_name, @options = args
37
+ @options ||= {}
38
38
  end
39
39
  @scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
40
40
  raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
@@ -44,7 +44,7 @@ module Scruber
44
44
  @on_page_error_callback = nil
45
45
  @on_complete_callbacks = []
46
46
 
47
- Scruber.configuration.merge_options(options)
47
+ Scruber.configuration.merge_options(@options)
48
48
  ActiveSupport::Dependencies.autoload_paths = Scruber.configuration.autoload_paths
49
49
 
50
50
  @queue = Scruber::Queue.new(scraper_name: @scraper_name)
@@ -53,9 +53,9 @@ module Scruber
53
53
  load_extenstions
54
54
  end
55
55
 
56
- #
56
+ #
57
57
  # Crawling engine
58
- #
58
+ #
59
59
  # @param block [Proc] crawler body
60
60
  def run(&block)
61
61
  instance_eval &block
@@ -75,33 +75,33 @@ module Scruber
75
75
  end
76
76
  end
77
77
  end
78
- @on_complete_callbacks.sort_by{|c| -c[0] }.each do |(_,callback)|
78
+ @on_complete_callbacks.sort_by{|c| -c[0] }.map do |(_,callback)|
79
79
  instance_exec &(callback)
80
- end
80
+ end.first
81
81
  end
82
82
 
83
- #
83
+ #
84
84
  # Register parser
85
- #
85
+ #
86
86
  # @param page_type [Symbol] type of page
87
87
  # @param options [Hash] options for parser
88
- # @option options [Symbol] :format format of page. Scruber automatically process
88
+ # @option options [Symbol] :format format of page. Scruber automatically process
89
89
  # page body depends on this format. For example :json or :html
90
90
  # @param block [Proc] body of parser
91
- #
91
+ #
92
92
  # @return [void]
93
93
  def parser(page_type, options={}, &block)
94
94
  register_callback(page_type, options, &block)
95
95
  end
96
96
 
97
- #
97
+ #
98
98
  # Method missing callback. Scruber allows to register
99
99
  # regexp and proc body to process calls
100
- #
100
+ #
101
101
  # @param method_sym [Symbol] missing method name
102
102
  # @param arguments [Array] arguments
103
103
  # @param block [Proc] block (if passed)
104
- #
104
+ #
105
105
  # @return [type] [description]
106
106
  def method_missing(method_sym, *arguments, &block)
107
107
  Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
@@ -124,27 +124,27 @@ module Scruber
124
124
 
125
125
  class << self
126
126
 
127
- #
127
+ #
128
128
  # Register method missing callback
129
- #
129
+ #
130
130
  # @param pattern [Regexp] Regexp to match missing name
131
131
  # @param block [Proc] Body to process missing method
132
- #
132
+ #
133
133
  # @return [void]
134
134
  def register_method_missing(pattern, &block)
135
135
  _registered_method_missings[pattern] = block
136
136
  end
137
137
 
138
- #
138
+ #
139
139
  # Registered method missing callbacks dictionary
140
- #
140
+ #
141
141
  # @return [Hash] callbacks
142
142
  def _registered_method_missings
143
143
  @registered_method_missings ||= {}
144
144
  end
145
145
  end
146
146
 
147
- #
147
+ #
148
148
  # Register callback which will be executed when
149
149
  # downloading and parsing will be completed.
150
150
  # For example when you need to write results to file,
@@ -153,16 +153,16 @@ module Scruber
153
153
  # on_complete -1 do
154
154
  # Scruber::Core::Extensions::CsvOutput.close_all
155
155
  # end
156
- #
156
+ #
157
157
  # @param priority [Integer] priority of this callback
158
158
  # @param block [Proc] body of callback
159
- #
159
+ #
160
160
  # @return [void]
161
161
  def on_complete(priority=1, &block)
162
162
  @on_complete_callbacks.push [priority,block]
163
163
  end
164
164
 
165
- #
165
+ #
166
166
  # Register callback which will be executed for
167
167
  # error pages, like 404 or 500
168
168
  # Attention! You should call one of these methods for page
@@ -178,9 +178,9 @@ module Scruber
178
178
  # page.delete
179
179
  # end
180
180
  # end
181
- #
181
+ #
182
182
  # @param block [Proc] body of callback
183
- #
183
+ #
184
184
  # @return [void]
185
185
  def on_page_error(&block)
186
186
  @on_page_error_callback = block
@@ -188,46 +188,46 @@ module Scruber
188
188
 
189
189
  private
190
190
 
191
- #
191
+ #
192
192
  # Register parser
193
- #
193
+ #
194
194
  # @param page_type [Symbol] type of page
195
195
  # @param options [Hash] options for parser
196
- # @option options [Symbol] :format format of page. Scruber automatically process
196
+ # @option options [Symbol] :format format of page. Scruber automatically process
197
197
  # page body depends on this format. For example :json or :html
198
198
  # @param block [Proc] body of parser
199
- #
199
+ #
200
200
  # @return [void]
201
201
  def register_callback(page_type, options, &block)
202
202
  @callbacks_options[page_type.to_sym] = options || {}
203
203
  @callbacks[page_type.to_sym] = block
204
204
  end
205
205
 
206
- #
206
+ #
207
207
  # Process page body depends on format of this page
208
208
  # For example, if page_format = :html, then
209
209
  # it will return Nokogiri::HTML(page.response_body)
210
- #
210
+ #
211
211
  # @param page [Page] page from queue
212
212
  # @param page_type [Symbol] name of parser
213
- #
213
+ #
214
214
  # @return [Object] depends on page_type it will return different objects
215
215
  def process_page(page, page_type)
216
216
  page_format = @callbacks_options[page_type].fetch(:format){ nil }
217
217
  Scruber::Core::PageFormat.process(page, page_format)
218
218
  end
219
219
 
220
- #
220
+ #
221
221
  # Loads all extensions
222
- #
222
+ #
223
223
  # @return [void]
224
224
  def load_extenstions
225
225
  Scruber::Core::Extensions::Base.descendants.each(&:register)
226
226
  end
227
227
 
228
- #
228
+ #
229
229
  # Initialize progressbar, that shows progress in console
230
- #
230
+ #
231
231
  # @return [void]
232
232
  def initialize_progressbar
233
233
  unless Scruber.configuration.silent
@@ -243,9 +243,9 @@ module Scruber
243
243
  end
244
244
  end
245
245
 
246
- #
246
+ #
247
247
  # Out progress to console
248
- #
248
+ #
249
249
  # @return [void]
250
250
  def show_progress
251
251
  if @progressbar
@@ -0,0 +1,13 @@
1
+ module Scruber
2
+ module Core
3
+ module PageFormat
4
+ class Json < Base
5
+ def self.process(page)
6
+ JSON.parse(page.response_body) rescue nil
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ Scruber::Core::PageFormat.add(:json, Scruber::Core::PageFormat::Json)
@@ -1,3 +1,5 @@
1
+ require 'charlock_holmes'
2
+
1
3
  module Scruber
2
4
  module FetcherAdapters
3
5
  class AbstractAdapter
@@ -45,9 +47,24 @@ module Scruber
45
47
  page.fetched_at = Time.now.to_i
46
48
  end
47
49
  end
50
+ if page.response_headers
51
+ page.response_headers = page.response_headers.inject({}) {|acc, (k,v)| acc[k.gsub('.', '_')] = v.is_a?(Array) ? v.map{|v1| convert_to_utf8(v1) } : convert_to_utf8(v); acc }
52
+ end
53
+ page.response_body = convert_to_utf8(page.response_body)
48
54
  page
49
55
  end
50
56
 
57
+ def convert_to_utf8(text)
58
+ unless text.to_s.empty?
59
+ detection = CharlockHolmes::EncodingDetector.detect(text)
60
+ if detection && detection[:encoding].present?
61
+ text = CharlockHolmes::Converter.convert(text, detection[:encoding], 'UTF-8') rescue text
62
+ end
63
+ end
64
+
65
+ text
66
+ end
67
+
51
68
  def headers_for(page)
52
69
  if page.fetcher_agent
53
70
  headers = page.fetcher_agent.headers
@@ -60,7 +60,7 @@ module Scruber
60
60
  def on_complete_callback(page, response)
61
61
  page.response_code = response.code
62
62
  page.response_body = response.body
63
- page.response_headers = response.response_headers
63
+ page.response_headers = response.headers
64
64
  page.response_total_time = response.total_time
65
65
 
66
66
  if response.timed_out?
@@ -134,6 +134,16 @@ module Scruber
134
134
  raise NotImplementedError
135
135
  end
136
136
 
137
+
138
+ #
139
+ # Join url of current page with another path or url
140
+ # @param link_url [String] link
141
+ #
142
+ # @return [String] joined url
143
+ def url_join(link_url)
144
+ URI.join(url, link_url).to_s
145
+ end
146
+
137
147
  def [](k)
138
148
  instance_variable_get("@#{k.to_s}")
139
149
  end
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.9"
3
3
  end
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
37
37
  spec.add_dependency "activesupport", '~> 5.1', '>= 5.1.5'
38
38
  spec.add_dependency "powerbar", '~> 2.0', '>= 2.0.1'
39
39
  spec.add_dependency "paint", '~> 2.0', '>= 2.0.1'
40
+ spec.add_dependency "charlock_holmes", '~> 0.7', '>= 0.7.6'
40
41
  spec.add_runtime_dependency "thor", "0.20.0"
41
42
  spec.add_development_dependency "bundler", "~> 1.15"
42
43
  spec.add_development_dependency "rake", "~> 10.0"
@@ -11,16 +11,19 @@ RSpec.describe Scruber::Core::Extensions::Loop do
11
11
 
12
12
  it "should add dictionary and read info" do
13
13
  Scruber::Core::Extensions::Loop.register
14
- $zip_codes = []
15
- Scruber.run :sample do
14
+ zip_codes = Scruber.run :sample do
16
15
  add_dictionary :zip_codes_usa, File.expand_path(File.dirname(__FILE__))+'/dict.csv', :csv
16
+ @zip_codes = []
17
+
17
18
  seed do
18
19
  loop :zip_codes_usa, state: 'NY' do |row|
19
- $zip_codes.push row['zip']
20
+ @zip_codes.push row['zip']
20
21
  end
21
22
  end
23
+
24
+ on_complete { @zip_codes }
22
25
  end
23
- expect($zip_codes).to eq(['10001', '10002'])
26
+ expect(zip_codes).to eq(['10001', '10002'])
24
27
  end
25
28
  end
26
29
  end
@@ -19,15 +19,17 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
19
19
 
20
20
  stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
21
21
 
22
- Scruber.run :sample do
22
+ page = Scruber.run :sample do
23
23
  get "http://example.com"
24
-
24
+
25
25
  parse do |page|
26
- $page = page
26
+ @queue_page = page
27
27
  end
28
+
29
+ on_complete { @queue_page }
28
30
  end
29
- expect($page.url).to eq("http://example.com")
30
- expect($page.page_type.to_s).to eq("seed")
31
+ expect(page.url).to eq("http://example.com")
32
+ expect(page.page_type.to_s).to eq("seed")
31
33
  end
32
34
 
33
35
  it "should register parser with custom page_type" do
@@ -35,16 +37,18 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
35
37
 
36
38
  stub_request(:post, "http://example.com").to_return(body: 'Example Domain')
37
39
 
38
- Scruber.run :sample do
40
+ page = Scruber.run :sample do
39
41
  post_product "http://example.com"
40
-
42
+
41
43
  parse_product do |page|
42
- $page = page
44
+ @queue_page = page
43
45
  end
46
+
47
+ on_complete { @queue_page }
44
48
  end
45
- expect($page.url).to eq("http://example.com")
46
- expect($page.method.to_s).to eq("post")
47
- expect($page.page_type.to_s).to eq("product")
49
+ expect(page.url).to eq("http://example.com")
50
+ expect(page.method.to_s).to eq("post")
51
+ expect(page.page_type.to_s).to eq("product")
48
52
  end
49
53
  end
50
54
 
@@ -54,17 +58,19 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
54
58
 
55
59
  stub_request(:get, "http://example.com").to_return(body: '<div><span>Example Domain</span></div>')
56
60
 
57
- Scruber.run :sample do
61
+ page, doc = Scruber.run :sample do
58
62
  get "http://example.com"
59
-
63
+
60
64
  parse :html do |page,doc|
61
- $page = page
62
- $doc = doc
65
+ @queue_page = page
66
+ @doc = doc
63
67
  end
68
+
69
+ on_complete { [@queue_page, @doc] }
64
70
  end
65
- expect($doc.at('span').text).to eq("Example Domain")
66
- expect($page.page_type.to_s).to eq("seed")
67
- expect($page.method.to_s).to eq("get")
71
+ expect(doc.at('span').text).to eq("Example Domain")
72
+ expect(page.page_type.to_s).to eq("seed")
73
+ expect(page.method.to_s).to eq("get")
68
74
  end
69
75
 
70
76
  it "should register parser with custom page_type" do
@@ -72,17 +78,19 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
72
78
 
73
79
  stub_request(:post, "http://example.com").to_return(body: '<div><span>Example Post</span></div>')
74
80
 
75
- Scruber.run :sample do
81
+ page, doc = Scruber.run :sample do
76
82
  post_product "http://example.com"
77
-
83
+
78
84
  parse_product :html do |page,doc|
79
- $page = page
80
- $doc = doc
85
+ @queue_page = page
86
+ @doc = doc
81
87
  end
88
+
89
+ on_complete { [@queue_page, @doc] }
82
90
  end
83
- expect($doc.at('span').text).to eq("Example Post")
84
- expect($page.method.to_s).to eq("post")
85
- expect($page.page_type.to_s).to eq("product")
91
+ expect(doc.at('span').text).to eq("Example Post")
92
+ expect(page.method.to_s).to eq("post")
93
+ expect(page.page_type.to_s).to eq("product")
86
94
  end
87
95
  end
88
96
  end
@@ -19,25 +19,29 @@ RSpec.describe Scruber::Core::Extensions::QueueAliases do
19
19
  it "should add page to queue" do
20
20
  described_class.register
21
21
 
22
- Scruber.run :sample do
22
+ page = Scruber.run :sample do
23
23
  get "http://example.com"
24
- $page = queue.fetch_pending
24
+
25
+ @queue_page = queue.fetch_pending
26
+ on_complete { @queue_page }
25
27
  end
26
- expect($page.url).to eq("http://example.com")
27
- expect($page.method.to_s).to eq("get")
28
- expect($page.page_type.to_s).to eq("seed")
28
+ expect(page.url).to eq("http://example.com")
29
+ expect(page.method.to_s).to eq("get")
30
+ expect(page.page_type.to_s).to eq("seed")
29
31
  end
30
32
 
31
33
  it "should add page to queue" do
32
34
  described_class.register
33
35
 
34
- Scruber.run :sample do
36
+ page = Scruber.run :sample do
35
37
  post_product "http://example.com"
36
- $page = queue.fetch_pending
38
+
39
+ @queue_page = queue.fetch_pending
40
+ on_complete { @queue_page }
37
41
  end
38
- expect($page.url).to eq("http://example.com")
39
- expect($page.method.to_s).to eq("post")
40
- expect($page.page_type).to eq("product")
42
+ expect(page.url).to eq("http://example.com")
43
+ expect(page.method.to_s).to eq("post")
44
+ expect(page.page_type).to eq("product")
41
45
  end
42
46
  end
43
47
 
@@ -45,27 +49,31 @@ RSpec.describe Scruber::Core::Extensions::QueueAliases do
45
49
  it "should add page to queue" do
46
50
  described_class.register
47
51
 
48
- Scruber.run :sample do
52
+ page = Scruber.run :sample do
49
53
  get "http://example.com", user_agent: 'Agent 1'
50
- $page = queue.fetch_pending
54
+
55
+ @queue_page = queue.fetch_pending
56
+ on_complete { @queue_page }
51
57
  end
52
- expect($page.url).to eq("http://example.com")
53
- expect($page.method.to_s).to eq("get")
54
- expect($page.page_type.to_s).to eq("seed")
55
- expect($page.user_agent).to eq('Agent 1')
58
+ expect(page.url).to eq("http://example.com")
59
+ expect(page.method.to_s).to eq("get")
60
+ expect(page.page_type.to_s).to eq("seed")
61
+ expect(page.user_agent).to eq('Agent 1')
56
62
  end
57
63
 
58
64
  it "should add page to queue" do
59
65
  described_class.register
60
66
 
61
- Scruber.run :sample do
67
+ page = Scruber.run :sample do
62
68
  post_product "http://example.com", user_agent: 'Agent 1'
63
- $page = queue.fetch_pending
69
+
70
+ @queue_page = queue.fetch_pending
71
+ on_complete { @queue_page }
64
72
  end
65
- expect($page.url).to eq("http://example.com")
66
- expect($page.method.to_s).to eq("post")
67
- expect($page.page_type).to eq("product")
68
- expect($page.user_agent).to eq('Agent 1')
73
+ expect(page.url).to eq("http://example.com")
74
+ expect(page.method.to_s).to eq("post")
75
+ expect(page.page_type).to eq("product")
76
+ expect(page.user_agent).to eq('Agent 1')
69
77
  end
70
78
  end
71
79
  end
@@ -14,31 +14,32 @@ RSpec.describe Scruber::Core::Extensions::Seed do
14
14
  stub_request(:get, "http://example.com").to_return(body: '<div><a>Main</a></div>')
15
15
  stub_request(:get, "http://example.com/contacts").to_return(body: '<div><a>Contacts</a></div>')
16
16
  end
17
-
17
+
18
18
  it "should execute seed block" do
19
- $queue_size = 0
20
- Scruber.run :sample do
19
+ queue_size = Scruber.run :sample do
21
20
  seed do
22
21
  get 'http://example.com'
23
22
  end
24
- $queue_size = queue.size
23
+ @queue_size = queue.size
24
+ on_complete { @queue_size }
25
25
  end
26
- expect($queue_size).to eq(1)
26
+ expect(queue_size).to eq(1)
27
27
  end
28
28
 
29
29
  it "should not execute seed block" do
30
- $queue_size = 0
31
- Scruber.run :sample do
30
+ queue_size, page = Scruber.run :sample do
32
31
  seed do
33
32
  get 'http://example.com'
34
33
  end
35
34
  seed do
36
35
  get 'http://example.com/contacts'
37
36
  end
38
- $queue_size = queue.size
39
- $page = queue.fetch_pending
37
+ @queue_size = queue.size
38
+ @queue_page = queue.fetch_pending
39
+
40
+ on_complete { [@queue_size, @queue_page] }
40
41
  end
41
- expect($queue_size).to eq(1)
42
- expect($page.url).to eq("http://example.com")
42
+ expect(queue_size).to eq(1)
43
+ expect(page.url).to eq("http://example.com")
43
44
  end
44
45
  end
@@ -13,7 +13,7 @@ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
13
13
  cookie_jar: cookie_jar_string,
14
14
  disable_proxy: true
15
15
  end
16
-
16
+
17
17
  it "set values" do
18
18
  expect(agent.id).to eq(1)
19
19
  expect(agent.user_agent).to eq('Scruber')
@@ -33,7 +33,8 @@ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
33
33
  end
34
34
 
35
35
  it "parse cookies from page" do
36
- page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep-18 00:26:06 GMT; path=/; domain=example.com; HttpOnly"})
36
+ page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep-#{Date.today.year+1} 00:26:06 GMT; path=/; domain=example.com; HttpOnly"})
37
+ puts page.response_cookies.inspect
37
38
  agent.parse_cookies_from_page!(page)
38
39
  expect(agent.cookie_for('http://example.com')).to eq('__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; feed_flow=top')
39
40
  end
@@ -20,7 +20,7 @@ RSpec.describe Scruber do
20
20
  config.fetcher_adapter = :typhoeus_fetcher
21
21
  end
22
22
  end
23
-
23
+
24
24
  it "returns :typhoeus_fetcher as fetcher" do
25
25
  expect(Scruber.configuration.fetcher_adapter).to eq(:typhoeus_fetcher)
26
26
  end
@@ -34,70 +34,70 @@ RSpec.describe Scruber do
34
34
 
35
35
  it "should set scraper name from ENV" do
36
36
  ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
37
- Scruber.run do
38
- $scraper_name = scraper_name
37
+ name = Scruber.run do
38
+ on_complete { scraper_name }
39
39
  end
40
- expect($scraper_name).to eq(:sample)
40
+ expect(name).to eq(:sample)
41
41
  end
42
42
  end
43
43
 
44
44
  context "with args" do
45
45
  it "should set scraper name from first arg" do
46
- Scruber.run :sample1 do
47
- $scraper_name = scraper_name
46
+ name = Scruber.run :sample1 do
47
+ on_complete { scraper_name }
48
48
  end
49
- expect($scraper_name).to eq(:sample1)
49
+ expect(name).to eq(:sample1)
50
50
  end
51
51
 
52
52
  it "should set scraper name from first arg, and options from second" do
53
- Scruber.run :sample2, queue_adapter: :test do
54
- $scraper_name = scraper_name
55
- $opt = Scruber.configuration.queue_adapter
53
+ name, opt = Scruber.run :sample2, queue_adapter: :test do
54
+ on_complete { [scraper_name, Scruber.configuration.queue_adapter] }
56
55
  end
57
- expect($scraper_name).to eq(:sample2)
58
- expect($opt).to eq(:test)
56
+ expect(name).to eq(:sample2)
57
+ expect(opt).to eq(:test)
59
58
  end
60
59
 
61
60
  it "options from first arg and scraper_name from ENV" do
62
61
  ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
63
- Scruber.run queue_adapter: :test2 do
64
- $scraper_name = scraper_name
65
- $opt = Scruber.configuration.queue_adapter
62
+ name, opt = Scruber.run queue_adapter: :test2 do
63
+ on_complete { [scraper_name, Scruber.configuration.queue_adapter] }
66
64
  end
67
- expect($scraper_name).to eq(:sample)
68
- expect($opt).to eq(:test2)
65
+ expect(name).to eq(:sample)
66
+ expect(opt).to eq(:test2)
69
67
  end
70
68
 
71
69
  it "should raise error if passed only options without ENV" do
72
70
  ENV['SCRUBER_SCRAPER_NAME'] = nil
73
- expect { Scruber.run(queue_adapter: :test2) { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
71
+ expect { Scruber.run(queue_adapter: :test2) { scraper_name } }.to raise_error(Scruber::ArgumentError)
74
72
  end
75
73
  end
76
74
 
77
75
  it "simple example" do
78
76
  stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
79
77
 
80
- Scruber.run :sample do
78
+ body = Scruber.run :sample do
81
79
  queue.add "http://example.com"
82
-
80
+
83
81
  parser :seed do |page|
84
- $title = page.response_body
82
+ @page_response_body = page.response_body
85
83
  end
84
+ on_complete { @page_response_body }
86
85
  end
87
- expect($title).to eq('Example Domain')
86
+ expect(body).to eq('Example Domain')
88
87
  end
89
88
 
90
89
  it "should return Nokogiri object" do
91
90
  stub_request(:get, "http://example.com/contacts.html").to_return(body: '<div><a>Contacts</a></div>')
92
91
 
93
- Scruber.run :sample do
92
+ title = Scruber.run :sample do
94
93
  queue.add "http://example.com/contacts.html"
95
-
94
+
96
95
  parser :seed, format: :html do |page, html|
97
- $title = html.at('a').text
96
+ @title = html.at('a').text
98
97
  end
98
+ on_complete { @title }
99
99
  end
100
- expect($title).to eq('Contacts')
100
+ expect(title).to eq('Contacts')
101
101
  end
102
102
 
103
103
  context "complex example" do
@@ -107,10 +107,9 @@ RSpec.describe Scruber do
107
107
  stub_request(:get, "http://example.com/product2").to_return(body: '<div><h1>Product 2</h1></div>')
108
108
  stub_request(:get, "http://example.com/product3").to_return(body: '<div><h1>Product 3</h1></div>')
109
109
 
110
- $products = []
111
- Scruber.run :sample do
110
+ products = Scruber.run :sample do
112
111
  get "http://example.com/catalog"
113
-
112
+
114
113
  parse :html do |page, doc|
115
114
  doc.search('a').each do |a|
116
115
  get_product URI.join(page.url, a.attr('href')).to_s
@@ -118,29 +117,34 @@ RSpec.describe Scruber do
118
117
  end
119
118
 
120
119
  parse_product :html do |page,doc|
121
- $products.push doc.at('h1').text
120
+ @products ||= []
121
+ @products.push doc.at('h1').text
122
122
  end
123
+
124
+ on_complete { @products }
123
125
  end
124
- expect($products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
126
+ expect(products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
125
127
  end
126
128
 
127
129
  it "should redownload page and increase retry" do
128
130
  stub_request(:get, "http://example.com/").to_return(body: '<div>blocked</div>').times(2).then.to_return(body: '<div><h1>Product</h1></div>')
129
131
 
130
- Scruber.run :sample do
132
+ title, retry_count = Scruber.run :sample do
131
133
  get "http://example.com/"
132
-
134
+
133
135
  parse :html do |page, doc|
134
136
  if page.response_body =~ /blocked/
135
137
  page.redownload!
136
138
  else
137
- $title = doc.at('h1').text
138
- $retry_count = page.retry_count
139
+ @title = doc.at('h1').text
140
+ @retry_count = page.retry_count
139
141
  end
140
142
  end
143
+
144
+ on_complete { [@title, @retry_count] }
141
145
  end
142
- expect($title).to eq('Product')
143
- expect($retry_count).to eq(2)
146
+ expect(title).to eq('Product')
147
+ expect(retry_count).to eq(2)
144
148
  end
145
149
  end
146
150
 
@@ -148,39 +152,41 @@ RSpec.describe Scruber do
148
152
  it "should process 500 error page" do
149
153
  stub_request(:get, "http://example.com").to_return(body: '<div><h1>500</h1></div>', status: 500)
150
154
 
151
- $error_title = nil
152
- Scruber.run :sample do
155
+ error_title = Scruber.run :sample do
153
156
  get "http://example.com", max_retry_times: 1
154
157
 
155
158
  parse :html do |page,doc|
156
- $error_title = doc.at('h1').text
159
+ @error_title = doc.at('h1').text
157
160
  end
158
161
 
159
162
  on_page_error do |page|
160
- $error_title = page.response_body
163
+ @error_title = page.response_body
161
164
  page.processed!
162
165
  end
166
+
167
+ on_complete { @error_title }
163
168
  end
164
- expect($error_title).to eq('<div><h1>500</h1></div>')
169
+ expect(error_title).to eq('<div><h1>500</h1></div>')
165
170
  end
166
171
 
167
172
  it "should process 404 error page" do
168
173
  stub_request(:get, "http://example.com").to_return(body: '<div><h1>404</h1></div>', status: 404)
169
174
 
170
- $error_title = nil
171
- Scruber.run :sample do
175
+ error_title = Scruber.run :sample do
172
176
  get "http://example.com", max_retry_times: 1
173
177
 
174
178
  parse :html do |page,doc|
175
- $error_title = doc.at('h1').text
179
+ @error_title = doc.at('h1').text
176
180
  end
177
181
 
178
182
  on_page_error do |page|
179
- $error_title = page.response_body
183
+ @error_title = page.response_body
180
184
  page.processed!
181
185
  end
186
+
187
+ on_complete { @error_title }
182
188
  end
183
- expect($error_title).to eq('<div><h1>404</h1></div>')
189
+ expect(error_title).to eq('<div><h1>404</h1></div>')
184
190
  end
185
191
  end
186
192
  end
@@ -167,5 +167,10 @@ RSpec.shared_examples "queue_adapter" do
167
167
  expect(page1.id).not_to be_blank
168
168
  expect(page1.id).not_to eq(page2.id)
169
169
  end
170
+
171
+ it "should join url" do
172
+ page1 = page_class.new queue, url: "http://example.com/product1"
173
+ expect(page1.url_join('/abc')).to eq("http://example.com/abc")
174
+ end
170
175
  end
171
176
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scruber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Goncharov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-04-23 00:00:00.000000000 Z
11
+ date: 2018-09-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus
@@ -138,6 +138,26 @@ dependencies:
138
138
  - - ">="
139
139
  - !ruby/object:Gem::Version
140
140
  version: 2.0.1
141
+ - !ruby/object:Gem::Dependency
142
+ name: charlock_holmes
143
+ requirement: !ruby/object:Gem::Requirement
144
+ requirements:
145
+ - - "~>"
146
+ - !ruby/object:Gem::Version
147
+ version: '0.7'
148
+ - - ">="
149
+ - !ruby/object:Gem::Version
150
+ version: 0.7.6
151
+ type: :runtime
152
+ prerelease: false
153
+ version_requirements: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - "~>"
156
+ - !ruby/object:Gem::Version
157
+ version: '0.7'
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ version: 0.7.6
141
161
  - !ruby/object:Gem::Dependency
142
162
  name: thor
143
163
  requirement: !ruby/object:Gem::Requirement
@@ -252,6 +272,7 @@ files:
252
272
  - lib/scruber/core/page_format.rb
253
273
  - lib/scruber/core/page_format/base.rb
254
274
  - lib/scruber/core/page_format/html.rb
275
+ - lib/scruber/core/page_format/json.rb
255
276
  - lib/scruber/core/page_format/xml.rb
256
277
  - lib/scruber/fetcher.rb
257
278
  - lib/scruber/fetcher_adapters/abstract_adapter.rb