scruber 0.1.6 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b845332207b108efa91983b4721cf7631120ea36
4
- data.tar.gz: 4c9d931ccdbf777c9d469d7cf2697f00acfe94dc
3
+ metadata.gz: b74171eb49e730929f3b303cc1b65f7d55eb54f5
4
+ data.tar.gz: 67debe50e980a21e66fc9966e509b9c4fb65fc2b
5
5
  SHA512:
6
- metadata.gz: b7c9bc638e7f168401bfd15de02746b691c5477563404db66810f894e7a1925821064b096d47c065209223856782e787bdc3f6272a179068aa0fdfcb6c14994d
7
- data.tar.gz: f4c73d7e94e795b32c647285f320cf9e25b9f6efa0c528aaa3138ab99c263627f9ad47088851126d2df07421e633037fa3e8f434d7303a91ad350804d9a64903
6
+ metadata.gz: 2f997b9b072270c8a28dc00f6dcde03a9041adb8f1fc6245d862f4423e3d155bf9e6fe478c11fe0d6081a878407fdc35792fe3a17dbf93b3f566daeb3fd22a60
7
+ data.tar.gz: a5d4f2bafe347e9a775bf01cbfd2170678596fd133cb788a444099a03a95bcc7f72116bd725f32506766f9dbb0756c06f1e3dc909e526d684cfa14e3535f6ae2
@@ -23,6 +23,7 @@ require "scruber/core/page_format"
23
23
  require "scruber/core/page_format/base"
24
24
  require "scruber/core/page_format/xml"
25
25
  require "scruber/core/page_format/html"
26
+ require "scruber/core/page_format/json"
26
27
 
27
28
  require "scruber/core/extensions/base"
28
29
  require "scruber/core/extensions/loop"
@@ -58,7 +59,7 @@ module Scruber
58
59
 
59
60
  def run(*args, &block)
60
61
  raise "You need a block to build!" unless block_given?
61
-
62
+
62
63
  Core::Crawler.new(*args).run(&block)
63
64
  end
64
65
 
@@ -1,10 +1,10 @@
1
1
  module Scruber
2
2
  module Core
3
- #
3
+ #
4
4
  # Crawler class
5
- #
5
+ #
6
6
  # Main class-runner for scrapers.
7
- #
7
+ #
8
8
  # @example Simple scraper
9
9
  # Scruber::Core::Crawler.new(:simple) do
10
10
  # get 'http://example.com'
@@ -12,29 +12,29 @@ module Scruber
12
12
  # puts html.at('title').text
13
13
  # end
14
14
  # end
15
- #
15
+ #
16
16
  # @author Ivan Goncharov
17
- #
17
+ #
18
18
  class Crawler
19
19
  attr_reader :queue, :fetcher, :scraper_name
20
20
 
21
- #
21
+ #
22
22
  # Initialize crawler with scraper name and/or with options
23
- #
23
+ #
24
24
  # Crawler.new(:sample, fetcher_adapter: :custom)
25
25
  # Crawler.new(:sample)
26
26
  # Crawler.new(fetcher_adapter: :custom)
27
- #
27
+ #
28
28
  # @param args [Array] if first arg is a Symbol, it will be used as scraper_name, hash will me used as configuration options (see {Scruber::Core::Configuration})
29
- #
29
+ #
30
30
  # @return [Scruber::Core::Crawler] [description]
31
31
  def initialize(*args)
32
32
  if args.first.is_a?(Hash)
33
33
  scraper_name = nil
34
- options = args.first
34
+ @options = args.first
35
35
  else
36
- scraper_name, options = args
37
- options ||= {}
36
+ scraper_name, @options = args
37
+ @options ||= {}
38
38
  end
39
39
  @scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
40
40
  raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
@@ -44,7 +44,7 @@ module Scruber
44
44
  @on_page_error_callback = nil
45
45
  @on_complete_callbacks = []
46
46
 
47
- Scruber.configuration.merge_options(options)
47
+ Scruber.configuration.merge_options(@options)
48
48
  ActiveSupport::Dependencies.autoload_paths = Scruber.configuration.autoload_paths
49
49
 
50
50
  @queue = Scruber::Queue.new(scraper_name: @scraper_name)
@@ -53,9 +53,9 @@ module Scruber
53
53
  load_extenstions
54
54
  end
55
55
 
56
- #
56
+ #
57
57
  # Crawling engine
58
- #
58
+ #
59
59
  # @param block [Proc] crawler body
60
60
  def run(&block)
61
61
  instance_eval &block
@@ -75,33 +75,33 @@ module Scruber
75
75
  end
76
76
  end
77
77
  end
78
- @on_complete_callbacks.sort_by{|c| -c[0] }.each do |(_,callback)|
78
+ @on_complete_callbacks.sort_by{|c| -c[0] }.map do |(_,callback)|
79
79
  instance_exec &(callback)
80
- end
80
+ end.first
81
81
  end
82
82
 
83
- #
83
+ #
84
84
  # Register parser
85
- #
85
+ #
86
86
  # @param page_type [Symbol] type of page
87
87
  # @param options [Hash] options for parser
88
- # @option options [Symbol] :format format of page. Scruber automatically process
88
+ # @option options [Symbol] :format format of page. Scruber automatically process
89
89
  # page body depends on this format. For example :json or :html
90
90
  # @param block [Proc] body of parser
91
- #
91
+ #
92
92
  # @return [void]
93
93
  def parser(page_type, options={}, &block)
94
94
  register_callback(page_type, options, &block)
95
95
  end
96
96
 
97
- #
97
+ #
98
98
  # Method missing callback. Scruber allows to register
99
99
  # regexp and proc body to process calls
100
- #
100
+ #
101
101
  # @param method_sym [Symbol] missing method name
102
102
  # @param arguments [Array] arguments
103
103
  # @param block [Proc] block (if passed)
104
- #
104
+ #
105
105
  # @return [type] [description]
106
106
  def method_missing(method_sym, *arguments, &block)
107
107
  Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
@@ -124,27 +124,27 @@ module Scruber
124
124
 
125
125
  class << self
126
126
 
127
- #
127
+ #
128
128
  # Register method missing callback
129
- #
129
+ #
130
130
  # @param pattern [Regexp] Regexp to match missing name
131
131
  # @param block [Proc] Body to process missing method
132
- #
132
+ #
133
133
  # @return [void]
134
134
  def register_method_missing(pattern, &block)
135
135
  _registered_method_missings[pattern] = block
136
136
  end
137
137
 
138
- #
138
+ #
139
139
  # Registered method missing callbacks dictionary
140
- #
140
+ #
141
141
  # @return [Hash] callbacks
142
142
  def _registered_method_missings
143
143
  @registered_method_missings ||= {}
144
144
  end
145
145
  end
146
146
 
147
- #
147
+ #
148
148
  # Register callback which will be executed when
149
149
  # downloading and parsing will be completed.
150
150
  # For example when you need to write results to file,
@@ -153,16 +153,16 @@ module Scruber
153
153
  # on_complete -1 do
154
154
  # Scruber::Core::Extensions::CsvOutput.close_all
155
155
  # end
156
- #
156
+ #
157
157
  # @param priority [Integer] priority of this callback
158
158
  # @param block [Proc] body of callback
159
- #
159
+ #
160
160
  # @return [void]
161
161
  def on_complete(priority=1, &block)
162
162
  @on_complete_callbacks.push [priority,block]
163
163
  end
164
164
 
165
- #
165
+ #
166
166
  # Register callback which will be executed for
167
167
  # error pages, like 404 or 500
168
168
  # Attention! You should call one of these methods for page
@@ -178,9 +178,9 @@ module Scruber
178
178
  # page.delete
179
179
  # end
180
180
  # end
181
- #
181
+ #
182
182
  # @param block [Proc] body of callback
183
- #
183
+ #
184
184
  # @return [void]
185
185
  def on_page_error(&block)
186
186
  @on_page_error_callback = block
@@ -188,46 +188,46 @@ module Scruber
188
188
 
189
189
  private
190
190
 
191
- #
191
+ #
192
192
  # Register parser
193
- #
193
+ #
194
194
  # @param page_type [Symbol] type of page
195
195
  # @param options [Hash] options for parser
196
- # @option options [Symbol] :format format of page. Scruber automatically process
196
+ # @option options [Symbol] :format format of page. Scruber automatically process
197
197
  # page body depends on this format. For example :json or :html
198
198
  # @param block [Proc] body of parser
199
- #
199
+ #
200
200
  # @return [void]
201
201
  def register_callback(page_type, options, &block)
202
202
  @callbacks_options[page_type.to_sym] = options || {}
203
203
  @callbacks[page_type.to_sym] = block
204
204
  end
205
205
 
206
- #
206
+ #
207
207
  # Process page body depends on format of this page
208
208
  # For example, if page_format = :html, then
209
209
  # it will return Nokogiri::HTML(page.response_body)
210
- #
210
+ #
211
211
  # @param page [Page] page from queue
212
212
  # @param page_type [Symbol] name of parser
213
- #
213
+ #
214
214
  # @return [Object] depends on page_type it will return different objects
215
215
  def process_page(page, page_type)
216
216
  page_format = @callbacks_options[page_type].fetch(:format){ nil }
217
217
  Scruber::Core::PageFormat.process(page, page_format)
218
218
  end
219
219
 
220
- #
220
+ #
221
221
  # Loads all extensions
222
- #
222
+ #
223
223
  # @return [void]
224
224
  def load_extenstions
225
225
  Scruber::Core::Extensions::Base.descendants.each(&:register)
226
226
  end
227
227
 
228
- #
228
+ #
229
229
  # Initialize progressbar, that shows progress in console
230
- #
230
+ #
231
231
  # @return [void]
232
232
  def initialize_progressbar
233
233
  unless Scruber.configuration.silent
@@ -243,9 +243,9 @@ module Scruber
243
243
  end
244
244
  end
245
245
 
246
- #
246
+ #
247
247
  # Out progress to console
248
- #
248
+ #
249
249
  # @return [void]
250
250
  def show_progress
251
251
  if @progressbar
@@ -0,0 +1,13 @@
1
+ module Scruber
2
+ module Core
3
+ module PageFormat
4
+ class Json < Base
5
+ def self.process(page)
6
+ JSON.parse(page.response_body) rescue nil
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ Scruber::Core::PageFormat.add(:json, Scruber::Core::PageFormat::Json)
@@ -1,3 +1,5 @@
1
+ require 'charlock_holmes'
2
+
1
3
  module Scruber
2
4
  module FetcherAdapters
3
5
  class AbstractAdapter
@@ -45,9 +47,24 @@ module Scruber
45
47
  page.fetched_at = Time.now.to_i
46
48
  end
47
49
  end
50
+ if page.response_headers
51
+ page.response_headers = page.response_headers.inject({}) {|acc, (k,v)| acc[k.gsub('.', '_')] = v.is_a?(Array) ? v.map{|v1| convert_to_utf8(v1) } : convert_to_utf8(v); acc }
52
+ end
53
+ page.response_body = convert_to_utf8(page.response_body)
48
54
  page
49
55
  end
50
56
 
57
+ def convert_to_utf8(text)
58
+ unless text.to_s.empty?
59
+ detection = CharlockHolmes::EncodingDetector.detect(text)
60
+ if detection && detection[:encoding].present?
61
+ text = CharlockHolmes::Converter.convert(text, detection[:encoding], 'UTF-8') rescue text
62
+ end
63
+ end
64
+
65
+ text
66
+ end
67
+
51
68
  def headers_for(page)
52
69
  if page.fetcher_agent
53
70
  headers = page.fetcher_agent.headers
@@ -60,7 +60,7 @@ module Scruber
60
60
  def on_complete_callback(page, response)
61
61
  page.response_code = response.code
62
62
  page.response_body = response.body
63
- page.response_headers = response.response_headers
63
+ page.response_headers = response.headers
64
64
  page.response_total_time = response.total_time
65
65
 
66
66
  if response.timed_out?
@@ -134,6 +134,16 @@ module Scruber
134
134
  raise NotImplementedError
135
135
  end
136
136
 
137
+
138
+ #
139
+ # Join url of current page with another path or url
140
+ # @param link_url [String] link
141
+ #
142
+ # @return [String] joined url
143
+ def url_join(link_url)
144
+ URI.join(url, link_url).to_s
145
+ end
146
+
137
147
  def [](k)
138
148
  instance_variable_get("@#{k.to_s}")
139
149
  end
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.9"
3
3
  end
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
37
37
  spec.add_dependency "activesupport", '~> 5.1', '>= 5.1.5'
38
38
  spec.add_dependency "powerbar", '~> 2.0', '>= 2.0.1'
39
39
  spec.add_dependency "paint", '~> 2.0', '>= 2.0.1'
40
+ spec.add_dependency "charlock_holmes", '~> 0.7', '>= 0.7.6'
40
41
  spec.add_runtime_dependency "thor", "0.20.0"
41
42
  spec.add_development_dependency "bundler", "~> 1.15"
42
43
  spec.add_development_dependency "rake", "~> 10.0"
@@ -11,16 +11,19 @@ RSpec.describe Scruber::Core::Extensions::Loop do
11
11
 
12
12
  it "should add dictionary and read info" do
13
13
  Scruber::Core::Extensions::Loop.register
14
- $zip_codes = []
15
- Scruber.run :sample do
14
+ zip_codes = Scruber.run :sample do
16
15
  add_dictionary :zip_codes_usa, File.expand_path(File.dirname(__FILE__))+'/dict.csv', :csv
16
+ @zip_codes = []
17
+
17
18
  seed do
18
19
  loop :zip_codes_usa, state: 'NY' do |row|
19
- $zip_codes.push row['zip']
20
+ @zip_codes.push row['zip']
20
21
  end
21
22
  end
23
+
24
+ on_complete { @zip_codes }
22
25
  end
23
- expect($zip_codes).to eq(['10001', '10002'])
26
+ expect(zip_codes).to eq(['10001', '10002'])
24
27
  end
25
28
  end
26
29
  end
@@ -19,15 +19,17 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
19
19
 
20
20
  stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
21
21
 
22
- Scruber.run :sample do
22
+ page = Scruber.run :sample do
23
23
  get "http://example.com"
24
-
24
+
25
25
  parse do |page|
26
- $page = page
26
+ @queue_page = page
27
27
  end
28
+
29
+ on_complete { @queue_page }
28
30
  end
29
- expect($page.url).to eq("http://example.com")
30
- expect($page.page_type.to_s).to eq("seed")
31
+ expect(page.url).to eq("http://example.com")
32
+ expect(page.page_type.to_s).to eq("seed")
31
33
  end
32
34
 
33
35
  it "should register parser with custom page_type" do
@@ -35,16 +37,18 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
35
37
 
36
38
  stub_request(:post, "http://example.com").to_return(body: 'Example Domain')
37
39
 
38
- Scruber.run :sample do
40
+ page = Scruber.run :sample do
39
41
  post_product "http://example.com"
40
-
42
+
41
43
  parse_product do |page|
42
- $page = page
44
+ @queue_page = page
43
45
  end
46
+
47
+ on_complete { @queue_page }
44
48
  end
45
- expect($page.url).to eq("http://example.com")
46
- expect($page.method.to_s).to eq("post")
47
- expect($page.page_type.to_s).to eq("product")
49
+ expect(page.url).to eq("http://example.com")
50
+ expect(page.method.to_s).to eq("post")
51
+ expect(page.page_type.to_s).to eq("product")
48
52
  end
49
53
  end
50
54
 
@@ -54,17 +58,19 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
54
58
 
55
59
  stub_request(:get, "http://example.com").to_return(body: '<div><span>Example Domain</span></div>')
56
60
 
57
- Scruber.run :sample do
61
+ page, doc = Scruber.run :sample do
58
62
  get "http://example.com"
59
-
63
+
60
64
  parse :html do |page,doc|
61
- $page = page
62
- $doc = doc
65
+ @queue_page = page
66
+ @doc = doc
63
67
  end
68
+
69
+ on_complete { [@queue_page, @doc] }
64
70
  end
65
- expect($doc.at('span').text).to eq("Example Domain")
66
- expect($page.page_type.to_s).to eq("seed")
67
- expect($page.method.to_s).to eq("get")
71
+ expect(doc.at('span').text).to eq("Example Domain")
72
+ expect(page.page_type.to_s).to eq("seed")
73
+ expect(page.method.to_s).to eq("get")
68
74
  end
69
75
 
70
76
  it "should register parser with custom page_type" do
@@ -72,17 +78,19 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
72
78
 
73
79
  stub_request(:post, "http://example.com").to_return(body: '<div><span>Example Post</span></div>')
74
80
 
75
- Scruber.run :sample do
81
+ page, doc = Scruber.run :sample do
76
82
  post_product "http://example.com"
77
-
83
+
78
84
  parse_product :html do |page,doc|
79
- $page = page
80
- $doc = doc
85
+ @queue_page = page
86
+ @doc = doc
81
87
  end
88
+
89
+ on_complete { [@queue_page, @doc] }
82
90
  end
83
- expect($doc.at('span').text).to eq("Example Post")
84
- expect($page.method.to_s).to eq("post")
85
- expect($page.page_type.to_s).to eq("product")
91
+ expect(doc.at('span').text).to eq("Example Post")
92
+ expect(page.method.to_s).to eq("post")
93
+ expect(page.page_type.to_s).to eq("product")
86
94
  end
87
95
  end
88
96
  end
@@ -19,25 +19,29 @@ RSpec.describe Scruber::Core::Extensions::QueueAliases do
19
19
  it "should add page to queue" do
20
20
  described_class.register
21
21
 
22
- Scruber.run :sample do
22
+ page = Scruber.run :sample do
23
23
  get "http://example.com"
24
- $page = queue.fetch_pending
24
+
25
+ @queue_page = queue.fetch_pending
26
+ on_complete { @queue_page }
25
27
  end
26
- expect($page.url).to eq("http://example.com")
27
- expect($page.method.to_s).to eq("get")
28
- expect($page.page_type.to_s).to eq("seed")
28
+ expect(page.url).to eq("http://example.com")
29
+ expect(page.method.to_s).to eq("get")
30
+ expect(page.page_type.to_s).to eq("seed")
29
31
  end
30
32
 
31
33
  it "should add page to queue" do
32
34
  described_class.register
33
35
 
34
- Scruber.run :sample do
36
+ page = Scruber.run :sample do
35
37
  post_product "http://example.com"
36
- $page = queue.fetch_pending
38
+
39
+ @queue_page = queue.fetch_pending
40
+ on_complete { @queue_page }
37
41
  end
38
- expect($page.url).to eq("http://example.com")
39
- expect($page.method.to_s).to eq("post")
40
- expect($page.page_type).to eq("product")
42
+ expect(page.url).to eq("http://example.com")
43
+ expect(page.method.to_s).to eq("post")
44
+ expect(page.page_type).to eq("product")
41
45
  end
42
46
  end
43
47
 
@@ -45,27 +49,31 @@ RSpec.describe Scruber::Core::Extensions::QueueAliases do
45
49
  it "should add page to queue" do
46
50
  described_class.register
47
51
 
48
- Scruber.run :sample do
52
+ page = Scruber.run :sample do
49
53
  get "http://example.com", user_agent: 'Agent 1'
50
- $page = queue.fetch_pending
54
+
55
+ @queue_page = queue.fetch_pending
56
+ on_complete { @queue_page }
51
57
  end
52
- expect($page.url).to eq("http://example.com")
53
- expect($page.method.to_s).to eq("get")
54
- expect($page.page_type.to_s).to eq("seed")
55
- expect($page.user_agent).to eq('Agent 1')
58
+ expect(page.url).to eq("http://example.com")
59
+ expect(page.method.to_s).to eq("get")
60
+ expect(page.page_type.to_s).to eq("seed")
61
+ expect(page.user_agent).to eq('Agent 1')
56
62
  end
57
63
 
58
64
  it "should add page to queue" do
59
65
  described_class.register
60
66
 
61
- Scruber.run :sample do
67
+ page = Scruber.run :sample do
62
68
  post_product "http://example.com", user_agent: 'Agent 1'
63
- $page = queue.fetch_pending
69
+
70
+ @queue_page = queue.fetch_pending
71
+ on_complete { @queue_page }
64
72
  end
65
- expect($page.url).to eq("http://example.com")
66
- expect($page.method.to_s).to eq("post")
67
- expect($page.page_type).to eq("product")
68
- expect($page.user_agent).to eq('Agent 1')
73
+ expect(page.url).to eq("http://example.com")
74
+ expect(page.method.to_s).to eq("post")
75
+ expect(page.page_type).to eq("product")
76
+ expect(page.user_agent).to eq('Agent 1')
69
77
  end
70
78
  end
71
79
  end
@@ -14,31 +14,32 @@ RSpec.describe Scruber::Core::Extensions::Seed do
14
14
  stub_request(:get, "http://example.com").to_return(body: '<div><a>Main</a></div>')
15
15
  stub_request(:get, "http://example.com/contacts").to_return(body: '<div><a>Contacts</a></div>')
16
16
  end
17
-
17
+
18
18
  it "should execute seed block" do
19
- $queue_size = 0
20
- Scruber.run :sample do
19
+ queue_size = Scruber.run :sample do
21
20
  seed do
22
21
  get 'http://example.com'
23
22
  end
24
- $queue_size = queue.size
23
+ @queue_size = queue.size
24
+ on_complete { @queue_size }
25
25
  end
26
- expect($queue_size).to eq(1)
26
+ expect(queue_size).to eq(1)
27
27
  end
28
28
 
29
29
  it "should not execute seed block" do
30
- $queue_size = 0
31
- Scruber.run :sample do
30
+ queue_size, page = Scruber.run :sample do
32
31
  seed do
33
32
  get 'http://example.com'
34
33
  end
35
34
  seed do
36
35
  get 'http://example.com/contacts'
37
36
  end
38
- $queue_size = queue.size
39
- $page = queue.fetch_pending
37
+ @queue_size = queue.size
38
+ @queue_page = queue.fetch_pending
39
+
40
+ on_complete { [@queue_size, @queue_page] }
40
41
  end
41
- expect($queue_size).to eq(1)
42
- expect($page.url).to eq("http://example.com")
42
+ expect(queue_size).to eq(1)
43
+ expect(page.url).to eq("http://example.com")
43
44
  end
44
45
  end
@@ -13,7 +13,7 @@ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
13
13
  cookie_jar: cookie_jar_string,
14
14
  disable_proxy: true
15
15
  end
16
-
16
+
17
17
  it "set values" do
18
18
  expect(agent.id).to eq(1)
19
19
  expect(agent.user_agent).to eq('Scruber')
@@ -33,7 +33,8 @@ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
33
33
  end
34
34
 
35
35
  it "parse cookies from page" do
36
- page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep-18 00:26:06 GMT; path=/; domain=example.com; HttpOnly"})
36
+ page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep-#{Date.today.year+1} 00:26:06 GMT; path=/; domain=example.com; HttpOnly"})
37
+ puts page.response_cookies.inspect
37
38
  agent.parse_cookies_from_page!(page)
38
39
  expect(agent.cookie_for('http://example.com')).to eq('__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; feed_flow=top')
39
40
  end
@@ -20,7 +20,7 @@ RSpec.describe Scruber do
20
20
  config.fetcher_adapter = :typhoeus_fetcher
21
21
  end
22
22
  end
23
-
23
+
24
24
  it "returns :typhoeus_fetcher as fetcher" do
25
25
  expect(Scruber.configuration.fetcher_adapter).to eq(:typhoeus_fetcher)
26
26
  end
@@ -34,70 +34,70 @@ RSpec.describe Scruber do
34
34
 
35
35
  it "should set scraper name from ENV" do
36
36
  ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
37
- Scruber.run do
38
- $scraper_name = scraper_name
37
+ name = Scruber.run do
38
+ on_complete { scraper_name }
39
39
  end
40
- expect($scraper_name).to eq(:sample)
40
+ expect(name).to eq(:sample)
41
41
  end
42
42
  end
43
43
 
44
44
  context "with args" do
45
45
  it "should set scraper name from first arg" do
46
- Scruber.run :sample1 do
47
- $scraper_name = scraper_name
46
+ name = Scruber.run :sample1 do
47
+ on_complete { scraper_name }
48
48
  end
49
- expect($scraper_name).to eq(:sample1)
49
+ expect(name).to eq(:sample1)
50
50
  end
51
51
 
52
52
  it "should set scraper name from first arg, and options from second" do
53
- Scruber.run :sample2, queue_adapter: :test do
54
- $scraper_name = scraper_name
55
- $opt = Scruber.configuration.queue_adapter
53
+ name, opt = Scruber.run :sample2, queue_adapter: :test do
54
+ on_complete { [scraper_name, Scruber.configuration.queue_adapter] }
56
55
  end
57
- expect($scraper_name).to eq(:sample2)
58
- expect($opt).to eq(:test)
56
+ expect(name).to eq(:sample2)
57
+ expect(opt).to eq(:test)
59
58
  end
60
59
 
61
60
  it "options from first arg and scraper_name from ENV" do
62
61
  ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
63
- Scruber.run queue_adapter: :test2 do
64
- $scraper_name = scraper_name
65
- $opt = Scruber.configuration.queue_adapter
62
+ name, opt = Scruber.run queue_adapter: :test2 do
63
+ on_complete { [scraper_name, Scruber.configuration.queue_adapter] }
66
64
  end
67
- expect($scraper_name).to eq(:sample)
68
- expect($opt).to eq(:test2)
65
+ expect(name).to eq(:sample)
66
+ expect(opt).to eq(:test2)
69
67
  end
70
68
 
71
69
  it "should raise error if passed only options without ENV" do
72
70
  ENV['SCRUBER_SCRAPER_NAME'] = nil
73
- expect { Scruber.run(queue_adapter: :test2) { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
71
+ expect { Scruber.run(queue_adapter: :test2) { scraper_name } }.to raise_error(Scruber::ArgumentError)
74
72
  end
75
73
  end
76
74
 
77
75
  it "simple example" do
78
76
  stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
79
77
 
80
- Scruber.run :sample do
78
+ body = Scruber.run :sample do
81
79
  queue.add "http://example.com"
82
-
80
+
83
81
  parser :seed do |page|
84
- $title = page.response_body
82
+ @page_response_body = page.response_body
85
83
  end
84
+ on_complete { @page_response_body }
86
85
  end
87
- expect($title).to eq('Example Domain')
86
+ expect(body).to eq('Example Domain')
88
87
  end
89
88
 
90
89
  it "should return Nokogiri object" do
91
90
  stub_request(:get, "http://example.com/contacts.html").to_return(body: '<div><a>Contacts</a></div>')
92
91
 
93
- Scruber.run :sample do
92
+ title = Scruber.run :sample do
94
93
  queue.add "http://example.com/contacts.html"
95
-
94
+
96
95
  parser :seed, format: :html do |page, html|
97
- $title = html.at('a').text
96
+ @title = html.at('a').text
98
97
  end
98
+ on_complete { @title }
99
99
  end
100
- expect($title).to eq('Contacts')
100
+ expect(title).to eq('Contacts')
101
101
  end
102
102
 
103
103
  context "complex example" do
@@ -107,10 +107,9 @@ RSpec.describe Scruber do
107
107
  stub_request(:get, "http://example.com/product2").to_return(body: '<div><h1>Product 2</h1></div>')
108
108
  stub_request(:get, "http://example.com/product3").to_return(body: '<div><h1>Product 3</h1></div>')
109
109
 
110
- $products = []
111
- Scruber.run :sample do
110
+ products = Scruber.run :sample do
112
111
  get "http://example.com/catalog"
113
-
112
+
114
113
  parse :html do |page, doc|
115
114
  doc.search('a').each do |a|
116
115
  get_product URI.join(page.url, a.attr('href')).to_s
@@ -118,29 +117,34 @@ RSpec.describe Scruber do
118
117
  end
119
118
 
120
119
  parse_product :html do |page,doc|
121
- $products.push doc.at('h1').text
120
+ @products ||= []
121
+ @products.push doc.at('h1').text
122
122
  end
123
+
124
+ on_complete { @products }
123
125
  end
124
- expect($products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
126
+ expect(products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
125
127
  end
126
128
 
127
129
  it "should redownload page and increase retry" do
128
130
  stub_request(:get, "http://example.com/").to_return(body: '<div>blocked</div>').times(2).then.to_return(body: '<div><h1>Product</h1></div>')
129
131
 
130
- Scruber.run :sample do
132
+ title, retry_count = Scruber.run :sample do
131
133
  get "http://example.com/"
132
-
134
+
133
135
  parse :html do |page, doc|
134
136
  if page.response_body =~ /blocked/
135
137
  page.redownload!
136
138
  else
137
- $title = doc.at('h1').text
138
- $retry_count = page.retry_count
139
+ @title = doc.at('h1').text
140
+ @retry_count = page.retry_count
139
141
  end
140
142
  end
143
+
144
+ on_complete { [@title, @retry_count] }
141
145
  end
142
- expect($title).to eq('Product')
143
- expect($retry_count).to eq(2)
146
+ expect(title).to eq('Product')
147
+ expect(retry_count).to eq(2)
144
148
  end
145
149
  end
146
150
 
@@ -148,39 +152,41 @@ RSpec.describe Scruber do
148
152
  it "should process 500 error page" do
149
153
  stub_request(:get, "http://example.com").to_return(body: '<div><h1>500</h1></div>', status: 500)
150
154
 
151
- $error_title = nil
152
- Scruber.run :sample do
155
+ error_title = Scruber.run :sample do
153
156
  get "http://example.com", max_retry_times: 1
154
157
 
155
158
  parse :html do |page,doc|
156
- $error_title = doc.at('h1').text
159
+ @error_title = doc.at('h1').text
157
160
  end
158
161
 
159
162
  on_page_error do |page|
160
- $error_title = page.response_body
163
+ @error_title = page.response_body
161
164
  page.processed!
162
165
  end
166
+
167
+ on_complete { @error_title }
163
168
  end
164
- expect($error_title).to eq('<div><h1>500</h1></div>')
169
+ expect(error_title).to eq('<div><h1>500</h1></div>')
165
170
  end
166
171
 
167
172
  it "should process 404 error page" do
168
173
  stub_request(:get, "http://example.com").to_return(body: '<div><h1>404</h1></div>', status: 404)
169
174
 
170
- $error_title = nil
171
- Scruber.run :sample do
175
+ error_title = Scruber.run :sample do
172
176
  get "http://example.com", max_retry_times: 1
173
177
 
174
178
  parse :html do |page,doc|
175
- $error_title = doc.at('h1').text
179
+ @error_title = doc.at('h1').text
176
180
  end
177
181
 
178
182
  on_page_error do |page|
179
- $error_title = page.response_body
183
+ @error_title = page.response_body
180
184
  page.processed!
181
185
  end
186
+
187
+ on_complete { @error_title }
182
188
  end
183
- expect($error_title).to eq('<div><h1>404</h1></div>')
189
+ expect(error_title).to eq('<div><h1>404</h1></div>')
184
190
  end
185
191
  end
186
192
  end
@@ -167,5 +167,10 @@ RSpec.shared_examples "queue_adapter" do
167
167
  expect(page1.id).not_to be_blank
168
168
  expect(page1.id).not_to eq(page2.id)
169
169
  end
170
+
171
+ it "should join url" do
172
+ page1 = page_class.new queue, url: "http://example.com/product1"
173
+ expect(page1.url_join('/abc')).to eq("http://example.com/abc")
174
+ end
170
175
  end
171
176
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scruber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Goncharov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-04-23 00:00:00.000000000 Z
11
+ date: 2018-09-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus
@@ -138,6 +138,26 @@ dependencies:
138
138
  - - ">="
139
139
  - !ruby/object:Gem::Version
140
140
  version: 2.0.1
141
+ - !ruby/object:Gem::Dependency
142
+ name: charlock_holmes
143
+ requirement: !ruby/object:Gem::Requirement
144
+ requirements:
145
+ - - "~>"
146
+ - !ruby/object:Gem::Version
147
+ version: '0.7'
148
+ - - ">="
149
+ - !ruby/object:Gem::Version
150
+ version: 0.7.6
151
+ type: :runtime
152
+ prerelease: false
153
+ version_requirements: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - "~>"
156
+ - !ruby/object:Gem::Version
157
+ version: '0.7'
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ version: 0.7.6
141
161
  - !ruby/object:Gem::Dependency
142
162
  name: thor
143
163
  requirement: !ruby/object:Gem::Requirement
@@ -252,6 +272,7 @@ files:
252
272
  - lib/scruber/core/page_format.rb
253
273
  - lib/scruber/core/page_format/base.rb
254
274
  - lib/scruber/core/page_format/html.rb
275
+ - lib/scruber/core/page_format/json.rb
255
276
  - lib/scruber/core/page_format/xml.rb
256
277
  - lib/scruber/fetcher.rb
257
278
  - lib/scruber/fetcher_adapters/abstract_adapter.rb