kimurai 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,38 +3,46 @@ require 'csv'
3
3
 
4
4
  module Kimurai
5
5
  class Base
6
- class SimpleSaver
7
- def initialize
6
+ class Saver
7
+ attr_reader :format, :path, :position, :append
8
+
9
+ def initialize(path, format:, position: true, append: false)
10
+ unless %i(json pretty_json jsonlines csv).include?(format)
11
+ raise "SimpleSaver: wrong type of format: #{format}"
12
+ end
13
+
14
+ @path = path
15
+ @format = format
16
+ @position = position
8
17
  @index = 0
18
+ @append = append
9
19
  @mutex = Mutex.new
10
20
  end
11
21
 
12
- def save(path, item, format:, position:)
22
+ def save(item)
13
23
  @mutex.synchronize do
14
24
  @index += 1
15
25
  item[:position] = @index if position
16
26
 
17
27
  case format
18
28
  when :json
19
- save_to_json(item, path)
29
+ save_to_json(item)
20
30
  when :pretty_json
21
- save_to_pretty_json(item, path)
31
+ save_to_pretty_json(item)
22
32
  when :jsonlines
23
- save_to_jsonlines(item, path)
33
+ save_to_jsonlines(item)
24
34
  when :csv
25
- save_to_csv(item, path)
26
- else
27
- raise "SimpleSaver: wrong type of format: #{format}"
35
+ save_to_csv(item)
28
36
  end
29
37
  end
30
38
  end
31
39
 
32
40
  private
33
41
 
34
- def save_to_json(item, path)
42
+ def save_to_json(item)
35
43
  data = JSON.generate([item])
36
44
 
37
- if @index > 1
45
+ if append || @index > 1
38
46
  file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
39
47
  File.open(path, "w") do |f|
40
48
  f.write(file_content + data.sub(/\A\[/, ""))
@@ -44,10 +52,10 @@ module Kimurai
44
52
  end
45
53
  end
46
54
 
47
- def save_to_pretty_json(item, path)
55
+ def save_to_pretty_json(item)
48
56
  data = JSON.pretty_generate([item])
49
57
 
50
- if @index > 1
58
+ if append || @index > 1
51
59
  file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
52
60
  File.open(path, "w") do |f|
53
61
  f.write(file_content + data.sub(/\A\[\n/, ""))
@@ -57,20 +65,20 @@ module Kimurai
57
65
  end
58
66
  end
59
67
 
60
- def save_to_jsonlines(item, path)
68
+ def save_to_jsonlines(item)
61
69
  data = JSON.generate(item)
62
70
 
63
- if @index > 1
71
+ if append || @index > 1
64
72
  File.open(path, "a") { |file| file.write("\n" + data) }
65
73
  else
66
74
  File.open(path, "w") { |file| file.write(data) }
67
75
  end
68
76
  end
69
77
 
70
- def save_to_csv(item, path)
78
+ def save_to_csv(item)
71
79
  data = flatten_hash(item)
72
80
 
73
- if @index > 1
81
+ if append || @index > 1
74
82
  CSV.open(path, "a+", force_quotes: true) do |csv|
75
83
  csv << data.values
76
84
  end
@@ -0,0 +1,91 @@
1
+ require 'pstore'
2
+
3
+ module Kimurai
4
+ class Base
5
+ class Storage
6
+ attr_reader :database, :path
7
+
8
+ def initialize(path = nil)
9
+ @path = path
10
+ @mutex = Mutex.new
11
+ @database = path ? PStore.new(path) : {}
12
+ end
13
+
14
+ def all(scope = nil)
15
+ @mutex.synchronize do
16
+ if path
17
+ database.transaction { scope ? database.fetch(scope, []) : database }
18
+ else
19
+ scope ? database.fetch(scope, []) : database
20
+ end
21
+ end
22
+ end
23
+
24
+ def include?(scope, value)
25
+ @mutex.synchronize do
26
+ if path
27
+ database.transaction do
28
+ database[scope] ||= []
29
+ database[scope].include?(value)
30
+ end
31
+ else
32
+ database[scope] ||= []
33
+ database[scope].include?(value)
34
+ end
35
+ end
36
+ end
37
+
38
+ def add(scope, value)
39
+ @mutex.synchronize do
40
+ if path
41
+ database.transaction do
42
+ database[scope] ||= []
43
+ database[scope].push(value) unless database[scope].include?(value)
44
+ end
45
+ else
46
+ database[scope] ||= []
47
+ database[scope].push(value) unless database[scope].include?(value)
48
+ end
49
+ end
50
+ end
51
+
52
+ ###
53
+
54
+ def unique?(scope, value)
55
+ @mutex.synchronize do
56
+ if path
57
+ database.transaction do
58
+ database[scope] ||= []
59
+ database[scope].include?(value) ? false : database[scope].push(value) and true
60
+ end
61
+ else
62
+ database[scope] ||= []
63
+ database[scope].include?(value) ? false : database[scope].push(value) and true
64
+ end
65
+ end
66
+ end
67
+
68
+ ###
69
+
70
+ def clear!
71
+ @mutex.synchronize do
72
+ if path
73
+ database.transaction do
74
+ database.roots.each { |key| database.delete key }
75
+ end
76
+ else
77
+ database = {}
78
+ end
79
+ end
80
+ end
81
+
82
+ def delete!
83
+ @mutex.synchronize do
84
+ if path
85
+ File.delete path if File.exists? path
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
@@ -13,6 +13,12 @@ module Kimurai
13
13
  raise "BrowserBuilder: wrong name of engine, available engines: #{AVAILABLE_ENGINES.join(', ')}"
14
14
  end
15
15
 
16
+ if config[:browser].present?
17
+ raise "++++++ BrowserBuilder: browser option is depricated. Now all sub-options inside " \
18
+ "`browser` should be placed right into `@config` hash, without `browser` parent key.\n" \
19
+ "See more here: https://github.com/vifreefly/kimuraframework/blob/master/CHANGELOG.md#breaking-changes-110 ++++++"
20
+ end
21
+
16
22
  case engine
17
23
  when :mechanize
18
24
  require_relative 'browser_builder/mechanize_builder'
@@ -29,6 +29,10 @@ module Kimurai
29
29
  @browser.spider = spider
30
30
  logger.debug "BrowserBuilder (mechanize): created browser instance"
31
31
 
32
+ if @config[:extensions].present?
33
+ logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
34
+ end
35
+
32
36
  # Proxy
33
37
  if proxy = @config[:proxy].presence
34
38
  proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
@@ -63,7 +67,7 @@ module Kimurai
63
67
  user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
64
68
 
65
69
  @browser.driver.add_header("User-Agent", user_agent_string)
66
- logger.debug "BrowserBuilder (mechanize): enabled custom user-agent"
70
+ logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
67
71
  end
68
72
 
69
73
  # Cookies
@@ -77,59 +81,59 @@ module Kimurai
77
81
 
78
82
  # Browser instance options
79
83
  # retry_request_errors
80
- if errors = @config.dig(:browser, :retry_request_errors).presence
84
+ if errors = @config[:retry_request_errors].presence
81
85
  @browser.config.retry_request_errors = errors
82
- logger.debug "BrowserBuilder (mechanize): enabled `browser retry_request_errors`"
86
+ logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
83
87
  end
84
88
 
85
89
  # restart_if
86
- if @config.dig(:browser, :restart_if).present?
87
- logger.warn "BrowserBuilder (mechanize): `browser restart_if` options not supported by Mechanize, skipped"
90
+ if @config[:restart_if].present?
91
+ logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
88
92
  end
89
93
 
90
94
  # before_request clear_cookies
91
- if @config.dig(:browser, :before_request, :clear_cookies)
95
+ if @config.dig(:before_request, :clear_cookies)
92
96
  @browser.config.before_request[:clear_cookies] = true
93
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_cookies`"
97
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
94
98
  end
95
99
 
96
100
  # before_request clear_and_set_cookies
97
- if @config.dig(:browser, :before_request, :clear_and_set_cookies)
101
+ if @config.dig(:before_request, :clear_and_set_cookies)
98
102
  if cookies = @config[:cookies].presence
99
103
  @browser.config.cookies = cookies
100
104
  @browser.config.before_request[:clear_and_set_cookies] = true
101
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_and_set_cookies`"
105
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
102
106
  else
103
- logger.error "BrowserBuilder (mechanize): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
107
+ logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
104
108
  end
105
109
  end
106
110
 
107
111
  # before_request change_user_agent
108
- if @config.dig(:browser, :before_request, :change_user_agent)
112
+ if @config.dig(:before_request, :change_user_agent)
109
113
  if @config[:user_agent].present? && @config[:user_agent].class == Proc
110
114
  @browser.config.user_agent = @config[:user_agent]
111
115
  @browser.config.before_request[:change_user_agent] = true
112
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_user_agent`"
116
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
113
117
  else
114
- logger.error "BrowserBuilder (mechanize): `user_agent` should be present and has lambda format to enable `browser before_request change_user_agent`, skipped"
118
+ logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
115
119
  end
116
120
  end
117
121
 
118
122
  # before_request change_proxy
119
- if @config.dig(:browser, :before_request, :change_proxy)
123
+ if @config.dig(:before_request, :change_proxy)
120
124
  if @config[:proxy].present? && @config[:proxy].class == Proc
121
125
  @browser.config.proxy = @config[:proxy]
122
126
  @browser.config.before_request[:change_proxy] = true
123
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_proxy`"
127
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
124
128
  else
125
- logger.error "BrowserBuilder (mechanize): `proxy` should be present and has lambda format to enable `browser before_request change_proxy`, skipped"
129
+ logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
126
130
  end
127
131
  end
128
132
 
129
133
  # before_request delay
130
- if delay = @config.dig(:browser, :before_request, :delay).presence
134
+ if delay = @config.dig(:before_request, :delay).presence
131
135
  @browser.config.before_request[:delay] = delay
132
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request delay`"
136
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
133
137
  end
134
138
 
135
139
  # return Capybara session instance
@@ -23,6 +23,11 @@ module Kimurai
23
23
  js_errors: false, debug: false, inspector: false, phantomjs_options: []
24
24
  }
25
25
 
26
+ if extensions = @config[:extensions].presence
27
+ driver_options[:extensions] = extensions
28
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
29
+ end
30
+
26
31
  # Window size
27
32
  if size = @config[:window_size].presence
28
33
  driver_options[:window_size] = size
@@ -73,7 +78,7 @@ module Kimurai
73
78
  user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
74
79
 
75
80
  @browser.driver.add_header("User-Agent", user_agent_string)
76
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user-agent"
81
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
77
82
  end
78
83
 
79
84
  # Cookies
@@ -87,65 +92,65 @@ module Kimurai
87
92
 
88
93
  # Browser instance options
89
94
  # retry_request_errors
90
- if errors = @config.dig(:browser, :retry_request_errors).presence
95
+ if errors = @config[:retry_request_errors].presence
91
96
  @browser.config.retry_request_errors = errors
92
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser retry_request_errors`"
97
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
93
98
  end
94
99
 
95
100
  # restart_if
96
- if requests_limit = @config.dig(:browser, :restart_if, :requests_limit).presence
101
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
97
102
  @browser.config.restart_if[:requests_limit] = requests_limit
98
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser restart_if requests_limit` >= #{requests_limit}"
103
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
99
104
  end
100
105
 
101
- if memory_limit = @config.dig(:browser, :restart_if, :memory_limit).presence
106
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
102
107
  @browser.config.restart_if[:memory_limit] = memory_limit
103
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser restart_if memory_limit` >= #{memory_limit}"
108
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
104
109
  end
105
110
 
106
111
  # before_request clear_cookies
107
- if @config.dig(:browser, :before_request, :clear_cookies)
112
+ if @config.dig(:before_request, :clear_cookies)
108
113
  @browser.config.before_request[:clear_cookies] = true
109
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request clear_cookies`"
114
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
110
115
  end
111
116
 
112
117
  # before_request clear_and_set_cookies
113
- if @config.dig(:browser, :before_request, :clear_and_set_cookies)
118
+ if @config.dig(:before_request, :clear_and_set_cookies)
114
119
  if cookies = @config[:cookies].presence
115
120
  @browser.config.cookies = cookies
116
121
  @browser.config.before_request[:clear_and_set_cookies] = true
117
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request clear_and_set_cookies`"
122
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
118
123
  else
119
- logger.error "BrowserBuilder (poltergeist_phantomjs): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
124
+ logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
120
125
  end
121
126
  end
122
127
 
123
128
  # before_request change_user_agent
124
- if @config.dig(:browser, :before_request, :change_user_agent)
129
+ if @config.dig(:before_request, :change_user_agent)
125
130
  if @config[:user_agent].present? && @config[:user_agent].class == Proc
126
131
  @browser.config.user_agent = @config[:user_agent]
127
132
  @browser.config.before_request[:change_user_agent] = true
128
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request change_user_agent`"
133
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
129
134
  else
130
- logger.error "BrowserBuilder (poltergeist_phantomjs): `user_agent` should be present and has lambda format to enable `browser before_request change_user_agent`, skipped"
135
+ logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
131
136
  end
132
137
  end
133
138
 
134
139
  # before_request change_proxy
135
- if @config.dig(:browser, :before_request, :change_proxy)
140
+ if @config.dig(:before_request, :change_proxy)
136
141
  if @config[:proxy].present? && @config[:proxy].class == Proc
137
142
  @browser.config.proxy = @config[:proxy]
138
143
  @browser.config.before_request[:change_proxy] = true
139
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request change_proxy`"
144
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
140
145
  else
141
- logger.error "BrowserBuilder (poltergeist_phantomjs): `proxy` should be present and has lambda format to enable `browser before_request change_proxy`, skipped"
146
+ logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
142
147
  end
143
148
  end
144
149
 
145
150
  # before_request delay
146
- if delay = @config.dig(:browser, :before_request, :delay).presence
151
+ if delay = @config.dig(:before_request, :delay).presence
147
152
  @browser.config.before_request[:delay] = delay
148
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request delay`"
153
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
149
154
  end
150
155
 
151
156
  # return Capybara session instance
@@ -75,7 +75,7 @@ module Kimurai
75
75
  if user_agent = @config[:user_agent].presence
76
76
  user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
77
77
  driver_options.args << "--user-agent='#{user_agent_string}'"
78
- logger.debug "BrowserBuilder (selenium_chrome): enabled custom user-agent"
78
+ logger.debug "BrowserBuilder (selenium_chrome): enabled custom user_agent"
79
79
  end
80
80
 
81
81
  # Headless mode
@@ -107,11 +107,9 @@ module Kimurai
107
107
  @browser.spider = spider
108
108
  logger.debug "BrowserBuilder (selenium_chrome): created browser instance"
109
109
 
110
- # Window size
111
- # if size = @config[:window_size].presence
112
- # @browser.current_window.resize_to(*size)
113
- # logger.debug "BrowserBuilder (selenium_chrome): enabled window_size"
114
- # end
110
+ if @config[:extensions].present?
111
+ logger.error "BrowserBuilder (selenium_chrome): `extensions` option not supported by Selenium, skipped"
112
+ end
115
113
 
116
114
  # Cookies
117
115
  if cookies = @config[:cookies].presence
@@ -121,53 +119,53 @@ module Kimurai
121
119
 
122
120
  # Browser instance options
123
121
  # retry_request_errors
124
- if errors = @config.dig(:browser, :retry_request_errors).presence
122
+ if errors = @config[:retry_request_errors].presence
125
123
  @browser.config.retry_request_errors = errors
126
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser retry_request_errors`"
124
+ logger.debug "BrowserBuilder (selenium_chrome): enabled retry_request_errors"
127
125
  end
128
126
 
129
127
  # restart_if
130
- if requests_limit = @config.dig(:browser, :restart_if, :requests_limit).presence
128
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
131
129
  @browser.config.restart_if[:requests_limit] = requests_limit
132
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser restart_if requests_limit` >= #{requests_limit}"
130
+ logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.requests_limit >= #{requests_limit}"
133
131
  end
134
132
 
135
- if memory_limit = @config.dig(:browser, :restart_if, :memory_limit).presence
133
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
136
134
  @browser.config.restart_if[:memory_limit] = memory_limit
137
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser restart_if memory_limit` >= #{memory_limit}"
135
+ logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.memory_limit >= #{memory_limit}"
138
136
  end
139
137
 
140
138
  # before_request clear_cookies
141
- if @config.dig(:browser, :before_request, :clear_cookies)
139
+ if @config.dig(:before_request, :clear_cookies)
142
140
  @browser.config.before_request[:clear_cookies] = true
143
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser before_request clear_cookies`"
141
+ logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_cookies"
144
142
  end
145
143
 
146
144
  # before_request clear_and_set_cookies
147
- if @config.dig(:browser, :before_request, :clear_and_set_cookies)
145
+ if @config.dig(:before_request, :clear_and_set_cookies)
148
146
  if cookies = @config[:cookies].presence
149
147
  @browser.config.cookies = cookies
150
148
  @browser.config.before_request[:clear_and_set_cookies] = true
151
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser before_request clear_and_set_cookies`"
149
+ logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_and_set_cookies"
152
150
  else
153
- logger.error "BrowserBuilder (selenium_chrome): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
151
+ logger.error "BrowserBuilder (selenium_chrome): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
154
152
  end
155
153
  end
156
154
 
157
155
  # before_request change_user_agent
158
- if @config.dig(:browser, :before_request, :change_user_agent)
159
- logger.error "BrowserBuilder (selenium_chrome): `browser before_request change_user_agent` option not supported by Selenium, skipped"
156
+ if @config.dig(:before_request, :change_user_agent)
157
+ logger.error "BrowserBuilder (selenium_chrome): before_request.change_user_agent option not supported by Selenium, skipped"
160
158
  end
161
159
 
162
160
  # before_request change_proxy
163
- if @config.dig(:browser, :before_request, :change_proxy)
164
- logger.error "BrowserBuilder (selenium_chrome): `browser before_request change_proxy` option not supported by Selenium, skipped"
161
+ if @config.dig(:before_request, :change_proxy)
162
+ logger.error "BrowserBuilder (selenium_chrome): before_request.change_proxy option not supported by Selenium, skipped"
165
163
  end
166
164
 
167
165
  # before_request delay
168
- if delay = @config.dig(:browser, :before_request, :delay).presence
166
+ if delay = @config.dig(:before_request, :delay).presence
169
167
  @browser.config.before_request[:delay] = delay
170
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser before_request delay`"
168
+ logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.delay"
171
169
  end
172
170
 
173
171
  # return Capybara session instance