kimurai 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,38 +3,46 @@ require 'csv'
3
3
 
4
4
  module Kimurai
5
5
  class Base
6
- class SimpleSaver
7
- def initialize
6
+ class Saver
7
+ attr_reader :format, :path, :position, :append
8
+
9
+ def initialize(path, format:, position: true, append: false)
10
+ unless %i(json pretty_json jsonlines csv).include?(format)
11
+ raise "SimpleSaver: wrong type of format: #{format}"
12
+ end
13
+
14
+ @path = path
15
+ @format = format
16
+ @position = position
8
17
  @index = 0
18
+ @append = append
9
19
  @mutex = Mutex.new
10
20
  end
11
21
 
12
- def save(path, item, format:, position:)
22
+ def save(item)
13
23
  @mutex.synchronize do
14
24
  @index += 1
15
25
  item[:position] = @index if position
16
26
 
17
27
  case format
18
28
  when :json
19
- save_to_json(item, path)
29
+ save_to_json(item)
20
30
  when :pretty_json
21
- save_to_pretty_json(item, path)
31
+ save_to_pretty_json(item)
22
32
  when :jsonlines
23
- save_to_jsonlines(item, path)
33
+ save_to_jsonlines(item)
24
34
  when :csv
25
- save_to_csv(item, path)
26
- else
27
- raise "SimpleSaver: wrong type of format: #{format}"
35
+ save_to_csv(item)
28
36
  end
29
37
  end
30
38
  end
31
39
 
32
40
  private
33
41
 
34
- def save_to_json(item, path)
42
+ def save_to_json(item)
35
43
  data = JSON.generate([item])
36
44
 
37
- if @index > 1
45
+ if append || @index > 1
38
46
  file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
39
47
  File.open(path, "w") do |f|
40
48
  f.write(file_content + data.sub(/\A\[/, ""))
@@ -44,10 +52,10 @@ module Kimurai
44
52
  end
45
53
  end
46
54
 
47
- def save_to_pretty_json(item, path)
55
+ def save_to_pretty_json(item)
48
56
  data = JSON.pretty_generate([item])
49
57
 
50
- if @index > 1
58
+ if append || @index > 1
51
59
  file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
52
60
  File.open(path, "w") do |f|
53
61
  f.write(file_content + data.sub(/\A\[\n/, ""))
@@ -57,20 +65,20 @@ module Kimurai
57
65
  end
58
66
  end
59
67
 
60
- def save_to_jsonlines(item, path)
68
+ def save_to_jsonlines(item)
61
69
  data = JSON.generate(item)
62
70
 
63
- if @index > 1
71
+ if append || @index > 1
64
72
  File.open(path, "a") { |file| file.write("\n" + data) }
65
73
  else
66
74
  File.open(path, "w") { |file| file.write(data) }
67
75
  end
68
76
  end
69
77
 
70
- def save_to_csv(item, path)
78
+ def save_to_csv(item)
71
79
  data = flatten_hash(item)
72
80
 
73
- if @index > 1
81
+ if append || @index > 1
74
82
  CSV.open(path, "a+", force_quotes: true) do |csv|
75
83
  csv << data.values
76
84
  end
@@ -0,0 +1,91 @@
1
+ require 'pstore'
2
+
3
+ module Kimurai
4
+ class Base
5
+ class Storage
6
+ attr_reader :database, :path
7
+
8
+ def initialize(path = nil)
9
+ @path = path
10
+ @mutex = Mutex.new
11
+ @database = path ? PStore.new(path) : {}
12
+ end
13
+
14
+ def all(scope = nil)
15
+ @mutex.synchronize do
16
+ if path
17
+ database.transaction { scope ? database.fetch(scope, []) : database }
18
+ else
19
+ scope ? database.fetch(scope, []) : database
20
+ end
21
+ end
22
+ end
23
+
24
+ def include?(scope, value)
25
+ @mutex.synchronize do
26
+ if path
27
+ database.transaction do
28
+ database[scope] ||= []
29
+ database[scope].include?(value)
30
+ end
31
+ else
32
+ database[scope] ||= []
33
+ database[scope].include?(value)
34
+ end
35
+ end
36
+ end
37
+
38
+ def add(scope, value)
39
+ @mutex.synchronize do
40
+ if path
41
+ database.transaction do
42
+ database[scope] ||= []
43
+ database[scope].push(value) unless database[scope].include?(value)
44
+ end
45
+ else
46
+ database[scope] ||= []
47
+ database[scope].push(value) unless database[scope].include?(value)
48
+ end
49
+ end
50
+ end
51
+
52
+ ###
53
+
54
+ def unique?(scope, value)
55
+ @mutex.synchronize do
56
+ if path
57
+ database.transaction do
58
+ database[scope] ||= []
59
+ database[scope].include?(value) ? false : database[scope].push(value) and true
60
+ end
61
+ else
62
+ database[scope] ||= []
63
+ database[scope].include?(value) ? false : database[scope].push(value) and true
64
+ end
65
+ end
66
+ end
67
+
68
+ ###
69
+
70
+ def clear!
71
+ @mutex.synchronize do
72
+ if path
73
+ database.transaction do
74
+ database.roots.each { |key| database.delete key }
75
+ end
76
+ else
77
+ database = {}
78
+ end
79
+ end
80
+ end
81
+
82
+ def delete!
83
+ @mutex.synchronize do
84
+ if path
85
+ File.delete path if File.exists? path
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
@@ -13,6 +13,12 @@ module Kimurai
13
13
  raise "BrowserBuilder: wrong name of engine, available engines: #{AVAILABLE_ENGINES.join(', ')}"
14
14
  end
15
15
 
16
+ if config[:browser].present?
17
+ raise "++++++ BrowserBuilder: browser option is depricated. Now all sub-options inside " \
18
+ "`browser` should be placed right into `@config` hash, without `browser` parent key.\n" \
19
+ "See more here: https://github.com/vifreefly/kimuraframework/blob/master/CHANGELOG.md#breaking-changes-110 ++++++"
20
+ end
21
+
16
22
  case engine
17
23
  when :mechanize
18
24
  require_relative 'browser_builder/mechanize_builder'
@@ -29,6 +29,10 @@ module Kimurai
29
29
  @browser.spider = spider
30
30
  logger.debug "BrowserBuilder (mechanize): created browser instance"
31
31
 
32
+ if @config[:extensions].present?
33
+ logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
34
+ end
35
+
32
36
  # Proxy
33
37
  if proxy = @config[:proxy].presence
34
38
  proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
@@ -63,7 +67,7 @@ module Kimurai
63
67
  user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
64
68
 
65
69
  @browser.driver.add_header("User-Agent", user_agent_string)
66
- logger.debug "BrowserBuilder (mechanize): enabled custom user-agent"
70
+ logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
67
71
  end
68
72
 
69
73
  # Cookies
@@ -77,59 +81,59 @@ module Kimurai
77
81
 
78
82
  # Browser instance options
79
83
  # retry_request_errors
80
- if errors = @config.dig(:browser, :retry_request_errors).presence
84
+ if errors = @config[:retry_request_errors].presence
81
85
  @browser.config.retry_request_errors = errors
82
- logger.debug "BrowserBuilder (mechanize): enabled `browser retry_request_errors`"
86
+ logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
83
87
  end
84
88
 
85
89
  # restart_if
86
- if @config.dig(:browser, :restart_if).present?
87
- logger.warn "BrowserBuilder (mechanize): `browser restart_if` options not supported by Mechanize, skipped"
90
+ if @config[:restart_if].present?
91
+ logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
88
92
  end
89
93
 
90
94
  # before_request clear_cookies
91
- if @config.dig(:browser, :before_request, :clear_cookies)
95
+ if @config.dig(:before_request, :clear_cookies)
92
96
  @browser.config.before_request[:clear_cookies] = true
93
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_cookies`"
97
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
94
98
  end
95
99
 
96
100
  # before_request clear_and_set_cookies
97
- if @config.dig(:browser, :before_request, :clear_and_set_cookies)
101
+ if @config.dig(:before_request, :clear_and_set_cookies)
98
102
  if cookies = @config[:cookies].presence
99
103
  @browser.config.cookies = cookies
100
104
  @browser.config.before_request[:clear_and_set_cookies] = true
101
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_and_set_cookies`"
105
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
102
106
  else
103
- logger.error "BrowserBuilder (mechanize): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
107
+ logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
104
108
  end
105
109
  end
106
110
 
107
111
  # before_request change_user_agent
108
- if @config.dig(:browser, :before_request, :change_user_agent)
112
+ if @config.dig(:before_request, :change_user_agent)
109
113
  if @config[:user_agent].present? && @config[:user_agent].class == Proc
110
114
  @browser.config.user_agent = @config[:user_agent]
111
115
  @browser.config.before_request[:change_user_agent] = true
112
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_user_agent`"
116
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
113
117
  else
114
- logger.error "BrowserBuilder (mechanize): `user_agent` should be present and has lambda format to enable `browser before_request change_user_agent`, skipped"
118
+ logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
115
119
  end
116
120
  end
117
121
 
118
122
  # before_request change_proxy
119
- if @config.dig(:browser, :before_request, :change_proxy)
123
+ if @config.dig(:before_request, :change_proxy)
120
124
  if @config[:proxy].present? && @config[:proxy].class == Proc
121
125
  @browser.config.proxy = @config[:proxy]
122
126
  @browser.config.before_request[:change_proxy] = true
123
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_proxy`"
127
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
124
128
  else
125
- logger.error "BrowserBuilder (mechanize): `proxy` should be present and has lambda format to enable `browser before_request change_proxy`, skipped"
129
+ logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
126
130
  end
127
131
  end
128
132
 
129
133
  # before_request delay
130
- if delay = @config.dig(:browser, :before_request, :delay).presence
134
+ if delay = @config.dig(:before_request, :delay).presence
131
135
  @browser.config.before_request[:delay] = delay
132
- logger.debug "BrowserBuilder (mechanize): enabled `browser before_request delay`"
136
+ logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
133
137
  end
134
138
 
135
139
  # return Capybara session instance
@@ -23,6 +23,11 @@ module Kimurai
23
23
  js_errors: false, debug: false, inspector: false, phantomjs_options: []
24
24
  }
25
25
 
26
+ if extensions = @config[:extensions].presence
27
+ driver_options[:extensions] = extensions
28
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
29
+ end
30
+
26
31
  # Window size
27
32
  if size = @config[:window_size].presence
28
33
  driver_options[:window_size] = size
@@ -73,7 +78,7 @@ module Kimurai
73
78
  user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
74
79
 
75
80
  @browser.driver.add_header("User-Agent", user_agent_string)
76
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user-agent"
81
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
77
82
  end
78
83
 
79
84
  # Cookies
@@ -87,65 +92,65 @@ module Kimurai
87
92
 
88
93
  # Browser instance options
89
94
  # retry_request_errors
90
- if errors = @config.dig(:browser, :retry_request_errors).presence
95
+ if errors = @config[:retry_request_errors].presence
91
96
  @browser.config.retry_request_errors = errors
92
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser retry_request_errors`"
97
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
93
98
  end
94
99
 
95
100
  # restart_if
96
- if requests_limit = @config.dig(:browser, :restart_if, :requests_limit).presence
101
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
97
102
  @browser.config.restart_if[:requests_limit] = requests_limit
98
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser restart_if requests_limit` >= #{requests_limit}"
103
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
99
104
  end
100
105
 
101
- if memory_limit = @config.dig(:browser, :restart_if, :memory_limit).presence
106
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
102
107
  @browser.config.restart_if[:memory_limit] = memory_limit
103
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser restart_if memory_limit` >= #{memory_limit}"
108
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
104
109
  end
105
110
 
106
111
  # before_request clear_cookies
107
- if @config.dig(:browser, :before_request, :clear_cookies)
112
+ if @config.dig(:before_request, :clear_cookies)
108
113
  @browser.config.before_request[:clear_cookies] = true
109
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request clear_cookies`"
114
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
110
115
  end
111
116
 
112
117
  # before_request clear_and_set_cookies
113
- if @config.dig(:browser, :before_request, :clear_and_set_cookies)
118
+ if @config.dig(:before_request, :clear_and_set_cookies)
114
119
  if cookies = @config[:cookies].presence
115
120
  @browser.config.cookies = cookies
116
121
  @browser.config.before_request[:clear_and_set_cookies] = true
117
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request clear_and_set_cookies`"
122
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
118
123
  else
119
- logger.error "BrowserBuilder (poltergeist_phantomjs): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
124
+ logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
120
125
  end
121
126
  end
122
127
 
123
128
  # before_request change_user_agent
124
- if @config.dig(:browser, :before_request, :change_user_agent)
129
+ if @config.dig(:before_request, :change_user_agent)
125
130
  if @config[:user_agent].present? && @config[:user_agent].class == Proc
126
131
  @browser.config.user_agent = @config[:user_agent]
127
132
  @browser.config.before_request[:change_user_agent] = true
128
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request change_user_agent`"
133
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
129
134
  else
130
- logger.error "BrowserBuilder (poltergeist_phantomjs): `user_agent` should be present and has lambda format to enable `browser before_request change_user_agent`, skipped"
135
+ logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
131
136
  end
132
137
  end
133
138
 
134
139
  # before_request change_proxy
135
- if @config.dig(:browser, :before_request, :change_proxy)
140
+ if @config.dig(:before_request, :change_proxy)
136
141
  if @config[:proxy].present? && @config[:proxy].class == Proc
137
142
  @browser.config.proxy = @config[:proxy]
138
143
  @browser.config.before_request[:change_proxy] = true
139
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request change_proxy`"
144
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
140
145
  else
141
- logger.error "BrowserBuilder (poltergeist_phantomjs): `proxy` should be present and has lambda format to enable `browser before_request change_proxy`, skipped"
146
+ logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
142
147
  end
143
148
  end
144
149
 
145
150
  # before_request delay
146
- if delay = @config.dig(:browser, :before_request, :delay).presence
151
+ if delay = @config.dig(:before_request, :delay).presence
147
152
  @browser.config.before_request[:delay] = delay
148
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled `browser before_request delay`"
153
+ logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
149
154
  end
150
155
 
151
156
  # return Capybara session instance
@@ -75,7 +75,7 @@ module Kimurai
75
75
  if user_agent = @config[:user_agent].presence
76
76
  user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
77
77
  driver_options.args << "--user-agent='#{user_agent_string}'"
78
- logger.debug "BrowserBuilder (selenium_chrome): enabled custom user-agent"
78
+ logger.debug "BrowserBuilder (selenium_chrome): enabled custom user_agent"
79
79
  end
80
80
 
81
81
  # Headless mode
@@ -107,11 +107,9 @@ module Kimurai
107
107
  @browser.spider = spider
108
108
  logger.debug "BrowserBuilder (selenium_chrome): created browser instance"
109
109
 
110
- # Window size
111
- # if size = @config[:window_size].presence
112
- # @browser.current_window.resize_to(*size)
113
- # logger.debug "BrowserBuilder (selenium_chrome): enabled window_size"
114
- # end
110
+ if @config[:extensions].present?
111
+ logger.error "BrowserBuilder (selenium_chrome): `extensions` option not supported by Selenium, skipped"
112
+ end
115
113
 
116
114
  # Cookies
117
115
  if cookies = @config[:cookies].presence
@@ -121,53 +119,53 @@ module Kimurai
121
119
 
122
120
  # Browser instance options
123
121
  # retry_request_errors
124
- if errors = @config.dig(:browser, :retry_request_errors).presence
122
+ if errors = @config[:retry_request_errors].presence
125
123
  @browser.config.retry_request_errors = errors
126
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser retry_request_errors`"
124
+ logger.debug "BrowserBuilder (selenium_chrome): enabled retry_request_errors"
127
125
  end
128
126
 
129
127
  # restart_if
130
- if requests_limit = @config.dig(:browser, :restart_if, :requests_limit).presence
128
+ if requests_limit = @config.dig(:restart_if, :requests_limit).presence
131
129
  @browser.config.restart_if[:requests_limit] = requests_limit
132
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser restart_if requests_limit` >= #{requests_limit}"
130
+ logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.requests_limit >= #{requests_limit}"
133
131
  end
134
132
 
135
- if memory_limit = @config.dig(:browser, :restart_if, :memory_limit).presence
133
+ if memory_limit = @config.dig(:restart_if, :memory_limit).presence
136
134
  @browser.config.restart_if[:memory_limit] = memory_limit
137
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser restart_if memory_limit` >= #{memory_limit}"
135
+ logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.memory_limit >= #{memory_limit}"
138
136
  end
139
137
 
140
138
  # before_request clear_cookies
141
- if @config.dig(:browser, :before_request, :clear_cookies)
139
+ if @config.dig(:before_request, :clear_cookies)
142
140
  @browser.config.before_request[:clear_cookies] = true
143
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser before_request clear_cookies`"
141
+ logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_cookies"
144
142
  end
145
143
 
146
144
  # before_request clear_and_set_cookies
147
- if @config.dig(:browser, :before_request, :clear_and_set_cookies)
145
+ if @config.dig(:before_request, :clear_and_set_cookies)
148
146
  if cookies = @config[:cookies].presence
149
147
  @browser.config.cookies = cookies
150
148
  @browser.config.before_request[:clear_and_set_cookies] = true
151
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser before_request clear_and_set_cookies`"
149
+ logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_and_set_cookies"
152
150
  else
153
- logger.error "BrowserBuilder (selenium_chrome): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
151
+ logger.error "BrowserBuilder (selenium_chrome): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
154
152
  end
155
153
  end
156
154
 
157
155
  # before_request change_user_agent
158
- if @config.dig(:browser, :before_request, :change_user_agent)
159
- logger.error "BrowserBuilder (selenium_chrome): `browser before_request change_user_agent` option not supported by Selenium, skipped"
156
+ if @config.dig(:before_request, :change_user_agent)
157
+ logger.error "BrowserBuilder (selenium_chrome): before_request.change_user_agent option not supported by Selenium, skipped"
160
158
  end
161
159
 
162
160
  # before_request change_proxy
163
- if @config.dig(:browser, :before_request, :change_proxy)
164
- logger.error "BrowserBuilder (selenium_chrome): `browser before_request change_proxy` option not supported by Selenium, skipped"
161
+ if @config.dig(:before_request, :change_proxy)
162
+ logger.error "BrowserBuilder (selenium_chrome): before_request.change_proxy option not supported by Selenium, skipped"
165
163
  end
166
164
 
167
165
  # before_request delay
168
- if delay = @config.dig(:browser, :before_request, :delay).presence
166
+ if delay = @config.dig(:before_request, :delay).presence
169
167
  @browser.config.before_request[:delay] = delay
170
- logger.debug "BrowserBuilder (selenium_chrome): enabled `browser before_request delay`"
168
+ logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.delay"
171
169
  end
172
170
 
173
171
  # return Capybara session instance