dataflow-rb 0.12.1 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 17c81e73d137d2c613e6cc676346970b3473c2ad
4
- data.tar.gz: febcd66be2ba8004d374422a23cf39c33af4dc63
3
+ metadata.gz: 72840e2477fe869fb06b0299c96d5ae2a57c7713
4
+ data.tar.gz: f9f03314f23473585a9e742740c0e809f2d99bc7
5
5
  SHA512:
6
- metadata.gz: 6200fd8715421654f1c8a62c812fa8914d78d11f2c002d07279ac49d29f0fde8cb11325530cea56d9dc707ba651bf632c45c74af6d7dad64bc0143e68f6099ca
7
- data.tar.gz: 70bbf3c790c6d47dddac9e932fb24b5159d3039a7a327066fff84af9feb26e23804216e94a72155a164844119a5eed7f30006a1111e2febc772fd973557daca0
6
+ metadata.gz: 43f7cef4b2150017871cb7b3c0f21602a01f385e40d07ddf7000f455a4adc007669974fd4e7170e4acc3807feae907f6114e3b3cbfbdbbf36f96348c3a06f60c
7
+ data.tar.gz: d16411f178fa8ccc00cc9dbaefd0905040a5f8354874f7b00ad5080681d912fa8051dd16e3975d25aed8c737fdf70594f3bb77a948e95665d92a198f0e65206c
@@ -1,5 +1,23 @@
1
1
  # Changelog
2
2
 
3
+
4
+ #### 0.13.0
5
+ - [b79c96f] Fix a bug in the sql adapter: support multiple ORDER BY clauses
6
+ - [a17f071] Add runtime query node. Make the ops transformation public.
7
+ - [8c78aa2] Added support for a per-node backup/restore
8
+ - [6069ec0] Moved the db settings to the settings class
9
+ - [b5a77fc] Set the last update time using a query directly on the DB. Do not return unneeded information from the recompute/explain method
10
+ - [cc77366] Explain why a node needs an update
11
+ - [e87ba14] Add logging to the sql query node
12
+ - [5d82dfc] Fix logging during the sql table creation.
13
+ - [7390264] Add a read-only data node
14
+ - [dbb14ed] Refactor the debugging implementation
15
+ - [38925a3] Added parameters on the data node to flexibly connec to any database
16
+ - [7aac1eb] Add support for partial (where clause) parallel queries generation.
17
+
18
+ #### 0.12.1
19
+ - [110ded7] Fix compute node not processing in parallel
20
+
3
21
  #### 0.12.0
4
22
  - [4a510df] Add support for case insentive regex matching on mysql
5
23
  - [63b0771] Add logging to understand the current computation batch progress
@@ -36,6 +36,8 @@ require 'dataflow/nodes/compute_node'
36
36
  require 'dataflow/nodes/join_node'
37
37
  require 'dataflow/nodes/map_node'
38
38
  require 'dataflow/nodes/merge_node'
39
+ require 'dataflow/nodes/read_only_data_node'
40
+ require 'dataflow/nodes/runtime_query_node'
39
41
  require 'dataflow/nodes/select_keys_node'
40
42
  require 'dataflow/nodes/snapshot_node'
41
43
  require 'dataflow/nodes/sql_query_node'
@@ -9,9 +9,9 @@ module Dataflow
9
9
  class << self
10
10
  def client(settings, db_name: nil)
11
11
  @clients ||= {}
12
- host = ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
13
- port = '27017'
14
- connection_uri = settings.connection_uri || "#{host}:#{port}"
12
+
13
+ settings.adapter_type = 'mongodb'
14
+ connection_uri = settings.connection_uri_or_default
15
15
  db_name ||= settings.db_name
16
16
  @clients["#{connection_uri}.#{db_name}"] ||= Mongo::Client.new([connection_uri], database: db_name)
17
17
  end
@@ -113,8 +113,8 @@ module Dataflow
113
113
  end
114
114
 
115
115
  # Create queries that permit processing the whole dataset in parallel without using offsets.
116
- def ordered_system_id_queries(batch_size:)
117
- ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
116
+ def ordered_system_id_queries(batch_size:, where: {})
117
+ ids = all(fields: [SYSTEM_ID], where: where, sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
118
118
  queries_count = (ids.size / batch_size.to_f).ceil
119
119
  Array.new(queries_count) do |i|
120
120
  from = ids[i * batch_size]
@@ -225,14 +225,25 @@ module Dataflow
225
225
  }
226
226
  end
227
227
 
228
- private
229
-
230
- def write_dataset_name
231
- settings.write_dataset_name
228
+ def dump(base_folder:)
229
+ archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.gz"
230
+ options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
231
+ options += "--host=#{@settings.db_host}" if @settings.db_host.present?
232
+ options += "--port=#{@settings.db_port}" if @settings.db_port.present?
233
+ options += "--username=#{@settings.db_user}" if @settings.db_user.present?
234
+ options += "--password=#{@settings.db_password}" if @settings.db_password.present?
235
+ `mkdir -p #{base_folder}`
236
+ `mongodump #{options} --gzip`
237
+ archive_path
232
238
  end
233
239
 
234
- def read_dataset_name
235
- settings.read_dataset_name
240
+ def restore(filepath:)
241
+ options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
242
+ options += "--host=#{@settings.db_host}" if @settings.db_host.present?
243
+ options += "--port=#{@settings.db_port}" if @settings.db_port.present?
244
+ options += "--username=#{@settings.db_user}" if @settings.db_user.present?
245
+ options += "--password=#{@settings.db_password}" if @settings.db_password.present?
246
+ `mongorestore #{options} --gzip`
236
247
  end
237
248
 
238
249
  def transform_to_query(opts)
@@ -275,6 +286,16 @@ module Dataflow
275
286
  sanitized_opts
276
287
  end
277
288
 
289
+ private
290
+
291
+ def write_dataset_name
292
+ settings.write_dataset_name
293
+ end
294
+
295
+ def read_dataset_name
296
+ settings.read_dataset_name
297
+ end
298
+
278
299
  def try_cast_value(field, value)
279
300
  # cast to time when querying on _mojaco_updated_at
280
301
  return Timeliness.parse(value) || value if field =~ /_mojaco_updated_at/
@@ -24,6 +24,28 @@ module Dataflow
24
24
  def regex_case_insensitive_op
25
25
  '~*'
26
26
  end
27
+
28
+ def dump(base_folder:)
29
+ archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.dump"
30
+ options = "--table=public.#{@settings.read_dataset_name}"
31
+ options += "--host=#{@settings.db_host}" if @settings.db_host.present?
32
+ options += "--port=#{@settings.db_port}" if @settings.db_port.present?
33
+ options += "--username=#{@settings.db_user}" if @settings.db_user.present?
34
+ password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
35
+ `mkdir -p #{base_folder}`
36
+ `#{password}pg_dump #{options} -Fc #{@settings.db_name} > #{archive_path}`
37
+ archive_path
38
+ end
39
+
40
+ def restore(filepath:)
41
+ options = "--table=#{@settings.read_dataset_name}"
42
+ options += "--host=#{@settings.db_host}" if @settings.db_host.present?
43
+ options += "--port=#{@settings.db_port}" if @settings.db_port.present?
44
+ options += "--username=#{@settings.db_user}" if @settings.db_user.present?
45
+ password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
46
+ p "#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}"
47
+ `#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
48
+ end
27
49
  end
28
50
  end
29
51
  end
@@ -2,16 +2,23 @@
2
2
  module Dataflow
3
3
  module Adapters
4
4
  class Settings
5
- attr_accessor :connection_uri, :db_name, :indexes, :adapter_type,
6
- :dataset_name, :read_dataset_name, :write_dataset_name, :schema
5
+ attr_accessor :connection_uri, :db_name,
6
+ :db_host, :db_port, :db_user, :db_password,
7
+ :dataset_name, :read_dataset_name, :write_dataset_name,
8
+ :indexes, :adapter_type, :schema
7
9
 
8
10
  def initialize(data_node: nil, connection_uri: nil, db_name: nil,
11
+ db_host: nil, db_port: nil, db_user: nil, db_password: nil,
9
12
  dataset_name: nil, indexes: nil, adapter_type: nil, schema: nil)
10
13
  @connection_uri = connection_uri
11
14
 
12
15
  # first try to set the options based on the data node settings
13
16
  if data_node.present?
14
17
  @db_name = data_node.db_name
18
+ @db_host = data_node.db_host
19
+ @db_port = data_node.db_port
20
+ @db_user = data_node.db_user
21
+ @db_password = data_node.db_password
15
22
  @dataset_name = data_node.name
16
23
  @read_dataset_name = data_node.read_dataset_name
17
24
  @write_dataset_name = data_node.write_dataset_name
@@ -21,6 +28,10 @@ module Dataflow
21
28
 
22
29
  # override if needed
23
30
  @db_name ||= db_name
31
+ @db_host ||= db_host
32
+ @db_port ||= db_port
33
+ @db_user ||= db_user
34
+ @db_password ||= db_password
24
35
  @dataset_name ||= dataset_name
25
36
  @read_dataset_name ||= dataset_name
26
37
  @write_dataset_name ||= dataset_name
@@ -28,6 +39,62 @@ module Dataflow
28
39
  @adapter_type ||= adapter_type
29
40
  @schema ||= schema
30
41
  end
42
+
43
+ def set_mongodb_defaults_if_needed!
44
+ @db_host ||= ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
45
+ @db_port ||= ENV['MOJACO_MONGO_PORT'] || '27017'
46
+ @db_user ||= ENV['MOJACO_MONGO_USER']
47
+ @db_password ||= ENV['MOJACO_MONGO_USER']
48
+ end
49
+
50
+ def set_postgresql_defaults_if_needed!
51
+ @db_host ||= ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
52
+ @db_port ||= ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
53
+ @db_user ||= ENV['MOJACO_POSTGRESQL_USER']
54
+ @db_password ||= ENV['MOJACO_POSTGRESQL_PASSWORD']
55
+ end
56
+
57
+ def set_mysql_defaults_if_needed!
58
+ @db_host ||= ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
59
+ @db_port ||= ENV['MOJACO_MYSQL_PORT'] || '3306'
60
+ @db_user ||= ENV['MOJACO_MYSQL_USER']
61
+ @db_password ||= ENV['MOJACO_MYSQL_PASSWORD']
62
+ end
63
+
64
+ def connection_uri_or_default
65
+ return @connection_uri if @connection_uri.present?
66
+
67
+ send("#{@adapter_type}_default_connection_uri")
68
+ end
69
+
70
+ def mongodb_default_connection_uri
71
+ set_mongodb_defaults_if_needed!
72
+
73
+ # if user/password are empty, the user_password will be empty as well
74
+ user_password = @db_user
75
+ user_password += ":#{@db_password}" if @db_password.present?
76
+ user_password += '@' if user_password.present?
77
+
78
+ # [username:password@]host1[:port1]
79
+ "#{user_password}#{@db_host}:#{@db_port}"
80
+ end
81
+
82
+ def mysql_default_connection_uri
83
+ set_mysql_defaults_if_needed!
84
+ sql_default_connection_uri('mysql2')
85
+ end
86
+
87
+ def postgresql_default_connection_uri
88
+ set_postgresql_defaults_if_needed!
89
+ sql_default_connection_uri('postgresql')
90
+ end
91
+
92
+ def sql_default_connection_uri(scheme)
93
+ user_password = @db_user
94
+ user_password += ":#{@db_password}" if @db_password.present?
95
+
96
+ "#{scheme}://#{user_password}@#{@db_host}:#{@db_port}"
97
+ end
31
98
  end
32
99
  end
33
100
  end
@@ -9,37 +9,17 @@ module Dataflow
9
9
  # @param settings [Hash] Represents the connection settings to the DB.
10
10
  # @param db_name [String] The database name to which the client will connect.
11
11
  # @return [Sequel::Database] a sequel database object.
12
- def client(settings, db_name: nil)
12
+ def client(settings)
13
13
  @clients ||= {}
14
-
15
- case settings.adapter_type
16
- when 'mysql2'
17
- host = ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
18
- port = ENV['MOJACO_MYSQL_PORT'] || '3306'
19
- user = ENV['MOJACO_MYSQL_USER']
20
- password = ENV['MOJACO_MYSQL_PASSWORD']
21
- when 'postgresql'
22
- host = ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
23
- port = ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
24
- user = ENV['MOJACO_POSTGRESQL_USER']
25
- password = ENV['MOJACO_POSTGRESQL_PASSWORD']
26
- end
27
-
28
- db_name ||= settings.db_name
29
- user_password = user
30
- user_password += ":#{password}" if password.present?
31
-
32
- uri = "#{settings.adapter_type}://#{user_password}@#{host}:#{port}"
33
- connection_uri = settings.connection_uri || "#{uri}/#{db_name}"
34
-
14
+ connection_uri = settings.connection_uri_or_default
35
15
  return @clients[connection_uri] if @clients[connection_uri].present?
36
16
 
37
17
  # first, make sure the DB is created (if it is not an external db)
38
18
  is_external_db = settings.connection_uri.present?
39
- try_create_db(uri, db_name, user, password) unless is_external_db
19
+ try_create_db(connection_uri, settings.db_name) unless is_external_db
40
20
 
41
21
  # then, create the connection object
42
- db = Sequel.connect("#{connection_uri}?encoding=utf8")
22
+ db = Sequel.connect("#{connection_uri}/#{settings.db_name}?encoding=utf8")
43
23
  add_extensions(settings, db)
44
24
  @clients[connection_uri] = db
45
25
  end
@@ -48,8 +28,8 @@ module Dataflow
48
28
  # @param uri [String] the connection uri to the DB.
49
29
  # @param db_name [String] the database name.
50
30
  # @return [Boolean] whether the db was created or not.
51
- def try_create_db(uri, db_name, user, password)
52
- Sequel.connect(uri, user: user, password: password) do |db|
31
+ def try_create_db(uri, db_name)
32
+ Sequel.connect(uri) do |db|
53
33
  db.run("CREATE DATABASE #{db_name}")
54
34
  true
55
35
  end
@@ -108,7 +88,7 @@ module Dataflow
108
88
 
109
89
  (sort || {}).each do |k, v|
110
90
  sort_value = v == 1 ? k.to_sym : Sequel.desc(k.to_sym)
111
- res = res.order(sort_value)
91
+ res = res.order_append(sort_value)
112
92
  end
113
93
 
114
94
  res = res.offset(offset) if offset > 0
@@ -127,8 +107,8 @@ module Dataflow
127
107
  end
128
108
 
129
109
  # Create queries that permit processing the whole dataset in parallel without using offsets.
130
- def ordered_system_id_queries(batch_size:)
131
- ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
110
+ def ordered_system_id_queries(batch_size:, where: {})
111
+ ids = all(fields: [SYSTEM_ID], where: where, sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
132
112
  queries_count = (ids.size / batch_size.to_f).ceil
133
113
  Array.new(queries_count) do |i|
134
114
  from = ids[i * batch_size]
@@ -195,7 +175,7 @@ module Dataflow
195
175
  def recreate_dataset(dataset: nil)
196
176
  dataset ||= settings.write_dataset_name.to_sym
197
177
  drop_dataset(dataset)
198
- create_table(dataset, @schema)
178
+ create_table(dataset, @schema, logger)
199
179
  end
200
180
 
201
181
  # drops the given dataset
@@ -248,12 +228,40 @@ module Dataflow
248
228
  table_usage.merge(effective_indexes: indexes)
249
229
  end
250
230
 
231
+ def transform_to_query(opts)
232
+ # map to a serie of AND clauses queries
233
+ opts.flat_map do |k, v|
234
+ if v.is_a? Hash
235
+ v.map do |operator, value|
236
+ case operator
237
+ when '!='
238
+ if value.is_a? Array
239
+ Sequel.lit("#{k} NOT IN ?", value)
240
+ else
241
+ Sequel.lit("#{k} <> ?", value)
242
+ end
243
+ when '<', '<=', '>', '>='
244
+ Sequel.lit("#{k} #{operator} ?", value)
245
+ when '~'
246
+ Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
247
+ when '~*'
248
+ Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
249
+ end
250
+ end
251
+ else
252
+ # e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
253
+ # are supported with simples hashes
254
+ [[{ k.to_sym => v }]]
255
+ end
256
+ end
257
+ end
258
+
251
259
  private
252
260
 
253
261
  MAX_INT = 2_147_483_647
254
262
  MAX_VARCHAR = 255
255
263
 
256
- def create_table(dataset, schema)
264
+ def create_table(dataset, schema, logger)
257
265
  client.create_table(dataset.to_sym) do
258
266
  # always add an _id field to be used internally
259
267
  primary_key SYSTEM_ID
@@ -309,34 +317,6 @@ module Dataflow
309
317
  res
310
318
  end
311
319
 
312
- def transform_to_query(opts)
313
- # map to a serie of AND clauses queries
314
- opts.flat_map do |k, v|
315
- if v.is_a? Hash
316
- v.map do |operator, value|
317
- case operator
318
- when '!='
319
- if value.is_a? Array
320
- Sequel.lit("#{k} NOT IN ?", value)
321
- else
322
- Sequel.lit("#{k} <> ?", value)
323
- end
324
- when '<', '<=', '>', '>='
325
- Sequel.lit("#{k} #{operator} ?", value)
326
- when '~'
327
- Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
328
- when '~*'
329
- Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
330
- end
331
- end
332
- else
333
- # e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
334
- # are supported with simples hashes
335
- [[{ k.to_sym => v }]]
336
- end
337
- end
338
- end
339
-
340
320
  # Required index format for sequel:
341
321
  # :keys, unique: true
342
322
  def index_parameters(index)
@@ -14,10 +14,6 @@ module Dataflow
14
14
  Dataflow::Nodes::ComputeNode.find(id)
15
15
  end
16
16
 
17
- def updated?
18
- true
19
- end
20
-
21
17
  def recompute(*args)
22
18
  # Interface only, for recursion purposes
23
19
  end
@@ -156,6 +156,22 @@ module Dataflow
156
156
  true
157
157
  end
158
158
 
159
+ # Logs out the dependencies tree update time and whether
160
+ # it should or not be updated. Useful to understand
161
+ # why a given nodes had to be recomputed.
162
+ def explain_update(depth: 0, verbose: false)
163
+ if depth == 0 || !updated? || verbose
164
+ logger.log("#{'>' * (depth + 1)} #{name} [COMPUTE] | #{updated? ? 'UPDATED' : 'OLD'} = #{updated_at}")
165
+ end
166
+
167
+ return if updated? && !verbose
168
+
169
+ dependencies.each do |dependency|
170
+ dependency.explain_update(depth: depth + 1, verbose: verbose)
171
+ end
172
+ true
173
+ end
174
+
159
175
  # Keep a uniform interface with a DataNode.
160
176
  def updated_at
161
177
  last_compute_starting_time
@@ -183,11 +199,11 @@ module Dataflow
183
199
  # even if the node is already up to date.
184
200
  def recompute(depth: 0, force_recompute: false)
185
201
  send_heartbeat
186
- logger.log "#{'>' * (depth + 1)} #{name} started recomputing..."
202
+ logger.log("#{'>' * (depth + 1)} #{name} started recomputing...")
187
203
  start_time = Time.now
188
204
 
189
205
  parallel_each(dependencies) do |dependency|
190
- logger.log "#{'>' * (depth + 1)} #{name} checking deps: #{dependency.name}..."
206
+ logger.log("#{'>' * (depth + 1)} #{name} checking deps: #{dependency.name}...")
191
207
  if !dependency.updated? || force_recompute
192
208
  dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
193
209
  end
@@ -196,11 +212,11 @@ module Dataflow
196
212
 
197
213
  # Dependencies data may have changed in a child process.
198
214
  # Reload to make sure we have the latest metadata.
199
- logger.log "#{'>' * (depth + 1)} #{name} reloading dependencies..."
215
+ logger.log("#{'>' * (depth + 1)} #{name} reloading dependencies...")
200
216
  dependencies(reload: true)
201
217
 
202
218
  compute(depth: depth, force_compute: force_recompute)
203
- logger.log "#{'>' * (depth + 1)} #{name} took #{Time.now - start_time} seconds to recompute."
219
+ logger.log("#{'>' * (depth + 1)} #{name} took #{Time.now - start_time} seconds to recompute.")
204
220
 
205
221
  true
206
222
  end
@@ -216,13 +232,13 @@ module Dataflow
216
232
  validate!
217
233
 
218
234
  if updated? && !force_compute
219
- logger.log "#{'>' * (depth + 1)} #{name} is up-to-date."
235
+ logger.log("#{'>' * (depth + 1)} #{name} is up-to-date.")
220
236
  return
221
237
  end
222
238
 
223
239
  has_compute_lock = acquire_computing_lock!
224
240
  if has_compute_lock
225
- logger.log "#{'>' * (depth + 1)} #{name} started computing."
241
+ logger.log("#{'>' * (depth + 1)} #{name} started computing.")
226
242
  on_computing_started
227
243
  start_time = Time.now
228
244
 
@@ -254,15 +270,15 @@ module Dataflow
254
270
  data_node&.swap_read_write_datasets!
255
271
  end
256
272
 
257
- self.last_compute_starting_time = start_time
258
- save
273
+ set_last_compute_starting_time(start_time)
259
274
  duration = Time.now - start_time
260
- logger.log "#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute."
275
+ logger.log("#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute.")
261
276
  on_computing_finished(state: 'computed')
277
+ true
262
278
  else
263
- logger.log "#{'>' * (depth + 1)} [IS AWAITING] #{name}."
279
+ logger.log("#{'>' * (depth + 1)} [IS AWAITING] #{name}.")
264
280
  await_computing!
265
- logger.log "#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}."
281
+ logger.log("#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}.")
266
282
  end
267
283
 
268
284
  rescue StandardError => e
@@ -412,6 +428,17 @@ module Dataflow
412
428
  .find_one_and_update(update_query)
413
429
  end
414
430
 
431
+ def set_last_compute_starting_time(time)
432
+ # this is just to avoid the reload.
433
+ # But this change will not be propagated across processes
434
+ self.last_compute_starting_time = time
435
+ # update directly on the DB
436
+ update_query = { '$set' => { last_compute_starting_time: time } }
437
+ Dataflow::Nodes::ComputeNode.where(_id: _id)
438
+ .find_one_and_update(update_query)
439
+
440
+ end
441
+
415
442
  ##############################
416
443
  # Dependency validations
417
444
  ##############################
@@ -505,24 +532,21 @@ module Dataflow
505
532
  Mongoid.disconnect_clients
506
533
 
507
534
  # set to true to debug code in the iteration
508
- is_debugging_impl = (ENV['RACK_ENV'] == 'test' && ENV['DEBUG'])
509
- if is_debugging_impl
510
- itr.each do |*args|
511
- yield(*args)
512
- end
513
- else
514
- opts = if max_parallel_processes > 0
515
- { in_processes: max_parallel_processes }
516
- else
517
- {}
518
- end
519
-
520
- Parallel.each(itr, opts) do |*args|
521
- yield(*args)
522
- Dataflow::Adapters::SqlAdapter.disconnect_clients
523
- Dataflow::Adapters::MongoDbAdapter.disconnect_clients
524
- Mongoid.disconnect_clients
525
- end
535
+ is_debugging_impl = ENV['DEBUG_DATAFLOW']
536
+ opts = if is_debugging_impl
537
+ # this will turn of the parallel processing
538
+ { in_processes: 0 }
539
+ elsif max_parallel_processes > 0
540
+ { in_processes: max_parallel_processes }
541
+ else
542
+ {}
543
+ end
544
+
545
+ Parallel.each(itr, opts) do |*args|
546
+ yield(*args)
547
+ Dataflow::Adapters::SqlAdapter.disconnect_clients
548
+ Dataflow::Adapters::MongoDbAdapter.disconnect_clients
549
+ Mongoid.disconnect_clients
526
550
  end
527
551
  end
528
552
 
@@ -24,11 +24,19 @@ module Dataflow
24
24
  # make sure we have only one node per db/table combination
25
25
  index({ db_name: 1, name: 1 }, unique: true)
26
26
 
27
+ # The dataset name used by this node for storage.
28
+ field :name, type: String, editable: false
29
+
27
30
  # The database name used by this node
28
31
  field :db_name, type: String, editable: false
29
-
30
- # The dataset name used by this node for storage.
31
- field :name, type: String
32
+ # The database host (used the ENV settings by default)
33
+ field :db_host, type: String, editable: false
34
+ # The database port (used the ENV settings by default)
35
+ field :db_port, type: String, editable: false
36
+ # The database user (used the ENV settings by default)
37
+ field :db_user, type: String, editable: false
38
+ # The database password (used the ENV settings by default)
39
+ field :db_password, type: String, editable: false
32
40
 
33
41
  # The schema of this node
34
42
  field :schema, type: Hash, editable: false
@@ -163,8 +171,8 @@ module Dataflow
163
171
  # Parallel.each(queries) do |query|
164
172
  # process(node.all(where: query))
165
173
  # end
166
- def ordered_system_id_queries(batch_size:)
167
- db_adapter.ordered_system_id_queries(batch_size: batch_size)
174
+ def ordered_system_id_queries(batch_size:, where: {})
175
+ db_adapter.ordered_system_id_queries(batch_size: batch_size, where: {})
168
176
  end
169
177
 
170
178
  # Counts how many records matches the condition or all if no condition is given.
@@ -297,6 +305,13 @@ module Dataflow
297
305
  (db_backend.to_s =~ /sql/).present?
298
306
  end
299
307
 
308
+ def updated?
309
+ true
310
+ end
311
+
312
+ def explain_update(depth: 0, verbose: false)
313
+ logger.log("#{'>' * (depth + 1)} #{name} [Dataset] | UPDATED = #{updated_at}")
314
+ end
300
315
 
301
316
  def required_by
302
317
  super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map { |node|
@@ -323,6 +338,18 @@ module Dataflow
323
338
  db_adapter.drop_dataset(read_dataset_name)
324
339
  end
325
340
 
341
+ # Dump a backup of this dataset to a file.
342
+ # @return [String] the filepath to the dump file.
343
+ def dump_dataset(base_folder: './dump')
344
+ db_adapter.dump(base_folder: base_folder)
345
+ end
346
+
347
+ # Restore a dump of this dataset
348
+ # @param files [String] the filepath to the dump file.
349
+ def restore_dataset(filepath:)
350
+ db_adapter.restore(filepath: filepath)
351
+ end
352
+
326
353
  private
327
354
 
328
355
  def db_adapter(connection_opts = {})
@@ -342,9 +369,9 @@ module Dataflow
342
369
  @csv_adapter ||= Adapters::CsvAdapter.new(data_node: self)
343
370
  return @csv_adapter
344
371
  when 'mysql'
345
- opts[:adapter_type] = 'mysql2'
372
+ opts[:adapter_type] = 'mysql'
346
373
  return Adapters::SqlAdapter.new(opts) if has_options
347
- @mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql2')
374
+ @mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql')
348
375
  return @mysql_adapter
349
376
  when 'postgresql'
350
377
  opts[:adapter_type] = 'postgresql'
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ # Only supports read operations
5
+ class ReadOnlyDataNode < DataNode
6
+
7
+ def set_defaults
8
+ super
9
+ self.use_double_buffering = false
10
+ end
11
+
12
+
13
+ def handle_dataset_settings_changed
14
+ # ignore - do not do anyhing
15
+ end
16
+
17
+ def add(*args)
18
+ raise_read_only_error!
19
+ end
20
+
21
+ def clear(*args)
22
+ raise_read_only_error!
23
+ end
24
+
25
+ def recreate_dataset(*args)
26
+ raise_read_only_error!
27
+ end
28
+
29
+ def create_unique_indexes(*args)
30
+ raise_read_only_error!
31
+ end
32
+
33
+ def create_non_unique_indexes(*args)
34
+ raise_read_only_error!
35
+ end
36
+
37
+ def read_dataset_name=(*args)
38
+ raise_read_only_error!
39
+ end
40
+
41
+ def swap_read_write_datasets!
42
+ raise_read_only_error!
43
+ end
44
+
45
+ def import(*args)
46
+ raise_read_only_error!
47
+ end
48
+
49
+
50
+ def drop_dataset!
51
+ raise_read_only_error!
52
+ end
53
+
54
+ private
55
+
56
+ def raise_read_only_error!
57
+ raise NotImplementedError, 'External data nodes are read only'
58
+ end
59
+
60
+ end # class ExternalDataNode
61
+ end # module Nodes
62
+ end # module Dataflow
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ # Interface for a node that behaves as a dataset.
4
+ # Does not support any operation.
5
+ # Inherit and override to implement custom behavior.
6
+ module Nodes
7
+ class RuntimeQueryNode < DataNode
8
+
9
+ after_initialize do
10
+ self.db_backend = :none
11
+ end
12
+
13
+ def handle_dataset_settings_changed
14
+ # dot not do anything, there is no real dataset
15
+ end
16
+
17
+ def all(*_args)
18
+ raise NotImplementedError, 'this node does not support #all'
19
+ end
20
+
21
+ def count(*_args)
22
+ raise NotImplementedError, 'this node does not support #count'
23
+ end
24
+
25
+ def find(*_args)
26
+ raise NotImplementedError, 'this node does not support #find'
27
+ end
28
+
29
+ def all_paginated(*_args)
30
+ raise NotImplementedError, 'this node does not support #all_paginated'
31
+ end
32
+
33
+ def add(*_args)
34
+ raise NotImplementedError, 'this node does not support #add'
35
+ end
36
+
37
+ def clear(*_args)
38
+ raise NotImplementedError, 'this node does not support #clear'
39
+ end
40
+ end
41
+ end
42
+ end
@@ -35,7 +35,9 @@ module Dataflow
35
35
  end
36
36
 
37
37
  def execute_query
38
- data_node.send(:db_adapter).client[computed_query].to_a
38
+ query = computed_query
39
+ logger.log(query)
40
+ data_node.send(:db_adapter).client[query].to_a
39
41
  end
40
42
 
41
43
  private
@@ -60,7 +60,7 @@ module Dataflow
60
60
  equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
61
61
  count_per_process = [max_per_process, equal_split_per_process].min
62
62
 
63
- queries = ordered_system_id_queries(batch_size: count_per_process)
63
+ queries = ordered_system_id_queries(batch_size: count_per_process, where: where)
64
64
 
65
65
  sch = schema_inferrer.infer_schema(batch_count: queries.count, extended: extended) do |idx|
66
66
  all(where: queries[idx].merge(where))
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.12.1'
3
+ VERSION = '0.13.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.1
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-13 00:00:00.000000000 Z
11
+ date: 2017-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -336,6 +336,8 @@ files:
336
336
  - lib/dataflow/nodes/merge_node.rb
337
337
  - lib/dataflow/nodes/mixin/add_internal_timestamp.rb
338
338
  - lib/dataflow/nodes/mixin/rename_dotted_fields.rb
339
+ - lib/dataflow/nodes/read_only_data_node.rb
340
+ - lib/dataflow/nodes/runtime_query_node.rb
339
341
  - lib/dataflow/nodes/select_keys_node.rb
340
342
  - lib/dataflow/nodes/snapshot_node.rb
341
343
  - lib/dataflow/nodes/sql_query_node.rb