dataflow-rb 0.12.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 17c81e73d137d2c613e6cc676346970b3473c2ad
4
- data.tar.gz: febcd66be2ba8004d374422a23cf39c33af4dc63
3
+ metadata.gz: 72840e2477fe869fb06b0299c96d5ae2a57c7713
4
+ data.tar.gz: f9f03314f23473585a9e742740c0e809f2d99bc7
5
5
  SHA512:
6
- metadata.gz: 6200fd8715421654f1c8a62c812fa8914d78d11f2c002d07279ac49d29f0fde8cb11325530cea56d9dc707ba651bf632c45c74af6d7dad64bc0143e68f6099ca
7
- data.tar.gz: 70bbf3c790c6d47dddac9e932fb24b5159d3039a7a327066fff84af9feb26e23804216e94a72155a164844119a5eed7f30006a1111e2febc772fd973557daca0
6
+ metadata.gz: 43f7cef4b2150017871cb7b3c0f21602a01f385e40d07ddf7000f455a4adc007669974fd4e7170e4acc3807feae907f6114e3b3cbfbdbbf36f96348c3a06f60c
7
+ data.tar.gz: d16411f178fa8ccc00cc9dbaefd0905040a5f8354874f7b00ad5080681d912fa8051dd16e3975d25aed8c737fdf70594f3bb77a948e95665d92a198f0e65206c
@@ -1,5 +1,23 @@
1
1
  # Changelog
2
2
 
3
+
4
+ #### 0.13.0
5
+ - [b79c96f] Fix a bug in the sql adapter: support multiple ORDER BY clauses
6
+ - [a17f071] Add runtime query node. Make the ops transformation public.
7
+ - [8c78aa2] Added support for a per-node backup/restore
8
+ - [6069ec0] Moved the db settings to the settings class
9
+ - [b5a77fc] Set the last update time using a query directly on the DB. Do not return unneeded information from the recompute/explain method
10
+ - [cc77366] Explain why a node needs an update
11
+ - [e87ba14] Add logging to the sql query node
12
+ - [5d82dfc] Fix logging during the sql table creation.
13
+ - [7390264] Add a read-only data node
14
+ - [dbb14ed] Refactor the debugging implementation
15
+ - [38925a3] Added parameters on the data node to flexibly connec to any database
16
+ - [7aac1eb] Add support for partial (where clause) parallel queries generation.
17
+
18
+ #### 0.12.1
19
+ - [110ded7] Fix compute node not processing in parallel
20
+
3
21
  #### 0.12.0
4
22
  - [4a510df] Add support for case insentive regex matching on mysql
5
23
  - [63b0771] Add logging to understand the current computation batch progress
@@ -36,6 +36,8 @@ require 'dataflow/nodes/compute_node'
36
36
  require 'dataflow/nodes/join_node'
37
37
  require 'dataflow/nodes/map_node'
38
38
  require 'dataflow/nodes/merge_node'
39
+ require 'dataflow/nodes/read_only_data_node'
40
+ require 'dataflow/nodes/runtime_query_node'
39
41
  require 'dataflow/nodes/select_keys_node'
40
42
  require 'dataflow/nodes/snapshot_node'
41
43
  require 'dataflow/nodes/sql_query_node'
@@ -9,9 +9,9 @@ module Dataflow
9
9
  class << self
10
10
  def client(settings, db_name: nil)
11
11
  @clients ||= {}
12
- host = ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
13
- port = '27017'
14
- connection_uri = settings.connection_uri || "#{host}:#{port}"
12
+
13
+ settings.adapter_type = 'mongodb'
14
+ connection_uri = settings.connection_uri_or_default
15
15
  db_name ||= settings.db_name
16
16
  @clients["#{connection_uri}.#{db_name}"] ||= Mongo::Client.new([connection_uri], database: db_name)
17
17
  end
@@ -113,8 +113,8 @@ module Dataflow
113
113
  end
114
114
 
115
115
  # Create queries that permit processing the whole dataset in parallel without using offsets.
116
- def ordered_system_id_queries(batch_size:)
117
- ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
116
+ def ordered_system_id_queries(batch_size:, where: {})
117
+ ids = all(fields: [SYSTEM_ID], where: where, sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
118
118
  queries_count = (ids.size / batch_size.to_f).ceil
119
119
  Array.new(queries_count) do |i|
120
120
  from = ids[i * batch_size]
@@ -225,14 +225,25 @@ module Dataflow
225
225
  }
226
226
  end
227
227
 
228
- private
229
-
230
- def write_dataset_name
231
- settings.write_dataset_name
228
+ def dump(base_folder:)
229
+ archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.gz"
230
+ options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
231
+ options += "--host=#{@settings.db_host}" if @settings.db_host.present?
232
+ options += "--port=#{@settings.db_port}" if @settings.db_port.present?
233
+ options += "--username=#{@settings.db_user}" if @settings.db_user.present?
234
+ options += "--password=#{@settings.db_password}" if @settings.db_password.present?
235
+ `mkdir -p #{base_folder}`
236
+ `mongodump #{options} --gzip`
237
+ archive_path
232
238
  end
233
239
 
234
- def read_dataset_name
235
- settings.read_dataset_name
240
+ def restore(filepath:)
241
+ options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
242
+ options += "--host=#{@settings.db_host}" if @settings.db_host.present?
243
+ options += "--port=#{@settings.db_port}" if @settings.db_port.present?
244
+ options += "--username=#{@settings.db_user}" if @settings.db_user.present?
245
+ options += "--password=#{@settings.db_password}" if @settings.db_password.present?
246
+ `mongorestore #{options} --gzip`
236
247
  end
237
248
 
238
249
  def transform_to_query(opts)
@@ -275,6 +286,16 @@ module Dataflow
275
286
  sanitized_opts
276
287
  end
277
288
 
289
+ private
290
+
291
+ def write_dataset_name
292
+ settings.write_dataset_name
293
+ end
294
+
295
+ def read_dataset_name
296
+ settings.read_dataset_name
297
+ end
298
+
278
299
  def try_cast_value(field, value)
279
300
  # cast to time when querying on _mojaco_updated_at
280
301
  return Timeliness.parse(value) || value if field =~ /_mojaco_updated_at/
@@ -24,6 +24,28 @@ module Dataflow
24
24
  def regex_case_insensitive_op
25
25
  '~*'
26
26
  end
27
+
28
+ def dump(base_folder:)
29
+ archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.dump"
30
+ options = "--table=public.#{@settings.read_dataset_name}"
31
+ options += "--host=#{@settings.db_host}" if @settings.db_host.present?
32
+ options += "--port=#{@settings.db_port}" if @settings.db_port.present?
33
+ options += "--username=#{@settings.db_user}" if @settings.db_user.present?
34
+ password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
35
+ `mkdir -p #{base_folder}`
36
+ `#{password}pg_dump #{options} -Fc #{@settings.db_name} > #{archive_path}`
37
+ archive_path
38
+ end
39
+
40
+ def restore(filepath:)
41
+ options = "--table=#{@settings.read_dataset_name}"
42
+ options += "--host=#{@settings.db_host}" if @settings.db_host.present?
43
+ options += "--port=#{@settings.db_port}" if @settings.db_port.present?
44
+ options += "--username=#{@settings.db_user}" if @settings.db_user.present?
45
+ password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
46
+ p "#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}"
47
+ `#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
48
+ end
27
49
  end
28
50
  end
29
51
  end
@@ -2,16 +2,23 @@
2
2
  module Dataflow
3
3
  module Adapters
4
4
  class Settings
5
- attr_accessor :connection_uri, :db_name, :indexes, :adapter_type,
6
- :dataset_name, :read_dataset_name, :write_dataset_name, :schema
5
+ attr_accessor :connection_uri, :db_name,
6
+ :db_host, :db_port, :db_user, :db_password,
7
+ :dataset_name, :read_dataset_name, :write_dataset_name,
8
+ :indexes, :adapter_type, :schema
7
9
 
8
10
  def initialize(data_node: nil, connection_uri: nil, db_name: nil,
11
+ db_host: nil, db_port: nil, db_user: nil, db_password: nil,
9
12
  dataset_name: nil, indexes: nil, adapter_type: nil, schema: nil)
10
13
  @connection_uri = connection_uri
11
14
 
12
15
  # first try to set the options based on the data node settings
13
16
  if data_node.present?
14
17
  @db_name = data_node.db_name
18
+ @db_host = data_node.db_host
19
+ @db_port = data_node.db_port
20
+ @db_user = data_node.db_user
21
+ @db_password = data_node.db_password
15
22
  @dataset_name = data_node.name
16
23
  @read_dataset_name = data_node.read_dataset_name
17
24
  @write_dataset_name = data_node.write_dataset_name
@@ -21,6 +28,10 @@ module Dataflow
21
28
 
22
29
  # override if needed
23
30
  @db_name ||= db_name
31
+ @db_host ||= db_host
32
+ @db_port ||= db_port
33
+ @db_user ||= db_user
34
+ @db_password ||= db_password
24
35
  @dataset_name ||= dataset_name
25
36
  @read_dataset_name ||= dataset_name
26
37
  @write_dataset_name ||= dataset_name
@@ -28,6 +39,62 @@ module Dataflow
28
39
  @adapter_type ||= adapter_type
29
40
  @schema ||= schema
30
41
  end
42
+
43
+ def set_mongodb_defaults_if_needed!
44
+ @db_host ||= ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
45
+ @db_port ||= ENV['MOJACO_MONGO_PORT'] || '27017'
46
+ @db_user ||= ENV['MOJACO_MONGO_USER']
47
+ @db_password ||= ENV['MOJACO_MONGO_USER']
48
+ end
49
+
50
+ def set_postgresql_defaults_if_needed!
51
+ @db_host ||= ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
52
+ @db_port ||= ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
53
+ @db_user ||= ENV['MOJACO_POSTGRESQL_USER']
54
+ @db_password ||= ENV['MOJACO_POSTGRESQL_PASSWORD']
55
+ end
56
+
57
+ def set_mysql_defaults_if_needed!
58
+ @db_host ||= ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
59
+ @db_port ||= ENV['MOJACO_MYSQL_PORT'] || '3306'
60
+ @db_user ||= ENV['MOJACO_MYSQL_USER']
61
+ @db_password ||= ENV['MOJACO_MYSQL_PASSWORD']
62
+ end
63
+
64
+ def connection_uri_or_default
65
+ return @connection_uri if @connection_uri.present?
66
+
67
+ send("#{@adapter_type}_default_connection_uri")
68
+ end
69
+
70
+ def mongodb_default_connection_uri
71
+ set_mongodb_defaults_if_needed!
72
+
73
+ # if user/password are empty, the user_password will be empty as well
74
+ user_password = @db_user
75
+ user_password += ":#{@db_password}" if @db_password.present?
76
+ user_password += '@' if user_password.present?
77
+
78
+ # [username:password@]host1[:port1]
79
+ "#{user_password}#{@db_host}:#{@db_port}"
80
+ end
81
+
82
+ def mysql_default_connection_uri
83
+ set_mysql_defaults_if_needed!
84
+ sql_default_connection_uri('mysql2')
85
+ end
86
+
87
+ def postgresql_default_connection_uri
88
+ set_postgresql_defaults_if_needed!
89
+ sql_default_connection_uri('postgresql')
90
+ end
91
+
92
+ def sql_default_connection_uri(scheme)
93
+ user_password = @db_user
94
+ user_password += ":#{@db_password}" if @db_password.present?
95
+
96
+ "#{scheme}://#{user_password}@#{@db_host}:#{@db_port}"
97
+ end
31
98
  end
32
99
  end
33
100
  end
@@ -9,37 +9,17 @@ module Dataflow
9
9
  # @param settings [Hash] Represents the connection settings to the DB.
10
10
  # @param db_name [String] The database name to which the client will connect.
11
11
  # @return [Sequel::Database] a sequel database object.
12
- def client(settings, db_name: nil)
12
+ def client(settings)
13
13
  @clients ||= {}
14
-
15
- case settings.adapter_type
16
- when 'mysql2'
17
- host = ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
18
- port = ENV['MOJACO_MYSQL_PORT'] || '3306'
19
- user = ENV['MOJACO_MYSQL_USER']
20
- password = ENV['MOJACO_MYSQL_PASSWORD']
21
- when 'postgresql'
22
- host = ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
23
- port = ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
24
- user = ENV['MOJACO_POSTGRESQL_USER']
25
- password = ENV['MOJACO_POSTGRESQL_PASSWORD']
26
- end
27
-
28
- db_name ||= settings.db_name
29
- user_password = user
30
- user_password += ":#{password}" if password.present?
31
-
32
- uri = "#{settings.adapter_type}://#{user_password}@#{host}:#{port}"
33
- connection_uri = settings.connection_uri || "#{uri}/#{db_name}"
34
-
14
+ connection_uri = settings.connection_uri_or_default
35
15
  return @clients[connection_uri] if @clients[connection_uri].present?
36
16
 
37
17
  # first, make sure the DB is created (if it is not an external db)
38
18
  is_external_db = settings.connection_uri.present?
39
- try_create_db(uri, db_name, user, password) unless is_external_db
19
+ try_create_db(connection_uri, settings.db_name) unless is_external_db
40
20
 
41
21
  # then, create the connection object
42
- db = Sequel.connect("#{connection_uri}?encoding=utf8")
22
+ db = Sequel.connect("#{connection_uri}/#{settings.db_name}?encoding=utf8")
43
23
  add_extensions(settings, db)
44
24
  @clients[connection_uri] = db
45
25
  end
@@ -48,8 +28,8 @@ module Dataflow
48
28
  # @param uri [String] the connection uri to the DB.
49
29
  # @param db_name [String] the database name.
50
30
  # @return [Boolean] whether the db was created or not.
51
- def try_create_db(uri, db_name, user, password)
52
- Sequel.connect(uri, user: user, password: password) do |db|
31
+ def try_create_db(uri, db_name)
32
+ Sequel.connect(uri) do |db|
53
33
  db.run("CREATE DATABASE #{db_name}")
54
34
  true
55
35
  end
@@ -108,7 +88,7 @@ module Dataflow
108
88
 
109
89
  (sort || {}).each do |k, v|
110
90
  sort_value = v == 1 ? k.to_sym : Sequel.desc(k.to_sym)
111
- res = res.order(sort_value)
91
+ res = res.order_append(sort_value)
112
92
  end
113
93
 
114
94
  res = res.offset(offset) if offset > 0
@@ -127,8 +107,8 @@ module Dataflow
127
107
  end
128
108
 
129
109
  # Create queries that permit processing the whole dataset in parallel without using offsets.
130
- def ordered_system_id_queries(batch_size:)
131
- ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
110
+ def ordered_system_id_queries(batch_size:, where: {})
111
+ ids = all(fields: [SYSTEM_ID], where: where, sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
132
112
  queries_count = (ids.size / batch_size.to_f).ceil
133
113
  Array.new(queries_count) do |i|
134
114
  from = ids[i * batch_size]
@@ -195,7 +175,7 @@ module Dataflow
195
175
  def recreate_dataset(dataset: nil)
196
176
  dataset ||= settings.write_dataset_name.to_sym
197
177
  drop_dataset(dataset)
198
- create_table(dataset, @schema)
178
+ create_table(dataset, @schema, logger)
199
179
  end
200
180
 
201
181
  # drops the given dataset
@@ -248,12 +228,40 @@ module Dataflow
248
228
  table_usage.merge(effective_indexes: indexes)
249
229
  end
250
230
 
231
+ def transform_to_query(opts)
232
+ # map to a serie of AND clauses queries
233
+ opts.flat_map do |k, v|
234
+ if v.is_a? Hash
235
+ v.map do |operator, value|
236
+ case operator
237
+ when '!='
238
+ if value.is_a? Array
239
+ Sequel.lit("#{k} NOT IN ?", value)
240
+ else
241
+ Sequel.lit("#{k} <> ?", value)
242
+ end
243
+ when '<', '<=', '>', '>='
244
+ Sequel.lit("#{k} #{operator} ?", value)
245
+ when '~'
246
+ Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
247
+ when '~*'
248
+ Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
249
+ end
250
+ end
251
+ else
252
+ # e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
253
+ # are supported with simples hashes
254
+ [[{ k.to_sym => v }]]
255
+ end
256
+ end
257
+ end
258
+
251
259
  private
252
260
 
253
261
  MAX_INT = 2_147_483_647
254
262
  MAX_VARCHAR = 255
255
263
 
256
- def create_table(dataset, schema)
264
+ def create_table(dataset, schema, logger)
257
265
  client.create_table(dataset.to_sym) do
258
266
  # always add an _id field to be used internally
259
267
  primary_key SYSTEM_ID
@@ -309,34 +317,6 @@ module Dataflow
309
317
  res
310
318
  end
311
319
 
312
- def transform_to_query(opts)
313
- # map to a serie of AND clauses queries
314
- opts.flat_map do |k, v|
315
- if v.is_a? Hash
316
- v.map do |operator, value|
317
- case operator
318
- when '!='
319
- if value.is_a? Array
320
- Sequel.lit("#{k} NOT IN ?", value)
321
- else
322
- Sequel.lit("#{k} <> ?", value)
323
- end
324
- when '<', '<=', '>', '>='
325
- Sequel.lit("#{k} #{operator} ?", value)
326
- when '~'
327
- Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
328
- when '~*'
329
- Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
330
- end
331
- end
332
- else
333
- # e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
334
- # are supported with simples hashes
335
- [[{ k.to_sym => v }]]
336
- end
337
- end
338
- end
339
-
340
320
  # Required index format for sequel:
341
321
  # :keys, unique: true
342
322
  def index_parameters(index)
@@ -14,10 +14,6 @@ module Dataflow
14
14
  Dataflow::Nodes::ComputeNode.find(id)
15
15
  end
16
16
 
17
- def updated?
18
- true
19
- end
20
-
21
17
  def recompute(*args)
22
18
  # Interface only, for recursion purposes
23
19
  end
@@ -156,6 +156,22 @@ module Dataflow
156
156
  true
157
157
  end
158
158
 
159
+ # Logs out the dependencies tree update time and whether
160
+ # it should or not be updated. Useful to understand
161
+ # why a given nodes had to be recomputed.
162
+ def explain_update(depth: 0, verbose: false)
163
+ if depth == 0 || !updated? || verbose
164
+ logger.log("#{'>' * (depth + 1)} #{name} [COMPUTE] | #{updated? ? 'UPDATED' : 'OLD'} = #{updated_at}")
165
+ end
166
+
167
+ return if updated? && !verbose
168
+
169
+ dependencies.each do |dependency|
170
+ dependency.explain_update(depth: depth + 1, verbose: verbose)
171
+ end
172
+ true
173
+ end
174
+
159
175
  # Keep a uniform interface with a DataNode.
160
176
  def updated_at
161
177
  last_compute_starting_time
@@ -183,11 +199,11 @@ module Dataflow
183
199
  # even if the node is already up to date.
184
200
  def recompute(depth: 0, force_recompute: false)
185
201
  send_heartbeat
186
- logger.log "#{'>' * (depth + 1)} #{name} started recomputing..."
202
+ logger.log("#{'>' * (depth + 1)} #{name} started recomputing...")
187
203
  start_time = Time.now
188
204
 
189
205
  parallel_each(dependencies) do |dependency|
190
- logger.log "#{'>' * (depth + 1)} #{name} checking deps: #{dependency.name}..."
206
+ logger.log("#{'>' * (depth + 1)} #{name} checking deps: #{dependency.name}...")
191
207
  if !dependency.updated? || force_recompute
192
208
  dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
193
209
  end
@@ -196,11 +212,11 @@ module Dataflow
196
212
 
197
213
  # Dependencies data may have changed in a child process.
198
214
  # Reload to make sure we have the latest metadata.
199
- logger.log "#{'>' * (depth + 1)} #{name} reloading dependencies..."
215
+ logger.log("#{'>' * (depth + 1)} #{name} reloading dependencies...")
200
216
  dependencies(reload: true)
201
217
 
202
218
  compute(depth: depth, force_compute: force_recompute)
203
- logger.log "#{'>' * (depth + 1)} #{name} took #{Time.now - start_time} seconds to recompute."
219
+ logger.log("#{'>' * (depth + 1)} #{name} took #{Time.now - start_time} seconds to recompute.")
204
220
 
205
221
  true
206
222
  end
@@ -216,13 +232,13 @@ module Dataflow
216
232
  validate!
217
233
 
218
234
  if updated? && !force_compute
219
- logger.log "#{'>' * (depth + 1)} #{name} is up-to-date."
235
+ logger.log("#{'>' * (depth + 1)} #{name} is up-to-date.")
220
236
  return
221
237
  end
222
238
 
223
239
  has_compute_lock = acquire_computing_lock!
224
240
  if has_compute_lock
225
- logger.log "#{'>' * (depth + 1)} #{name} started computing."
241
+ logger.log("#{'>' * (depth + 1)} #{name} started computing.")
226
242
  on_computing_started
227
243
  start_time = Time.now
228
244
 
@@ -254,15 +270,15 @@ module Dataflow
254
270
  data_node&.swap_read_write_datasets!
255
271
  end
256
272
 
257
- self.last_compute_starting_time = start_time
258
- save
273
+ set_last_compute_starting_time(start_time)
259
274
  duration = Time.now - start_time
260
- logger.log "#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute."
275
+ logger.log("#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute.")
261
276
  on_computing_finished(state: 'computed')
277
+ true
262
278
  else
263
- logger.log "#{'>' * (depth + 1)} [IS AWAITING] #{name}."
279
+ logger.log("#{'>' * (depth + 1)} [IS AWAITING] #{name}.")
264
280
  await_computing!
265
- logger.log "#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}."
281
+ logger.log("#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}.")
266
282
  end
267
283
 
268
284
  rescue StandardError => e
@@ -412,6 +428,17 @@ module Dataflow
412
428
  .find_one_and_update(update_query)
413
429
  end
414
430
 
431
+ def set_last_compute_starting_time(time)
432
+ # this is just to avoid the reload.
433
+ # But this change will not be propagated across processes
434
+ self.last_compute_starting_time = time
435
+ # update directly on the DB
436
+ update_query = { '$set' => { last_compute_starting_time: time } }
437
+ Dataflow::Nodes::ComputeNode.where(_id: _id)
438
+ .find_one_and_update(update_query)
439
+
440
+ end
441
+
415
442
  ##############################
416
443
  # Dependency validations
417
444
  ##############################
@@ -505,24 +532,21 @@ module Dataflow
505
532
  Mongoid.disconnect_clients
506
533
 
507
534
  # set to true to debug code in the iteration
508
- is_debugging_impl = (ENV['RACK_ENV'] == 'test' && ENV['DEBUG'])
509
- if is_debugging_impl
510
- itr.each do |*args|
511
- yield(*args)
512
- end
513
- else
514
- opts = if max_parallel_processes > 0
515
- { in_processes: max_parallel_processes }
516
- else
517
- {}
518
- end
519
-
520
- Parallel.each(itr, opts) do |*args|
521
- yield(*args)
522
- Dataflow::Adapters::SqlAdapter.disconnect_clients
523
- Dataflow::Adapters::MongoDbAdapter.disconnect_clients
524
- Mongoid.disconnect_clients
525
- end
535
+ is_debugging_impl = ENV['DEBUG_DATAFLOW']
536
+ opts = if is_debugging_impl
537
+ # this will turn of the parallel processing
538
+ { in_processes: 0 }
539
+ elsif max_parallel_processes > 0
540
+ { in_processes: max_parallel_processes }
541
+ else
542
+ {}
543
+ end
544
+
545
+ Parallel.each(itr, opts) do |*args|
546
+ yield(*args)
547
+ Dataflow::Adapters::SqlAdapter.disconnect_clients
548
+ Dataflow::Adapters::MongoDbAdapter.disconnect_clients
549
+ Mongoid.disconnect_clients
526
550
  end
527
551
  end
528
552
 
@@ -24,11 +24,19 @@ module Dataflow
24
24
  # make sure we have only one node per db/table combination
25
25
  index({ db_name: 1, name: 1 }, unique: true)
26
26
 
27
+ # The dataset name used by this node for storage.
28
+ field :name, type: String, editable: false
29
+
27
30
  # The database name used by this node
28
31
  field :db_name, type: String, editable: false
29
-
30
- # The dataset name used by this node for storage.
31
- field :name, type: String
32
+ # The database host (used the ENV settings by default)
33
+ field :db_host, type: String, editable: false
34
+ # The database port (used the ENV settings by default)
35
+ field :db_port, type: String, editable: false
36
+ # The database user (used the ENV settings by default)
37
+ field :db_user, type: String, editable: false
38
+ # The database password (used the ENV settings by default)
39
+ field :db_password, type: String, editable: false
32
40
 
33
41
  # The schema of this node
34
42
  field :schema, type: Hash, editable: false
@@ -163,8 +171,8 @@ module Dataflow
163
171
  # Parallel.each(queries) do |query|
164
172
  # process(node.all(where: query))
165
173
  # end
166
- def ordered_system_id_queries(batch_size:)
167
- db_adapter.ordered_system_id_queries(batch_size: batch_size)
174
+ def ordered_system_id_queries(batch_size:, where: {})
175
+ db_adapter.ordered_system_id_queries(batch_size: batch_size, where: {})
168
176
  end
169
177
 
170
178
  # Counts how many records matches the condition or all if no condition is given.
@@ -297,6 +305,13 @@ module Dataflow
297
305
  (db_backend.to_s =~ /sql/).present?
298
306
  end
299
307
 
308
+ def updated?
309
+ true
310
+ end
311
+
312
+ def explain_update(depth: 0, verbose: false)
313
+ logger.log("#{'>' * (depth + 1)} #{name} [Dataset] | UPDATED = #{updated_at}")
314
+ end
300
315
 
301
316
  def required_by
302
317
  super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map { |node|
@@ -323,6 +338,18 @@ module Dataflow
323
338
  db_adapter.drop_dataset(read_dataset_name)
324
339
  end
325
340
 
341
+ # Dump a backup of this dataset to a file.
342
+ # @return [String] the filepath to the dump file.
343
+ def dump_dataset(base_folder: './dump')
344
+ db_adapter.dump(base_folder: base_folder)
345
+ end
346
+
347
+ # Restore a dump of this dataset
348
+ # @param files [String] the filepath to the dump file.
349
+ def restore_dataset(filepath:)
350
+ db_adapter.restore(filepath: filepath)
351
+ end
352
+
326
353
  private
327
354
 
328
355
  def db_adapter(connection_opts = {})
@@ -342,9 +369,9 @@ module Dataflow
342
369
  @csv_adapter ||= Adapters::CsvAdapter.new(data_node: self)
343
370
  return @csv_adapter
344
371
  when 'mysql'
345
- opts[:adapter_type] = 'mysql2'
372
+ opts[:adapter_type] = 'mysql'
346
373
  return Adapters::SqlAdapter.new(opts) if has_options
347
- @mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql2')
374
+ @mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql')
348
375
  return @mysql_adapter
349
376
  when 'postgresql'
350
377
  opts[:adapter_type] = 'postgresql'
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ # Only supports read operations
5
+ class ReadOnlyDataNode < DataNode
6
+
7
+ def set_defaults
8
+ super
9
+ self.use_double_buffering = false
10
+ end
11
+
12
+
13
+ def handle_dataset_settings_changed
14
+ # ignore - do not do anyhing
15
+ end
16
+
17
+ def add(*args)
18
+ raise_read_only_error!
19
+ end
20
+
21
+ def clear(*args)
22
+ raise_read_only_error!
23
+ end
24
+
25
+ def recreate_dataset(*args)
26
+ raise_read_only_error!
27
+ end
28
+
29
+ def create_unique_indexes(*args)
30
+ raise_read_only_error!
31
+ end
32
+
33
+ def create_non_unique_indexes(*args)
34
+ raise_read_only_error!
35
+ end
36
+
37
+ def read_dataset_name=(*args)
38
+ raise_read_only_error!
39
+ end
40
+
41
+ def swap_read_write_datasets!
42
+ raise_read_only_error!
43
+ end
44
+
45
+ def import(*args)
46
+ raise_read_only_error!
47
+ end
48
+
49
+
50
+ def drop_dataset!
51
+ raise_read_only_error!
52
+ end
53
+
54
+ private
55
+
56
+ def raise_read_only_error!
57
+ raise NotImplementedError, 'External data nodes are read only'
58
+ end
59
+
60
+ end # class ExternalDataNode
61
+ end # module Nodes
62
+ end # module Dataflow
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ # Interface for a node that behaves as a dataset.
4
+ # Does not support any operation.
5
+ # Inherit and override to implement custom behavior.
6
+ module Nodes
7
+ class RuntimeQueryNode < DataNode
8
+
9
+ after_initialize do
10
+ self.db_backend = :none
11
+ end
12
+
13
+ def handle_dataset_settings_changed
14
+ # dot not do anything, there is no real dataset
15
+ end
16
+
17
+ def all(*_args)
18
+ raise NotImplementedError, 'this node does not support #all'
19
+ end
20
+
21
+ def count(*_args)
22
+ raise NotImplementedError, 'this node does not support #count'
23
+ end
24
+
25
+ def find(*_args)
26
+ raise NotImplementedError, 'this node does not support #find'
27
+ end
28
+
29
+ def all_paginated(*_args)
30
+ raise NotImplementedError, 'this node does not support #all_paginated'
31
+ end
32
+
33
+ def add(*_args)
34
+ raise NotImplementedError, 'this node does not support #add'
35
+ end
36
+
37
+ def clear(*_args)
38
+ raise NotImplementedError, 'this node does not support #clear'
39
+ end
40
+ end
41
+ end
42
+ end
@@ -35,7 +35,9 @@ module Dataflow
35
35
  end
36
36
 
37
37
  def execute_query
38
- data_node.send(:db_adapter).client[computed_query].to_a
38
+ query = computed_query
39
+ logger.log(query)
40
+ data_node.send(:db_adapter).client[query].to_a
39
41
  end
40
42
 
41
43
  private
@@ -60,7 +60,7 @@ module Dataflow
60
60
  equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
61
61
  count_per_process = [max_per_process, equal_split_per_process].min
62
62
 
63
- queries = ordered_system_id_queries(batch_size: count_per_process)
63
+ queries = ordered_system_id_queries(batch_size: count_per_process, where: where)
64
64
 
65
65
  sch = schema_inferrer.infer_schema(batch_count: queries.count, extended: extended) do |idx|
66
66
  all(where: queries[idx].merge(where))
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.12.1'
3
+ VERSION = '0.13.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.1
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-13 00:00:00.000000000 Z
11
+ date: 2017-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -336,6 +336,8 @@ files:
336
336
  - lib/dataflow/nodes/merge_node.rb
337
337
  - lib/dataflow/nodes/mixin/add_internal_timestamp.rb
338
338
  - lib/dataflow/nodes/mixin/rename_dotted_fields.rb
339
+ - lib/dataflow/nodes/read_only_data_node.rb
340
+ - lib/dataflow/nodes/runtime_query_node.rb
339
341
  - lib/dataflow/nodes/select_keys_node.rb
340
342
  - lib/dataflow/nodes/snapshot_node.rb
341
343
  - lib/dataflow/nodes/sql_query_node.rb