dataflow-rb 0.12.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/lib/dataflow-rb.rb +2 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +32 -11
- data/lib/dataflow/adapters/psql_adapter.rb +22 -0
- data/lib/dataflow/adapters/settings.rb +69 -2
- data/lib/dataflow/adapters/sql_adapter.rb +39 -59
- data/lib/dataflow/node.rb +0 -4
- data/lib/dataflow/nodes/compute_node.rb +53 -29
- data/lib/dataflow/nodes/data_node.rb +34 -7
- data/lib/dataflow/nodes/read_only_data_node.rb +62 -0
- data/lib/dataflow/nodes/runtime_query_node.rb +42 -0
- data/lib/dataflow/nodes/sql_query_node.rb +3 -1
- data/lib/dataflow/schema_mixin.rb +1 -1
- data/lib/dataflow/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72840e2477fe869fb06b0299c96d5ae2a57c7713
|
4
|
+
data.tar.gz: f9f03314f23473585a9e742740c0e809f2d99bc7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 43f7cef4b2150017871cb7b3c0f21602a01f385e40d07ddf7000f455a4adc007669974fd4e7170e4acc3807feae907f6114e3b3cbfbdbbf36f96348c3a06f60c
|
7
|
+
data.tar.gz: d16411f178fa8ccc00cc9dbaefd0905040a5f8354874f7b00ad5080681d912fa8051dd16e3975d25aed8c737fdf70594f3bb77a948e95665d92a198f0e65206c
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,23 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
|
4
|
+
#### 0.13.0
|
5
|
+
- [b79c96f] Fix a bug in the sql adapter: support multiple ORDER BY clauses
|
6
|
+
- [a17f071] Add runtime query node. Make the ops transformation public.
|
7
|
+
- [8c78aa2] Added support for a per-node backup/restore
|
8
|
+
- [6069ec0] Moved the db settings to the settings class
|
9
|
+
- [b5a77fc] Set the last update time using a query directly on the DB. Do not return unneeded information from the recompute/explain method
|
10
|
+
- [cc77366] Explain why a node needs an update
|
11
|
+
- [e87ba14] Add logging to the sql query node
|
12
|
+
- [5d82dfc] Fix logging during the sql table creation.
|
13
|
+
- [7390264] Add a read-only data node
|
14
|
+
- [dbb14ed] Refactor the debugging implementation
|
15
|
+
- [38925a3] Added parameters on the data node to flexibly connec to any database
|
16
|
+
- [7aac1eb] Add support for partial (where clause) parallel queries generation.
|
17
|
+
|
18
|
+
#### 0.12.1
|
19
|
+
- [110ded7] Fix compute node not processing in parallel
|
20
|
+
|
3
21
|
#### 0.12.0
|
4
22
|
- [4a510df] Add support for case insentive regex matching on mysql
|
5
23
|
- [63b0771] Add logging to understand the current computation batch progress
|
data/lib/dataflow-rb.rb
CHANGED
@@ -36,6 +36,8 @@ require 'dataflow/nodes/compute_node'
|
|
36
36
|
require 'dataflow/nodes/join_node'
|
37
37
|
require 'dataflow/nodes/map_node'
|
38
38
|
require 'dataflow/nodes/merge_node'
|
39
|
+
require 'dataflow/nodes/read_only_data_node'
|
40
|
+
require 'dataflow/nodes/runtime_query_node'
|
39
41
|
require 'dataflow/nodes/select_keys_node'
|
40
42
|
require 'dataflow/nodes/snapshot_node'
|
41
43
|
require 'dataflow/nodes/sql_query_node'
|
@@ -9,9 +9,9 @@ module Dataflow
|
|
9
9
|
class << self
|
10
10
|
def client(settings, db_name: nil)
|
11
11
|
@clients ||= {}
|
12
|
-
|
13
|
-
|
14
|
-
connection_uri = settings.
|
12
|
+
|
13
|
+
settings.adapter_type = 'mongodb'
|
14
|
+
connection_uri = settings.connection_uri_or_default
|
15
15
|
db_name ||= settings.db_name
|
16
16
|
@clients["#{connection_uri}.#{db_name}"] ||= Mongo::Client.new([connection_uri], database: db_name)
|
17
17
|
end
|
@@ -113,8 +113,8 @@ module Dataflow
|
|
113
113
|
end
|
114
114
|
|
115
115
|
# Create queries that permit processing the whole dataset in parallel without using offsets.
|
116
|
-
def ordered_system_id_queries(batch_size:)
|
117
|
-
ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
|
116
|
+
def ordered_system_id_queries(batch_size:, where: {})
|
117
|
+
ids = all(fields: [SYSTEM_ID], where: where, sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
|
118
118
|
queries_count = (ids.size / batch_size.to_f).ceil
|
119
119
|
Array.new(queries_count) do |i|
|
120
120
|
from = ids[i * batch_size]
|
@@ -225,14 +225,25 @@ module Dataflow
|
|
225
225
|
}
|
226
226
|
end
|
227
227
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
settings.
|
228
|
+
def dump(base_folder:)
|
229
|
+
archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.gz"
|
230
|
+
options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
|
231
|
+
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
232
|
+
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
233
|
+
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
234
|
+
options += "--password=#{@settings.db_password}" if @settings.db_password.present?
|
235
|
+
`mkdir -p #{base_folder}`
|
236
|
+
`mongodump #{options} --gzip`
|
237
|
+
archive_path
|
232
238
|
end
|
233
239
|
|
234
|
-
def
|
235
|
-
settings.read_dataset_name
|
240
|
+
def restore(filepath:)
|
241
|
+
options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
|
242
|
+
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
243
|
+
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
244
|
+
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
245
|
+
options += "--password=#{@settings.db_password}" if @settings.db_password.present?
|
246
|
+
`mongorestore #{options} --gzip`
|
236
247
|
end
|
237
248
|
|
238
249
|
def transform_to_query(opts)
|
@@ -275,6 +286,16 @@ module Dataflow
|
|
275
286
|
sanitized_opts
|
276
287
|
end
|
277
288
|
|
289
|
+
private
|
290
|
+
|
291
|
+
def write_dataset_name
|
292
|
+
settings.write_dataset_name
|
293
|
+
end
|
294
|
+
|
295
|
+
def read_dataset_name
|
296
|
+
settings.read_dataset_name
|
297
|
+
end
|
298
|
+
|
278
299
|
def try_cast_value(field, value)
|
279
300
|
# cast to time when querying on _mojaco_updated_at
|
280
301
|
return Timeliness.parse(value) || value if field =~ /_mojaco_updated_at/
|
@@ -24,6 +24,28 @@ module Dataflow
|
|
24
24
|
def regex_case_insensitive_op
|
25
25
|
'~*'
|
26
26
|
end
|
27
|
+
|
28
|
+
def dump(base_folder:)
|
29
|
+
archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.dump"
|
30
|
+
options = "--table=public.#{@settings.read_dataset_name}"
|
31
|
+
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
32
|
+
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
33
|
+
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
34
|
+
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
35
|
+
`mkdir -p #{base_folder}`
|
36
|
+
`#{password}pg_dump #{options} -Fc #{@settings.db_name} > #{archive_path}`
|
37
|
+
archive_path
|
38
|
+
end
|
39
|
+
|
40
|
+
def restore(filepath:)
|
41
|
+
options = "--table=#{@settings.read_dataset_name}"
|
42
|
+
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
43
|
+
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
44
|
+
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
45
|
+
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
46
|
+
p "#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}"
|
47
|
+
`#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
|
48
|
+
end
|
27
49
|
end
|
28
50
|
end
|
29
51
|
end
|
@@ -2,16 +2,23 @@
|
|
2
2
|
module Dataflow
|
3
3
|
module Adapters
|
4
4
|
class Settings
|
5
|
-
attr_accessor :connection_uri, :db_name,
|
6
|
-
:
|
5
|
+
attr_accessor :connection_uri, :db_name,
|
6
|
+
:db_host, :db_port, :db_user, :db_password,
|
7
|
+
:dataset_name, :read_dataset_name, :write_dataset_name,
|
8
|
+
:indexes, :adapter_type, :schema
|
7
9
|
|
8
10
|
def initialize(data_node: nil, connection_uri: nil, db_name: nil,
|
11
|
+
db_host: nil, db_port: nil, db_user: nil, db_password: nil,
|
9
12
|
dataset_name: nil, indexes: nil, adapter_type: nil, schema: nil)
|
10
13
|
@connection_uri = connection_uri
|
11
14
|
|
12
15
|
# first try to set the options based on the data node settings
|
13
16
|
if data_node.present?
|
14
17
|
@db_name = data_node.db_name
|
18
|
+
@db_host = data_node.db_host
|
19
|
+
@db_port = data_node.db_port
|
20
|
+
@db_user = data_node.db_user
|
21
|
+
@db_password = data_node.db_password
|
15
22
|
@dataset_name = data_node.name
|
16
23
|
@read_dataset_name = data_node.read_dataset_name
|
17
24
|
@write_dataset_name = data_node.write_dataset_name
|
@@ -21,6 +28,10 @@ module Dataflow
|
|
21
28
|
|
22
29
|
# override if needed
|
23
30
|
@db_name ||= db_name
|
31
|
+
@db_host ||= db_host
|
32
|
+
@db_port ||= db_port
|
33
|
+
@db_user ||= db_user
|
34
|
+
@db_password ||= db_password
|
24
35
|
@dataset_name ||= dataset_name
|
25
36
|
@read_dataset_name ||= dataset_name
|
26
37
|
@write_dataset_name ||= dataset_name
|
@@ -28,6 +39,62 @@ module Dataflow
|
|
28
39
|
@adapter_type ||= adapter_type
|
29
40
|
@schema ||= schema
|
30
41
|
end
|
42
|
+
|
43
|
+
def set_mongodb_defaults_if_needed!
|
44
|
+
@db_host ||= ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
|
45
|
+
@db_port ||= ENV['MOJACO_MONGO_PORT'] || '27017'
|
46
|
+
@db_user ||= ENV['MOJACO_MONGO_USER']
|
47
|
+
@db_password ||= ENV['MOJACO_MONGO_USER']
|
48
|
+
end
|
49
|
+
|
50
|
+
def set_postgresql_defaults_if_needed!
|
51
|
+
@db_host ||= ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
|
52
|
+
@db_port ||= ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
|
53
|
+
@db_user ||= ENV['MOJACO_POSTGRESQL_USER']
|
54
|
+
@db_password ||= ENV['MOJACO_POSTGRESQL_PASSWORD']
|
55
|
+
end
|
56
|
+
|
57
|
+
def set_mysql_defaults_if_needed!
|
58
|
+
@db_host ||= ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
|
59
|
+
@db_port ||= ENV['MOJACO_MYSQL_PORT'] || '3306'
|
60
|
+
@db_user ||= ENV['MOJACO_MYSQL_USER']
|
61
|
+
@db_password ||= ENV['MOJACO_MYSQL_PASSWORD']
|
62
|
+
end
|
63
|
+
|
64
|
+
def connection_uri_or_default
|
65
|
+
return @connection_uri if @connection_uri.present?
|
66
|
+
|
67
|
+
send("#{@adapter_type}_default_connection_uri")
|
68
|
+
end
|
69
|
+
|
70
|
+
def mongodb_default_connection_uri
|
71
|
+
set_mongodb_defaults_if_needed!
|
72
|
+
|
73
|
+
# if user/password are empty, the user_password will be empty as well
|
74
|
+
user_password = @db_user
|
75
|
+
user_password += ":#{@db_password}" if @db_password.present?
|
76
|
+
user_password += '@' if user_password.present?
|
77
|
+
|
78
|
+
# [username:password@]host1[:port1]
|
79
|
+
"#{user_password}#{@db_host}:#{@db_port}"
|
80
|
+
end
|
81
|
+
|
82
|
+
def mysql_default_connection_uri
|
83
|
+
set_mysql_defaults_if_needed!
|
84
|
+
sql_default_connection_uri('mysql2')
|
85
|
+
end
|
86
|
+
|
87
|
+
def postgresql_default_connection_uri
|
88
|
+
set_postgresql_defaults_if_needed!
|
89
|
+
sql_default_connection_uri('postgresql')
|
90
|
+
end
|
91
|
+
|
92
|
+
def sql_default_connection_uri(scheme)
|
93
|
+
user_password = @db_user
|
94
|
+
user_password += ":#{@db_password}" if @db_password.present?
|
95
|
+
|
96
|
+
"#{scheme}://#{user_password}@#{@db_host}:#{@db_port}"
|
97
|
+
end
|
31
98
|
end
|
32
99
|
end
|
33
100
|
end
|
@@ -9,37 +9,17 @@ module Dataflow
|
|
9
9
|
# @param settings [Hash] Represents the connection settings to the DB.
|
10
10
|
# @param db_name [String] The database name to which the client will connect.
|
11
11
|
# @return [Sequel::Database] a sequel database object.
|
12
|
-
def client(settings
|
12
|
+
def client(settings)
|
13
13
|
@clients ||= {}
|
14
|
-
|
15
|
-
case settings.adapter_type
|
16
|
-
when 'mysql2'
|
17
|
-
host = ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
|
18
|
-
port = ENV['MOJACO_MYSQL_PORT'] || '3306'
|
19
|
-
user = ENV['MOJACO_MYSQL_USER']
|
20
|
-
password = ENV['MOJACO_MYSQL_PASSWORD']
|
21
|
-
when 'postgresql'
|
22
|
-
host = ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
|
23
|
-
port = ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
|
24
|
-
user = ENV['MOJACO_POSTGRESQL_USER']
|
25
|
-
password = ENV['MOJACO_POSTGRESQL_PASSWORD']
|
26
|
-
end
|
27
|
-
|
28
|
-
db_name ||= settings.db_name
|
29
|
-
user_password = user
|
30
|
-
user_password += ":#{password}" if password.present?
|
31
|
-
|
32
|
-
uri = "#{settings.adapter_type}://#{user_password}@#{host}:#{port}"
|
33
|
-
connection_uri = settings.connection_uri || "#{uri}/#{db_name}"
|
34
|
-
|
14
|
+
connection_uri = settings.connection_uri_or_default
|
35
15
|
return @clients[connection_uri] if @clients[connection_uri].present?
|
36
16
|
|
37
17
|
# first, make sure the DB is created (if it is not an external db)
|
38
18
|
is_external_db = settings.connection_uri.present?
|
39
|
-
try_create_db(
|
19
|
+
try_create_db(connection_uri, settings.db_name) unless is_external_db
|
40
20
|
|
41
21
|
# then, create the connection object
|
42
|
-
db = Sequel.connect("#{connection_uri}?encoding=utf8")
|
22
|
+
db = Sequel.connect("#{connection_uri}/#{settings.db_name}?encoding=utf8")
|
43
23
|
add_extensions(settings, db)
|
44
24
|
@clients[connection_uri] = db
|
45
25
|
end
|
@@ -48,8 +28,8 @@ module Dataflow
|
|
48
28
|
# @param uri [String] the connection uri to the DB.
|
49
29
|
# @param db_name [String] the database name.
|
50
30
|
# @return [Boolean] whether the db was created or not.
|
51
|
-
def try_create_db(uri, db_name
|
52
|
-
Sequel.connect(uri
|
31
|
+
def try_create_db(uri, db_name)
|
32
|
+
Sequel.connect(uri) do |db|
|
53
33
|
db.run("CREATE DATABASE #{db_name}")
|
54
34
|
true
|
55
35
|
end
|
@@ -108,7 +88,7 @@ module Dataflow
|
|
108
88
|
|
109
89
|
(sort || {}).each do |k, v|
|
110
90
|
sort_value = v == 1 ? k.to_sym : Sequel.desc(k.to_sym)
|
111
|
-
res = res.
|
91
|
+
res = res.order_append(sort_value)
|
112
92
|
end
|
113
93
|
|
114
94
|
res = res.offset(offset) if offset > 0
|
@@ -127,8 +107,8 @@ module Dataflow
|
|
127
107
|
end
|
128
108
|
|
129
109
|
# Create queries that permit processing the whole dataset in parallel without using offsets.
|
130
|
-
def ordered_system_id_queries(batch_size:)
|
131
|
-
ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
|
110
|
+
def ordered_system_id_queries(batch_size:, where: {})
|
111
|
+
ids = all(fields: [SYSTEM_ID], where: where, sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
|
132
112
|
queries_count = (ids.size / batch_size.to_f).ceil
|
133
113
|
Array.new(queries_count) do |i|
|
134
114
|
from = ids[i * batch_size]
|
@@ -195,7 +175,7 @@ module Dataflow
|
|
195
175
|
def recreate_dataset(dataset: nil)
|
196
176
|
dataset ||= settings.write_dataset_name.to_sym
|
197
177
|
drop_dataset(dataset)
|
198
|
-
create_table(dataset, @schema)
|
178
|
+
create_table(dataset, @schema, logger)
|
199
179
|
end
|
200
180
|
|
201
181
|
# drops the given dataset
|
@@ -248,12 +228,40 @@ module Dataflow
|
|
248
228
|
table_usage.merge(effective_indexes: indexes)
|
249
229
|
end
|
250
230
|
|
231
|
+
def transform_to_query(opts)
|
232
|
+
# map to a serie of AND clauses queries
|
233
|
+
opts.flat_map do |k, v|
|
234
|
+
if v.is_a? Hash
|
235
|
+
v.map do |operator, value|
|
236
|
+
case operator
|
237
|
+
when '!='
|
238
|
+
if value.is_a? Array
|
239
|
+
Sequel.lit("#{k} NOT IN ?", value)
|
240
|
+
else
|
241
|
+
Sequel.lit("#{k} <> ?", value)
|
242
|
+
end
|
243
|
+
when '<', '<=', '>', '>='
|
244
|
+
Sequel.lit("#{k} #{operator} ?", value)
|
245
|
+
when '~'
|
246
|
+
Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
|
247
|
+
when '~*'
|
248
|
+
Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
else
|
252
|
+
# e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
|
253
|
+
# are supported with simples hashes
|
254
|
+
[[{ k.to_sym => v }]]
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
251
259
|
private
|
252
260
|
|
253
261
|
MAX_INT = 2_147_483_647
|
254
262
|
MAX_VARCHAR = 255
|
255
263
|
|
256
|
-
def create_table(dataset, schema)
|
264
|
+
def create_table(dataset, schema, logger)
|
257
265
|
client.create_table(dataset.to_sym) do
|
258
266
|
# always add an _id field to be used internally
|
259
267
|
primary_key SYSTEM_ID
|
@@ -309,34 +317,6 @@ module Dataflow
|
|
309
317
|
res
|
310
318
|
end
|
311
319
|
|
312
|
-
def transform_to_query(opts)
|
313
|
-
# map to a serie of AND clauses queries
|
314
|
-
opts.flat_map do |k, v|
|
315
|
-
if v.is_a? Hash
|
316
|
-
v.map do |operator, value|
|
317
|
-
case operator
|
318
|
-
when '!='
|
319
|
-
if value.is_a? Array
|
320
|
-
Sequel.lit("#{k} NOT IN ?", value)
|
321
|
-
else
|
322
|
-
Sequel.lit("#{k} <> ?", value)
|
323
|
-
end
|
324
|
-
when '<', '<=', '>', '>='
|
325
|
-
Sequel.lit("#{k} #{operator} ?", value)
|
326
|
-
when '~'
|
327
|
-
Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
|
328
|
-
when '~*'
|
329
|
-
Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
|
330
|
-
end
|
331
|
-
end
|
332
|
-
else
|
333
|
-
# e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
|
334
|
-
# are supported with simples hashes
|
335
|
-
[[{ k.to_sym => v }]]
|
336
|
-
end
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
340
320
|
# Required index format for sequel:
|
341
321
|
# :keys, unique: true
|
342
322
|
def index_parameters(index)
|
data/lib/dataflow/node.rb
CHANGED
@@ -156,6 +156,22 @@ module Dataflow
|
|
156
156
|
true
|
157
157
|
end
|
158
158
|
|
159
|
+
# Logs out the dependencies tree update time and whether
|
160
|
+
# it should or not be updated. Useful to understand
|
161
|
+
# why a given nodes had to be recomputed.
|
162
|
+
def explain_update(depth: 0, verbose: false)
|
163
|
+
if depth == 0 || !updated? || verbose
|
164
|
+
logger.log("#{'>' * (depth + 1)} #{name} [COMPUTE] | #{updated? ? 'UPDATED' : 'OLD'} = #{updated_at}")
|
165
|
+
end
|
166
|
+
|
167
|
+
return if updated? && !verbose
|
168
|
+
|
169
|
+
dependencies.each do |dependency|
|
170
|
+
dependency.explain_update(depth: depth + 1, verbose: verbose)
|
171
|
+
end
|
172
|
+
true
|
173
|
+
end
|
174
|
+
|
159
175
|
# Keep a uniform interface with a DataNode.
|
160
176
|
def updated_at
|
161
177
|
last_compute_starting_time
|
@@ -183,11 +199,11 @@ module Dataflow
|
|
183
199
|
# even if the node is already up to date.
|
184
200
|
def recompute(depth: 0, force_recompute: false)
|
185
201
|
send_heartbeat
|
186
|
-
logger.log
|
202
|
+
logger.log("#{'>' * (depth + 1)} #{name} started recomputing...")
|
187
203
|
start_time = Time.now
|
188
204
|
|
189
205
|
parallel_each(dependencies) do |dependency|
|
190
|
-
logger.log
|
206
|
+
logger.log("#{'>' * (depth + 1)} #{name} checking deps: #{dependency.name}...")
|
191
207
|
if !dependency.updated? || force_recompute
|
192
208
|
dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
|
193
209
|
end
|
@@ -196,11 +212,11 @@ module Dataflow
|
|
196
212
|
|
197
213
|
# Dependencies data may have changed in a child process.
|
198
214
|
# Reload to make sure we have the latest metadata.
|
199
|
-
logger.log
|
215
|
+
logger.log("#{'>' * (depth + 1)} #{name} reloading dependencies...")
|
200
216
|
dependencies(reload: true)
|
201
217
|
|
202
218
|
compute(depth: depth, force_compute: force_recompute)
|
203
|
-
logger.log
|
219
|
+
logger.log("#{'>' * (depth + 1)} #{name} took #{Time.now - start_time} seconds to recompute.")
|
204
220
|
|
205
221
|
true
|
206
222
|
end
|
@@ -216,13 +232,13 @@ module Dataflow
|
|
216
232
|
validate!
|
217
233
|
|
218
234
|
if updated? && !force_compute
|
219
|
-
logger.log
|
235
|
+
logger.log("#{'>' * (depth + 1)} #{name} is up-to-date.")
|
220
236
|
return
|
221
237
|
end
|
222
238
|
|
223
239
|
has_compute_lock = acquire_computing_lock!
|
224
240
|
if has_compute_lock
|
225
|
-
logger.log
|
241
|
+
logger.log("#{'>' * (depth + 1)} #{name} started computing.")
|
226
242
|
on_computing_started
|
227
243
|
start_time = Time.now
|
228
244
|
|
@@ -254,15 +270,15 @@ module Dataflow
|
|
254
270
|
data_node&.swap_read_write_datasets!
|
255
271
|
end
|
256
272
|
|
257
|
-
|
258
|
-
save
|
273
|
+
set_last_compute_starting_time(start_time)
|
259
274
|
duration = Time.now - start_time
|
260
|
-
logger.log
|
275
|
+
logger.log("#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute.")
|
261
276
|
on_computing_finished(state: 'computed')
|
277
|
+
true
|
262
278
|
else
|
263
|
-
logger.log
|
279
|
+
logger.log("#{'>' * (depth + 1)} [IS AWAITING] #{name}.")
|
264
280
|
await_computing!
|
265
|
-
logger.log
|
281
|
+
logger.log("#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}.")
|
266
282
|
end
|
267
283
|
|
268
284
|
rescue StandardError => e
|
@@ -412,6 +428,17 @@ module Dataflow
|
|
412
428
|
.find_one_and_update(update_query)
|
413
429
|
end
|
414
430
|
|
431
|
+
def set_last_compute_starting_time(time)
|
432
|
+
# this is just to avoid the reload.
|
433
|
+
# But this change will not be propagated across processes
|
434
|
+
self.last_compute_starting_time = time
|
435
|
+
# update directly on the DB
|
436
|
+
update_query = { '$set' => { last_compute_starting_time: time } }
|
437
|
+
Dataflow::Nodes::ComputeNode.where(_id: _id)
|
438
|
+
.find_one_and_update(update_query)
|
439
|
+
|
440
|
+
end
|
441
|
+
|
415
442
|
##############################
|
416
443
|
# Dependency validations
|
417
444
|
##############################
|
@@ -505,24 +532,21 @@ module Dataflow
|
|
505
532
|
Mongoid.disconnect_clients
|
506
533
|
|
507
534
|
# set to true to debug code in the iteration
|
508
|
-
is_debugging_impl =
|
509
|
-
if is_debugging_impl
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
524
|
-
Mongoid.disconnect_clients
|
525
|
-
end
|
535
|
+
is_debugging_impl = ENV['DEBUG_DATAFLOW']
|
536
|
+
opts = if is_debugging_impl
|
537
|
+
# this will turn of the parallel processing
|
538
|
+
{ in_processes: 0 }
|
539
|
+
elsif max_parallel_processes > 0
|
540
|
+
{ in_processes: max_parallel_processes }
|
541
|
+
else
|
542
|
+
{}
|
543
|
+
end
|
544
|
+
|
545
|
+
Parallel.each(itr, opts) do |*args|
|
546
|
+
yield(*args)
|
547
|
+
Dataflow::Adapters::SqlAdapter.disconnect_clients
|
548
|
+
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
549
|
+
Mongoid.disconnect_clients
|
526
550
|
end
|
527
551
|
end
|
528
552
|
|
@@ -24,11 +24,19 @@ module Dataflow
|
|
24
24
|
# make sure we have only one node per db/table combination
|
25
25
|
index({ db_name: 1, name: 1 }, unique: true)
|
26
26
|
|
27
|
+
# The dataset name used by this node for storage.
|
28
|
+
field :name, type: String, editable: false
|
29
|
+
|
27
30
|
# The database name used by this node
|
28
31
|
field :db_name, type: String, editable: false
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
+
# The database host (used the ENV settings by default)
|
33
|
+
field :db_host, type: String, editable: false
|
34
|
+
# The database port (used the ENV settings by default)
|
35
|
+
field :db_port, type: String, editable: false
|
36
|
+
# The database user (used the ENV settings by default)
|
37
|
+
field :db_user, type: String, editable: false
|
38
|
+
# The database password (used the ENV settings by default)
|
39
|
+
field :db_password, type: String, editable: false
|
32
40
|
|
33
41
|
# The schema of this node
|
34
42
|
field :schema, type: Hash, editable: false
|
@@ -163,8 +171,8 @@ module Dataflow
|
|
163
171
|
# Parallel.each(queries) do |query|
|
164
172
|
# process(node.all(where: query))
|
165
173
|
# end
|
166
|
-
def ordered_system_id_queries(batch_size:)
|
167
|
-
db_adapter.ordered_system_id_queries(batch_size: batch_size)
|
174
|
+
def ordered_system_id_queries(batch_size:, where: {})
|
175
|
+
db_adapter.ordered_system_id_queries(batch_size: batch_size, where: {})
|
168
176
|
end
|
169
177
|
|
170
178
|
# Counts how many records matches the condition or all if no condition is given.
|
@@ -297,6 +305,13 @@ module Dataflow
|
|
297
305
|
(db_backend.to_s =~ /sql/).present?
|
298
306
|
end
|
299
307
|
|
308
|
+
def updated?
|
309
|
+
true
|
310
|
+
end
|
311
|
+
|
312
|
+
def explain_update(depth: 0, verbose: false)
|
313
|
+
logger.log("#{'>' * (depth + 1)} #{name} [Dataset] | UPDATED = #{updated_at}")
|
314
|
+
end
|
300
315
|
|
301
316
|
def required_by
|
302
317
|
super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map { |node|
|
@@ -323,6 +338,18 @@ module Dataflow
|
|
323
338
|
db_adapter.drop_dataset(read_dataset_name)
|
324
339
|
end
|
325
340
|
|
341
|
+
# Dump a backup of this dataset to a file.
|
342
|
+
# @return [String] the filepath to the dump file.
|
343
|
+
def dump_dataset(base_folder: './dump')
|
344
|
+
db_adapter.dump(base_folder: base_folder)
|
345
|
+
end
|
346
|
+
|
347
|
+
# Restore a dump of this dataset
|
348
|
+
# @param files [String] the filepath to the dump file.
|
349
|
+
def restore_dataset(filepath:)
|
350
|
+
db_adapter.restore(filepath: filepath)
|
351
|
+
end
|
352
|
+
|
326
353
|
private
|
327
354
|
|
328
355
|
def db_adapter(connection_opts = {})
|
@@ -342,9 +369,9 @@ module Dataflow
|
|
342
369
|
@csv_adapter ||= Adapters::CsvAdapter.new(data_node: self)
|
343
370
|
return @csv_adapter
|
344
371
|
when 'mysql'
|
345
|
-
opts[:adapter_type] = '
|
372
|
+
opts[:adapter_type] = 'mysql'
|
346
373
|
return Adapters::SqlAdapter.new(opts) if has_options
|
347
|
-
@mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: '
|
374
|
+
@mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql')
|
348
375
|
return @mysql_adapter
|
349
376
|
when 'postgresql'
|
350
377
|
opts[:adapter_type] = 'postgresql'
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Only supports read operations
|
5
|
+
class ReadOnlyDataNode < DataNode
|
6
|
+
|
7
|
+
def set_defaults
|
8
|
+
super
|
9
|
+
self.use_double_buffering = false
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
def handle_dataset_settings_changed
|
14
|
+
# ignore - do not do anyhing
|
15
|
+
end
|
16
|
+
|
17
|
+
def add(*args)
|
18
|
+
raise_read_only_error!
|
19
|
+
end
|
20
|
+
|
21
|
+
def clear(*args)
|
22
|
+
raise_read_only_error!
|
23
|
+
end
|
24
|
+
|
25
|
+
def recreate_dataset(*args)
|
26
|
+
raise_read_only_error!
|
27
|
+
end
|
28
|
+
|
29
|
+
def create_unique_indexes(*args)
|
30
|
+
raise_read_only_error!
|
31
|
+
end
|
32
|
+
|
33
|
+
def create_non_unique_indexes(*args)
|
34
|
+
raise_read_only_error!
|
35
|
+
end
|
36
|
+
|
37
|
+
def read_dataset_name=(*args)
|
38
|
+
raise_read_only_error!
|
39
|
+
end
|
40
|
+
|
41
|
+
def swap_read_write_datasets!
|
42
|
+
raise_read_only_error!
|
43
|
+
end
|
44
|
+
|
45
|
+
def import(*args)
|
46
|
+
raise_read_only_error!
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
def drop_dataset!
|
51
|
+
raise_read_only_error!
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def raise_read_only_error!
|
57
|
+
raise NotImplementedError, 'External data nodes are read only'
|
58
|
+
end
|
59
|
+
|
60
|
+
end # class ExternalDataNode
|
61
|
+
end # module Nodes
|
62
|
+
end # module Dataflow
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
# Interface for a node that behaves as a dataset.
|
4
|
+
# Does not support any operation.
|
5
|
+
# Inherit and override to implement custom behavior.
|
6
|
+
module Nodes
|
7
|
+
class RuntimeQueryNode < DataNode
|
8
|
+
|
9
|
+
after_initialize do
|
10
|
+
self.db_backend = :none
|
11
|
+
end
|
12
|
+
|
13
|
+
def handle_dataset_settings_changed
|
14
|
+
# dot not do anything, there is no real dataset
|
15
|
+
end
|
16
|
+
|
17
|
+
def all(*_args)
|
18
|
+
raise NotImplementedError, 'this node does not support #all'
|
19
|
+
end
|
20
|
+
|
21
|
+
def count(*_args)
|
22
|
+
raise NotImplementedError, 'this node does not support #count'
|
23
|
+
end
|
24
|
+
|
25
|
+
def find(*_args)
|
26
|
+
raise NotImplementedError, 'this node does not support #find'
|
27
|
+
end
|
28
|
+
|
29
|
+
def all_paginated(*_args)
|
30
|
+
raise NotImplementedError, 'this node does not support #all_paginated'
|
31
|
+
end
|
32
|
+
|
33
|
+
def add(*_args)
|
34
|
+
raise NotImplementedError, 'this node does not support #add'
|
35
|
+
end
|
36
|
+
|
37
|
+
def clear(*_args)
|
38
|
+
raise NotImplementedError, 'this node does not support #clear'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -60,7 +60,7 @@ module Dataflow
|
|
60
60
|
equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
|
61
61
|
count_per_process = [max_per_process, equal_split_per_process].min
|
62
62
|
|
63
|
-
queries = ordered_system_id_queries(batch_size: count_per_process)
|
63
|
+
queries = ordered_system_id_queries(batch_size: count_per_process, where: where)
|
64
64
|
|
65
65
|
sch = schema_inferrer.infer_schema(batch_count: queries.count, extended: extended) do |idx|
|
66
66
|
all(where: queries[idx].merge(where))
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -336,6 +336,8 @@ files:
|
|
336
336
|
- lib/dataflow/nodes/merge_node.rb
|
337
337
|
- lib/dataflow/nodes/mixin/add_internal_timestamp.rb
|
338
338
|
- lib/dataflow/nodes/mixin/rename_dotted_fields.rb
|
339
|
+
- lib/dataflow/nodes/read_only_data_node.rb
|
340
|
+
- lib/dataflow/nodes/runtime_query_node.rb
|
339
341
|
- lib/dataflow/nodes/select_keys_node.rb
|
340
342
|
- lib/dataflow/nodes/snapshot_node.rb
|
341
343
|
- lib/dataflow/nodes/sql_query_node.rb
|