dataflow-rb 0.12.1 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/lib/dataflow-rb.rb +2 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +32 -11
- data/lib/dataflow/adapters/psql_adapter.rb +22 -0
- data/lib/dataflow/adapters/settings.rb +69 -2
- data/lib/dataflow/adapters/sql_adapter.rb +39 -59
- data/lib/dataflow/node.rb +0 -4
- data/lib/dataflow/nodes/compute_node.rb +53 -29
- data/lib/dataflow/nodes/data_node.rb +34 -7
- data/lib/dataflow/nodes/read_only_data_node.rb +62 -0
- data/lib/dataflow/nodes/runtime_query_node.rb +42 -0
- data/lib/dataflow/nodes/sql_query_node.rb +3 -1
- data/lib/dataflow/schema_mixin.rb +1 -1
- data/lib/dataflow/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72840e2477fe869fb06b0299c96d5ae2a57c7713
|
4
|
+
data.tar.gz: f9f03314f23473585a9e742740c0e809f2d99bc7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 43f7cef4b2150017871cb7b3c0f21602a01f385e40d07ddf7000f455a4adc007669974fd4e7170e4acc3807feae907f6114e3b3cbfbdbbf36f96348c3a06f60c
|
7
|
+
data.tar.gz: d16411f178fa8ccc00cc9dbaefd0905040a5f8354874f7b00ad5080681d912fa8051dd16e3975d25aed8c737fdf70594f3bb77a948e95665d92a198f0e65206c
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,23 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
|
4
|
+
#### 0.13.0
|
5
|
+
- [b79c96f] Fix a bug in the sql adapter: support multiple ORDER BY clauses
|
6
|
+
- [a17f071] Add runtime query node. Make the ops transformation public.
|
7
|
+
- [8c78aa2] Added support for a per-node backup/restore
|
8
|
+
- [6069ec0] Moved the db settings to the settings class
|
9
|
+
- [b5a77fc] Set the last update time using a query directly on the DB. Do not return unneeded information from the recompute/explain method
|
10
|
+
- [cc77366] Explain why a node needs an update
|
11
|
+
- [e87ba14] Add logging to the sql query node
|
12
|
+
- [5d82dfc] Fix logging during the sql table creation.
|
13
|
+
- [7390264] Add a read-only data node
|
14
|
+
- [dbb14ed] Refactor the debugging implementation
|
15
|
+
- [38925a3] Added parameters on the data node to flexibly connec to any database
|
16
|
+
- [7aac1eb] Add support for partial (where clause) parallel queries generation.
|
17
|
+
|
18
|
+
#### 0.12.1
|
19
|
+
- [110ded7] Fix compute node not processing in parallel
|
20
|
+
|
3
21
|
#### 0.12.0
|
4
22
|
- [4a510df] Add support for case insentive regex matching on mysql
|
5
23
|
- [63b0771] Add logging to understand the current computation batch progress
|
data/lib/dataflow-rb.rb
CHANGED
@@ -36,6 +36,8 @@ require 'dataflow/nodes/compute_node'
|
|
36
36
|
require 'dataflow/nodes/join_node'
|
37
37
|
require 'dataflow/nodes/map_node'
|
38
38
|
require 'dataflow/nodes/merge_node'
|
39
|
+
require 'dataflow/nodes/read_only_data_node'
|
40
|
+
require 'dataflow/nodes/runtime_query_node'
|
39
41
|
require 'dataflow/nodes/select_keys_node'
|
40
42
|
require 'dataflow/nodes/snapshot_node'
|
41
43
|
require 'dataflow/nodes/sql_query_node'
|
@@ -9,9 +9,9 @@ module Dataflow
|
|
9
9
|
class << self
|
10
10
|
def client(settings, db_name: nil)
|
11
11
|
@clients ||= {}
|
12
|
-
|
13
|
-
|
14
|
-
connection_uri = settings.
|
12
|
+
|
13
|
+
settings.adapter_type = 'mongodb'
|
14
|
+
connection_uri = settings.connection_uri_or_default
|
15
15
|
db_name ||= settings.db_name
|
16
16
|
@clients["#{connection_uri}.#{db_name}"] ||= Mongo::Client.new([connection_uri], database: db_name)
|
17
17
|
end
|
@@ -113,8 +113,8 @@ module Dataflow
|
|
113
113
|
end
|
114
114
|
|
115
115
|
# Create queries that permit processing the whole dataset in parallel without using offsets.
|
116
|
-
def ordered_system_id_queries(batch_size:)
|
117
|
-
ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
|
116
|
+
def ordered_system_id_queries(batch_size:, where: {})
|
117
|
+
ids = all(fields: [SYSTEM_ID], where: where, sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
|
118
118
|
queries_count = (ids.size / batch_size.to_f).ceil
|
119
119
|
Array.new(queries_count) do |i|
|
120
120
|
from = ids[i * batch_size]
|
@@ -225,14 +225,25 @@ module Dataflow
|
|
225
225
|
}
|
226
226
|
end
|
227
227
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
settings.
|
228
|
+
def dump(base_folder:)
|
229
|
+
archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.gz"
|
230
|
+
options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
|
231
|
+
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
232
|
+
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
233
|
+
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
234
|
+
options += "--password=#{@settings.db_password}" if @settings.db_password.present?
|
235
|
+
`mkdir -p #{base_folder}`
|
236
|
+
`mongodump #{options} --gzip`
|
237
|
+
archive_path
|
232
238
|
end
|
233
239
|
|
234
|
-
def
|
235
|
-
settings.read_dataset_name
|
240
|
+
def restore(filepath:)
|
241
|
+
options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
|
242
|
+
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
243
|
+
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
244
|
+
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
245
|
+
options += "--password=#{@settings.db_password}" if @settings.db_password.present?
|
246
|
+
`mongorestore #{options} --gzip`
|
236
247
|
end
|
237
248
|
|
238
249
|
def transform_to_query(opts)
|
@@ -275,6 +286,16 @@ module Dataflow
|
|
275
286
|
sanitized_opts
|
276
287
|
end
|
277
288
|
|
289
|
+
private
|
290
|
+
|
291
|
+
def write_dataset_name
|
292
|
+
settings.write_dataset_name
|
293
|
+
end
|
294
|
+
|
295
|
+
def read_dataset_name
|
296
|
+
settings.read_dataset_name
|
297
|
+
end
|
298
|
+
|
278
299
|
def try_cast_value(field, value)
|
279
300
|
# cast to time when querying on _mojaco_updated_at
|
280
301
|
return Timeliness.parse(value) || value if field =~ /_mojaco_updated_at/
|
@@ -24,6 +24,28 @@ module Dataflow
|
|
24
24
|
def regex_case_insensitive_op
|
25
25
|
'~*'
|
26
26
|
end
|
27
|
+
|
28
|
+
def dump(base_folder:)
|
29
|
+
archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.dump"
|
30
|
+
options = "--table=public.#{@settings.read_dataset_name}"
|
31
|
+
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
32
|
+
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
33
|
+
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
34
|
+
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
35
|
+
`mkdir -p #{base_folder}`
|
36
|
+
`#{password}pg_dump #{options} -Fc #{@settings.db_name} > #{archive_path}`
|
37
|
+
archive_path
|
38
|
+
end
|
39
|
+
|
40
|
+
def restore(filepath:)
|
41
|
+
options = "--table=#{@settings.read_dataset_name}"
|
42
|
+
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
43
|
+
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
44
|
+
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
45
|
+
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
46
|
+
p "#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}"
|
47
|
+
`#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
|
48
|
+
end
|
27
49
|
end
|
28
50
|
end
|
29
51
|
end
|
@@ -2,16 +2,23 @@
|
|
2
2
|
module Dataflow
|
3
3
|
module Adapters
|
4
4
|
class Settings
|
5
|
-
attr_accessor :connection_uri, :db_name,
|
6
|
-
:
|
5
|
+
attr_accessor :connection_uri, :db_name,
|
6
|
+
:db_host, :db_port, :db_user, :db_password,
|
7
|
+
:dataset_name, :read_dataset_name, :write_dataset_name,
|
8
|
+
:indexes, :adapter_type, :schema
|
7
9
|
|
8
10
|
def initialize(data_node: nil, connection_uri: nil, db_name: nil,
|
11
|
+
db_host: nil, db_port: nil, db_user: nil, db_password: nil,
|
9
12
|
dataset_name: nil, indexes: nil, adapter_type: nil, schema: nil)
|
10
13
|
@connection_uri = connection_uri
|
11
14
|
|
12
15
|
# first try to set the options based on the data node settings
|
13
16
|
if data_node.present?
|
14
17
|
@db_name = data_node.db_name
|
18
|
+
@db_host = data_node.db_host
|
19
|
+
@db_port = data_node.db_port
|
20
|
+
@db_user = data_node.db_user
|
21
|
+
@db_password = data_node.db_password
|
15
22
|
@dataset_name = data_node.name
|
16
23
|
@read_dataset_name = data_node.read_dataset_name
|
17
24
|
@write_dataset_name = data_node.write_dataset_name
|
@@ -21,6 +28,10 @@ module Dataflow
|
|
21
28
|
|
22
29
|
# override if needed
|
23
30
|
@db_name ||= db_name
|
31
|
+
@db_host ||= db_host
|
32
|
+
@db_port ||= db_port
|
33
|
+
@db_user ||= db_user
|
34
|
+
@db_password ||= db_password
|
24
35
|
@dataset_name ||= dataset_name
|
25
36
|
@read_dataset_name ||= dataset_name
|
26
37
|
@write_dataset_name ||= dataset_name
|
@@ -28,6 +39,62 @@ module Dataflow
|
|
28
39
|
@adapter_type ||= adapter_type
|
29
40
|
@schema ||= schema
|
30
41
|
end
|
42
|
+
|
43
|
+
def set_mongodb_defaults_if_needed!
|
44
|
+
@db_host ||= ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
|
45
|
+
@db_port ||= ENV['MOJACO_MONGO_PORT'] || '27017'
|
46
|
+
@db_user ||= ENV['MOJACO_MONGO_USER']
|
47
|
+
@db_password ||= ENV['MOJACO_MONGO_USER']
|
48
|
+
end
|
49
|
+
|
50
|
+
def set_postgresql_defaults_if_needed!
|
51
|
+
@db_host ||= ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
|
52
|
+
@db_port ||= ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
|
53
|
+
@db_user ||= ENV['MOJACO_POSTGRESQL_USER']
|
54
|
+
@db_password ||= ENV['MOJACO_POSTGRESQL_PASSWORD']
|
55
|
+
end
|
56
|
+
|
57
|
+
def set_mysql_defaults_if_needed!
|
58
|
+
@db_host ||= ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
|
59
|
+
@db_port ||= ENV['MOJACO_MYSQL_PORT'] || '3306'
|
60
|
+
@db_user ||= ENV['MOJACO_MYSQL_USER']
|
61
|
+
@db_password ||= ENV['MOJACO_MYSQL_PASSWORD']
|
62
|
+
end
|
63
|
+
|
64
|
+
def connection_uri_or_default
|
65
|
+
return @connection_uri if @connection_uri.present?
|
66
|
+
|
67
|
+
send("#{@adapter_type}_default_connection_uri")
|
68
|
+
end
|
69
|
+
|
70
|
+
def mongodb_default_connection_uri
|
71
|
+
set_mongodb_defaults_if_needed!
|
72
|
+
|
73
|
+
# if user/password are empty, the user_password will be empty as well
|
74
|
+
user_password = @db_user
|
75
|
+
user_password += ":#{@db_password}" if @db_password.present?
|
76
|
+
user_password += '@' if user_password.present?
|
77
|
+
|
78
|
+
# [username:password@]host1[:port1]
|
79
|
+
"#{user_password}#{@db_host}:#{@db_port}"
|
80
|
+
end
|
81
|
+
|
82
|
+
def mysql_default_connection_uri
|
83
|
+
set_mysql_defaults_if_needed!
|
84
|
+
sql_default_connection_uri('mysql2')
|
85
|
+
end
|
86
|
+
|
87
|
+
def postgresql_default_connection_uri
|
88
|
+
set_postgresql_defaults_if_needed!
|
89
|
+
sql_default_connection_uri('postgresql')
|
90
|
+
end
|
91
|
+
|
92
|
+
def sql_default_connection_uri(scheme)
|
93
|
+
user_password = @db_user
|
94
|
+
user_password += ":#{@db_password}" if @db_password.present?
|
95
|
+
|
96
|
+
"#{scheme}://#{user_password}@#{@db_host}:#{@db_port}"
|
97
|
+
end
|
31
98
|
end
|
32
99
|
end
|
33
100
|
end
|
@@ -9,37 +9,17 @@ module Dataflow
|
|
9
9
|
# @param settings [Hash] Represents the connection settings to the DB.
|
10
10
|
# @param db_name [String] The database name to which the client will connect.
|
11
11
|
# @return [Sequel::Database] a sequel database object.
|
12
|
-
def client(settings
|
12
|
+
def client(settings)
|
13
13
|
@clients ||= {}
|
14
|
-
|
15
|
-
case settings.adapter_type
|
16
|
-
when 'mysql2'
|
17
|
-
host = ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
|
18
|
-
port = ENV['MOJACO_MYSQL_PORT'] || '3306'
|
19
|
-
user = ENV['MOJACO_MYSQL_USER']
|
20
|
-
password = ENV['MOJACO_MYSQL_PASSWORD']
|
21
|
-
when 'postgresql'
|
22
|
-
host = ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
|
23
|
-
port = ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
|
24
|
-
user = ENV['MOJACO_POSTGRESQL_USER']
|
25
|
-
password = ENV['MOJACO_POSTGRESQL_PASSWORD']
|
26
|
-
end
|
27
|
-
|
28
|
-
db_name ||= settings.db_name
|
29
|
-
user_password = user
|
30
|
-
user_password += ":#{password}" if password.present?
|
31
|
-
|
32
|
-
uri = "#{settings.adapter_type}://#{user_password}@#{host}:#{port}"
|
33
|
-
connection_uri = settings.connection_uri || "#{uri}/#{db_name}"
|
34
|
-
|
14
|
+
connection_uri = settings.connection_uri_or_default
|
35
15
|
return @clients[connection_uri] if @clients[connection_uri].present?
|
36
16
|
|
37
17
|
# first, make sure the DB is created (if it is not an external db)
|
38
18
|
is_external_db = settings.connection_uri.present?
|
39
|
-
try_create_db(
|
19
|
+
try_create_db(connection_uri, settings.db_name) unless is_external_db
|
40
20
|
|
41
21
|
# then, create the connection object
|
42
|
-
db = Sequel.connect("#{connection_uri}?encoding=utf8")
|
22
|
+
db = Sequel.connect("#{connection_uri}/#{settings.db_name}?encoding=utf8")
|
43
23
|
add_extensions(settings, db)
|
44
24
|
@clients[connection_uri] = db
|
45
25
|
end
|
@@ -48,8 +28,8 @@ module Dataflow
|
|
48
28
|
# @param uri [String] the connection uri to the DB.
|
49
29
|
# @param db_name [String] the database name.
|
50
30
|
# @return [Boolean] whether the db was created or not.
|
51
|
-
def try_create_db(uri, db_name
|
52
|
-
Sequel.connect(uri
|
31
|
+
def try_create_db(uri, db_name)
|
32
|
+
Sequel.connect(uri) do |db|
|
53
33
|
db.run("CREATE DATABASE #{db_name}")
|
54
34
|
true
|
55
35
|
end
|
@@ -108,7 +88,7 @@ module Dataflow
|
|
108
88
|
|
109
89
|
(sort || {}).each do |k, v|
|
110
90
|
sort_value = v == 1 ? k.to_sym : Sequel.desc(k.to_sym)
|
111
|
-
res = res.
|
91
|
+
res = res.order_append(sort_value)
|
112
92
|
end
|
113
93
|
|
114
94
|
res = res.offset(offset) if offset > 0
|
@@ -127,8 +107,8 @@ module Dataflow
|
|
127
107
|
end
|
128
108
|
|
129
109
|
# Create queries that permit processing the whole dataset in parallel without using offsets.
|
130
|
-
def ordered_system_id_queries(batch_size:)
|
131
|
-
ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
|
110
|
+
def ordered_system_id_queries(batch_size:, where: {})
|
111
|
+
ids = all(fields: [SYSTEM_ID], where: where, sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
|
132
112
|
queries_count = (ids.size / batch_size.to_f).ceil
|
133
113
|
Array.new(queries_count) do |i|
|
134
114
|
from = ids[i * batch_size]
|
@@ -195,7 +175,7 @@ module Dataflow
|
|
195
175
|
def recreate_dataset(dataset: nil)
|
196
176
|
dataset ||= settings.write_dataset_name.to_sym
|
197
177
|
drop_dataset(dataset)
|
198
|
-
create_table(dataset, @schema)
|
178
|
+
create_table(dataset, @schema, logger)
|
199
179
|
end
|
200
180
|
|
201
181
|
# drops the given dataset
|
@@ -248,12 +228,40 @@ module Dataflow
|
|
248
228
|
table_usage.merge(effective_indexes: indexes)
|
249
229
|
end
|
250
230
|
|
231
|
+
def transform_to_query(opts)
|
232
|
+
# map to a serie of AND clauses queries
|
233
|
+
opts.flat_map do |k, v|
|
234
|
+
if v.is_a? Hash
|
235
|
+
v.map do |operator, value|
|
236
|
+
case operator
|
237
|
+
when '!='
|
238
|
+
if value.is_a? Array
|
239
|
+
Sequel.lit("#{k} NOT IN ?", value)
|
240
|
+
else
|
241
|
+
Sequel.lit("#{k} <> ?", value)
|
242
|
+
end
|
243
|
+
when '<', '<=', '>', '>='
|
244
|
+
Sequel.lit("#{k} #{operator} ?", value)
|
245
|
+
when '~'
|
246
|
+
Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
|
247
|
+
when '~*'
|
248
|
+
Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
else
|
252
|
+
# e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
|
253
|
+
# are supported with simples hashes
|
254
|
+
[[{ k.to_sym => v }]]
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
251
259
|
private
|
252
260
|
|
253
261
|
MAX_INT = 2_147_483_647
|
254
262
|
MAX_VARCHAR = 255
|
255
263
|
|
256
|
-
def create_table(dataset, schema)
|
264
|
+
def create_table(dataset, schema, logger)
|
257
265
|
client.create_table(dataset.to_sym) do
|
258
266
|
# always add an _id field to be used internally
|
259
267
|
primary_key SYSTEM_ID
|
@@ -309,34 +317,6 @@ module Dataflow
|
|
309
317
|
res
|
310
318
|
end
|
311
319
|
|
312
|
-
def transform_to_query(opts)
|
313
|
-
# map to a serie of AND clauses queries
|
314
|
-
opts.flat_map do |k, v|
|
315
|
-
if v.is_a? Hash
|
316
|
-
v.map do |operator, value|
|
317
|
-
case operator
|
318
|
-
when '!='
|
319
|
-
if value.is_a? Array
|
320
|
-
Sequel.lit("#{k} NOT IN ?", value)
|
321
|
-
else
|
322
|
-
Sequel.lit("#{k} <> ?", value)
|
323
|
-
end
|
324
|
-
when '<', '<=', '>', '>='
|
325
|
-
Sequel.lit("#{k} #{operator} ?", value)
|
326
|
-
when '~'
|
327
|
-
Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
|
328
|
-
when '~*'
|
329
|
-
Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
|
330
|
-
end
|
331
|
-
end
|
332
|
-
else
|
333
|
-
# e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
|
334
|
-
# are supported with simples hashes
|
335
|
-
[[{ k.to_sym => v }]]
|
336
|
-
end
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
340
320
|
# Required index format for sequel:
|
341
321
|
# :keys, unique: true
|
342
322
|
def index_parameters(index)
|
data/lib/dataflow/node.rb
CHANGED
@@ -156,6 +156,22 @@ module Dataflow
|
|
156
156
|
true
|
157
157
|
end
|
158
158
|
|
159
|
+
# Logs out the dependencies tree update time and whether
|
160
|
+
# it should or not be updated. Useful to understand
|
161
|
+
# why a given nodes had to be recomputed.
|
162
|
+
def explain_update(depth: 0, verbose: false)
|
163
|
+
if depth == 0 || !updated? || verbose
|
164
|
+
logger.log("#{'>' * (depth + 1)} #{name} [COMPUTE] | #{updated? ? 'UPDATED' : 'OLD'} = #{updated_at}")
|
165
|
+
end
|
166
|
+
|
167
|
+
return if updated? && !verbose
|
168
|
+
|
169
|
+
dependencies.each do |dependency|
|
170
|
+
dependency.explain_update(depth: depth + 1, verbose: verbose)
|
171
|
+
end
|
172
|
+
true
|
173
|
+
end
|
174
|
+
|
159
175
|
# Keep a uniform interface with a DataNode.
|
160
176
|
def updated_at
|
161
177
|
last_compute_starting_time
|
@@ -183,11 +199,11 @@ module Dataflow
|
|
183
199
|
# even if the node is already up to date.
|
184
200
|
def recompute(depth: 0, force_recompute: false)
|
185
201
|
send_heartbeat
|
186
|
-
logger.log
|
202
|
+
logger.log("#{'>' * (depth + 1)} #{name} started recomputing...")
|
187
203
|
start_time = Time.now
|
188
204
|
|
189
205
|
parallel_each(dependencies) do |dependency|
|
190
|
-
logger.log
|
206
|
+
logger.log("#{'>' * (depth + 1)} #{name} checking deps: #{dependency.name}...")
|
191
207
|
if !dependency.updated? || force_recompute
|
192
208
|
dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
|
193
209
|
end
|
@@ -196,11 +212,11 @@ module Dataflow
|
|
196
212
|
|
197
213
|
# Dependencies data may have changed in a child process.
|
198
214
|
# Reload to make sure we have the latest metadata.
|
199
|
-
logger.log
|
215
|
+
logger.log("#{'>' * (depth + 1)} #{name} reloading dependencies...")
|
200
216
|
dependencies(reload: true)
|
201
217
|
|
202
218
|
compute(depth: depth, force_compute: force_recompute)
|
203
|
-
logger.log
|
219
|
+
logger.log("#{'>' * (depth + 1)} #{name} took #{Time.now - start_time} seconds to recompute.")
|
204
220
|
|
205
221
|
true
|
206
222
|
end
|
@@ -216,13 +232,13 @@ module Dataflow
|
|
216
232
|
validate!
|
217
233
|
|
218
234
|
if updated? && !force_compute
|
219
|
-
logger.log
|
235
|
+
logger.log("#{'>' * (depth + 1)} #{name} is up-to-date.")
|
220
236
|
return
|
221
237
|
end
|
222
238
|
|
223
239
|
has_compute_lock = acquire_computing_lock!
|
224
240
|
if has_compute_lock
|
225
|
-
logger.log
|
241
|
+
logger.log("#{'>' * (depth + 1)} #{name} started computing.")
|
226
242
|
on_computing_started
|
227
243
|
start_time = Time.now
|
228
244
|
|
@@ -254,15 +270,15 @@ module Dataflow
|
|
254
270
|
data_node&.swap_read_write_datasets!
|
255
271
|
end
|
256
272
|
|
257
|
-
|
258
|
-
save
|
273
|
+
set_last_compute_starting_time(start_time)
|
259
274
|
duration = Time.now - start_time
|
260
|
-
logger.log
|
275
|
+
logger.log("#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute.")
|
261
276
|
on_computing_finished(state: 'computed')
|
277
|
+
true
|
262
278
|
else
|
263
|
-
logger.log
|
279
|
+
logger.log("#{'>' * (depth + 1)} [IS AWAITING] #{name}.")
|
264
280
|
await_computing!
|
265
|
-
logger.log
|
281
|
+
logger.log("#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}.")
|
266
282
|
end
|
267
283
|
|
268
284
|
rescue StandardError => e
|
@@ -412,6 +428,17 @@ module Dataflow
|
|
412
428
|
.find_one_and_update(update_query)
|
413
429
|
end
|
414
430
|
|
431
|
+
def set_last_compute_starting_time(time)
|
432
|
+
# this is just to avoid the reload.
|
433
|
+
# But this change will not be propagated across processes
|
434
|
+
self.last_compute_starting_time = time
|
435
|
+
# update directly on the DB
|
436
|
+
update_query = { '$set' => { last_compute_starting_time: time } }
|
437
|
+
Dataflow::Nodes::ComputeNode.where(_id: _id)
|
438
|
+
.find_one_and_update(update_query)
|
439
|
+
|
440
|
+
end
|
441
|
+
|
415
442
|
##############################
|
416
443
|
# Dependency validations
|
417
444
|
##############################
|
@@ -505,24 +532,21 @@ module Dataflow
|
|
505
532
|
Mongoid.disconnect_clients
|
506
533
|
|
507
534
|
# set to true to debug code in the iteration
|
508
|
-
is_debugging_impl =
|
509
|
-
if is_debugging_impl
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
524
|
-
Mongoid.disconnect_clients
|
525
|
-
end
|
535
|
+
is_debugging_impl = ENV['DEBUG_DATAFLOW']
|
536
|
+
opts = if is_debugging_impl
|
537
|
+
# this will turn of the parallel processing
|
538
|
+
{ in_processes: 0 }
|
539
|
+
elsif max_parallel_processes > 0
|
540
|
+
{ in_processes: max_parallel_processes }
|
541
|
+
else
|
542
|
+
{}
|
543
|
+
end
|
544
|
+
|
545
|
+
Parallel.each(itr, opts) do |*args|
|
546
|
+
yield(*args)
|
547
|
+
Dataflow::Adapters::SqlAdapter.disconnect_clients
|
548
|
+
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
549
|
+
Mongoid.disconnect_clients
|
526
550
|
end
|
527
551
|
end
|
528
552
|
|
@@ -24,11 +24,19 @@ module Dataflow
|
|
24
24
|
# make sure we have only one node per db/table combination
|
25
25
|
index({ db_name: 1, name: 1 }, unique: true)
|
26
26
|
|
27
|
+
# The dataset name used by this node for storage.
|
28
|
+
field :name, type: String, editable: false
|
29
|
+
|
27
30
|
# The database name used by this node
|
28
31
|
field :db_name, type: String, editable: false
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
+
# The database host (used the ENV settings by default)
|
33
|
+
field :db_host, type: String, editable: false
|
34
|
+
# The database port (used the ENV settings by default)
|
35
|
+
field :db_port, type: String, editable: false
|
36
|
+
# The database user (used the ENV settings by default)
|
37
|
+
field :db_user, type: String, editable: false
|
38
|
+
# The database password (used the ENV settings by default)
|
39
|
+
field :db_password, type: String, editable: false
|
32
40
|
|
33
41
|
# The schema of this node
|
34
42
|
field :schema, type: Hash, editable: false
|
@@ -163,8 +171,8 @@ module Dataflow
|
|
163
171
|
# Parallel.each(queries) do |query|
|
164
172
|
# process(node.all(where: query))
|
165
173
|
# end
|
166
|
-
def ordered_system_id_queries(batch_size:)
|
167
|
-
db_adapter.ordered_system_id_queries(batch_size: batch_size)
|
174
|
+
def ordered_system_id_queries(batch_size:, where: {})
|
175
|
+
db_adapter.ordered_system_id_queries(batch_size: batch_size, where: {})
|
168
176
|
end
|
169
177
|
|
170
178
|
# Counts how many records matches the condition or all if no condition is given.
|
@@ -297,6 +305,13 @@ module Dataflow
|
|
297
305
|
(db_backend.to_s =~ /sql/).present?
|
298
306
|
end
|
299
307
|
|
308
|
+
def updated?
|
309
|
+
true
|
310
|
+
end
|
311
|
+
|
312
|
+
def explain_update(depth: 0, verbose: false)
|
313
|
+
logger.log("#{'>' * (depth + 1)} #{name} [Dataset] | UPDATED = #{updated_at}")
|
314
|
+
end
|
300
315
|
|
301
316
|
def required_by
|
302
317
|
super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map { |node|
|
@@ -323,6 +338,18 @@ module Dataflow
|
|
323
338
|
db_adapter.drop_dataset(read_dataset_name)
|
324
339
|
end
|
325
340
|
|
341
|
+
# Dump a backup of this dataset to a file.
|
342
|
+
# @return [String] the filepath to the dump file.
|
343
|
+
def dump_dataset(base_folder: './dump')
|
344
|
+
db_adapter.dump(base_folder: base_folder)
|
345
|
+
end
|
346
|
+
|
347
|
+
# Restore a dump of this dataset
|
348
|
+
# @param files [String] the filepath to the dump file.
|
349
|
+
def restore_dataset(filepath:)
|
350
|
+
db_adapter.restore(filepath: filepath)
|
351
|
+
end
|
352
|
+
|
326
353
|
private
|
327
354
|
|
328
355
|
def db_adapter(connection_opts = {})
|
@@ -342,9 +369,9 @@ module Dataflow
|
|
342
369
|
@csv_adapter ||= Adapters::CsvAdapter.new(data_node: self)
|
343
370
|
return @csv_adapter
|
344
371
|
when 'mysql'
|
345
|
-
opts[:adapter_type] = '
|
372
|
+
opts[:adapter_type] = 'mysql'
|
346
373
|
return Adapters::SqlAdapter.new(opts) if has_options
|
347
|
-
@mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: '
|
374
|
+
@mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql')
|
348
375
|
return @mysql_adapter
|
349
376
|
when 'postgresql'
|
350
377
|
opts[:adapter_type] = 'postgresql'
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Only supports read operations
|
5
|
+
class ReadOnlyDataNode < DataNode
|
6
|
+
|
7
|
+
def set_defaults
|
8
|
+
super
|
9
|
+
self.use_double_buffering = false
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
def handle_dataset_settings_changed
|
14
|
+
# ignore - do not do anyhing
|
15
|
+
end
|
16
|
+
|
17
|
+
def add(*args)
|
18
|
+
raise_read_only_error!
|
19
|
+
end
|
20
|
+
|
21
|
+
def clear(*args)
|
22
|
+
raise_read_only_error!
|
23
|
+
end
|
24
|
+
|
25
|
+
def recreate_dataset(*args)
|
26
|
+
raise_read_only_error!
|
27
|
+
end
|
28
|
+
|
29
|
+
def create_unique_indexes(*args)
|
30
|
+
raise_read_only_error!
|
31
|
+
end
|
32
|
+
|
33
|
+
def create_non_unique_indexes(*args)
|
34
|
+
raise_read_only_error!
|
35
|
+
end
|
36
|
+
|
37
|
+
def read_dataset_name=(*args)
|
38
|
+
raise_read_only_error!
|
39
|
+
end
|
40
|
+
|
41
|
+
def swap_read_write_datasets!
|
42
|
+
raise_read_only_error!
|
43
|
+
end
|
44
|
+
|
45
|
+
def import(*args)
|
46
|
+
raise_read_only_error!
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
def drop_dataset!
|
51
|
+
raise_read_only_error!
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def raise_read_only_error!
|
57
|
+
raise NotImplementedError, 'External data nodes are read only'
|
58
|
+
end
|
59
|
+
|
60
|
+
end # class ExternalDataNode
|
61
|
+
end # module Nodes
|
62
|
+
end # module Dataflow
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
# Interface for a node that behaves as a dataset.
|
4
|
+
# Does not support any operation.
|
5
|
+
# Inherit and override to implement custom behavior.
|
6
|
+
module Nodes
|
7
|
+
class RuntimeQueryNode < DataNode
|
8
|
+
|
9
|
+
after_initialize do
|
10
|
+
self.db_backend = :none
|
11
|
+
end
|
12
|
+
|
13
|
+
def handle_dataset_settings_changed
|
14
|
+
# dot not do anything, there is no real dataset
|
15
|
+
end
|
16
|
+
|
17
|
+
def all(*_args)
|
18
|
+
raise NotImplementedError, 'this node does not support #all'
|
19
|
+
end
|
20
|
+
|
21
|
+
def count(*_args)
|
22
|
+
raise NotImplementedError, 'this node does not support #count'
|
23
|
+
end
|
24
|
+
|
25
|
+
def find(*_args)
|
26
|
+
raise NotImplementedError, 'this node does not support #find'
|
27
|
+
end
|
28
|
+
|
29
|
+
def all_paginated(*_args)
|
30
|
+
raise NotImplementedError, 'this node does not support #all_paginated'
|
31
|
+
end
|
32
|
+
|
33
|
+
def add(*_args)
|
34
|
+
raise NotImplementedError, 'this node does not support #add'
|
35
|
+
end
|
36
|
+
|
37
|
+
def clear(*_args)
|
38
|
+
raise NotImplementedError, 'this node does not support #clear'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -60,7 +60,7 @@ module Dataflow
|
|
60
60
|
equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
|
61
61
|
count_per_process = [max_per_process, equal_split_per_process].min
|
62
62
|
|
63
|
-
queries = ordered_system_id_queries(batch_size: count_per_process)
|
63
|
+
queries = ordered_system_id_queries(batch_size: count_per_process, where: where)
|
64
64
|
|
65
65
|
sch = schema_inferrer.infer_schema(batch_count: queries.count, extended: extended) do |idx|
|
66
66
|
all(where: queries[idx].merge(where))
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -336,6 +336,8 @@ files:
|
|
336
336
|
- lib/dataflow/nodes/merge_node.rb
|
337
337
|
- lib/dataflow/nodes/mixin/add_internal_timestamp.rb
|
338
338
|
- lib/dataflow/nodes/mixin/rename_dotted_fields.rb
|
339
|
+
- lib/dataflow/nodes/read_only_data_node.rb
|
340
|
+
- lib/dataflow/nodes/runtime_query_node.rb
|
339
341
|
- lib/dataflow/nodes/select_keys_node.rb
|
340
342
|
- lib/dataflow/nodes/snapshot_node.rb
|
341
343
|
- lib/dataflow/nodes/sql_query_node.rb
|