dataflow-rb 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.env.test.example +6 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/README.md +46 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/dataflow-rb.gemspec +42 -0
- data/lib/config/mongoid.yml +21 -0
- data/lib/dataflow/adapters/csv_adapter.rb +123 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
- data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
- data/lib/dataflow/adapters/psql_adapter.rb +21 -0
- data/lib/dataflow/adapters/settings.rb +33 -0
- data/lib/dataflow/adapters/sql_adapter.rb +322 -0
- data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
- data/lib/dataflow/errors/not_implemented_error.rb +7 -0
- data/lib/dataflow/event_mixin.rb +77 -0
- data/lib/dataflow/extensions/mongo_driver.rb +21 -0
- data/lib/dataflow/extensions/msgpack.rb +19 -0
- data/lib/dataflow/logger.rb +27 -0
- data/lib/dataflow/node.rb +37 -0
- data/lib/dataflow/nodes/compute_node.rb +495 -0
- data/lib/dataflow/nodes/data_node.rb +331 -0
- data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
- data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
- data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
- data/lib/dataflow/nodes/filter/where_node.rb +44 -0
- data/lib/dataflow/nodes/join_node.rb +151 -0
- data/lib/dataflow/nodes/map_node.rb +50 -0
- data/lib/dataflow/nodes/merge_node.rb +33 -0
- data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
- data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
- data/lib/dataflow/nodes/select_keys_node.rb +39 -0
- data/lib/dataflow/nodes/snapshot_node.rb +77 -0
- data/lib/dataflow/nodes/sql_query_node.rb +50 -0
- data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
- data/lib/dataflow/nodes/upsert_node.rb +68 -0
- data/lib/dataflow/properties_mixin.rb +35 -0
- data/lib/dataflow/schema_mixin.rb +134 -0
- data/lib/dataflow/version.rb +4 -0
- data/lib/dataflow-rb.rb +72 -0
- metadata +371 -0
@@ -0,0 +1,151 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Performs a join operation on 2 dependencies.
|
5
|
+
class JoinNode < ComputeNode
|
6
|
+
VALID_TYPES = %w(inner left).freeze
|
7
|
+
field :join_type, type: String, required_for_computing: true, values: VALID_TYPES, default: VALID_TYPES[0]
|
8
|
+
field :key1, type: String, required_for_computing: true
|
9
|
+
field :key2, type: String, required_for_computing: true
|
10
|
+
# Support joining on multiple keys by setting them in the other keys.
|
11
|
+
# other_keys_1 and 2 must match in length
|
12
|
+
field :other_keys1, type: Array, default: []
|
13
|
+
field :other_keys2, type: Array, default: []
|
14
|
+
field :prefix1, type: String, default: ''
|
15
|
+
field :prefix2, type: String, default: ''
|
16
|
+
|
17
|
+
ensure_data_node_exists
|
18
|
+
ensure_dependencies exactly: 2
|
19
|
+
|
20
|
+
def valid_for_computation?
|
21
|
+
# We need an equivalent number of keys as they will be matched with each others
|
22
|
+
if other_keys1.count != other_keys2.count
|
23
|
+
errors.add(:other_keys2, "#{self.class} other_keys2 must match other_keys1's length")
|
24
|
+
end
|
25
|
+
|
26
|
+
super
|
27
|
+
end
|
28
|
+
|
29
|
+
def required_schema
|
30
|
+
return {} unless dependencies.count == 2
|
31
|
+
|
32
|
+
# merge both dependencies schemas
|
33
|
+
sch = dependencies.first.schema || {}
|
34
|
+
sch.merge(dependencies.second.schema || {})
|
35
|
+
end
|
36
|
+
|
37
|
+
def compute_impl
|
38
|
+
all_same_postgresql = db_backend == :postgresql
|
39
|
+
all_same_postgresql &&= dependencies[1..-1].all? do |dep|
|
40
|
+
dep.db_backend == :postgresql && dep.db_name == db_name
|
41
|
+
end
|
42
|
+
|
43
|
+
if all_same_postgresql
|
44
|
+
# use SQL join
|
45
|
+
execute_sql_join
|
46
|
+
self.updated_at = Time.now
|
47
|
+
else
|
48
|
+
# use software join
|
49
|
+
super
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def execute_sql_join
|
56
|
+
fields = required_schema.keys
|
57
|
+
select_keys = dependencies[0].schema.keys.map { |x| "d1.#{x}" } + (dependencies[1].schema.keys - dependencies[0].schema.keys).map { |x| "d2.#{x}" }
|
58
|
+
query = "INSERT INTO #{write_dataset_name} (#{fields.join(',')})
|
59
|
+
SELECT #{select_keys.join(', ')}
|
60
|
+
FROM #{dependencies[0].read_dataset_name} as d1
|
61
|
+
INNER JOIN #{dependencies[1].read_dataset_name} as d2
|
62
|
+
ON d1.#{key1} = d2.#{key2}"
|
63
|
+
p query
|
64
|
+
db_adapter.client[query].to_a
|
65
|
+
end
|
66
|
+
|
67
|
+
def compute_batch(records:)
|
68
|
+
join(n1_records: records)
|
69
|
+
end
|
70
|
+
|
71
|
+
def join(n1_records:)
|
72
|
+
tokens_key1 = record_dig_tokens(key: key1, use_sym: dependencies.first.use_symbols?)
|
73
|
+
tokens_key2 = record_dig_tokens(key: key2, use_sym: dependencies.second.use_symbols?)
|
74
|
+
other_tokens_key1 = (other_keys1 || []).map do |key|
|
75
|
+
record_dig_tokens(key: key, use_sym: dependencies.second.use_symbols?)
|
76
|
+
end
|
77
|
+
other_tokens_key2 = (other_keys2 || []).map do |key|
|
78
|
+
record_dig_tokens(key: key, use_sym: dependencies.second.use_symbols?)
|
79
|
+
end
|
80
|
+
|
81
|
+
# fetch necessary records from node2
|
82
|
+
node2 = dependencies.second
|
83
|
+
n2_ids = n1_records.map { |x| x.dig(*tokens_key1) }.compact.uniq
|
84
|
+
n2_records = node2.all(where: { key2 => n2_ids })
|
85
|
+
|
86
|
+
# preload and map dataset2 by the key we want to lookup
|
87
|
+
mapped_data2 = {}
|
88
|
+
if has_multiple_keys?
|
89
|
+
n2_records.each do |datum2|
|
90
|
+
lookup_value = datum2.dig(*tokens_key2)
|
91
|
+
mapped_data2[lookup_value] ||= []
|
92
|
+
mapped_data2[lookup_value] << datum2
|
93
|
+
end
|
94
|
+
else
|
95
|
+
n2_records.each do |datum2|
|
96
|
+
lookup_value = datum2.dig(*tokens_key2)
|
97
|
+
mapped_data2[lookup_value] = datum2
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# for each datum in dataset1, find the corresponding datum in dataset2
|
102
|
+
n1_records.map do |d1|
|
103
|
+
join_value = d1.dig(*tokens_key1)
|
104
|
+
next if join_value.nil?
|
105
|
+
|
106
|
+
d2 = mapped_data2[join_value]
|
107
|
+
if has_multiple_keys? && !d2.nil?
|
108
|
+
# in this case, it will be an array,
|
109
|
+
# so we need to further search the correct datum
|
110
|
+
d2 = find_matching_record(d1, d2, other_tokens_key1, other_tokens_key2)
|
111
|
+
end
|
112
|
+
|
113
|
+
# if there is no d2, only continue based on the type of join we want.
|
114
|
+
next if d2.blank? && join_type == 'inner'
|
115
|
+
|
116
|
+
# there might be the case that nothing was found after-all
|
117
|
+
d2 ||= {}
|
118
|
+
|
119
|
+
# prefix if needed
|
120
|
+
d1 = Hash[d1.map { |k, v| ["#{prefix1}#{k}", v] }] if prefix1.present?
|
121
|
+
d2 = Hash[d2.map { |k, v| ["#{prefix2}#{k}", v] }] if prefix2.present?
|
122
|
+
|
123
|
+
d1.reverse_merge(d2)
|
124
|
+
end.compact
|
125
|
+
end
|
126
|
+
|
127
|
+
def has_multiple_keys?
|
128
|
+
other_keys1.present? && other_keys2.present?
|
129
|
+
end
|
130
|
+
|
131
|
+
# Find a record in d2_list that can be join with d1 based on
|
132
|
+
# the values in the fields specified in other_keys_1/2.
|
133
|
+
# @param d1 [Hash] a datum
|
134
|
+
# @param d2_list [Array] an array of datums that may match with d1
|
135
|
+
# @param other_keys1 [Array] an array of arrays (tokens) that will
|
136
|
+
# be used to fetch the corresponding value in d1
|
137
|
+
# @param other_keys2 [Array] an array of arrays (tokens) that will
|
138
|
+
# be used to fetch the corresponding value in the d2_list
|
139
|
+
# @return [Hash] a record if found, nil otherwise.
|
140
|
+
def find_matching_record(d1, d2_list, other_tokens1, other_tokens2)
|
141
|
+
values1 = other_tokens1.map { |tokens| d1.dig(*tokens) }
|
142
|
+
d2_list.find do |d2|
|
143
|
+
values1.each_with_index.all? do |value1, idx|
|
144
|
+
# does this record match d1 on all the fields in other_key1/2?
|
145
|
+
value1 == d2.dig(*(other_tokens2[idx]))
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Performs a map operation on 2 dependencies.
|
5
|
+
class MapNode < ComputeNode
|
6
|
+
ensure_data_node_exists
|
7
|
+
ensure_dependencies exactly: 2
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def compute_batch(records:)
|
12
|
+
map(records: records, mapping_node: dependencies.second)
|
13
|
+
end
|
14
|
+
|
15
|
+
def map(records:, mapping_node:)
|
16
|
+
mapping_table = mapping_node.all
|
17
|
+
|
18
|
+
records.each do |record|
|
19
|
+
mapping_table.each { |mapping| map_record(record, mapping) }
|
20
|
+
end
|
21
|
+
|
22
|
+
records
|
23
|
+
end
|
24
|
+
|
25
|
+
def map_record(record, mapping)
|
26
|
+
original_key = mapping['key']
|
27
|
+
original_value = record_value(record: record, key: original_key)
|
28
|
+
mapped_key = mapping['mapped_key']
|
29
|
+
mapped_value = nil
|
30
|
+
|
31
|
+
if mapping['map'].present?
|
32
|
+
# re-map either the key/value with a lambda(key,value)
|
33
|
+
result = eval(mapping['map']).call(original_key, original_value)
|
34
|
+
mapped_key = result.keys[0]
|
35
|
+
mapped_value = result.values[0]
|
36
|
+
elsif mapping['values'].is_a? Hash
|
37
|
+
# or from a hash-table that directly translates values
|
38
|
+
mapped_value = mapping['values'][original_value]
|
39
|
+
mapped_value ||= mapping['default']
|
40
|
+
elsif mapping['values'].present?
|
41
|
+
# or map the current value with a lambda(value)
|
42
|
+
mapped_value = eval(mapping['values']).call(original_value)
|
43
|
+
end
|
44
|
+
|
45
|
+
mapped_key ||= original_key
|
46
|
+
record[mapped_key] = mapped_value || original_value
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Performs a merge operation on 2 dependencies.
|
5
|
+
class MergeNode < ComputeNode
|
6
|
+
field :merge_key, type: String, default: ''
|
7
|
+
field :merge_values, type: Array, default: []
|
8
|
+
|
9
|
+
ensure_data_node_exists
|
10
|
+
ensure_dependencies exactly: 2
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def compute_impl
|
15
|
+
process_parallel(node: dependencies.first) do |records|
|
16
|
+
merge_records(records: records, index: 0)
|
17
|
+
end
|
18
|
+
|
19
|
+
process_parallel(node: dependencies.second) do |records|
|
20
|
+
merge_records(records: records, index: 1)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def merge_records(records:, index:)
|
25
|
+
records.each do |record|
|
26
|
+
# add a merge key with the corresponding value if necessary
|
27
|
+
record[merge_key] = merge_values[index] if merge_key.present?
|
28
|
+
end
|
29
|
+
records
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Mixin
|
5
|
+
# Add an internal updated_at timestamp to the records.
|
6
|
+
module AddInternalTimestamp
|
7
|
+
def self.included(base)
|
8
|
+
base.class_eval do
|
9
|
+
field :use_internal_timestamp, type: Boolean, default: true
|
10
|
+
field :internal_timestamp_key, type: String, default: '_mojaco_updated_at'
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# Add an internal updated_at timestamp to the records
|
15
|
+
def add_internal_timestamp(records:)
|
16
|
+
return unless use_internal_timestamp
|
17
|
+
return unless internal_timestamp_key.present?
|
18
|
+
|
19
|
+
updated_at = Time.now
|
20
|
+
records.each do |record|
|
21
|
+
record[internal_timestamp_key] = updated_at
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end # module AddInternalTimestamp
|
25
|
+
end # module Mixin
|
26
|
+
end # module Nodes
|
27
|
+
end # module Dataflow
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Mixin
|
5
|
+
# Support tranversing the record and rename fields that contain a dot '.'.
|
6
|
+
module RenameDottedFields
|
7
|
+
# Add a mixin-specific field to the node
|
8
|
+
def self.included(base)
|
9
|
+
base.class_eval do
|
10
|
+
field :rename_dotted_fields_in, type: Array
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# Rename the specified dotted fields
|
15
|
+
def rename_dotted_fields(records:)
|
16
|
+
return if rename_dotted_fields_in.blank?
|
17
|
+
|
18
|
+
traverse_whole_record = rename_dotted_fields_in.include?('.')
|
19
|
+
|
20
|
+
records.each do |record|
|
21
|
+
if traverse_whole_record
|
22
|
+
traverse_and_rename_dotted_fields(record)
|
23
|
+
else
|
24
|
+
rename_dotted_fields_in.each do |field|
|
25
|
+
value = record[field]
|
26
|
+
if value.is_a?(Array)
|
27
|
+
traverse_and_rename_dotted_fields_in_array(value)
|
28
|
+
elsif value.is_a?(Hash)
|
29
|
+
traverse_and_rename_dotted_fields(value)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Traverse a hash and look for the fields to rename
|
37
|
+
def traverse_and_rename_dotted_fields(hash)
|
38
|
+
return if hash.blank?
|
39
|
+
|
40
|
+
hash.keys.each do |k|
|
41
|
+
value = hash[k]
|
42
|
+
if value.is_a?(Array)
|
43
|
+
traverse_and_rename_dotted_fields_in_array(value)
|
44
|
+
elsif value.is_a?(Hash)
|
45
|
+
traverse_and_rename_dotted_fields(value)
|
46
|
+
end
|
47
|
+
|
48
|
+
next unless k.include?('.')
|
49
|
+
hash[k.tr('.', '_')] = value
|
50
|
+
hash.delete(k)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Looks for hashs in the array that may require a transformation
|
55
|
+
def traverse_and_rename_dotted_fields_in_array(array)
|
56
|
+
array.each do |v|
|
57
|
+
traverse_and_rename_dotted_fields(v) if v.is_a?(Hash)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end # module RenameDottedFields
|
61
|
+
end # module Mixin
|
62
|
+
end # module Nodes
|
63
|
+
end # module Dataflow
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Performs a select operation on its dependency.
|
5
|
+
class SelectKeysNode < ComputeNode
|
6
|
+
field :keys, type: Array, required_for_computing: true
|
7
|
+
|
8
|
+
ensure_data_node_exists
|
9
|
+
ensure_dependencies exactly: 1
|
10
|
+
|
11
|
+
def export(connection_opts: { db_backend: :csv }, keys: nil)
|
12
|
+
super(connection_opts: connection_opts, keys: keys || self.keys)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def compute_batch(records:)
|
18
|
+
k = keys
|
19
|
+
k = k.map(&:to_sym) if dependencies.first.use_symbols?
|
20
|
+
select_keys(records: records, keys: k)
|
21
|
+
end
|
22
|
+
|
23
|
+
def select_keys(records:, keys:)
|
24
|
+
records.map do |base_record|
|
25
|
+
new_record = {}
|
26
|
+
keys.each do |key|
|
27
|
+
value = record_value(record: base_record, key: key)
|
28
|
+
next unless value.present?
|
29
|
+
|
30
|
+
add_value_to_record(record: new_record, key: key, value: value)
|
31
|
+
end
|
32
|
+
|
33
|
+
next unless new_record.present?
|
34
|
+
new_record
|
35
|
+
end.compact
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
# Represents a node that captures changes over time.
|
4
|
+
module Nodes
|
5
|
+
# TODO: extend the unique node?
|
6
|
+
class SnapshotNode < DataNode
|
7
|
+
include Mixin::RenameDottedFields
|
8
|
+
include Mixin::AddInternalTimestamp
|
9
|
+
|
10
|
+
field :index_key, type: String, required_for_computing: true
|
11
|
+
field :updated_at_key, type: String, required_for_computing: true
|
12
|
+
|
13
|
+
validates_presence_of :index_key
|
14
|
+
validates_presence_of :updated_at_key
|
15
|
+
|
16
|
+
def set_defaults
|
17
|
+
super
|
18
|
+
|
19
|
+
self.indexes ||= []
|
20
|
+
# get rid of keys/string confusion
|
21
|
+
self.indexes = JSON.parse(self.indexes.to_json)
|
22
|
+
|
23
|
+
# add keys for the index, updated_at and unique keys
|
24
|
+
self.indexes += [{ 'key' => index_key }] if index_key
|
25
|
+
self.indexes += [{ 'key' => updated_at_key }] if updated_at_key
|
26
|
+
self.indexes += [{ 'key' => [index_key, updated_at_key], 'unique' => true }] if index_key && updated_at_key
|
27
|
+
self.indexes.uniq!
|
28
|
+
|
29
|
+
self.updated_at ||= Time.now
|
30
|
+
end
|
31
|
+
|
32
|
+
def add(records:)
|
33
|
+
# TODO: create a chain of behavior "before add"
|
34
|
+
rename_dotted_fields(records: records)
|
35
|
+
add_internal_timestamp(records: records)
|
36
|
+
|
37
|
+
records.delete_if do |record|
|
38
|
+
convert_update_at_key(record)
|
39
|
+
is_record_redundant?(record: record)
|
40
|
+
end.compact
|
41
|
+
super(records: records)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
# If this record already exists, and only the updated_at
|
47
|
+
# key changed, but the rest of the content is the same,
|
48
|
+
# we will consider it to be redundant
|
49
|
+
def is_record_redundant?(record:)
|
50
|
+
id = record[index_key]
|
51
|
+
previous_record = db_adapter.find(where: { index_key => id },
|
52
|
+
sort: { updated_at_key => -1 })
|
53
|
+
return false if previous_record.blank?
|
54
|
+
|
55
|
+
has_same_content = previous_record.keys == record.keys
|
56
|
+
has_same_content &&= previous_record.keys.all? do |k|
|
57
|
+
# we allow the updated_at key to change, or the mojaco time stamp
|
58
|
+
next true if k == updated_at_key || k == internal_timestamp_key
|
59
|
+
# but most importantly, the rest of the content should be the same
|
60
|
+
record[k] == previous_record[k]
|
61
|
+
end
|
62
|
+
|
63
|
+
has_same_content
|
64
|
+
end
|
65
|
+
|
66
|
+
def convert_update_at_key(record)
|
67
|
+
return if record[updated_at_key].is_a?(Time)
|
68
|
+
|
69
|
+
# try to parse as a string
|
70
|
+
record[updated_at_key] = Time.parse(record[updated_at_key])
|
71
|
+
rescue TypeError
|
72
|
+
# try to parse as a timestamp
|
73
|
+
record[updated_at_key] = Time.at(record[updated_at_key])
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Transforms the dependency's dataset to a SQL-compatible one.
|
5
|
+
class SqlQueryNode < ComputeNode
|
6
|
+
ensure_data_node_exists
|
7
|
+
ensure_dependencies min: 0 # dependencies are not necessarily needed
|
8
|
+
field :query, type: String, required_for_computing: true
|
9
|
+
|
10
|
+
def valid_for_computation?
|
11
|
+
unless (data_node&.db_backend.to_s =~ /sql/).present?
|
12
|
+
errors.add(:db_backend, 'Must have a SQL based backend.')
|
13
|
+
end
|
14
|
+
|
15
|
+
begin
|
16
|
+
computed_query
|
17
|
+
rescue StandardError => e
|
18
|
+
errors.add(:query, "Specified query has errors: #{e.message}")
|
19
|
+
end
|
20
|
+
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
def computed_query
|
25
|
+
# 1. replace the current write dataset's name
|
26
|
+
q = query.gsub('<node>', write_dataset_name)
|
27
|
+
|
28
|
+
# 2. replace the dependencies' (read) dataset names
|
29
|
+
q.gsub(/<[0-9]+>/) do |match|
|
30
|
+
# [1..-2] will remove the 'less than' < and 'greater than' >
|
31
|
+
dep_index = match[1..-2].to_i
|
32
|
+
raise "Specified depependency #{match} does not exist. There are only #{dependencies.count} dependencies." if dep_index >= dependencies.count
|
33
|
+
dependencies[dep_index].read_dataset_name
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def execute_query
|
38
|
+
data_node.send(:db_adapter).client[computed_query].to_a
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
# Overrides the base implementation.
|
44
|
+
# This node will leave all the work to the DB.
|
45
|
+
def compute_impl
|
46
|
+
execute_query
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Transformation
|
5
|
+
# Transforms the given keys' values to Time.
|
6
|
+
class ToTimeNode < ComputeNode
|
7
|
+
field :keys, type: Array, required_for_computing: true, default: []
|
8
|
+
|
9
|
+
ensure_data_node_exists
|
10
|
+
ensure_dependencies exactly: 1
|
11
|
+
|
12
|
+
def valid_for_computation?
|
13
|
+
# It does not make sense to use this node without any keys specified.
|
14
|
+
if (keys || []).count.zero?
|
15
|
+
errors.add(:keys, "#{self.class} keys must contain at least 1 value")
|
16
|
+
end
|
17
|
+
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
def compute_batch(records:)
|
22
|
+
key_tokens = keys.map do |key|
|
23
|
+
record_dig_tokens(key: key, use_sym: dependencies.first.use_symbols?)
|
24
|
+
end
|
25
|
+
|
26
|
+
records.each do |record|
|
27
|
+
key_tokens.each_with_index do |tokens, index|
|
28
|
+
value = record.dig(*tokens)
|
29
|
+
next unless value.present?
|
30
|
+
|
31
|
+
value = value.to_time
|
32
|
+
add_value_to_record(record: record, key: keys[index], value: value)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
records
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
# Represents a node with a unique index and upsert behavior:
|
4
|
+
# If there is any existing that that match on that index,
|
5
|
+
# it gets replaced. If not, it simply gets added.
|
6
|
+
module Nodes
|
7
|
+
class UpsertNode < DataNode
|
8
|
+
include Mixin::RenameDottedFields
|
9
|
+
include Mixin::AddInternalTimestamp
|
10
|
+
|
11
|
+
before_save :transform_index_key
|
12
|
+
|
13
|
+
field :index_key, required_for_computing: true
|
14
|
+
validates_presence_of :index_key
|
15
|
+
|
16
|
+
def set_defaults
|
17
|
+
super
|
18
|
+
|
19
|
+
self.indexes ||= []
|
20
|
+
# get rid of keys/string confusion
|
21
|
+
self.indexes = JSON.parse(self.indexes.to_json)
|
22
|
+
|
23
|
+
# if there is no index_key, take the first unique index
|
24
|
+
if index_key.blank?
|
25
|
+
first_unique_index = self.indexes.find { |x| x['unique'] }
|
26
|
+
self.index_key = (first_unique_index || {})['key']
|
27
|
+
end
|
28
|
+
|
29
|
+
# add keys for the unique index keys
|
30
|
+
if index_key.present?
|
31
|
+
auto_generated_indexes = [{ 'key' => index_key, 'unique' => true }]
|
32
|
+
|
33
|
+
if index_key.is_a? Array
|
34
|
+
# generated non-unique indexes for each key in a compound index
|
35
|
+
auto_generated_indexes += index_key.map { |idx| { 'key' => idx } }
|
36
|
+
end
|
37
|
+
self.indexes += auto_generated_indexes
|
38
|
+
self.indexes.uniq!
|
39
|
+
end
|
40
|
+
|
41
|
+
self.updated_at ||= Time.now
|
42
|
+
end
|
43
|
+
|
44
|
+
def add(records:)
|
45
|
+
return if records.blank?
|
46
|
+
|
47
|
+
# TODO: create a chain of behavior "before add"
|
48
|
+
rename_dotted_fields(records: records)
|
49
|
+
add_internal_timestamp(records: records)
|
50
|
+
|
51
|
+
db_adapter.save(records: records, replace_by: index_key)
|
52
|
+
self.updated_at = Time.now
|
53
|
+
save!
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def transform_index_key
|
59
|
+
return unless index_key.is_a?(String)
|
60
|
+
|
61
|
+
# try to split the comma separated string
|
62
|
+
keys = index_key.split(',')
|
63
|
+
# if there was no comma, leave as-is
|
64
|
+
self.index_key = keys if keys.count > 1
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module PropertiesMixin
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
module ClassMethods
|
7
|
+
# Override the mongoid `field` method to produce a list of
|
8
|
+
# properties for each node.
|
9
|
+
def field(name, opts = {})
|
10
|
+
add_property(name, opts)
|
11
|
+
# make sure we pass mongoid-only keys to the superclass
|
12
|
+
opts.delete(:editable)
|
13
|
+
opts.delete(:required_for_computing)
|
14
|
+
opts.delete(:values)
|
15
|
+
super
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_property(name, opts)
|
19
|
+
# skip properties that start by underscore
|
20
|
+
return if name =~ /^_/
|
21
|
+
@properties ||= {}
|
22
|
+
@properties[name] ||= {}
|
23
|
+
@properties[name].merge!(opts)
|
24
|
+
end
|
25
|
+
|
26
|
+
def properties
|
27
|
+
@properties ||= {}
|
28
|
+
@properties.merge(superclass.properties)
|
29
|
+
rescue NoMethodError => e
|
30
|
+
# handle cases where we're already on top of the hierarchy.
|
31
|
+
@properties
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|