dataflow-rb 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.env.test.example +6 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/README.md +46 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/dataflow-rb.gemspec +42 -0
- data/lib/config/mongoid.yml +21 -0
- data/lib/dataflow/adapters/csv_adapter.rb +123 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
- data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
- data/lib/dataflow/adapters/psql_adapter.rb +21 -0
- data/lib/dataflow/adapters/settings.rb +33 -0
- data/lib/dataflow/adapters/sql_adapter.rb +322 -0
- data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
- data/lib/dataflow/errors/not_implemented_error.rb +7 -0
- data/lib/dataflow/event_mixin.rb +77 -0
- data/lib/dataflow/extensions/mongo_driver.rb +21 -0
- data/lib/dataflow/extensions/msgpack.rb +19 -0
- data/lib/dataflow/logger.rb +27 -0
- data/lib/dataflow/node.rb +37 -0
- data/lib/dataflow/nodes/compute_node.rb +495 -0
- data/lib/dataflow/nodes/data_node.rb +331 -0
- data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
- data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
- data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
- data/lib/dataflow/nodes/filter/where_node.rb +44 -0
- data/lib/dataflow/nodes/join_node.rb +151 -0
- data/lib/dataflow/nodes/map_node.rb +50 -0
- data/lib/dataflow/nodes/merge_node.rb +33 -0
- data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
- data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
- data/lib/dataflow/nodes/select_keys_node.rb +39 -0
- data/lib/dataflow/nodes/snapshot_node.rb +77 -0
- data/lib/dataflow/nodes/sql_query_node.rb +50 -0
- data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
- data/lib/dataflow/nodes/upsert_node.rb +68 -0
- data/lib/dataflow/properties_mixin.rb +35 -0
- data/lib/dataflow/schema_mixin.rb +134 -0
- data/lib/dataflow/version.rb +4 -0
- data/lib/dataflow-rb.rb +72 -0
- metadata +371 -0
@@ -0,0 +1,151 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Performs a join operation on 2 dependencies.
|
5
|
+
class JoinNode < ComputeNode
|
6
|
+
VALID_TYPES = %w(inner left).freeze
|
7
|
+
field :join_type, type: String, required_for_computing: true, values: VALID_TYPES, default: VALID_TYPES[0]
|
8
|
+
field :key1, type: String, required_for_computing: true
|
9
|
+
field :key2, type: String, required_for_computing: true
|
10
|
+
# Support joining on multiple keys by setting them in the other keys.
|
11
|
+
# other_keys_1 and 2 must match in length
|
12
|
+
field :other_keys1, type: Array, default: []
|
13
|
+
field :other_keys2, type: Array, default: []
|
14
|
+
field :prefix1, type: String, default: ''
|
15
|
+
field :prefix2, type: String, default: ''
|
16
|
+
|
17
|
+
ensure_data_node_exists
|
18
|
+
ensure_dependencies exactly: 2
|
19
|
+
|
20
|
+
def valid_for_computation?
|
21
|
+
# We need an equivalent number of keys as they will be matched with each others
|
22
|
+
if other_keys1.count != other_keys2.count
|
23
|
+
errors.add(:other_keys2, "#{self.class} other_keys2 must match other_keys1's length")
|
24
|
+
end
|
25
|
+
|
26
|
+
super
|
27
|
+
end
|
28
|
+
|
29
|
+
def required_schema
|
30
|
+
return {} unless dependencies.count == 2
|
31
|
+
|
32
|
+
# merge both dependencies schemas
|
33
|
+
sch = dependencies.first.schema || {}
|
34
|
+
sch.merge(dependencies.second.schema || {})
|
35
|
+
end
|
36
|
+
|
37
|
+
def compute_impl
|
38
|
+
all_same_postgresql = db_backend == :postgresql
|
39
|
+
all_same_postgresql &&= dependencies[1..-1].all? do |dep|
|
40
|
+
dep.db_backend == :postgresql && dep.db_name == db_name
|
41
|
+
end
|
42
|
+
|
43
|
+
if all_same_postgresql
|
44
|
+
# use SQL join
|
45
|
+
execute_sql_join
|
46
|
+
self.updated_at = Time.now
|
47
|
+
else
|
48
|
+
# use software join
|
49
|
+
super
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def execute_sql_join
|
56
|
+
fields = required_schema.keys
|
57
|
+
select_keys = dependencies[0].schema.keys.map { |x| "d1.#{x}" } + (dependencies[1].schema.keys - dependencies[0].schema.keys).map { |x| "d2.#{x}" }
|
58
|
+
query = "INSERT INTO #{write_dataset_name} (#{fields.join(',')})
|
59
|
+
SELECT #{select_keys.join(', ')}
|
60
|
+
FROM #{dependencies[0].read_dataset_name} as d1
|
61
|
+
INNER JOIN #{dependencies[1].read_dataset_name} as d2
|
62
|
+
ON d1.#{key1} = d2.#{key2}"
|
63
|
+
p query
|
64
|
+
db_adapter.client[query].to_a
|
65
|
+
end
|
66
|
+
|
67
|
+
def compute_batch(records:)
|
68
|
+
join(n1_records: records)
|
69
|
+
end
|
70
|
+
|
71
|
+
def join(n1_records:)
|
72
|
+
tokens_key1 = record_dig_tokens(key: key1, use_sym: dependencies.first.use_symbols?)
|
73
|
+
tokens_key2 = record_dig_tokens(key: key2, use_sym: dependencies.second.use_symbols?)
|
74
|
+
other_tokens_key1 = (other_keys1 || []).map do |key|
|
75
|
+
record_dig_tokens(key: key, use_sym: dependencies.second.use_symbols?)
|
76
|
+
end
|
77
|
+
other_tokens_key2 = (other_keys2 || []).map do |key|
|
78
|
+
record_dig_tokens(key: key, use_sym: dependencies.second.use_symbols?)
|
79
|
+
end
|
80
|
+
|
81
|
+
# fetch necessary records from node2
|
82
|
+
node2 = dependencies.second
|
83
|
+
n2_ids = n1_records.map { |x| x.dig(*tokens_key1) }.compact.uniq
|
84
|
+
n2_records = node2.all(where: { key2 => n2_ids })
|
85
|
+
|
86
|
+
# preload and map dataset2 by the key we want to lookup
|
87
|
+
mapped_data2 = {}
|
88
|
+
if has_multiple_keys?
|
89
|
+
n2_records.each do |datum2|
|
90
|
+
lookup_value = datum2.dig(*tokens_key2)
|
91
|
+
mapped_data2[lookup_value] ||= []
|
92
|
+
mapped_data2[lookup_value] << datum2
|
93
|
+
end
|
94
|
+
else
|
95
|
+
n2_records.each do |datum2|
|
96
|
+
lookup_value = datum2.dig(*tokens_key2)
|
97
|
+
mapped_data2[lookup_value] = datum2
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# for each datum in dataset1, find the corresponding datum in dataset2
|
102
|
+
n1_records.map do |d1|
|
103
|
+
join_value = d1.dig(*tokens_key1)
|
104
|
+
next if join_value.nil?
|
105
|
+
|
106
|
+
d2 = mapped_data2[join_value]
|
107
|
+
if has_multiple_keys? && !d2.nil?
|
108
|
+
# in this case, it will be an array,
|
109
|
+
# so we need to further search the correct datum
|
110
|
+
d2 = find_matching_record(d1, d2, other_tokens_key1, other_tokens_key2)
|
111
|
+
end
|
112
|
+
|
113
|
+
# if there is no d2, only continue based on the type of join we want.
|
114
|
+
next if d2.blank? && join_type == 'inner'
|
115
|
+
|
116
|
+
# there might be the case that nothing was found after-all
|
117
|
+
d2 ||= {}
|
118
|
+
|
119
|
+
# prefix if needed
|
120
|
+
d1 = Hash[d1.map { |k, v| ["#{prefix1}#{k}", v] }] if prefix1.present?
|
121
|
+
d2 = Hash[d2.map { |k, v| ["#{prefix2}#{k}", v] }] if prefix2.present?
|
122
|
+
|
123
|
+
d1.reverse_merge(d2)
|
124
|
+
end.compact
|
125
|
+
end
|
126
|
+
|
127
|
+
def has_multiple_keys?
|
128
|
+
other_keys1.present? && other_keys2.present?
|
129
|
+
end
|
130
|
+
|
131
|
+
# Find a record in d2_list that can be join with d1 based on
|
132
|
+
# the values in the fields specified in other_keys_1/2.
|
133
|
+
# @param d1 [Hash] a datum
|
134
|
+
# @param d2_list [Array] an array of datums that may match with d1
|
135
|
+
# @param other_keys1 [Array] an array of arrays (tokens) that will
|
136
|
+
# be used to fetch the corresponding value in d1
|
137
|
+
# @param other_keys2 [Array] an array of arrays (tokens) that will
|
138
|
+
# be used to fetch the corresponding value in the d2_list
|
139
|
+
# @return [Hash] a record if found, nil otherwise.
|
140
|
+
def find_matching_record(d1, d2_list, other_tokens1, other_tokens2)
|
141
|
+
values1 = other_tokens1.map { |tokens| d1.dig(*tokens) }
|
142
|
+
d2_list.find do |d2|
|
143
|
+
values1.each_with_index.all? do |value1, idx|
|
144
|
+
# does this record match d1 on all the fields in other_key1/2?
|
145
|
+
value1 == d2.dig(*(other_tokens2[idx]))
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Performs a map operation on 2 dependencies.
|
5
|
+
class MapNode < ComputeNode
|
6
|
+
ensure_data_node_exists
|
7
|
+
ensure_dependencies exactly: 2
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def compute_batch(records:)
|
12
|
+
map(records: records, mapping_node: dependencies.second)
|
13
|
+
end
|
14
|
+
|
15
|
+
def map(records:, mapping_node:)
|
16
|
+
mapping_table = mapping_node.all
|
17
|
+
|
18
|
+
records.each do |record|
|
19
|
+
mapping_table.each { |mapping| map_record(record, mapping) }
|
20
|
+
end
|
21
|
+
|
22
|
+
records
|
23
|
+
end
|
24
|
+
|
25
|
+
def map_record(record, mapping)
|
26
|
+
original_key = mapping['key']
|
27
|
+
original_value = record_value(record: record, key: original_key)
|
28
|
+
mapped_key = mapping['mapped_key']
|
29
|
+
mapped_value = nil
|
30
|
+
|
31
|
+
if mapping['map'].present?
|
32
|
+
# re-map either the key/value with a lambda(key,value)
|
33
|
+
result = eval(mapping['map']).call(original_key, original_value)
|
34
|
+
mapped_key = result.keys[0]
|
35
|
+
mapped_value = result.values[0]
|
36
|
+
elsif mapping['values'].is_a? Hash
|
37
|
+
# or from a hash-table that directly translates values
|
38
|
+
mapped_value = mapping['values'][original_value]
|
39
|
+
mapped_value ||= mapping['default']
|
40
|
+
elsif mapping['values'].present?
|
41
|
+
# or map the current value with a lambda(value)
|
42
|
+
mapped_value = eval(mapping['values']).call(original_value)
|
43
|
+
end
|
44
|
+
|
45
|
+
mapped_key ||= original_key
|
46
|
+
record[mapped_key] = mapped_value || original_value
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Performs a merge operation on 2 dependencies.
|
5
|
+
class MergeNode < ComputeNode
|
6
|
+
field :merge_key, type: String, default: ''
|
7
|
+
field :merge_values, type: Array, default: []
|
8
|
+
|
9
|
+
ensure_data_node_exists
|
10
|
+
ensure_dependencies exactly: 2
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def compute_impl
|
15
|
+
process_parallel(node: dependencies.first) do |records|
|
16
|
+
merge_records(records: records, index: 0)
|
17
|
+
end
|
18
|
+
|
19
|
+
process_parallel(node: dependencies.second) do |records|
|
20
|
+
merge_records(records: records, index: 1)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def merge_records(records:, index:)
|
25
|
+
records.each do |record|
|
26
|
+
# add a merge key with the corresponding value if necessary
|
27
|
+
record[merge_key] = merge_values[index] if merge_key.present?
|
28
|
+
end
|
29
|
+
records
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Mixin
|
5
|
+
# Add an internal updated_at timestamp to the records.
|
6
|
+
module AddInternalTimestamp
|
7
|
+
def self.included(base)
|
8
|
+
base.class_eval do
|
9
|
+
field :use_internal_timestamp, type: Boolean, default: true
|
10
|
+
field :internal_timestamp_key, type: String, default: '_mojaco_updated_at'
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# Add an internal updated_at timestamp to the records
|
15
|
+
def add_internal_timestamp(records:)
|
16
|
+
return unless use_internal_timestamp
|
17
|
+
return unless internal_timestamp_key.present?
|
18
|
+
|
19
|
+
updated_at = Time.now
|
20
|
+
records.each do |record|
|
21
|
+
record[internal_timestamp_key] = updated_at
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end # module AddInternalTimestamp
|
25
|
+
end # module Mixin
|
26
|
+
end # module Nodes
|
27
|
+
end # module Dataflow
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Mixin
|
5
|
+
# Support tranversing the record and rename fields that contain a dot '.'.
|
6
|
+
module RenameDottedFields
|
7
|
+
# Add a mixin-specific field to the node
|
8
|
+
def self.included(base)
|
9
|
+
base.class_eval do
|
10
|
+
field :rename_dotted_fields_in, type: Array
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# Rename the specified dotted fields
|
15
|
+
def rename_dotted_fields(records:)
|
16
|
+
return if rename_dotted_fields_in.blank?
|
17
|
+
|
18
|
+
traverse_whole_record = rename_dotted_fields_in.include?('.')
|
19
|
+
|
20
|
+
records.each do |record|
|
21
|
+
if traverse_whole_record
|
22
|
+
traverse_and_rename_dotted_fields(record)
|
23
|
+
else
|
24
|
+
rename_dotted_fields_in.each do |field|
|
25
|
+
value = record[field]
|
26
|
+
if value.is_a?(Array)
|
27
|
+
traverse_and_rename_dotted_fields_in_array(value)
|
28
|
+
elsif value.is_a?(Hash)
|
29
|
+
traverse_and_rename_dotted_fields(value)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Traverse a hash and look for the fields to rename
|
37
|
+
def traverse_and_rename_dotted_fields(hash)
|
38
|
+
return if hash.blank?
|
39
|
+
|
40
|
+
hash.keys.each do |k|
|
41
|
+
value = hash[k]
|
42
|
+
if value.is_a?(Array)
|
43
|
+
traverse_and_rename_dotted_fields_in_array(value)
|
44
|
+
elsif value.is_a?(Hash)
|
45
|
+
traverse_and_rename_dotted_fields(value)
|
46
|
+
end
|
47
|
+
|
48
|
+
next unless k.include?('.')
|
49
|
+
hash[k.tr('.', '_')] = value
|
50
|
+
hash.delete(k)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Looks for hashs in the array that may require a transformation
|
55
|
+
def traverse_and_rename_dotted_fields_in_array(array)
|
56
|
+
array.each do |v|
|
57
|
+
traverse_and_rename_dotted_fields(v) if v.is_a?(Hash)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end # module RenameDottedFields
|
61
|
+
end # module Mixin
|
62
|
+
end # module Nodes
|
63
|
+
end # module Dataflow
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Performs a select operation on its dependency.
|
5
|
+
class SelectKeysNode < ComputeNode
|
6
|
+
field :keys, type: Array, required_for_computing: true
|
7
|
+
|
8
|
+
ensure_data_node_exists
|
9
|
+
ensure_dependencies exactly: 1
|
10
|
+
|
11
|
+
def export(connection_opts: { db_backend: :csv }, keys: nil)
|
12
|
+
super(connection_opts: connection_opts, keys: keys || self.keys)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def compute_batch(records:)
|
18
|
+
k = keys
|
19
|
+
k = k.map(&:to_sym) if dependencies.first.use_symbols?
|
20
|
+
select_keys(records: records, keys: k)
|
21
|
+
end
|
22
|
+
|
23
|
+
def select_keys(records:, keys:)
|
24
|
+
records.map do |base_record|
|
25
|
+
new_record = {}
|
26
|
+
keys.each do |key|
|
27
|
+
value = record_value(record: base_record, key: key)
|
28
|
+
next unless value.present?
|
29
|
+
|
30
|
+
add_value_to_record(record: new_record, key: key, value: value)
|
31
|
+
end
|
32
|
+
|
33
|
+
next unless new_record.present?
|
34
|
+
new_record
|
35
|
+
end.compact
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
# Represents a node that captures changes over time.
|
4
|
+
module Nodes
|
5
|
+
# TODO: extend the unique node?
|
6
|
+
class SnapshotNode < DataNode
|
7
|
+
include Mixin::RenameDottedFields
|
8
|
+
include Mixin::AddInternalTimestamp
|
9
|
+
|
10
|
+
field :index_key, type: String, required_for_computing: true
|
11
|
+
field :updated_at_key, type: String, required_for_computing: true
|
12
|
+
|
13
|
+
validates_presence_of :index_key
|
14
|
+
validates_presence_of :updated_at_key
|
15
|
+
|
16
|
+
def set_defaults
|
17
|
+
super
|
18
|
+
|
19
|
+
self.indexes ||= []
|
20
|
+
# get rid of keys/string confusion
|
21
|
+
self.indexes = JSON.parse(self.indexes.to_json)
|
22
|
+
|
23
|
+
# add keys for the index, updated_at and unique keys
|
24
|
+
self.indexes += [{ 'key' => index_key }] if index_key
|
25
|
+
self.indexes += [{ 'key' => updated_at_key }] if updated_at_key
|
26
|
+
self.indexes += [{ 'key' => [index_key, updated_at_key], 'unique' => true }] if index_key && updated_at_key
|
27
|
+
self.indexes.uniq!
|
28
|
+
|
29
|
+
self.updated_at ||= Time.now
|
30
|
+
end
|
31
|
+
|
32
|
+
def add(records:)
|
33
|
+
# TODO: create a chain of behavior "before add"
|
34
|
+
rename_dotted_fields(records: records)
|
35
|
+
add_internal_timestamp(records: records)
|
36
|
+
|
37
|
+
records.delete_if do |record|
|
38
|
+
convert_update_at_key(record)
|
39
|
+
is_record_redundant?(record: record)
|
40
|
+
end.compact
|
41
|
+
super(records: records)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
# If this record already exists, and only the updated_at
|
47
|
+
# key changed, but the rest of the content is the same,
|
48
|
+
# we will consider it to be redundant
|
49
|
+
def is_record_redundant?(record:)
|
50
|
+
id = record[index_key]
|
51
|
+
previous_record = db_adapter.find(where: { index_key => id },
|
52
|
+
sort: { updated_at_key => -1 })
|
53
|
+
return false if previous_record.blank?
|
54
|
+
|
55
|
+
has_same_content = previous_record.keys == record.keys
|
56
|
+
has_same_content &&= previous_record.keys.all? do |k|
|
57
|
+
# we allow the updated_at key to change, or the mojaco time stamp
|
58
|
+
next true if k == updated_at_key || k == internal_timestamp_key
|
59
|
+
# but most importantly, the rest of the content should be the same
|
60
|
+
record[k] == previous_record[k]
|
61
|
+
end
|
62
|
+
|
63
|
+
has_same_content
|
64
|
+
end
|
65
|
+
|
66
|
+
def convert_update_at_key(record)
|
67
|
+
return if record[updated_at_key].is_a?(Time)
|
68
|
+
|
69
|
+
# try to parse as a string
|
70
|
+
record[updated_at_key] = Time.parse(record[updated_at_key])
|
71
|
+
rescue TypeError
|
72
|
+
# try to parse as a timestamp
|
73
|
+
record[updated_at_key] = Time.at(record[updated_at_key])
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Transforms the dependency's dataset to a SQL-compatible one.
|
5
|
+
class SqlQueryNode < ComputeNode
|
6
|
+
ensure_data_node_exists
|
7
|
+
ensure_dependencies min: 0 # dependencies are not necessarily needed
|
8
|
+
field :query, type: String, required_for_computing: true
|
9
|
+
|
10
|
+
def valid_for_computation?
|
11
|
+
unless (data_node&.db_backend.to_s =~ /sql/).present?
|
12
|
+
errors.add(:db_backend, 'Must have a SQL based backend.')
|
13
|
+
end
|
14
|
+
|
15
|
+
begin
|
16
|
+
computed_query
|
17
|
+
rescue StandardError => e
|
18
|
+
errors.add(:query, "Specified query has errors: #{e.message}")
|
19
|
+
end
|
20
|
+
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
def computed_query
|
25
|
+
# 1. replace the current write dataset's name
|
26
|
+
q = query.gsub('<node>', write_dataset_name)
|
27
|
+
|
28
|
+
# 2. replace the dependencies' (read) dataset names
|
29
|
+
q.gsub(/<[0-9]+>/) do |match|
|
30
|
+
# [1..-2] will remove the 'less than' < and 'greater than' >
|
31
|
+
dep_index = match[1..-2].to_i
|
32
|
+
raise "Specified depependency #{match} does not exist. There are only #{dependencies.count} dependencies." if dep_index >= dependencies.count
|
33
|
+
dependencies[dep_index].read_dataset_name
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def execute_query
|
38
|
+
data_node.send(:db_adapter).client[computed_query].to_a
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
# Overrides the base implementation.
|
44
|
+
# This node will leave all the work to the DB.
|
45
|
+
def compute_impl
|
46
|
+
execute_query
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Transformation
|
5
|
+
# Transforms the given keys' values to Time.
|
6
|
+
class ToTimeNode < ComputeNode
|
7
|
+
field :keys, type: Array, required_for_computing: true, default: []
|
8
|
+
|
9
|
+
ensure_data_node_exists
|
10
|
+
ensure_dependencies exactly: 1
|
11
|
+
|
12
|
+
def valid_for_computation?
|
13
|
+
# It does not make sense to use this node without any keys specified.
|
14
|
+
if (keys || []).count.zero?
|
15
|
+
errors.add(:keys, "#{self.class} keys must contain at least 1 value")
|
16
|
+
end
|
17
|
+
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
def compute_batch(records:)
|
22
|
+
key_tokens = keys.map do |key|
|
23
|
+
record_dig_tokens(key: key, use_sym: dependencies.first.use_symbols?)
|
24
|
+
end
|
25
|
+
|
26
|
+
records.each do |record|
|
27
|
+
key_tokens.each_with_index do |tokens, index|
|
28
|
+
value = record.dig(*tokens)
|
29
|
+
next unless value.present?
|
30
|
+
|
31
|
+
value = value.to_time
|
32
|
+
add_value_to_record(record: record, key: keys[index], value: value)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
records
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
# Represents a node with a unique index and upsert behavior:
|
4
|
+
# If there is any existing that that match on that index,
|
5
|
+
# it gets replaced. If not, it simply gets added.
|
6
|
+
module Nodes
|
7
|
+
class UpsertNode < DataNode
|
8
|
+
include Mixin::RenameDottedFields
|
9
|
+
include Mixin::AddInternalTimestamp
|
10
|
+
|
11
|
+
before_save :transform_index_key
|
12
|
+
|
13
|
+
field :index_key, required_for_computing: true
|
14
|
+
validates_presence_of :index_key
|
15
|
+
|
16
|
+
def set_defaults
|
17
|
+
super
|
18
|
+
|
19
|
+
self.indexes ||= []
|
20
|
+
# get rid of keys/string confusion
|
21
|
+
self.indexes = JSON.parse(self.indexes.to_json)
|
22
|
+
|
23
|
+
# if there is no index_key, take the first unique index
|
24
|
+
if index_key.blank?
|
25
|
+
first_unique_index = self.indexes.find { |x| x['unique'] }
|
26
|
+
self.index_key = (first_unique_index || {})['key']
|
27
|
+
end
|
28
|
+
|
29
|
+
# add keys for the unique index keys
|
30
|
+
if index_key.present?
|
31
|
+
auto_generated_indexes = [{ 'key' => index_key, 'unique' => true }]
|
32
|
+
|
33
|
+
if index_key.is_a? Array
|
34
|
+
# generated non-unique indexes for each key in a compound index
|
35
|
+
auto_generated_indexes += index_key.map { |idx| { 'key' => idx } }
|
36
|
+
end
|
37
|
+
self.indexes += auto_generated_indexes
|
38
|
+
self.indexes.uniq!
|
39
|
+
end
|
40
|
+
|
41
|
+
self.updated_at ||= Time.now
|
42
|
+
end
|
43
|
+
|
44
|
+
def add(records:)
|
45
|
+
return if records.blank?
|
46
|
+
|
47
|
+
# TODO: create a chain of behavior "before add"
|
48
|
+
rename_dotted_fields(records: records)
|
49
|
+
add_internal_timestamp(records: records)
|
50
|
+
|
51
|
+
db_adapter.save(records: records, replace_by: index_key)
|
52
|
+
self.updated_at = Time.now
|
53
|
+
save!
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def transform_index_key
|
59
|
+
return unless index_key.is_a?(String)
|
60
|
+
|
61
|
+
# try to split the comma separated string
|
62
|
+
keys = index_key.split(',')
|
63
|
+
# if there was no comma, leave as-is
|
64
|
+
self.index_key = keys if keys.count > 1
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module PropertiesMixin
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
module ClassMethods
|
7
|
+
# Override the mongoid `field` method to produce a list of
|
8
|
+
# properties for each node.
|
9
|
+
def field(name, opts = {})
|
10
|
+
add_property(name, opts)
|
11
|
+
# make sure we pass mongoid-only keys to the superclass
|
12
|
+
opts.delete(:editable)
|
13
|
+
opts.delete(:required_for_computing)
|
14
|
+
opts.delete(:values)
|
15
|
+
super
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_property(name, opts)
|
19
|
+
# skip properties that start by underscore
|
20
|
+
return if name =~ /^_/
|
21
|
+
@properties ||= {}
|
22
|
+
@properties[name] ||= {}
|
23
|
+
@properties[name].merge!(opts)
|
24
|
+
end
|
25
|
+
|
26
|
+
def properties
|
27
|
+
@properties ||= {}
|
28
|
+
@properties.merge(superclass.properties)
|
29
|
+
rescue NoMethodError => e
|
30
|
+
# handle cases where we're already on top of the hierarchy.
|
31
|
+
@properties
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|