dataflow-rb 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.env.test.example +6 -0
  3. data/.gitignore +14 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +4 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE +21 -0
  8. data/README.md +46 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +7 -0
  12. data/dataflow-rb.gemspec +42 -0
  13. data/lib/config/mongoid.yml +21 -0
  14. data/lib/dataflow/adapters/csv_adapter.rb +123 -0
  15. data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
  16. data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
  17. data/lib/dataflow/adapters/psql_adapter.rb +21 -0
  18. data/lib/dataflow/adapters/settings.rb +33 -0
  19. data/lib/dataflow/adapters/sql_adapter.rb +322 -0
  20. data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
  21. data/lib/dataflow/errors/not_implemented_error.rb +7 -0
  22. data/lib/dataflow/event_mixin.rb +77 -0
  23. data/lib/dataflow/extensions/mongo_driver.rb +21 -0
  24. data/lib/dataflow/extensions/msgpack.rb +19 -0
  25. data/lib/dataflow/logger.rb +27 -0
  26. data/lib/dataflow/node.rb +37 -0
  27. data/lib/dataflow/nodes/compute_node.rb +495 -0
  28. data/lib/dataflow/nodes/data_node.rb +331 -0
  29. data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
  30. data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
  31. data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
  32. data/lib/dataflow/nodes/filter/where_node.rb +44 -0
  33. data/lib/dataflow/nodes/join_node.rb +151 -0
  34. data/lib/dataflow/nodes/map_node.rb +50 -0
  35. data/lib/dataflow/nodes/merge_node.rb +33 -0
  36. data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
  37. data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
  38. data/lib/dataflow/nodes/select_keys_node.rb +39 -0
  39. data/lib/dataflow/nodes/snapshot_node.rb +77 -0
  40. data/lib/dataflow/nodes/sql_query_node.rb +50 -0
  41. data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
  42. data/lib/dataflow/nodes/upsert_node.rb +68 -0
  43. data/lib/dataflow/properties_mixin.rb +35 -0
  44. data/lib/dataflow/schema_mixin.rb +134 -0
  45. data/lib/dataflow/version.rb +4 -0
  46. data/lib/dataflow-rb.rb +72 -0
  47. metadata +371 -0
@@ -0,0 +1,331 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ # Data nodes are used to build a data computing/transformation graph.
5
+ # At each step we can save the results to a (temp) table.
6
+ #
7
+ # Nodes::DataNode represents one of the data nodes.
8
+ # It is meant to be treated as an interface and should not be used directly.
9
+ class DataNode
10
+ include Mongoid::Document
11
+ include Dataflow::Node
12
+ include Dataflow::PropertiesMixin
13
+ include Dataflow::EventMixin
14
+ include Dataflow::SchemaMixin
15
+
16
+ event :schema_inference_started
17
+ event :schema_inference_progressed
18
+ event :schema_inference_finished
19
+
20
+ event :export_started
21
+ event :export_progressed
22
+ event :export_finished
23
+
24
+ # make sure we have only one node per db/table combination
25
+ index({ db_name: 1, name: 1 }, unique: true)
26
+
27
+ # The database name used by this node
28
+ field :db_name, type: String, editable: false
29
+
30
+ # The dataset name used by this node for storage.
31
+ field :name, type: String
32
+
33
+ # The schema of this node
34
+ field :schema, type: Hash, editable: false
35
+ field :inferred_schema, type: Hash, editable: false
36
+ field :inferred_schema_at, type: Time, editable: false
37
+ # How many samples were used to infer the schema
38
+ field :inferred_schema_from, type: Integer, editable: false
39
+
40
+ # The time when this node was last updated
41
+ field :updated_at, type: Time, editable: false
42
+
43
+ # One of the possible backend this node will use e.g.: :mongodb, :csv, :mysql
44
+ field :db_backend, type: Symbol, editable: false, default: :mongodb
45
+
46
+ # Represents the time in seconds within which to expect an update on this node
47
+ field :update_expected_within, type: Integer, default: 0
48
+
49
+ # The indexes this node will implement on its dataset.
50
+ # Indexes should be in the following format:
51
+ # [
52
+ # { key: 'id' },
53
+ # { key: 'updated_at' },
54
+ # { key: ['id', 'updated_at'], unique: true }
55
+ # ]
56
+ field :indexes, type: Array, default: []
57
+
58
+ # whether to use double buffering or not
59
+ field :use_double_buffering, type: Boolean, editable: false, default: false
60
+
61
+ # internal use: where to read/write from. Use 1 and 2 for legacy reasons.
62
+ field :read_dataset_idx, type: Integer, editable: false, default: 1
63
+ field :write_dataset_idx, type: Integer, editable: false, default: 2
64
+
65
+ # Necessary fields:
66
+ validates_presence_of :db_name
67
+ validates_presence_of :name
68
+
69
+ # Before create: run default initializations
70
+ before_create :set_defaults
71
+
72
+ # Sets the default parameters before creating the object.
73
+ def set_defaults
74
+ self.schema = schema || {}
75
+
76
+ # Use the schema as the inferred schema if none is provided.
77
+ # This useful when there is no need to infer schemas (e.g. in SQL)
78
+ self.inferred_schema ||= schema
79
+ end
80
+
81
+ # Callback: after creation make sure the underlying dataset matches this node's properties.
82
+ after_create do
83
+ handle_dataset_settings_changed
84
+ end
85
+
86
+ # Callback: after save, make sure the underlying dataset is valid if
87
+ # any dataset-related proprety changed.
88
+ after_save do
89
+ if name_changed? || indexes_changed? || db_backend_changed?
90
+ handle_dataset_settings_changed
91
+ end
92
+ end
93
+
94
+ # When the dataset properties changed notify the adapter to handle the new settings.
95
+ def handle_dataset_settings_changed
96
+ db_adapter.update_settings(data_node: self)
97
+
98
+ # recreate the dataset if there is no data
99
+ if db_adapter.count.zero?
100
+ db_adapter.recreate_dataset(dataset: read_dataset_name)
101
+ end
102
+
103
+ db_adapter.create_indexes(dataset: read_dataset_name)
104
+ end
105
+
106
+ # Finds and return from the dataset, based on the given options.
107
+ # @param where [Hash] the condition to apply for retrieving the element.
108
+ # e.g.: { 'id' => 1 } will fetch a record with the id 1.
109
+ # An empty option hash will retrieve any record.
110
+ # @return [Hash] returns a single record from the dataset.
111
+ def find(where: {})
112
+ db_adapter.find(where: where)
113
+ end
114
+
115
+ # Returns all the records from a dataset that match the options.
116
+ # @param where [Hash] the condition to apply for retrieving the element.
117
+ # e.g.: { 'id' => 1 } will fetch a record with the id 1.
118
+ # An empty option hash will retrieve any record.
119
+ # @param fields [Array] Array of strings representing which fields to include.
120
+ # e.g.: ['id', 'updated_at'] will only return these two fields.
121
+ # @param sort [Hash] represents the sorting of the returned dataset.
122
+ # e.g. { 'id' => 1, 'updated_at' => -1 } will sort by
123
+ # id ASC and by updated_at DESC.
124
+ # @param limit [Integer] limits the amount of records returned.
125
+ # @param offset [Integer] starting offset of the records returned.
126
+ # Use with limit to implement pagination.
127
+ # @yield [db_client] When a block is passed, yields the db client on which .each
128
+ # can be called to stream the results rather than load everything in memory.
129
+ # Other methods can also be called depending on the backend,
130
+ # the downside being back-end portability (use at your own risk).
131
+ def all(where: {}, fields: [], sort: {}, limit: 0, offset: 0, &block)
132
+ db_adapter.all(where: where, fields: fields, sort: sort, limit: limit, offset: offset, &block)
133
+ end
134
+
135
+ # Supports paginating efficiently through the dataset.
136
+ # @param where [Hash] the condition to apply for retrieving the element.
137
+ # e.g.: { 'id' => 1 } will fetch a record with the id 1.
138
+ # An empty option hash will retrieve any record.
139
+ # IMPORTANT: do not use the system id in the query. It will be overwritten.
140
+ # @param fields [Array] Array of strings representing which fields to include.
141
+ # e.g.: ['id', 'updated_at'] will only return these two fields.
142
+ # @param limit [Integer] limits the amount of records returned.
143
+ # @param cursor [String] indicates from which page should the results be returned.
144
+ # @return [Hash] with 2 fields:
145
+ # - data [Array] that contains the fetched records
146
+ # - next_cursor [String] a string to pass into the sub-sequent
147
+ # calls to fetch the next page of the data
148
+ def all_paginated(where: {}, fields: [], cursor: nil)
149
+ db_adapter.all_paginated(where: where, fields: fields, cursor: cursor)
150
+ end
151
+
152
+ # Return a list of order (ASC) system IDs.
153
+ # @param batch_size [Integer] how many IDs to select per query.
154
+ # These can be used to process the dataset in parallel by querying on a sub-section:
155
+ # queries = node.ordered_system_id_queries
156
+ # Parallel.each(queries) do |query|
157
+ # process(node.all(where: query))
158
+ # end
159
+ def ordered_system_id_queries(batch_size:)
160
+ db_adapter.ordered_system_id_queries(batch_size: batch_size)
161
+ end
162
+
163
+ # Counts how many records matches the condition or all if no condition is given.
164
+ # @return [Integer] the record count.
165
+ def count(where: {})
166
+ db_adapter.count(where: where)
167
+ end
168
+
169
+ # Adds the given records to the dataset and updates the updated_at time.
170
+ # @param records [Array] an array of the records to be added.
171
+ def add(records:)
172
+ return if records.blank?
173
+ db_adapter.save(records: records)
174
+ self.updated_at = Time.now
175
+ save!
176
+ end
177
+
178
+ # Clear the data that matches the options.
179
+ def clear(where: {})
180
+ db_adapter.delete(where: where)
181
+ end
182
+
183
+ # Update this node's schema.
184
+ def update_schema(sch)
185
+ self.schema = sch
186
+ db_adapter.update_settings(data_node: self)
187
+ end
188
+
189
+ # Recreates a dataset.
190
+ # @param dataset_type [Symbol] select which dataset to recreate.
191
+ # Can :read or :write.
192
+ def recreate_dataset(dataset_type: :read)
193
+ # fetch the proper dataset name
194
+ dataset = send("#{dataset_type}_dataset_name")
195
+ db_adapter.recreate_dataset(dataset: dataset)
196
+ end
197
+
198
+ # Applies unique indexes on the dataset.
199
+ # As this will be enforcing constraints, it is best applied
200
+ # before adding any data.
201
+ # @param dataset_type [Symbol] select which dataset to recreate.
202
+ # Can :read or :write.
203
+ def create_unique_indexes(dataset_type: :read)
204
+ dataset = send("#{dataset_type}_dataset_name")
205
+ db_adapter.create_indexes(dataset: dataset, type: :unique_only)
206
+ end
207
+
208
+ # Applies non-unique indexes on the dataset.
209
+ # For performance reasons, these indexes are best applied
210
+ # after adding data (especially on large import operations).
211
+ def create_non_unique_indexes(dataset_type: :read)
212
+ dataset = send("#{dataset_type}_dataset_name")
213
+ db_adapter.create_indexes(dataset: dataset, type: :non_unique_only)
214
+ end
215
+
216
+ def read_dataset_name
217
+ return @temporary_read_dataset if @temporary_read_dataset
218
+
219
+ if use_double_buffering
220
+ "#{name}_buffer#{read_dataset_idx}"
221
+ else
222
+ name
223
+ end
224
+ end
225
+
226
+ def write_dataset_name
227
+ if use_double_buffering
228
+ "#{name}_buffer#{write_dataset_idx}"
229
+ else
230
+ name
231
+ end
232
+ end
233
+
234
+ # Use to select from which dataset you want to read.
235
+ # A possible use case is to read from an old dataset name.
236
+ # @param dataset [String] the dataset name from where to read from.
237
+ # It must be a valid dataset name for the current settings.
238
+ def read_dataset_name=(dataset)
239
+ return unless valid_dataset_names.include?(dataset)
240
+ @temporary_read_dataset = dataset
241
+ db_adapter.update_settings(data_node: self)
242
+ dataset
243
+ end
244
+
245
+ def swap_read_write_datasets!
246
+ raise Dataflow::Errors::InvalidConfigurationError, '#swap_read_write_dataset_names! called on "#{self.name}" but "use_double_buffering" is not activated.' unless use_double_buffering
247
+ tmp = read_dataset_idx
248
+ self.read_dataset_idx = write_dataset_idx
249
+ self.write_dataset_idx = tmp
250
+ db_adapter.update_settings(data_node: self)
251
+ save!
252
+ end
253
+
254
+ def import(connection_opts: {}, keys: nil)
255
+ importer = db_adapter(connection_opts)
256
+ records = importer.all
257
+ add(records: records)
258
+ end
259
+
260
+ def export(connection_opts: { db_backend: :csv }, keys: nil, where: {})
261
+ on_export_started(connection_opts: connection_opts, keys: keys)
262
+ # instanciate and export without saving anything
263
+ Export::ToCsvNode.new(dependency_ids: [self], query: where.to_json).compute_impl
264
+ on_export_finished
265
+ end
266
+
267
+ # retrieves some informations about this node and its usage
268
+ def info(write_dataset: false)
269
+ dataset = write_dataset ? write_dataset_name : read_dataset_name
270
+ usage = db_adapter.usage(dataset: dataset)
271
+ {
272
+ name: name,
273
+ type: self.class.to_s,
274
+ dataset: dataset,
275
+ db_backend: db_backend,
276
+ updated_at: updated_at,
277
+ record_count: count,
278
+ indexes: indexes,
279
+ effective_indexes: usage[:effective_indexes],
280
+ mem_usage: usage[:memory],
281
+ storage_usage: usage[:storage]
282
+ }
283
+ end
284
+
285
+ def use_symbols?
286
+ (db_backend.to_s =~ /sql/).present?
287
+ end
288
+
289
+ private
290
+
291
+ def db_adapter(connection_opts = {})
292
+ db_backend = connection_opts[:db_backend] || self.db_backend
293
+
294
+ opts = connection_opts.deep_dup
295
+ opts.delete(:db_backend)
296
+ has_options = opts.present?
297
+
298
+ case db_backend.downcase.to_s
299
+ when 'mongodb'
300
+ return Adapters::MongoDbAdapter.new(opts) if has_options
301
+ @mongodb_adapter ||= Adapters::MongoDbAdapter.new(data_node: self)
302
+ return @mongodb_adapter
303
+ when 'csv'
304
+ return Adapters::CsvAdapter.new(opts) if has_options
305
+ @csv_adapter ||= Adapters::CsvAdapter.new(data_node: self)
306
+ return @csv_adapter
307
+ when 'mysql'
308
+ opts[:adapter_type] = 'mysql2'
309
+ return Adapters::SqlAdapter.new(opts) if has_options
310
+ @mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql2')
311
+ return @mysql_adapter
312
+ when 'postgresql'
313
+ opts[:adapter_type] = 'postgresql'
314
+ return Adapters::SqlAdapter.new(opts) if has_options
315
+ @postgresql_adapter ||= Adapters::PsqlAdapter.new(data_node: self, adapter_type: 'postgresql')
316
+ return @postgresql_adapter
317
+ end
318
+
319
+ raise Errors::NotImplementedError, "'#{db_backend}' backend is not implemented."
320
+ end
321
+
322
+ def valid_dataset_names
323
+ if use_double_buffering
324
+ ["#{name}_buffer1", "#{name}_buffer2"]
325
+ else
326
+ [name]
327
+ end
328
+ end
329
+ end # class DataNode
330
+ end # module Nodes
331
+ end # module Dataflow
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ module Export
5
+ # Export a dataset to CSV
6
+ class ToCsvNode < ComputeNode
7
+ ensure_dependencies exactly: 1
8
+
9
+ # A JSON encoded query to pass along.
10
+ field :query, type: String, default: {}.to_json
11
+
12
+ def compute_impl
13
+ node = dependencies.first
14
+ where = JSON.parse(query)
15
+
16
+ # fetch the schema
17
+ sch = node.infer_partial_schema(where: where, extended: true)
18
+
19
+ # re-order the schema if needed
20
+ if node.respond_to? :keys
21
+ sch = node.keys.map { |k| [k, sch[k]] }.to_h if keys.present?
22
+ end
23
+
24
+ # create the dataset
25
+ csv_adapter = Adapters::CsvAdapter.new(data_node: node)
26
+ csv_adapter.set_schema(sch)
27
+ csv_adapter.recreate_dataset
28
+
29
+ # export in parallel
30
+ max_per_process = 1000
31
+ max_per_process = limit_per_process if limit_per_process < 0
32
+
33
+ data_count = [node.count(where: where), 1].max
34
+ equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
35
+ count_per_process = [max_per_process, equal_split_per_process].min
36
+
37
+ queries = node.ordered_system_id_queries(batch_size: count_per_process)
38
+
39
+ parallel_each(queries.each_with_index) do |query, _idx|
40
+ # TODO: re-enabled event on_export_progressed
41
+ # progress = (idx / queries.count.to_f * 100).ceil
42
+ # on_export_progressed(pct_complete: progress)
43
+
44
+ batch = node.all(where: query.merge(where))
45
+ csv_adapter.save(records: batch)
46
+ end
47
+
48
+ # needed by the csv exporter to finalize in a single file
49
+ csv_adapter.on_save_finished
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ module Filter
5
+ # Makes a sequency based on a key (e.g. id), and order it (e.g. by time),
6
+ # and then applies the same logic as ruby's drop_while.
7
+ # See: https://ruby-doc.org/core-2.4.0/Array.html#method-i-drop_while
8
+ class DropWhileNode < ComputeNode
9
+ VALID_OPS = %w(eq ne le lt ge gt).freeze
10
+ VALID_MODES = %w(both left right).freeze
11
+
12
+ # group by the id key
13
+ field :id_key, type: String, required_for_computing: true
14
+ # then sort by the sort_by
15
+ field :sort_by, type: String, required_for_computing: true
16
+ field :sort_asc, type: Boolean, required_for_computing: true, default: true
17
+
18
+ # the apply a drop_while on { field op value }
19
+ field :field, type: String, required_for_computing: true
20
+ field :op, type: String, required_for_computing: true, values: VALID_OPS
21
+ field :value, required_for_computing: true
22
+ field :drop_mode, type: String, required_for_computing: true, values: VALID_MODES, default: VALID_MODES[0]
23
+
24
+ ensure_data_node_exists
25
+ ensure_dependencies exactly: 1
26
+
27
+ def compute_impl
28
+ base_node = dependencies.first
29
+ records_count = base_node.count
30
+ return if records_count == 0
31
+
32
+ ids = base_node.all(fields: [id_key]) do |results|
33
+ results.distinct(id_key)
34
+ end
35
+ count_per_process = (ids.count / Parallel.processor_count.to_f).ceil
36
+ limit = limit_per_process.to_i
37
+ count_per_process = [limit, count_per_process].min if limit > 0
38
+
39
+ parallel_each(ids.each_slice(count_per_process)) do |ids_slice|
40
+ # ids.each_slice(count_per_process) do |ids_slice|
41
+ process_ids(node: base_node, ids: ids_slice)
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ def process_ids(node:, ids:)
48
+ records = node.all(where: { id_key => ids })
49
+ groups = records.group_by { |x| x[id_key] }
50
+
51
+ result = groups.flat_map do |_, group|
52
+ process_group(group)
53
+ end.compact
54
+
55
+ data_node.add(records: result)
56
+ end
57
+
58
+ # sort the record group and then proceed to drop the elements
59
+ # that satisfy the condition
60
+ def process_group(record_group)
61
+ sort_tokens = record_dig_tokens(key: sort_by, use_sym: dependencies.first.use_symbols?)
62
+ group = record_group.sort_by { |x| x.dig(*sort_tokens) }
63
+ group = group.reverse unless sort_asc
64
+ modes = drop_mode == 'both' ? %w(left right) : [drop_mode]
65
+
66
+ modes.each do |mode|
67
+ # if we want to drop on the right,
68
+ # reverse the array, drop on the left and reverse again
69
+ group = group.reverse if mode == 'right'
70
+ group = drop_while(group)
71
+ group = group.reverse if mode == 'right'
72
+ end
73
+
74
+ group
75
+ end
76
+
77
+ # apply a single drop_while on the group.
78
+ def drop_while(group)
79
+ value_tokens = record_dig_tokens(key: field, use_sym: dependencies.first.use_symbols?)
80
+
81
+ case op.to_s.downcase
82
+ when 'eq'
83
+ group.drop_while { |x| x.dig(*value_tokens) == value }
84
+ when 'ne'
85
+ group.drop_while { |x| x.dig(*value_tokens) != value }
86
+ when 'le'
87
+ group.drop_while do |x|
88
+ val = x.dig(*value_tokens)
89
+ next true if val.nil? # drop nil values
90
+ val <= value
91
+ end
92
+ when 'lt'
93
+ group.drop_while do |x|
94
+ val = x.dig(*value_tokens)
95
+ next true if val.nil? # drop nil values
96
+ val < value
97
+ end
98
+ when 'ge'
99
+ group.drop_while do |x|
100
+ val = x.dig(*value_tokens)
101
+ next true if val.nil? # drop nil values
102
+ val >= value
103
+ end
104
+ when 'gt'
105
+ group.drop_while do |x|
106
+ val = x.dig(*value_tokens)
107
+ next true if val.nil? # drop nil values
108
+ val > value
109
+ end
110
+ else
111
+ raise Errors::InvalidConfigurationError, "Invalid op key: #{op}"
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ module Filter
5
+ # Select the newest record among records with the same id key.
6
+ class NewestNode < ComputeNode
7
+ field :id_key, type: String, required_for_computing: true
8
+ field :date_key, type: String, required_for_computing: true
9
+
10
+ ensure_data_node_exists
11
+ ensure_dependencies exactly: 1
12
+
13
+ private
14
+
15
+ def ensure_keys_are_set!
16
+ raise Errors::InvalidConfigurationError, 'Id key must be set.' if id_key.blank?
17
+ raise Errors::InvalidConfigurationError, 'Date key must be set.' if date_key.blank?
18
+ end
19
+
20
+ def compute_impl
21
+ base_node = dependencies.first
22
+ records_count = base_node.count
23
+ return if records_count == 0
24
+
25
+ ids = base_node.all(fields: [id_key]) do |results|
26
+ results.distinct(id_key)
27
+ end
28
+ count_per_process = (ids.count / Parallel.processor_count.to_f).ceil
29
+ limit = limit_per_process.to_i
30
+ count_per_process = [limit, count_per_process].min if limit > 0
31
+
32
+ parallel_each(ids.each_slice(count_per_process)) do |ids_slice|
33
+ # ids.each_slice(count_per_process) do |ids_slice|
34
+ process_ids(node: base_node, ids: ids_slice)
35
+ end
36
+ end
37
+
38
+ def process_ids(node:, ids:)
39
+ metatata = node.all(where: { id_key => ids }, fields: [id_key, date_key])
40
+ groups = metatata.group_by { |x| x[id_key] }
41
+ newest_record_metadata = filter_by_newest(groups: groups,
42
+ date_key: date_key)
43
+ records = newest_record_metadata.map do |metadata|
44
+ query = {
45
+ id_key => metadata[id_key],
46
+ date_key => metadata[date_key]
47
+ }
48
+ node.find(where: query)
49
+ end.compact
50
+
51
+ data_node.add(records: records)
52
+ end
53
+
54
+ def filter_by_newest(groups:, date_key:)
55
+ groups.map do |_, entries|
56
+ # sort by date ASC and select the newest
57
+ entries
58
+ .sort_by do |x|
59
+ x[date_key].is_a?(Time) ? x[date_key] : Timeliness.parse(x[date_key])
60
+ end.last
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ module Filter
5
+ # Select records that match the condition.
6
+ class WhereNode < ComputeNode
7
+ VALID_OPS = %w(eq ne le lt ge gt).freeze
8
+
9
+ field :key, type: String, required_for_computing: true
10
+ field :op, type: String, required_for_computing: true, values: VALID_OPS
11
+ field :value, required_for_computing: true
12
+
13
+ ensure_data_node_exists
14
+ ensure_dependencies exactly: 1
15
+
16
+ private
17
+
18
+ def compute_batch(records:)
19
+ where(records: records)
20
+ end
21
+
22
+ def where(records:)
23
+ tokens = record_dig_tokens(key: key, use_sym: dependencies.first.use_symbols?)
24
+ case op.to_s.downcase
25
+ when 'eq'
26
+ records.select { |x| x.dig(*tokens) == value }
27
+ when 'ne'
28
+ records.select { |x| x.dig(*tokens) != value }
29
+ when 'le'
30
+ records.select { |x| x.dig(*tokens) <= value }
31
+ when 'lt'
32
+ records.select { |x| x.dig(*tokens) < value }
33
+ when 'ge'
34
+ records.select { |x| x.dig(*tokens) >= value }
35
+ when 'gt'
36
+ records.select { |x| x.dig(*tokens) > value }
37
+ else
38
+ raise Errors::InvalidConfigurationError, "Invalid op key: #{op}"
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end