dataflow-rb 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.env.test.example +6 -0
  3. data/.gitignore +14 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +4 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE +21 -0
  8. data/README.md +46 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +7 -0
  12. data/dataflow-rb.gemspec +42 -0
  13. data/lib/config/mongoid.yml +21 -0
  14. data/lib/dataflow/adapters/csv_adapter.rb +123 -0
  15. data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
  16. data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
  17. data/lib/dataflow/adapters/psql_adapter.rb +21 -0
  18. data/lib/dataflow/adapters/settings.rb +33 -0
  19. data/lib/dataflow/adapters/sql_adapter.rb +322 -0
  20. data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
  21. data/lib/dataflow/errors/not_implemented_error.rb +7 -0
  22. data/lib/dataflow/event_mixin.rb +77 -0
  23. data/lib/dataflow/extensions/mongo_driver.rb +21 -0
  24. data/lib/dataflow/extensions/msgpack.rb +19 -0
  25. data/lib/dataflow/logger.rb +27 -0
  26. data/lib/dataflow/node.rb +37 -0
  27. data/lib/dataflow/nodes/compute_node.rb +495 -0
  28. data/lib/dataflow/nodes/data_node.rb +331 -0
  29. data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
  30. data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
  31. data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
  32. data/lib/dataflow/nodes/filter/where_node.rb +44 -0
  33. data/lib/dataflow/nodes/join_node.rb +151 -0
  34. data/lib/dataflow/nodes/map_node.rb +50 -0
  35. data/lib/dataflow/nodes/merge_node.rb +33 -0
  36. data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
  37. data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
  38. data/lib/dataflow/nodes/select_keys_node.rb +39 -0
  39. data/lib/dataflow/nodes/snapshot_node.rb +77 -0
  40. data/lib/dataflow/nodes/sql_query_node.rb +50 -0
  41. data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
  42. data/lib/dataflow/nodes/upsert_node.rb +68 -0
  43. data/lib/dataflow/properties_mixin.rb +35 -0
  44. data/lib/dataflow/schema_mixin.rb +134 -0
  45. data/lib/dataflow/version.rb +4 -0
  46. data/lib/dataflow-rb.rb +72 -0
  47. metadata +371 -0
@@ -0,0 +1,331 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ # Data nodes are used to build a data computing/transformation graph.
5
+ # At each step we can save the results to a (temp) table.
6
+ #
7
+ # Nodes::DataNode represents one of the data nodes.
8
+ # It is meant to be treated as an interface and should not be used directly.
9
+ class DataNode
10
+ include Mongoid::Document
11
+ include Dataflow::Node
12
+ include Dataflow::PropertiesMixin
13
+ include Dataflow::EventMixin
14
+ include Dataflow::SchemaMixin
15
+
16
+ event :schema_inference_started
17
+ event :schema_inference_progressed
18
+ event :schema_inference_finished
19
+
20
+ event :export_started
21
+ event :export_progressed
22
+ event :export_finished
23
+
24
+ # make sure we have only one node per db/table combination
25
+ index({ db_name: 1, name: 1 }, unique: true)
26
+
27
+ # The database name used by this node
28
+ field :db_name, type: String, editable: false
29
+
30
+ # The dataset name used by this node for storage.
31
+ field :name, type: String
32
+
33
+ # The schema of this node
34
+ field :schema, type: Hash, editable: false
35
+ field :inferred_schema, type: Hash, editable: false
36
+ field :inferred_schema_at, type: Time, editable: false
37
+ # How many samples were used to infer the schema
38
+ field :inferred_schema_from, type: Integer, editable: false
39
+
40
+ # The time when this node was last updated
41
+ field :updated_at, type: Time, editable: false
42
+
43
+ # One of the possible backend this node will use e.g.: :mongodb, :csv, :mysql
44
+ field :db_backend, type: Symbol, editable: false, default: :mongodb
45
+
46
+ # Represents the time in seconds within which to expect an update on this node
47
+ field :update_expected_within, type: Integer, default: 0
48
+
49
+ # The indexes this node will implement on its dataset.
50
+ # Indexes should be in the following format:
51
+ # [
52
+ # { key: 'id' },
53
+ # { key: 'updated_at' },
54
+ # { key: ['id', 'updated_at'], unique: true }
55
+ # ]
56
+ field :indexes, type: Array, default: []
57
+
58
+ # whether to use double buffering or not
59
+ field :use_double_buffering, type: Boolean, editable: false, default: false
60
+
61
+ # internal use: where to read/write from. Use 1 and 2 for legacy reasons.
62
+ field :read_dataset_idx, type: Integer, editable: false, default: 1
63
+ field :write_dataset_idx, type: Integer, editable: false, default: 2
64
+
65
+ # Necessary fields:
66
+ validates_presence_of :db_name
67
+ validates_presence_of :name
68
+
69
+ # Before create: run default initializations
70
+ before_create :set_defaults
71
+
72
+ # Sets the default parameters before creating the object.
73
+ def set_defaults
74
+ self.schema = schema || {}
75
+
76
+ # Use the schema as the inferred schema if none is provided.
77
+ # This useful when there is no need to infer schemas (e.g. in SQL)
78
+ self.inferred_schema ||= schema
79
+ end
80
+
81
+ # Callback: after creation make sure the underlying dataset matches this node's properties.
82
+ after_create do
83
+ handle_dataset_settings_changed
84
+ end
85
+
86
+ # Callback: after save, make sure the underlying dataset is valid if
87
+ # any dataset-related proprety changed.
88
+ after_save do
89
+ if name_changed? || indexes_changed? || db_backend_changed?
90
+ handle_dataset_settings_changed
91
+ end
92
+ end
93
+
94
+ # When the dataset properties changed notify the adapter to handle the new settings.
95
+ def handle_dataset_settings_changed
96
+ db_adapter.update_settings(data_node: self)
97
+
98
+ # recreate the dataset if there is no data
99
+ if db_adapter.count.zero?
100
+ db_adapter.recreate_dataset(dataset: read_dataset_name)
101
+ end
102
+
103
+ db_adapter.create_indexes(dataset: read_dataset_name)
104
+ end
105
+
106
+ # Finds and return from the dataset, based on the given options.
107
+ # @param where [Hash] the condition to apply for retrieving the element.
108
+ # e.g.: { 'id' => 1 } will fetch a record with the id 1.
109
+ # An empty option hash will retrieve any record.
110
+ # @return [Hash] returns a single record from the dataset.
111
+ def find(where: {})
112
+ db_adapter.find(where: where)
113
+ end
114
+
115
+ # Returns all the records from a dataset that match the options.
116
+ # @param where [Hash] the condition to apply for retrieving the element.
117
+ # e.g.: { 'id' => 1 } will fetch a record with the id 1.
118
+ # An empty option hash will retrieve any record.
119
+ # @param fields [Array] Array of strings representing which fields to include.
120
+ # e.g.: ['id', 'updated_at'] will only return these two fields.
121
+ # @param sort [Hash] represents the sorting of the returned dataset.
122
+ # e.g. { 'id' => 1, 'updated_at' => -1 } will sort by
123
+ # id ASC and by updated_at DESC.
124
+ # @param limit [Integer] limits the amount of records returned.
125
+ # @param offset [Integer] starting offset of the records returned.
126
+ # Use with limit to implement pagination.
127
+ # @yield [db_client] When a block is passed, yields the db client on which .each
128
+ # can be called to stream the results rather than load everything in memory.
129
+ # Other methods can also be called depending on the backend,
130
+ # the downside being back-end portability (use at your own risk).
131
+ def all(where: {}, fields: [], sort: {}, limit: 0, offset: 0, &block)
132
+ db_adapter.all(where: where, fields: fields, sort: sort, limit: limit, offset: offset, &block)
133
+ end
134
+
135
+ # Supports paginating efficiently through the dataset.
136
+ # @param where [Hash] the condition to apply for retrieving the element.
137
+ # e.g.: { 'id' => 1 } will fetch a record with the id 1.
138
+ # An empty option hash will retrieve any record.
139
+ # IMPORTANT: do not use the system id in the query. It will be overwritten.
140
+ # @param fields [Array] Array of strings representing which fields to include.
141
+ # e.g.: ['id', 'updated_at'] will only return these two fields.
142
+ # @param limit [Integer] limits the amount of records returned.
143
+ # @param cursor [String] indicates from which page should the results be returned.
144
+ # @return [Hash] with 2 fields:
145
+ # - data [Array] that contains the fetched records
146
+ # - next_cursor [String] a string to pass into the sub-sequent
147
+ # calls to fetch the next page of the data
148
+ def all_paginated(where: {}, fields: [], cursor: nil)
149
+ db_adapter.all_paginated(where: where, fields: fields, cursor: cursor)
150
+ end
151
+
152
+ # Return a list of order (ASC) system IDs.
153
+ # @param batch_size [Integer] how many IDs to select per query.
154
+ # These can be used to process the dataset in parallel by querying on a sub-section:
155
+ # queries = node.ordered_system_id_queries
156
+ # Parallel.each(queries) do |query|
157
+ # process(node.all(where: query))
158
+ # end
159
+ def ordered_system_id_queries(batch_size:)
160
+ db_adapter.ordered_system_id_queries(batch_size: batch_size)
161
+ end
162
+
163
+ # Counts how many records matches the condition or all if no condition is given.
164
+ # @return [Integer] the record count.
165
+ def count(where: {})
166
+ db_adapter.count(where: where)
167
+ end
168
+
169
+ # Adds the given records to the dataset and updates the updated_at time.
170
+ # @param records [Array] an array of the records to be added.
171
+ def add(records:)
172
+ return if records.blank?
173
+ db_adapter.save(records: records)
174
+ self.updated_at = Time.now
175
+ save!
176
+ end
177
+
178
+ # Clear the data that matches the options.
179
+ def clear(where: {})
180
+ db_adapter.delete(where: where)
181
+ end
182
+
183
+ # Update this node's schema.
184
+ def update_schema(sch)
185
+ self.schema = sch
186
+ db_adapter.update_settings(data_node: self)
187
+ end
188
+
189
+ # Recreates a dataset.
190
+ # @param dataset_type [Symbol] select which dataset to recreate.
191
+ # Can :read or :write.
192
+ def recreate_dataset(dataset_type: :read)
193
+ # fetch the proper dataset name
194
+ dataset = send("#{dataset_type}_dataset_name")
195
+ db_adapter.recreate_dataset(dataset: dataset)
196
+ end
197
+
198
+ # Applies unique indexes on the dataset.
199
+ # As this will be enforcing constraints, it is best applied
200
+ # before adding any data.
201
+ # @param dataset_type [Symbol] select which dataset to recreate.
202
+ # Can :read or :write.
203
+ def create_unique_indexes(dataset_type: :read)
204
+ dataset = send("#{dataset_type}_dataset_name")
205
+ db_adapter.create_indexes(dataset: dataset, type: :unique_only)
206
+ end
207
+
208
+ # Applies non-unique indexes on the dataset.
209
+ # For performance reasons, these indexes are best applied
210
+ # after adding data (especially on large import operations).
211
+ def create_non_unique_indexes(dataset_type: :read)
212
+ dataset = send("#{dataset_type}_dataset_name")
213
+ db_adapter.create_indexes(dataset: dataset, type: :non_unique_only)
214
+ end
215
+
216
+ def read_dataset_name
217
+ return @temporary_read_dataset if @temporary_read_dataset
218
+
219
+ if use_double_buffering
220
+ "#{name}_buffer#{read_dataset_idx}"
221
+ else
222
+ name
223
+ end
224
+ end
225
+
226
+ def write_dataset_name
227
+ if use_double_buffering
228
+ "#{name}_buffer#{write_dataset_idx}"
229
+ else
230
+ name
231
+ end
232
+ end
233
+
234
+ # Use to select from which dataset you want to read.
235
+ # A possible use case is to read from an old dataset name.
236
+ # @param dataset [String] the dataset name from where to read from.
237
+ # It must be a valid dataset name for the current settings.
238
+ def read_dataset_name=(dataset)
239
+ return unless valid_dataset_names.include?(dataset)
240
+ @temporary_read_dataset = dataset
241
+ db_adapter.update_settings(data_node: self)
242
+ dataset
243
+ end
244
+
245
+ def swap_read_write_datasets!
246
+ raise Dataflow::Errors::InvalidConfigurationError, '#swap_read_write_dataset_names! called on "#{self.name}" but "use_double_buffering" is not activated.' unless use_double_buffering
247
+ tmp = read_dataset_idx
248
+ self.read_dataset_idx = write_dataset_idx
249
+ self.write_dataset_idx = tmp
250
+ db_adapter.update_settings(data_node: self)
251
+ save!
252
+ end
253
+
254
+ def import(connection_opts: {}, keys: nil)
255
+ importer = db_adapter(connection_opts)
256
+ records = importer.all
257
+ add(records: records)
258
+ end
259
+
260
+ def export(connection_opts: { db_backend: :csv }, keys: nil, where: {})
261
+ on_export_started(connection_opts: connection_opts, keys: keys)
262
+ # instanciate and export without saving anything
263
+ Export::ToCsvNode.new(dependency_ids: [self], query: where.to_json).compute_impl
264
+ on_export_finished
265
+ end
266
+
267
+ # retrieves some informations about this node and its usage
268
+ def info(write_dataset: false)
269
+ dataset = write_dataset ? write_dataset_name : read_dataset_name
270
+ usage = db_adapter.usage(dataset: dataset)
271
+ {
272
+ name: name,
273
+ type: self.class.to_s,
274
+ dataset: dataset,
275
+ db_backend: db_backend,
276
+ updated_at: updated_at,
277
+ record_count: count,
278
+ indexes: indexes,
279
+ effective_indexes: usage[:effective_indexes],
280
+ mem_usage: usage[:memory],
281
+ storage_usage: usage[:storage]
282
+ }
283
+ end
284
+
285
+ def use_symbols?
286
+ (db_backend.to_s =~ /sql/).present?
287
+ end
288
+
289
+ private
290
+
291
+ def db_adapter(connection_opts = {})
292
+ db_backend = connection_opts[:db_backend] || self.db_backend
293
+
294
+ opts = connection_opts.deep_dup
295
+ opts.delete(:db_backend)
296
+ has_options = opts.present?
297
+
298
+ case db_backend.downcase.to_s
299
+ when 'mongodb'
300
+ return Adapters::MongoDbAdapter.new(opts) if has_options
301
+ @mongodb_adapter ||= Adapters::MongoDbAdapter.new(data_node: self)
302
+ return @mongodb_adapter
303
+ when 'csv'
304
+ return Adapters::CsvAdapter.new(opts) if has_options
305
+ @csv_adapter ||= Adapters::CsvAdapter.new(data_node: self)
306
+ return @csv_adapter
307
+ when 'mysql'
308
+ opts[:adapter_type] = 'mysql2'
309
+ return Adapters::SqlAdapter.new(opts) if has_options
310
+ @mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql2')
311
+ return @mysql_adapter
312
+ when 'postgresql'
313
+ opts[:adapter_type] = 'postgresql'
314
+ return Adapters::SqlAdapter.new(opts) if has_options
315
+ @postgresql_adapter ||= Adapters::PsqlAdapter.new(data_node: self, adapter_type: 'postgresql')
316
+ return @postgresql_adapter
317
+ end
318
+
319
+ raise Errors::NotImplementedError, "'#{db_backend}' backend is not implemented."
320
+ end
321
+
322
+ def valid_dataset_names
323
+ if use_double_buffering
324
+ ["#{name}_buffer1", "#{name}_buffer2"]
325
+ else
326
+ [name]
327
+ end
328
+ end
329
+ end # class DataNode
330
+ end # module Nodes
331
+ end # module Dataflow
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ module Export
5
+ # Export a dataset to CSV
6
+ class ToCsvNode < ComputeNode
7
+ ensure_dependencies exactly: 1
8
+
9
+ # A JSON encoded query to pass along.
10
+ field :query, type: String, default: {}.to_json
11
+
12
+ def compute_impl
13
+ node = dependencies.first
14
+ where = JSON.parse(query)
15
+
16
+ # fetch the schema
17
+ sch = node.infer_partial_schema(where: where, extended: true)
18
+
19
+ # re-order the schema if needed
20
+ if node.respond_to? :keys
21
+ sch = node.keys.map { |k| [k, sch[k]] }.to_h if keys.present?
22
+ end
23
+
24
+ # create the dataset
25
+ csv_adapter = Adapters::CsvAdapter.new(data_node: node)
26
+ csv_adapter.set_schema(sch)
27
+ csv_adapter.recreate_dataset
28
+
29
+ # export in parallel
30
+ max_per_process = 1000
31
+ max_per_process = limit_per_process if limit_per_process < 0
32
+
33
+ data_count = [node.count(where: where), 1].max
34
+ equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
35
+ count_per_process = [max_per_process, equal_split_per_process].min
36
+
37
+ queries = node.ordered_system_id_queries(batch_size: count_per_process)
38
+
39
+ parallel_each(queries.each_with_index) do |query, _idx|
40
+ # TODO: re-enabled event on_export_progressed
41
+ # progress = (idx / queries.count.to_f * 100).ceil
42
+ # on_export_progressed(pct_complete: progress)
43
+
44
+ batch = node.all(where: query.merge(where))
45
+ csv_adapter.save(records: batch)
46
+ end
47
+
48
+ # needed by the csv exporter to finalize in a single file
49
+ csv_adapter.on_save_finished
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ module Filter
5
+ # Makes a sequency based on a key (e.g. id), and order it (e.g. by time),
6
+ # and then applies the same logic as ruby's drop_while.
7
+ # See: https://ruby-doc.org/core-2.4.0/Array.html#method-i-drop_while
8
+ class DropWhileNode < ComputeNode
9
+ VALID_OPS = %w(eq ne le lt ge gt).freeze
10
+ VALID_MODES = %w(both left right).freeze
11
+
12
+ # group by the id key
13
+ field :id_key, type: String, required_for_computing: true
14
+ # then sort by the sort_by
15
+ field :sort_by, type: String, required_for_computing: true
16
+ field :sort_asc, type: Boolean, required_for_computing: true, default: true
17
+
18
+ # the apply a drop_while on { field op value }
19
+ field :field, type: String, required_for_computing: true
20
+ field :op, type: String, required_for_computing: true, values: VALID_OPS
21
+ field :value, required_for_computing: true
22
+ field :drop_mode, type: String, required_for_computing: true, values: VALID_MODES, default: VALID_MODES[0]
23
+
24
+ ensure_data_node_exists
25
+ ensure_dependencies exactly: 1
26
+
27
+ def compute_impl
28
+ base_node = dependencies.first
29
+ records_count = base_node.count
30
+ return if records_count == 0
31
+
32
+ ids = base_node.all(fields: [id_key]) do |results|
33
+ results.distinct(id_key)
34
+ end
35
+ count_per_process = (ids.count / Parallel.processor_count.to_f).ceil
36
+ limit = limit_per_process.to_i
37
+ count_per_process = [limit, count_per_process].min if limit > 0
38
+
39
+ parallel_each(ids.each_slice(count_per_process)) do |ids_slice|
40
+ # ids.each_slice(count_per_process) do |ids_slice|
41
+ process_ids(node: base_node, ids: ids_slice)
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ def process_ids(node:, ids:)
48
+ records = node.all(where: { id_key => ids })
49
+ groups = records.group_by { |x| x[id_key] }
50
+
51
+ result = groups.flat_map do |_, group|
52
+ process_group(group)
53
+ end.compact
54
+
55
+ data_node.add(records: result)
56
+ end
57
+
58
+ # sort the record group and then proceed to drop the elements
59
+ # that satisfy the condition
60
+ def process_group(record_group)
61
+ sort_tokens = record_dig_tokens(key: sort_by, use_sym: dependencies.first.use_symbols?)
62
+ group = record_group.sort_by { |x| x.dig(*sort_tokens) }
63
+ group = group.reverse unless sort_asc
64
+ modes = drop_mode == 'both' ? %w(left right) : [drop_mode]
65
+
66
+ modes.each do |mode|
67
+ # if we want to drop on the right,
68
+ # reverse the array, drop on the left and reverse again
69
+ group = group.reverse if mode == 'right'
70
+ group = drop_while(group)
71
+ group = group.reverse if mode == 'right'
72
+ end
73
+
74
+ group
75
+ end
76
+
77
+ # apply a single drop_while on the group.
78
+ def drop_while(group)
79
+ value_tokens = record_dig_tokens(key: field, use_sym: dependencies.first.use_symbols?)
80
+
81
+ case op.to_s.downcase
82
+ when 'eq'
83
+ group.drop_while { |x| x.dig(*value_tokens) == value }
84
+ when 'ne'
85
+ group.drop_while { |x| x.dig(*value_tokens) != value }
86
+ when 'le'
87
+ group.drop_while do |x|
88
+ val = x.dig(*value_tokens)
89
+ next true if val.nil? # drop nil values
90
+ val <= value
91
+ end
92
+ when 'lt'
93
+ group.drop_while do |x|
94
+ val = x.dig(*value_tokens)
95
+ next true if val.nil? # drop nil values
96
+ val < value
97
+ end
98
+ when 'ge'
99
+ group.drop_while do |x|
100
+ val = x.dig(*value_tokens)
101
+ next true if val.nil? # drop nil values
102
+ val >= value
103
+ end
104
+ when 'gt'
105
+ group.drop_while do |x|
106
+ val = x.dig(*value_tokens)
107
+ next true if val.nil? # drop nil values
108
+ val > value
109
+ end
110
+ else
111
+ raise Errors::InvalidConfigurationError, "Invalid op key: #{op}"
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ module Filter
5
+ # Select the newest record among records with the same id key.
6
+ class NewestNode < ComputeNode
7
+ field :id_key, type: String, required_for_computing: true
8
+ field :date_key, type: String, required_for_computing: true
9
+
10
+ ensure_data_node_exists
11
+ ensure_dependencies exactly: 1
12
+
13
+ private
14
+
15
+ def ensure_keys_are_set!
16
+ raise Errors::InvalidConfigurationError, 'Id key must be set.' if id_key.blank?
17
+ raise Errors::InvalidConfigurationError, 'Date key must be set.' if date_key.blank?
18
+ end
19
+
20
+ def compute_impl
21
+ base_node = dependencies.first
22
+ records_count = base_node.count
23
+ return if records_count == 0
24
+
25
+ ids = base_node.all(fields: [id_key]) do |results|
26
+ results.distinct(id_key)
27
+ end
28
+ count_per_process = (ids.count / Parallel.processor_count.to_f).ceil
29
+ limit = limit_per_process.to_i
30
+ count_per_process = [limit, count_per_process].min if limit > 0
31
+
32
+ parallel_each(ids.each_slice(count_per_process)) do |ids_slice|
33
+ # ids.each_slice(count_per_process) do |ids_slice|
34
+ process_ids(node: base_node, ids: ids_slice)
35
+ end
36
+ end
37
+
38
+ def process_ids(node:, ids:)
39
+ metatata = node.all(where: { id_key => ids }, fields: [id_key, date_key])
40
+ groups = metatata.group_by { |x| x[id_key] }
41
+ newest_record_metadata = filter_by_newest(groups: groups,
42
+ date_key: date_key)
43
+ records = newest_record_metadata.map do |metadata|
44
+ query = {
45
+ id_key => metadata[id_key],
46
+ date_key => metadata[date_key]
47
+ }
48
+ node.find(where: query)
49
+ end.compact
50
+
51
+ data_node.add(records: records)
52
+ end
53
+
54
+ def filter_by_newest(groups:, date_key:)
55
+ groups.map do |_, entries|
56
+ # sort by date ASC and select the newest
57
+ entries
58
+ .sort_by do |x|
59
+ x[date_key].is_a?(Time) ? x[date_key] : Timeliness.parse(x[date_key])
60
+ end.last
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ module Filter
5
+ # Select records that match the condition.
6
+ class WhereNode < ComputeNode
7
+ VALID_OPS = %w(eq ne le lt ge gt).freeze
8
+
9
+ field :key, type: String, required_for_computing: true
10
+ field :op, type: String, required_for_computing: true, values: VALID_OPS
11
+ field :value, required_for_computing: true
12
+
13
+ ensure_data_node_exists
14
+ ensure_dependencies exactly: 1
15
+
16
+ private
17
+
18
+ def compute_batch(records:)
19
+ where(records: records)
20
+ end
21
+
22
+ def where(records:)
23
+ tokens = record_dig_tokens(key: key, use_sym: dependencies.first.use_symbols?)
24
+ case op.to_s.downcase
25
+ when 'eq'
26
+ records.select { |x| x.dig(*tokens) == value }
27
+ when 'ne'
28
+ records.select { |x| x.dig(*tokens) != value }
29
+ when 'le'
30
+ records.select { |x| x.dig(*tokens) <= value }
31
+ when 'lt'
32
+ records.select { |x| x.dig(*tokens) < value }
33
+ when 'ge'
34
+ records.select { |x| x.dig(*tokens) >= value }
35
+ when 'gt'
36
+ records.select { |x| x.dig(*tokens) > value }
37
+ else
38
+ raise Errors::InvalidConfigurationError, "Invalid op key: #{op}"
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end