dataflow-rb 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.env.test.example +6 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/README.md +46 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/dataflow-rb.gemspec +42 -0
- data/lib/config/mongoid.yml +21 -0
- data/lib/dataflow/adapters/csv_adapter.rb +123 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
- data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
- data/lib/dataflow/adapters/psql_adapter.rb +21 -0
- data/lib/dataflow/adapters/settings.rb +33 -0
- data/lib/dataflow/adapters/sql_adapter.rb +322 -0
- data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
- data/lib/dataflow/errors/not_implemented_error.rb +7 -0
- data/lib/dataflow/event_mixin.rb +77 -0
- data/lib/dataflow/extensions/mongo_driver.rb +21 -0
- data/lib/dataflow/extensions/msgpack.rb +19 -0
- data/lib/dataflow/logger.rb +27 -0
- data/lib/dataflow/node.rb +37 -0
- data/lib/dataflow/nodes/compute_node.rb +495 -0
- data/lib/dataflow/nodes/data_node.rb +331 -0
- data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
- data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
- data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
- data/lib/dataflow/nodes/filter/where_node.rb +44 -0
- data/lib/dataflow/nodes/join_node.rb +151 -0
- data/lib/dataflow/nodes/map_node.rb +50 -0
- data/lib/dataflow/nodes/merge_node.rb +33 -0
- data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
- data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
- data/lib/dataflow/nodes/select_keys_node.rb +39 -0
- data/lib/dataflow/nodes/snapshot_node.rb +77 -0
- data/lib/dataflow/nodes/sql_query_node.rb +50 -0
- data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
- data/lib/dataflow/nodes/upsert_node.rb +68 -0
- data/lib/dataflow/properties_mixin.rb +35 -0
- data/lib/dataflow/schema_mixin.rb +134 -0
- data/lib/dataflow/version.rb +4 -0
- data/lib/dataflow-rb.rb +72 -0
- metadata +371 -0
@@ -0,0 +1,331 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Data nodes are used to build a data computing/transformation graph.
|
5
|
+
# At each step we can save the results to a (temp) table.
|
6
|
+
#
|
7
|
+
# Nodes::DataNode represents one of the data nodes.
|
8
|
+
# It is meant to be treated as an interface and should not be used directly.
|
9
|
+
class DataNode
|
10
|
+
include Mongoid::Document
|
11
|
+
include Dataflow::Node
|
12
|
+
include Dataflow::PropertiesMixin
|
13
|
+
include Dataflow::EventMixin
|
14
|
+
include Dataflow::SchemaMixin
|
15
|
+
|
16
|
+
event :schema_inference_started
|
17
|
+
event :schema_inference_progressed
|
18
|
+
event :schema_inference_finished
|
19
|
+
|
20
|
+
event :export_started
|
21
|
+
event :export_progressed
|
22
|
+
event :export_finished
|
23
|
+
|
24
|
+
# make sure we have only one node per db/table combination
|
25
|
+
index({ db_name: 1, name: 1 }, unique: true)
|
26
|
+
|
27
|
+
# The database name used by this node
|
28
|
+
field :db_name, type: String, editable: false
|
29
|
+
|
30
|
+
# The dataset name used by this node for storage.
|
31
|
+
field :name, type: String
|
32
|
+
|
33
|
+
# The schema of this node
|
34
|
+
field :schema, type: Hash, editable: false
|
35
|
+
field :inferred_schema, type: Hash, editable: false
|
36
|
+
field :inferred_schema_at, type: Time, editable: false
|
37
|
+
# How many samples were used to infer the schema
|
38
|
+
field :inferred_schema_from, type: Integer, editable: false
|
39
|
+
|
40
|
+
# The time when this node was last updated
|
41
|
+
field :updated_at, type: Time, editable: false
|
42
|
+
|
43
|
+
# One of the possible backend this node will use e.g.: :mongodb, :csv, :mysql
|
44
|
+
field :db_backend, type: Symbol, editable: false, default: :mongodb
|
45
|
+
|
46
|
+
# Represents the time in seconds within which to expect an update on this node
|
47
|
+
field :update_expected_within, type: Integer, default: 0
|
48
|
+
|
49
|
+
# The indexes this node will implement on its dataset.
|
50
|
+
# Indexes should be in the following format:
|
51
|
+
# [
|
52
|
+
# { key: 'id' },
|
53
|
+
# { key: 'updated_at' },
|
54
|
+
# { key: ['id', 'updated_at'], unique: true }
|
55
|
+
# ]
|
56
|
+
field :indexes, type: Array, default: []
|
57
|
+
|
58
|
+
# whether to use double buffering or not
|
59
|
+
field :use_double_buffering, type: Boolean, editable: false, default: false
|
60
|
+
|
61
|
+
# internal use: where to read/write from. Use 1 and 2 for legacy reasons.
|
62
|
+
field :read_dataset_idx, type: Integer, editable: false, default: 1
|
63
|
+
field :write_dataset_idx, type: Integer, editable: false, default: 2
|
64
|
+
|
65
|
+
# Necessary fields:
|
66
|
+
validates_presence_of :db_name
|
67
|
+
validates_presence_of :name
|
68
|
+
|
69
|
+
# Before create: run default initializations
|
70
|
+
before_create :set_defaults
|
71
|
+
|
72
|
+
# Sets the default parameters before creating the object.
|
73
|
+
def set_defaults
|
74
|
+
self.schema = schema || {}
|
75
|
+
|
76
|
+
# Use the schema as the inferred schema if none is provided.
|
77
|
+
# This useful when there is no need to infer schemas (e.g. in SQL)
|
78
|
+
self.inferred_schema ||= schema
|
79
|
+
end
|
80
|
+
|
81
|
+
# Callback: after creation make sure the underlying dataset matches this node's properties.
|
82
|
+
after_create do
|
83
|
+
handle_dataset_settings_changed
|
84
|
+
end
|
85
|
+
|
86
|
+
# Callback: after save, make sure the underlying dataset is valid if
|
87
|
+
# any dataset-related proprety changed.
|
88
|
+
after_save do
|
89
|
+
if name_changed? || indexes_changed? || db_backend_changed?
|
90
|
+
handle_dataset_settings_changed
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# When the dataset properties changed notify the adapter to handle the new settings.
|
95
|
+
def handle_dataset_settings_changed
|
96
|
+
db_adapter.update_settings(data_node: self)
|
97
|
+
|
98
|
+
# recreate the dataset if there is no data
|
99
|
+
if db_adapter.count.zero?
|
100
|
+
db_adapter.recreate_dataset(dataset: read_dataset_name)
|
101
|
+
end
|
102
|
+
|
103
|
+
db_adapter.create_indexes(dataset: read_dataset_name)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Finds and return from the dataset, based on the given options.
|
107
|
+
# @param where [Hash] the condition to apply for retrieving the element.
|
108
|
+
# e.g.: { 'id' => 1 } will fetch a record with the id 1.
|
109
|
+
# An empty option hash will retrieve any record.
|
110
|
+
# @return [Hash] returns a single record from the dataset.
|
111
|
+
def find(where: {})
|
112
|
+
db_adapter.find(where: where)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Returns all the records from a dataset that match the options.
|
116
|
+
# @param where [Hash] the condition to apply for retrieving the element.
|
117
|
+
# e.g.: { 'id' => 1 } will fetch a record with the id 1.
|
118
|
+
# An empty option hash will retrieve any record.
|
119
|
+
# @param fields [Array] Array of strings representing which fields to include.
|
120
|
+
# e.g.: ['id', 'updated_at'] will only return these two fields.
|
121
|
+
# @param sort [Hash] represents the sorting of the returned dataset.
|
122
|
+
# e.g. { 'id' => 1, 'updated_at' => -1 } will sort by
|
123
|
+
# id ASC and by updated_at DESC.
|
124
|
+
# @param limit [Integer] limits the amount of records returned.
|
125
|
+
# @param offset [Integer] starting offset of the records returned.
|
126
|
+
# Use with limit to implement pagination.
|
127
|
+
# @yield [db_client] When a block is passed, yields the db client on which .each
|
128
|
+
# can be called to stream the results rather than load everything in memory.
|
129
|
+
# Other methods can also be called depending on the backend,
|
130
|
+
# the downside being back-end portability (use at your own risk).
|
131
|
+
def all(where: {}, fields: [], sort: {}, limit: 0, offset: 0, &block)
|
132
|
+
db_adapter.all(where: where, fields: fields, sort: sort, limit: limit, offset: offset, &block)
|
133
|
+
end
|
134
|
+
|
135
|
+
# Supports paginating efficiently through the dataset.
|
136
|
+
# @param where [Hash] the condition to apply for retrieving the element.
|
137
|
+
# e.g.: { 'id' => 1 } will fetch a record with the id 1.
|
138
|
+
# An empty option hash will retrieve any record.
|
139
|
+
# IMPORTANT: do not use the system id in the query. It will be overwritten.
|
140
|
+
# @param fields [Array] Array of strings representing which fields to include.
|
141
|
+
# e.g.: ['id', 'updated_at'] will only return these two fields.
|
142
|
+
# @param limit [Integer] limits the amount of records returned.
|
143
|
+
# @param cursor [String] indicates from which page should the results be returned.
|
144
|
+
# @return [Hash] with 2 fields:
|
145
|
+
# - data [Array] that contains the fetched records
|
146
|
+
# - next_cursor [String] a string to pass into the sub-sequent
|
147
|
+
# calls to fetch the next page of the data
|
148
|
+
def all_paginated(where: {}, fields: [], cursor: nil)
|
149
|
+
db_adapter.all_paginated(where: where, fields: fields, cursor: cursor)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Return a list of order (ASC) system IDs.
|
153
|
+
# @param batch_size [Integer] how many IDs to select per query.
|
154
|
+
# These can be used to process the dataset in parallel by querying on a sub-section:
|
155
|
+
# queries = node.ordered_system_id_queries
|
156
|
+
# Parallel.each(queries) do |query|
|
157
|
+
# process(node.all(where: query))
|
158
|
+
# end
|
159
|
+
def ordered_system_id_queries(batch_size:)
|
160
|
+
db_adapter.ordered_system_id_queries(batch_size: batch_size)
|
161
|
+
end
|
162
|
+
|
163
|
+
# Counts how many records matches the condition or all if no condition is given.
|
164
|
+
# @return [Integer] the record count.
|
165
|
+
def count(where: {})
|
166
|
+
db_adapter.count(where: where)
|
167
|
+
end
|
168
|
+
|
169
|
+
# Adds the given records to the dataset and updates the updated_at time.
|
170
|
+
# @param records [Array] an array of the records to be added.
|
171
|
+
def add(records:)
|
172
|
+
return if records.blank?
|
173
|
+
db_adapter.save(records: records)
|
174
|
+
self.updated_at = Time.now
|
175
|
+
save!
|
176
|
+
end
|
177
|
+
|
178
|
+
# Clear the data that matches the options.
|
179
|
+
def clear(where: {})
|
180
|
+
db_adapter.delete(where: where)
|
181
|
+
end
|
182
|
+
|
183
|
+
# Update this node's schema.
|
184
|
+
def update_schema(sch)
|
185
|
+
self.schema = sch
|
186
|
+
db_adapter.update_settings(data_node: self)
|
187
|
+
end
|
188
|
+
|
189
|
+
# Recreates a dataset.
|
190
|
+
# @param dataset_type [Symbol] select which dataset to recreate.
|
191
|
+
# Can :read or :write.
|
192
|
+
def recreate_dataset(dataset_type: :read)
|
193
|
+
# fetch the proper dataset name
|
194
|
+
dataset = send("#{dataset_type}_dataset_name")
|
195
|
+
db_adapter.recreate_dataset(dataset: dataset)
|
196
|
+
end
|
197
|
+
|
198
|
+
# Applies unique indexes on the dataset.
|
199
|
+
# As this will be enforcing constraints, it is best applied
|
200
|
+
# before adding any data.
|
201
|
+
# @param dataset_type [Symbol] select which dataset to recreate.
|
202
|
+
# Can :read or :write.
|
203
|
+
def create_unique_indexes(dataset_type: :read)
|
204
|
+
dataset = send("#{dataset_type}_dataset_name")
|
205
|
+
db_adapter.create_indexes(dataset: dataset, type: :unique_only)
|
206
|
+
end
|
207
|
+
|
208
|
+
# Applies non-unique indexes on the dataset.
|
209
|
+
# For performance reasons, these indexes are best applied
|
210
|
+
# after adding data (especially on large import operations).
|
211
|
+
def create_non_unique_indexes(dataset_type: :read)
|
212
|
+
dataset = send("#{dataset_type}_dataset_name")
|
213
|
+
db_adapter.create_indexes(dataset: dataset, type: :non_unique_only)
|
214
|
+
end
|
215
|
+
|
216
|
+
def read_dataset_name
|
217
|
+
return @temporary_read_dataset if @temporary_read_dataset
|
218
|
+
|
219
|
+
if use_double_buffering
|
220
|
+
"#{name}_buffer#{read_dataset_idx}"
|
221
|
+
else
|
222
|
+
name
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
def write_dataset_name
|
227
|
+
if use_double_buffering
|
228
|
+
"#{name}_buffer#{write_dataset_idx}"
|
229
|
+
else
|
230
|
+
name
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
# Use to select from which dataset you want to read.
|
235
|
+
# A possible use case is to read from an old dataset name.
|
236
|
+
# @param dataset [String] the dataset name from where to read from.
|
237
|
+
# It must be a valid dataset name for the current settings.
|
238
|
+
def read_dataset_name=(dataset)
|
239
|
+
return unless valid_dataset_names.include?(dataset)
|
240
|
+
@temporary_read_dataset = dataset
|
241
|
+
db_adapter.update_settings(data_node: self)
|
242
|
+
dataset
|
243
|
+
end
|
244
|
+
|
245
|
+
def swap_read_write_datasets!
|
246
|
+
raise Dataflow::Errors::InvalidConfigurationError, '#swap_read_write_dataset_names! called on "#{self.name}" but "use_double_buffering" is not activated.' unless use_double_buffering
|
247
|
+
tmp = read_dataset_idx
|
248
|
+
self.read_dataset_idx = write_dataset_idx
|
249
|
+
self.write_dataset_idx = tmp
|
250
|
+
db_adapter.update_settings(data_node: self)
|
251
|
+
save!
|
252
|
+
end
|
253
|
+
|
254
|
+
def import(connection_opts: {}, keys: nil)
|
255
|
+
importer = db_adapter(connection_opts)
|
256
|
+
records = importer.all
|
257
|
+
add(records: records)
|
258
|
+
end
|
259
|
+
|
260
|
+
def export(connection_opts: { db_backend: :csv }, keys: nil, where: {})
|
261
|
+
on_export_started(connection_opts: connection_opts, keys: keys)
|
262
|
+
# instanciate and export without saving anything
|
263
|
+
Export::ToCsvNode.new(dependency_ids: [self], query: where.to_json).compute_impl
|
264
|
+
on_export_finished
|
265
|
+
end
|
266
|
+
|
267
|
+
# retrieves some informations about this node and its usage
|
268
|
+
def info(write_dataset: false)
|
269
|
+
dataset = write_dataset ? write_dataset_name : read_dataset_name
|
270
|
+
usage = db_adapter.usage(dataset: dataset)
|
271
|
+
{
|
272
|
+
name: name,
|
273
|
+
type: self.class.to_s,
|
274
|
+
dataset: dataset,
|
275
|
+
db_backend: db_backend,
|
276
|
+
updated_at: updated_at,
|
277
|
+
record_count: count,
|
278
|
+
indexes: indexes,
|
279
|
+
effective_indexes: usage[:effective_indexes],
|
280
|
+
mem_usage: usage[:memory],
|
281
|
+
storage_usage: usage[:storage]
|
282
|
+
}
|
283
|
+
end
|
284
|
+
|
285
|
+
def use_symbols?
|
286
|
+
(db_backend.to_s =~ /sql/).present?
|
287
|
+
end
|
288
|
+
|
289
|
+
private
|
290
|
+
|
291
|
+
def db_adapter(connection_opts = {})
|
292
|
+
db_backend = connection_opts[:db_backend] || self.db_backend
|
293
|
+
|
294
|
+
opts = connection_opts.deep_dup
|
295
|
+
opts.delete(:db_backend)
|
296
|
+
has_options = opts.present?
|
297
|
+
|
298
|
+
case db_backend.downcase.to_s
|
299
|
+
when 'mongodb'
|
300
|
+
return Adapters::MongoDbAdapter.new(opts) if has_options
|
301
|
+
@mongodb_adapter ||= Adapters::MongoDbAdapter.new(data_node: self)
|
302
|
+
return @mongodb_adapter
|
303
|
+
when 'csv'
|
304
|
+
return Adapters::CsvAdapter.new(opts) if has_options
|
305
|
+
@csv_adapter ||= Adapters::CsvAdapter.new(data_node: self)
|
306
|
+
return @csv_adapter
|
307
|
+
when 'mysql'
|
308
|
+
opts[:adapter_type] = 'mysql2'
|
309
|
+
return Adapters::SqlAdapter.new(opts) if has_options
|
310
|
+
@mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql2')
|
311
|
+
return @mysql_adapter
|
312
|
+
when 'postgresql'
|
313
|
+
opts[:adapter_type] = 'postgresql'
|
314
|
+
return Adapters::SqlAdapter.new(opts) if has_options
|
315
|
+
@postgresql_adapter ||= Adapters::PsqlAdapter.new(data_node: self, adapter_type: 'postgresql')
|
316
|
+
return @postgresql_adapter
|
317
|
+
end
|
318
|
+
|
319
|
+
raise Errors::NotImplementedError, "'#{db_backend}' backend is not implemented."
|
320
|
+
end
|
321
|
+
|
322
|
+
def valid_dataset_names
|
323
|
+
if use_double_buffering
|
324
|
+
["#{name}_buffer1", "#{name}_buffer2"]
|
325
|
+
else
|
326
|
+
[name]
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end # class DataNode
|
330
|
+
end # module Nodes
|
331
|
+
end # module Dataflow
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Export
|
5
|
+
# Export a dataset to CSV
|
6
|
+
class ToCsvNode < ComputeNode
|
7
|
+
ensure_dependencies exactly: 1
|
8
|
+
|
9
|
+
# A JSON encoded query to pass along.
|
10
|
+
field :query, type: String, default: {}.to_json
|
11
|
+
|
12
|
+
def compute_impl
|
13
|
+
node = dependencies.first
|
14
|
+
where = JSON.parse(query)
|
15
|
+
|
16
|
+
# fetch the schema
|
17
|
+
sch = node.infer_partial_schema(where: where, extended: true)
|
18
|
+
|
19
|
+
# re-order the schema if needed
|
20
|
+
if node.respond_to? :keys
|
21
|
+
sch = node.keys.map { |k| [k, sch[k]] }.to_h if keys.present?
|
22
|
+
end
|
23
|
+
|
24
|
+
# create the dataset
|
25
|
+
csv_adapter = Adapters::CsvAdapter.new(data_node: node)
|
26
|
+
csv_adapter.set_schema(sch)
|
27
|
+
csv_adapter.recreate_dataset
|
28
|
+
|
29
|
+
# export in parallel
|
30
|
+
max_per_process = 1000
|
31
|
+
max_per_process = limit_per_process if limit_per_process < 0
|
32
|
+
|
33
|
+
data_count = [node.count(where: where), 1].max
|
34
|
+
equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
|
35
|
+
count_per_process = [max_per_process, equal_split_per_process].min
|
36
|
+
|
37
|
+
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
38
|
+
|
39
|
+
parallel_each(queries.each_with_index) do |query, _idx|
|
40
|
+
# TODO: re-enabled event on_export_progressed
|
41
|
+
# progress = (idx / queries.count.to_f * 100).ceil
|
42
|
+
# on_export_progressed(pct_complete: progress)
|
43
|
+
|
44
|
+
batch = node.all(where: query.merge(where))
|
45
|
+
csv_adapter.save(records: batch)
|
46
|
+
end
|
47
|
+
|
48
|
+
# needed by the csv exporter to finalize in a single file
|
49
|
+
csv_adapter.on_save_finished
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Filter
|
5
|
+
# Makes a sequency based on a key (e.g. id), and order it (e.g. by time),
|
6
|
+
# and then applies the same logic as ruby's drop_while.
|
7
|
+
# See: https://ruby-doc.org/core-2.4.0/Array.html#method-i-drop_while
|
8
|
+
class DropWhileNode < ComputeNode
|
9
|
+
VALID_OPS = %w(eq ne le lt ge gt).freeze
|
10
|
+
VALID_MODES = %w(both left right).freeze
|
11
|
+
|
12
|
+
# group by the id key
|
13
|
+
field :id_key, type: String, required_for_computing: true
|
14
|
+
# then sort by the sort_by
|
15
|
+
field :sort_by, type: String, required_for_computing: true
|
16
|
+
field :sort_asc, type: Boolean, required_for_computing: true, default: true
|
17
|
+
|
18
|
+
# the apply a drop_while on { field op value }
|
19
|
+
field :field, type: String, required_for_computing: true
|
20
|
+
field :op, type: String, required_for_computing: true, values: VALID_OPS
|
21
|
+
field :value, required_for_computing: true
|
22
|
+
field :drop_mode, type: String, required_for_computing: true, values: VALID_MODES, default: VALID_MODES[0]
|
23
|
+
|
24
|
+
ensure_data_node_exists
|
25
|
+
ensure_dependencies exactly: 1
|
26
|
+
|
27
|
+
def compute_impl
|
28
|
+
base_node = dependencies.first
|
29
|
+
records_count = base_node.count
|
30
|
+
return if records_count == 0
|
31
|
+
|
32
|
+
ids = base_node.all(fields: [id_key]) do |results|
|
33
|
+
results.distinct(id_key)
|
34
|
+
end
|
35
|
+
count_per_process = (ids.count / Parallel.processor_count.to_f).ceil
|
36
|
+
limit = limit_per_process.to_i
|
37
|
+
count_per_process = [limit, count_per_process].min if limit > 0
|
38
|
+
|
39
|
+
parallel_each(ids.each_slice(count_per_process)) do |ids_slice|
|
40
|
+
# ids.each_slice(count_per_process) do |ids_slice|
|
41
|
+
process_ids(node: base_node, ids: ids_slice)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def process_ids(node:, ids:)
|
48
|
+
records = node.all(where: { id_key => ids })
|
49
|
+
groups = records.group_by { |x| x[id_key] }
|
50
|
+
|
51
|
+
result = groups.flat_map do |_, group|
|
52
|
+
process_group(group)
|
53
|
+
end.compact
|
54
|
+
|
55
|
+
data_node.add(records: result)
|
56
|
+
end
|
57
|
+
|
58
|
+
# sort the record group and then proceed to drop the elements
|
59
|
+
# that satisfy the condition
|
60
|
+
def process_group(record_group)
|
61
|
+
sort_tokens = record_dig_tokens(key: sort_by, use_sym: dependencies.first.use_symbols?)
|
62
|
+
group = record_group.sort_by { |x| x.dig(*sort_tokens) }
|
63
|
+
group = group.reverse unless sort_asc
|
64
|
+
modes = drop_mode == 'both' ? %w(left right) : [drop_mode]
|
65
|
+
|
66
|
+
modes.each do |mode|
|
67
|
+
# if we want to drop on the right,
|
68
|
+
# reverse the array, drop on the left and reverse again
|
69
|
+
group = group.reverse if mode == 'right'
|
70
|
+
group = drop_while(group)
|
71
|
+
group = group.reverse if mode == 'right'
|
72
|
+
end
|
73
|
+
|
74
|
+
group
|
75
|
+
end
|
76
|
+
|
77
|
+
# apply a single drop_while on the group.
|
78
|
+
def drop_while(group)
|
79
|
+
value_tokens = record_dig_tokens(key: field, use_sym: dependencies.first.use_symbols?)
|
80
|
+
|
81
|
+
case op.to_s.downcase
|
82
|
+
when 'eq'
|
83
|
+
group.drop_while { |x| x.dig(*value_tokens) == value }
|
84
|
+
when 'ne'
|
85
|
+
group.drop_while { |x| x.dig(*value_tokens) != value }
|
86
|
+
when 'le'
|
87
|
+
group.drop_while do |x|
|
88
|
+
val = x.dig(*value_tokens)
|
89
|
+
next true if val.nil? # drop nil values
|
90
|
+
val <= value
|
91
|
+
end
|
92
|
+
when 'lt'
|
93
|
+
group.drop_while do |x|
|
94
|
+
val = x.dig(*value_tokens)
|
95
|
+
next true if val.nil? # drop nil values
|
96
|
+
val < value
|
97
|
+
end
|
98
|
+
when 'ge'
|
99
|
+
group.drop_while do |x|
|
100
|
+
val = x.dig(*value_tokens)
|
101
|
+
next true if val.nil? # drop nil values
|
102
|
+
val >= value
|
103
|
+
end
|
104
|
+
when 'gt'
|
105
|
+
group.drop_while do |x|
|
106
|
+
val = x.dig(*value_tokens)
|
107
|
+
next true if val.nil? # drop nil values
|
108
|
+
val > value
|
109
|
+
end
|
110
|
+
else
|
111
|
+
raise Errors::InvalidConfigurationError, "Invalid op key: #{op}"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Filter
|
5
|
+
# Select the newest record among records with the same id key.
|
6
|
+
class NewestNode < ComputeNode
|
7
|
+
field :id_key, type: String, required_for_computing: true
|
8
|
+
field :date_key, type: String, required_for_computing: true
|
9
|
+
|
10
|
+
ensure_data_node_exists
|
11
|
+
ensure_dependencies exactly: 1
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def ensure_keys_are_set!
|
16
|
+
raise Errors::InvalidConfigurationError, 'Id key must be set.' if id_key.blank?
|
17
|
+
raise Errors::InvalidConfigurationError, 'Date key must be set.' if date_key.blank?
|
18
|
+
end
|
19
|
+
|
20
|
+
def compute_impl
|
21
|
+
base_node = dependencies.first
|
22
|
+
records_count = base_node.count
|
23
|
+
return if records_count == 0
|
24
|
+
|
25
|
+
ids = base_node.all(fields: [id_key]) do |results|
|
26
|
+
results.distinct(id_key)
|
27
|
+
end
|
28
|
+
count_per_process = (ids.count / Parallel.processor_count.to_f).ceil
|
29
|
+
limit = limit_per_process.to_i
|
30
|
+
count_per_process = [limit, count_per_process].min if limit > 0
|
31
|
+
|
32
|
+
parallel_each(ids.each_slice(count_per_process)) do |ids_slice|
|
33
|
+
# ids.each_slice(count_per_process) do |ids_slice|
|
34
|
+
process_ids(node: base_node, ids: ids_slice)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def process_ids(node:, ids:)
|
39
|
+
metatata = node.all(where: { id_key => ids }, fields: [id_key, date_key])
|
40
|
+
groups = metatata.group_by { |x| x[id_key] }
|
41
|
+
newest_record_metadata = filter_by_newest(groups: groups,
|
42
|
+
date_key: date_key)
|
43
|
+
records = newest_record_metadata.map do |metadata|
|
44
|
+
query = {
|
45
|
+
id_key => metadata[id_key],
|
46
|
+
date_key => metadata[date_key]
|
47
|
+
}
|
48
|
+
node.find(where: query)
|
49
|
+
end.compact
|
50
|
+
|
51
|
+
data_node.add(records: records)
|
52
|
+
end
|
53
|
+
|
54
|
+
def filter_by_newest(groups:, date_key:)
|
55
|
+
groups.map do |_, entries|
|
56
|
+
# sort by date ASC and select the newest
|
57
|
+
entries
|
58
|
+
.sort_by do |x|
|
59
|
+
x[date_key].is_a?(Time) ? x[date_key] : Timeliness.parse(x[date_key])
|
60
|
+
end.last
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
module Filter
|
5
|
+
# Select records that match the condition.
|
6
|
+
class WhereNode < ComputeNode
|
7
|
+
VALID_OPS = %w(eq ne le lt ge gt).freeze
|
8
|
+
|
9
|
+
field :key, type: String, required_for_computing: true
|
10
|
+
field :op, type: String, required_for_computing: true, values: VALID_OPS
|
11
|
+
field :value, required_for_computing: true
|
12
|
+
|
13
|
+
ensure_data_node_exists
|
14
|
+
ensure_dependencies exactly: 1
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def compute_batch(records:)
|
19
|
+
where(records: records)
|
20
|
+
end
|
21
|
+
|
22
|
+
def where(records:)
|
23
|
+
tokens = record_dig_tokens(key: key, use_sym: dependencies.first.use_symbols?)
|
24
|
+
case op.to_s.downcase
|
25
|
+
when 'eq'
|
26
|
+
records.select { |x| x.dig(*tokens) == value }
|
27
|
+
when 'ne'
|
28
|
+
records.select { |x| x.dig(*tokens) != value }
|
29
|
+
when 'le'
|
30
|
+
records.select { |x| x.dig(*tokens) <= value }
|
31
|
+
when 'lt'
|
32
|
+
records.select { |x| x.dig(*tokens) < value }
|
33
|
+
when 'ge'
|
34
|
+
records.select { |x| x.dig(*tokens) >= value }
|
35
|
+
when 'gt'
|
36
|
+
records.select { |x| x.dig(*tokens) > value }
|
37
|
+
else
|
38
|
+
raise Errors::InvalidConfigurationError, "Invalid op key: #{op}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|