dataoperations-aggregate 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/dataoperations-aggregate.rb +277 -0
  3. metadata +57 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: a2bb76e93dd60923113cd9ca110d3c45c5809702adf9d62b28cf1e8411536184
4
+ data.tar.gz: da54bea512ceb72b4da917d7199365114d05708977f89e5d0700cb3b8c580475
5
+ SHA512:
6
+ metadata.gz: 8def074117e9f6077673652ae8c5b8641c4b663097f94cf373b5aa3acf5272ed4709c2409669c9a9f620ec9bb00971248237b2e69e21068e40227d36993aa8bf
7
+ data.tar.gz: 3f5860fe505cd9a3e188018eede7bd66fe91e7aea97dd333c69079b954b4fee84e5160ee9b4541f8382ac52ba99a20226bbf32add9b57ab02b717e4be5eecece
@@ -0,0 +1,277 @@
1
+ require 'logger'
2
+ require 'time'
3
+ require 'descriptive_statistics'
4
+ module DataOperations
5
+ class Aggregate
6
+ DEFAULT_TIME_FORMAT='%Y-%m-%dT%H:%M:%S.%L%:z'
7
+ DEFAULT_TIME_FIELD='timestamp'
8
+ DEFAULT_OUTPUT_TIME_FORMAT='%Y-%m-%dT%H:%M:%S.%L%z'
9
+ DEFAULT_INTERVALS=[10]
10
+ DEFAULT_FLUSH_INTERVAL=5
11
+ DEFAULT_PROCESSING_MODE=:batch
12
+ DEFAULT_FIELD_NO_DATA_VALUE='no_data'
13
+ DEFAULT_AGGREGATIONS=['sum','min','max','mean','median','variance','standard_deviation']
14
+ VALID_AGGREGATIONS = ['sum','min','max','mean','median','variance','standard_deviation']
15
+ DEFAULT_HASH_TIME_FORMAT = '%Y-%m-%dT%H'
16
+ DEFAULT_INERVAL_SECONDS = 3600
17
+
18
+ def initialize(aggregator: {},
19
+ time_format: DEFAULT_TIME_FORMAT,
20
+ time_field: DEFAULT_TIME_FIELD,
21
+ output_time_format: DEFAULT_OUTPUT_TIME_FORMAT,
22
+ intervals:DEFAULT_INTERVALS,
23
+ flush_interval:DEFAULT_FLUSH_INTERVAL,
24
+ keep_interval:DEFAULT_KEEP_INTERVAL,
25
+ field_no_data_value:DEFAULT_FIELD_NO_DATA_VALUE,
26
+ processing_mode:DEFAULT_PROCESSING_MODE,
27
+ log:Logger.new(STDOUT),
28
+ aggregation_names:,
29
+ group_field_names:,
30
+ aggregate_field_names:
31
+ )
32
+ @aggregator = aggregator
33
+ @time_format = time_format
34
+ @time_field = time_field
35
+ @output_time_format = output_time_format
36
+ @intervals = intervals.uniq.sort!
37
+ @flush_interval = flush_interval
38
+ @keep_interval = keep_interval
39
+ @field_no_data_value = field_no_data_value
40
+ @processing_mode = processing_mode
41
+
42
+
43
+ if aggregation_names.nil? || ! aggregation_names.is_a?(Array)
44
+ raise "Configuration error, aggregation_names must be specified and Array"
45
+ end
46
+ if group_field_names.nil? || ! aggregation_names.is_a?(Array)
47
+ raise "Configuration error, group_field_names must be specified and Array"
48
+ end
49
+ if aggregate_field_names.nil? || ! aggregation_names.is_a?(Array)
50
+ raise "Configuration error, aggregate_field_names must be specified and Array"
51
+ end
52
+
53
+ @log = log
54
+
55
+ @hash_time_format = DEFAULT_HASH_TIME_FORMAT
56
+ @interval_seconds = DEFAULT_INERVAL_SECONDS
57
+
58
+ @aggregation_names = aggregation_names
59
+ @group_field_names = group_field_names
60
+ @aggregate_field_names = aggregate_field_names
61
+
62
+ @aggregation_names.each {|operation|
63
+ if ! VALID_AGGREGATIONS.include?(operation)
64
+ raise "aggregations must set any combination of sum,min,max,mean,median,variance,standard_deviation"
65
+ end
66
+ }
67
+ @intervals.each {|interval|
68
+ if ! (interval % @intervals[0] == 0)
69
+ raise "interval: #{interval} must be multiple of first interval: #{@intervals[0]}"
70
+ end
71
+ }
72
+
73
+ #TODO:
74
+ # - Duplicate intervals - Done
75
+ # - Sort intervals - Done
76
+ # - Validate aggregation_names, group_field_names, aggregate_field_names
77
+ end
78
+
79
+ def log_level(log_level)
80
+ @log.level = log_level
81
+ end
82
+
83
+ def add_events(record)
84
+ timestamp = nil
85
+ if ! record.has_key?(@time_field) || ! (timestamp = DateTime.strptime(record[@time_field],@time_format).to_time.to_i)
86
+ timestamp = DateTime.now.to_time.to_i
87
+ end
88
+
89
+ current_interval_seconds = (timestamp / @intervals[0]) * @intervals[0]
90
+ aggregator_hash_key = current_interval_seconds
91
+
92
+ hash_group_key = nil
93
+ @group_field_names.each {|field_name|
94
+ if ! hash_group_key.nil?
95
+ hash_group_key = "#{hash_group_key}_#{field_name}:#{record[field_name]}"
96
+ else
97
+ hash_group_key = "#{field_name}:#{record[field_name]}"
98
+ end
99
+ }
100
+
101
+ aggregator_item={}
102
+ if @aggregator.has_key?(hash_group_key)
103
+ aggregator_item = @aggregator[hash_group_key]
104
+ else
105
+ group_detail = {}
106
+ aggregate_detail = {}
107
+ interval_detail = {}
108
+ @group_field_names.each {|field_name|
109
+ if record.has_key?(field_name)
110
+ group_detail[field_name] = record[field_name]
111
+ else
112
+ group_detail[field_name] = @field_no_data_value
113
+ end
114
+ }
115
+
116
+ #Add interval empty data
117
+ @intervals.each{|interval|
118
+ interval_detail[interval.to_s]={}
119
+ }
120
+
121
+ aggregator_item["group_fields"]=group_detail
122
+ aggregator_item["aggregate_fields"]=aggregate_detail
123
+ aggregator_item["intervals"]=interval_detail
124
+
125
+ @aggregator[hash_group_key]=aggregator_item
126
+ end
127
+
128
+ if ! aggregator_item["aggregate_fields"].has_key?(aggregator_hash_key)
129
+ hash_aggregator = {}
130
+ hash_aggregator[:time_started]=Time.now.to_i
131
+ hash_aggregator["processed"]=1
132
+ aggregator_item["aggregate_fields"][aggregator_hash_key]=hash_aggregator
133
+ else
134
+ aggregator_item["aggregate_fields"][aggregator_hash_key]["processed"]+=1
135
+ end
136
+
137
+ @aggregate_field_names.each {|field_name|
138
+ aggregate_values = []
139
+ if aggregator_item["aggregate_fields"][aggregator_hash_key].has_key?(field_name)
140
+ aggregate_values = aggregator_item["aggregate_fields"][aggregator_hash_key][field_name]
141
+ end
142
+ if record[field_name].is_a?(Integer) or record[field_name].is_a?(Float)
143
+ aggregate_values << record[field_name]
144
+ else
145
+ aggregate_values << 0
146
+ end
147
+ aggregator_item["aggregate_fields"][aggregator_hash_key][field_name] = aggregate_values
148
+ }
149
+ end
150
+
151
+ def aggregate_data
152
+ @aggregator
153
+ end
154
+
155
+ def aggregate_events
156
+ aggregate_data = {}
157
+
158
+ #@log.debug @aggregator
159
+ #@aggregator_mutex.synchronize do
160
+ current_time = Time.now.to_i
161
+ @aggregator.each {|group_item_key,group_item_value|
162
+ group_item_value["aggregate_fields"].each {|aggregator_item_key,aggregator_item_value|
163
+ #If processing mode is :batch, aggregate immediatly, else wait to arrive events (streaming processing like fluentd)
164
+ @processing_mode == :batch ? limit_time = 0 : limit_time = aggregator_item_value[:time_started] + @intervals[0] + @keep_interval
165
+
166
+ #Is this data ready to aggregate (based on the ingest time), if @processing_mode is batch limit_time is 0
167
+ if current_time >= limit_time
168
+ aggregator_data = {}
169
+ aggregator_data[@time_field] = Time.at(aggregator_item_key).strftime(@output_time_format)
170
+ aggregator_data.merge!(group_item_value["group_fields"])
171
+
172
+ aggregator_data["time"] = aggregator_item_key
173
+ aggregator_data["processed"] = aggregator_item_value["processed"]
174
+ aggregator_data["aggregator_id"] = @aggregator_name
175
+
176
+ #Add entry in accumulative aggregation hash
177
+ group_item_value['intervals'].keys[1..-1].each{|interval_secs|
178
+ interval_aggregator_item_key=(aggregator_item_key/interval_secs.to_i)*interval_secs.to_i
179
+ #@log.debug "interval_aggregator_item_key: #{interval_aggregator_item_key}"
180
+
181
+ if interval_aggregator_item_value = group_item_value['intervals'][interval_secs][interval_aggregator_item_key]
182
+ interval_aggregator_item_value[:time_started] = aggregator_item_value[:time_started] if interval_aggregator_item_value[:time_started] < aggregator_item_value[:time_started]
183
+ interval_aggregator_item_value["processed"] += aggregator_item_value["processed"]
184
+ #@log.debug interval_aggregator_item_value
185
+ else
186
+ interval_aggregator_item_value = {}
187
+ interval_aggregator_item_value[:time_started] = aggregator_item_value[:time_started]
188
+ interval_aggregator_item_value["aggregate_fields"]={}
189
+ interval_aggregator_item_value["processed"] = aggregator_item_value["processed"]
190
+ group_item_value['intervals'][interval_secs][interval_aggregator_item_key] = interval_aggregator_item_value
191
+ #@log.debug interval_aggregator_item_value
192
+ end
193
+ }
194
+
195
+ aggregator_item_value.each { |aggregate_field_key,aggregate_field_value|
196
+ #Create field metadata for subsecuents aggregations
197
+ group_item_value['intervals'].keys[1..-1].each{|interval_secs|
198
+ interval_aggregator_item_key=(aggregator_item_key/interval_secs.to_i)*interval_secs.to_i
199
+ interval_aggregator_item_value = group_item_value['intervals'][interval_secs][interval_aggregator_item_key]
200
+ #@log.debug interval_aggregator_item_value
201
+ if ! interval_aggregator_item_value["aggregate_fields"].has_key?(aggregate_field_key) && aggregate_field_value.is_a?(Array)
202
+ interval_aggregator_item_value["aggregate_fields"][aggregate_field_key]={}
203
+ @aggregation_names.each {|operation|
204
+ interval_aggregator_item_value["aggregate_fields"][aggregate_field_key][operation]=[]
205
+ }
206
+ end
207
+ }
208
+
209
+ #Aggregate data
210
+ if aggregate_field_value.is_a?(Array)
211
+ @aggregation_names.each {|operation|
212
+ data = aggregate_field_value.method(operation).call
213
+ aggregator_data["#{aggregate_field_key}_#{operation}"] = data
214
+
215
+ #Add aggregated data to interval
216
+ group_item_value['intervals'].keys[1..-1].each{|interval_secs|
217
+ interval_aggregator_item_key=(aggregator_item_key/interval_secs.to_i)*interval_secs.to_i
218
+ interval_aggregator_item_value = group_item_value['intervals'][interval_secs][interval_aggregator_item_key]
219
+ interval_aggregator_item_value["aggregate_fields"][aggregate_field_key][operation] << data
220
+ }
221
+ }
222
+ end
223
+ }
224
+
225
+ group_item_value["aggregate_fields"].delete(aggregator_item_key)
226
+ aggregate_data[group_item_value['intervals'].keys[0]] =[] if aggregate_data[group_item_value['intervals'].keys[0]].nil?
227
+ aggregate_data[group_item_value['intervals'].keys[0]] << aggregator_data
228
+ end
229
+ }
230
+
231
+ #Calculate subsecuents aggregations
232
+ group_item_value["intervals"].keys[1..-1].each {|s_interval|
233
+ group_item_value["intervals"][s_interval].each{|aggregator_item_key,aggregator_item_value|
234
+ interval = s_interval.to_i
235
+ #If processing mode is :batch, aggregate immediatly, else wait to arrive events (streaming processing like fluentd)
236
+ @processing_mode == :batch ? limit_time = 0 : limit_time = aggregator_item_value[:time_started] + interval + @keep_interval
237
+
238
+ #@log.debug "processing_mode:#{@processing_mode} limit_time:#{limit_time}"
239
+
240
+ if current_time >= limit_time
241
+ aggregator_data = {}
242
+ aggregator_data[@time_field] = Time.at(aggregator_item_key).strftime(@output_time_format)
243
+ aggregator_data.merge!(group_item_value["group_fields"])
244
+
245
+ aggregator_data["time"] = aggregator_item_key
246
+ aggregator_data["processed"] = aggregator_item_value["processed"]
247
+ aggregator_data["aggregator_id"] = @aggregator_name
248
+ aggregator_item_value["aggregate_fields"].each{|field_name,field_data|
249
+ field_data.each{|operation,vector|
250
+ case operation
251
+ when 'max','min','mean','median'
252
+ data = vector.method(operation).call
253
+ else
254
+ data = vector.median
255
+ end
256
+ aggregator_data["#{field_name}_#{operation}"] = data
257
+ }
258
+ }
259
+ #@log.debug aggregator_item_value
260
+ #@log.debug aggregator_data
261
+ group_item_value["intervals"][s_interval].delete(aggregator_item_key)
262
+ aggregate_data[s_interval] =[] if aggregate_data[s_interval].nil?
263
+ aggregate_data[s_interval] << aggregator_data
264
+ end
265
+ }
266
+ }
267
+ }
268
+
269
+ #@log.debug aggregate_data
270
+ unless aggregate_data.empty?
271
+ aggregate_data
272
+ end
273
+ #rescue Exception => e
274
+ # $log.error e
275
+ end
276
+ end
277
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dataoperations-aggregate
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Victor Guillen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-04-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: descriptive_statistics
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: Aggregate data over time
28
+ email: vguillen_public@gmail.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/dataoperations-aggregate.rb
34
+ homepage: https://github.com/superguillen/dataoperations-aggregate
35
+ licenses:
36
+ - MIT
37
+ metadata: {}
38
+ post_install_message:
39
+ rdoc_options: []
40
+ require_paths:
41
+ - lib
42
+ required_ruby_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ requirements: []
53
+ rubygems_version: 3.0.3
54
+ signing_key:
55
+ specification_version: 4
56
+ summary: Aggregate data
57
+ test_files: []