dataoperations-aggregate 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/dataoperations-aggregate.rb +277 -0
  3. metadata +57 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: a2bb76e93dd60923113cd9ca110d3c45c5809702adf9d62b28cf1e8411536184
4
+ data.tar.gz: da54bea512ceb72b4da917d7199365114d05708977f89e5d0700cb3b8c580475
5
+ SHA512:
6
+ metadata.gz: 8def074117e9f6077673652ae8c5b8641c4b663097f94cf373b5aa3acf5272ed4709c2409669c9a9f620ec9bb00971248237b2e69e21068e40227d36993aa8bf
7
+ data.tar.gz: 3f5860fe505cd9a3e188018eede7bd66fe91e7aea97dd333c69079b954b4fee84e5160ee9b4541f8382ac52ba99a20226bbf32add9b57ab02b717e4be5eecece
@@ -0,0 +1,277 @@
1
+ require 'logger'
2
+ require 'time'
3
+ require 'descriptive_statistics'
4
+ module DataOperations
5
+ class Aggregate
6
+ DEFAULT_TIME_FORMAT='%Y-%m-%dT%H:%M:%S.%L%:z'
7
+ DEFAULT_TIME_FIELD='timestamp'
8
+ DEFAULT_OUTPUT_TIME_FORMAT='%Y-%m-%dT%H:%M:%S.%L%z'
9
+ DEFAULT_INTERVALS=[10]
10
+ DEFAULT_FLUSH_INTERVAL=5
11
+ DEFAULT_PROCESSING_MODE=:batch
12
+ DEFAULT_FIELD_NO_DATA_VALUE='no_data'
13
+ DEFAULT_AGGREGATIONS=['sum','min','max','mean','median','variance','standard_deviation']
14
+ VALID_AGGREGATIONS = ['sum','min','max','mean','median','variance','standard_deviation']
15
+ DEFAULT_HASH_TIME_FORMAT = '%Y-%m-%dT%H'
16
+ DEFAULT_INERVAL_SECONDS = 3600
17
+
18
+ def initialize(aggregator: {},
19
+ time_format: DEFAULT_TIME_FORMAT,
20
+ time_field: DEFAULT_TIME_FIELD,
21
+ output_time_format: DEFAULT_OUTPUT_TIME_FORMAT,
22
+ intervals:DEFAULT_INTERVALS,
23
+ flush_interval:DEFAULT_FLUSH_INTERVAL,
24
+ keep_interval:DEFAULT_KEEP_INTERVAL,
25
+ field_no_data_value:DEFAULT_FIELD_NO_DATA_VALUE,
26
+ processing_mode:DEFAULT_PROCESSING_MODE,
27
+ log:Logger.new(STDOUT),
28
+ aggregation_names:,
29
+ group_field_names:,
30
+ aggregate_field_names:
31
+ )
32
+ @aggregator = aggregator
33
+ @time_format = time_format
34
+ @time_field = time_field
35
+ @output_time_format = output_time_format
36
+ @intervals = intervals.uniq.sort!
37
+ @flush_interval = flush_interval
38
+ @keep_interval = keep_interval
39
+ @field_no_data_value = field_no_data_value
40
+ @processing_mode = processing_mode
41
+
42
+
43
+ if aggregation_names.nil? || ! aggregation_names.is_a?(Array)
44
+ raise "Configuration error, aggregation_names must be specified and Array"
45
+ end
46
+ if group_field_names.nil? || ! aggregation_names.is_a?(Array)
47
+ raise "Configuration error, group_field_names must be specified and Array"
48
+ end
49
+ if aggregate_field_names.nil? || ! aggregation_names.is_a?(Array)
50
+ raise "Configuration error, aggregate_field_names must be specified and Array"
51
+ end
52
+
53
+ @log = log
54
+
55
+ @hash_time_format = DEFAULT_HASH_TIME_FORMAT
56
+ @interval_seconds = DEFAULT_INERVAL_SECONDS
57
+
58
+ @aggregation_names = aggregation_names
59
+ @group_field_names = group_field_names
60
+ @aggregate_field_names = aggregate_field_names
61
+
62
+ @aggregation_names.each {|operation|
63
+ if ! VALID_AGGREGATIONS.include?(operation)
64
+ raise "aggregations must set any combination of sum,min,max,mean,median,variance,standard_deviation"
65
+ end
66
+ }
67
+ @intervals.each {|interval|
68
+ if ! (interval % @intervals[0] == 0)
69
+ raise "interval: #{interval} must be multiple of first interval: #{@intervals[0]}"
70
+ end
71
+ }
72
+
73
+ #TODO:
74
+ # - Duplicate intervals - Done
75
+ # - Sort intervals - Done
76
+ # - Validate aggregation_names, group_field_names, aggregate_field_names
77
+ end
78
+
79
+ def log_level(log_level)
80
+ @log.level = log_level
81
+ end
82
+
83
+ def add_events(record)
84
+ timestamp = nil
85
+ if ! record.has_key?(@time_field) || ! (timestamp = DateTime.strptime(record[@time_field],@time_format).to_time.to_i)
86
+ timestamp = DateTime.now.to_time.to_i
87
+ end
88
+
89
+ current_interval_seconds = (timestamp / @intervals[0]) * @intervals[0]
90
+ aggregator_hash_key = current_interval_seconds
91
+
92
+ hash_group_key = nil
93
+ @group_field_names.each {|field_name|
94
+ if ! hash_group_key.nil?
95
+ hash_group_key = "#{hash_group_key}_#{field_name}:#{record[field_name]}"
96
+ else
97
+ hash_group_key = "#{field_name}:#{record[field_name]}"
98
+ end
99
+ }
100
+
101
+ aggregator_item={}
102
+ if @aggregator.has_key?(hash_group_key)
103
+ aggregator_item = @aggregator[hash_group_key]
104
+ else
105
+ group_detail = {}
106
+ aggregate_detail = {}
107
+ interval_detail = {}
108
+ @group_field_names.each {|field_name|
109
+ if record.has_key?(field_name)
110
+ group_detail[field_name] = record[field_name]
111
+ else
112
+ group_detail[field_name] = @field_no_data_value
113
+ end
114
+ }
115
+
116
+ #Add interval empty data
117
+ @intervals.each{|interval|
118
+ interval_detail[interval.to_s]={}
119
+ }
120
+
121
+ aggregator_item["group_fields"]=group_detail
122
+ aggregator_item["aggregate_fields"]=aggregate_detail
123
+ aggregator_item["intervals"]=interval_detail
124
+
125
+ @aggregator[hash_group_key]=aggregator_item
126
+ end
127
+
128
+ if ! aggregator_item["aggregate_fields"].has_key?(aggregator_hash_key)
129
+ hash_aggregator = {}
130
+ hash_aggregator[:time_started]=Time.now.to_i
131
+ hash_aggregator["processed"]=1
132
+ aggregator_item["aggregate_fields"][aggregator_hash_key]=hash_aggregator
133
+ else
134
+ aggregator_item["aggregate_fields"][aggregator_hash_key]["processed"]+=1
135
+ end
136
+
137
+ @aggregate_field_names.each {|field_name|
138
+ aggregate_values = []
139
+ if aggregator_item["aggregate_fields"][aggregator_hash_key].has_key?(field_name)
140
+ aggregate_values = aggregator_item["aggregate_fields"][aggregator_hash_key][field_name]
141
+ end
142
+ if record[field_name].is_a?(Integer) or record[field_name].is_a?(Float)
143
+ aggregate_values << record[field_name]
144
+ else
145
+ aggregate_values << 0
146
+ end
147
+ aggregator_item["aggregate_fields"][aggregator_hash_key][field_name] = aggregate_values
148
+ }
149
+ end
150
+
151
+ def aggregate_data
152
+ @aggregator
153
+ end
154
+
155
+ def aggregate_events
156
+ aggregate_data = {}
157
+
158
+ #@log.debug @aggregator
159
+ #@aggregator_mutex.synchronize do
160
+ current_time = Time.now.to_i
161
+ @aggregator.each {|group_item_key,group_item_value|
162
+ group_item_value["aggregate_fields"].each {|aggregator_item_key,aggregator_item_value|
163
+ #If processing mode is :batch, aggregate immediatly, else wait to arrive events (streaming processing like fluentd)
164
+ @processing_mode == :batch ? limit_time = 0 : limit_time = aggregator_item_value[:time_started] + @intervals[0] + @keep_interval
165
+
166
+ #Is this data ready to aggregate (based on the ingest time), if @processing_mode is batch limit_time is 0
167
+ if current_time >= limit_time
168
+ aggregator_data = {}
169
+ aggregator_data[@time_field] = Time.at(aggregator_item_key).strftime(@output_time_format)
170
+ aggregator_data.merge!(group_item_value["group_fields"])
171
+
172
+ aggregator_data["time"] = aggregator_item_key
173
+ aggregator_data["processed"] = aggregator_item_value["processed"]
174
+ aggregator_data["aggregator_id"] = @aggregator_name
175
+
176
+ #Add entry in accumulative aggregation hash
177
+ group_item_value['intervals'].keys[1..-1].each{|interval_secs|
178
+ interval_aggregator_item_key=(aggregator_item_key/interval_secs.to_i)*interval_secs.to_i
179
+ #@log.debug "interval_aggregator_item_key: #{interval_aggregator_item_key}"
180
+
181
+ if interval_aggregator_item_value = group_item_value['intervals'][interval_secs][interval_aggregator_item_key]
182
+ interval_aggregator_item_value[:time_started] = aggregator_item_value[:time_started] if interval_aggregator_item_value[:time_started] < aggregator_item_value[:time_started]
183
+ interval_aggregator_item_value["processed"] += aggregator_item_value["processed"]
184
+ #@log.debug interval_aggregator_item_value
185
+ else
186
+ interval_aggregator_item_value = {}
187
+ interval_aggregator_item_value[:time_started] = aggregator_item_value[:time_started]
188
+ interval_aggregator_item_value["aggregate_fields"]={}
189
+ interval_aggregator_item_value["processed"] = aggregator_item_value["processed"]
190
+ group_item_value['intervals'][interval_secs][interval_aggregator_item_key] = interval_aggregator_item_value
191
+ #@log.debug interval_aggregator_item_value
192
+ end
193
+ }
194
+
195
+ aggregator_item_value.each { |aggregate_field_key,aggregate_field_value|
196
+ #Create field metadata for subsecuents aggregations
197
+ group_item_value['intervals'].keys[1..-1].each{|interval_secs|
198
+ interval_aggregator_item_key=(aggregator_item_key/interval_secs.to_i)*interval_secs.to_i
199
+ interval_aggregator_item_value = group_item_value['intervals'][interval_secs][interval_aggregator_item_key]
200
+ #@log.debug interval_aggregator_item_value
201
+ if ! interval_aggregator_item_value["aggregate_fields"].has_key?(aggregate_field_key) && aggregate_field_value.is_a?(Array)
202
+ interval_aggregator_item_value["aggregate_fields"][aggregate_field_key]={}
203
+ @aggregation_names.each {|operation|
204
+ interval_aggregator_item_value["aggregate_fields"][aggregate_field_key][operation]=[]
205
+ }
206
+ end
207
+ }
208
+
209
+ #Aggregate data
210
+ if aggregate_field_value.is_a?(Array)
211
+ @aggregation_names.each {|operation|
212
+ data = aggregate_field_value.method(operation).call
213
+ aggregator_data["#{aggregate_field_key}_#{operation}"] = data
214
+
215
+ #Add aggregated data to interval
216
+ group_item_value['intervals'].keys[1..-1].each{|interval_secs|
217
+ interval_aggregator_item_key=(aggregator_item_key/interval_secs.to_i)*interval_secs.to_i
218
+ interval_aggregator_item_value = group_item_value['intervals'][interval_secs][interval_aggregator_item_key]
219
+ interval_aggregator_item_value["aggregate_fields"][aggregate_field_key][operation] << data
220
+ }
221
+ }
222
+ end
223
+ }
224
+
225
+ group_item_value["aggregate_fields"].delete(aggregator_item_key)
226
+ aggregate_data[group_item_value['intervals'].keys[0]] =[] if aggregate_data[group_item_value['intervals'].keys[0]].nil?
227
+ aggregate_data[group_item_value['intervals'].keys[0]] << aggregator_data
228
+ end
229
+ }
230
+
231
+ #Calculate subsecuents aggregations
232
+ group_item_value["intervals"].keys[1..-1].each {|s_interval|
233
+ group_item_value["intervals"][s_interval].each{|aggregator_item_key,aggregator_item_value|
234
+ interval = s_interval.to_i
235
+ #If processing mode is :batch, aggregate immediatly, else wait to arrive events (streaming processing like fluentd)
236
+ @processing_mode == :batch ? limit_time = 0 : limit_time = aggregator_item_value[:time_started] + interval + @keep_interval
237
+
238
+ #@log.debug "processing_mode:#{@processing_mode} limit_time:#{limit_time}"
239
+
240
+ if current_time >= limit_time
241
+ aggregator_data = {}
242
+ aggregator_data[@time_field] = Time.at(aggregator_item_key).strftime(@output_time_format)
243
+ aggregator_data.merge!(group_item_value["group_fields"])
244
+
245
+ aggregator_data["time"] = aggregator_item_key
246
+ aggregator_data["processed"] = aggregator_item_value["processed"]
247
+ aggregator_data["aggregator_id"] = @aggregator_name
248
+ aggregator_item_value["aggregate_fields"].each{|field_name,field_data|
249
+ field_data.each{|operation,vector|
250
+ case operation
251
+ when 'max','min','mean','median'
252
+ data = vector.method(operation).call
253
+ else
254
+ data = vector.median
255
+ end
256
+ aggregator_data["#{field_name}_#{operation}"] = data
257
+ }
258
+ }
259
+ #@log.debug aggregator_item_value
260
+ #@log.debug aggregator_data
261
+ group_item_value["intervals"][s_interval].delete(aggregator_item_key)
262
+ aggregate_data[s_interval] =[] if aggregate_data[s_interval].nil?
263
+ aggregate_data[s_interval] << aggregator_data
264
+ end
265
+ }
266
+ }
267
+ }
268
+
269
+ #@log.debug aggregate_data
270
+ unless aggregate_data.empty?
271
+ aggregate_data
272
+ end
273
+ #rescue Exception => e
274
+ # $log.error e
275
+ end
276
+ end
277
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dataoperations-aggregate
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Victor Guillen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-04-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: descriptive_statistics
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: Aggregate data over time
28
+ email: vguillen_public@gmail.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/dataoperations-aggregate.rb
34
+ homepage: https://github.com/superguillen/dataoperations-aggregate
35
+ licenses:
36
+ - MIT
37
+ metadata: {}
38
+ post_install_message:
39
+ rdoc_options: []
40
+ require_paths:
41
+ - lib
42
+ required_ruby_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ requirements: []
53
+ rubygems_version: 3.0.3
54
+ signing_key:
55
+ specification_version: 4
56
+ summary: Aggregate data
57
+ test_files: []