dataoperations-aggregate 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/dataoperations-aggregate.rb +277 -0
- metadata +57 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: a2bb76e93dd60923113cd9ca110d3c45c5809702adf9d62b28cf1e8411536184
|
4
|
+
data.tar.gz: da54bea512ceb72b4da917d7199365114d05708977f89e5d0700cb3b8c580475
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8def074117e9f6077673652ae8c5b8641c4b663097f94cf373b5aa3acf5272ed4709c2409669c9a9f620ec9bb00971248237b2e69e21068e40227d36993aa8bf
|
7
|
+
data.tar.gz: 3f5860fe505cd9a3e188018eede7bd66fe91e7aea97dd333c69079b954b4fee84e5160ee9b4541f8382ac52ba99a20226bbf32add9b57ab02b717e4be5eecece
|
@@ -0,0 +1,277 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'time'
|
3
|
+
require 'descriptive_statistics'
|
4
|
+
module DataOperations
|
5
|
+
class Aggregate
|
6
|
+
DEFAULT_TIME_FORMAT='%Y-%m-%dT%H:%M:%S.%L%:z'
|
7
|
+
DEFAULT_TIME_FIELD='timestamp'
|
8
|
+
DEFAULT_OUTPUT_TIME_FORMAT='%Y-%m-%dT%H:%M:%S.%L%z'
|
9
|
+
DEFAULT_INTERVALS=[10]
|
10
|
+
DEFAULT_FLUSH_INTERVAL=5
|
11
|
+
DEFAULT_PROCESSING_MODE=:batch
|
12
|
+
DEFAULT_FIELD_NO_DATA_VALUE='no_data'
|
13
|
+
DEFAULT_AGGREGATIONS=['sum','min','max','mean','median','variance','standard_deviation']
|
14
|
+
VALID_AGGREGATIONS = ['sum','min','max','mean','median','variance','standard_deviation']
|
15
|
+
DEFAULT_HASH_TIME_FORMAT = '%Y-%m-%dT%H'
|
16
|
+
DEFAULT_INERVAL_SECONDS = 3600
|
17
|
+
|
18
|
+
def initialize(aggregator: {},
|
19
|
+
time_format: DEFAULT_TIME_FORMAT,
|
20
|
+
time_field: DEFAULT_TIME_FIELD,
|
21
|
+
output_time_format: DEFAULT_OUTPUT_TIME_FORMAT,
|
22
|
+
intervals:DEFAULT_INTERVALS,
|
23
|
+
flush_interval:DEFAULT_FLUSH_INTERVAL,
|
24
|
+
keep_interval:DEFAULT_KEEP_INTERVAL,
|
25
|
+
field_no_data_value:DEFAULT_FIELD_NO_DATA_VALUE,
|
26
|
+
processing_mode:DEFAULT_PROCESSING_MODE,
|
27
|
+
log:Logger.new(STDOUT),
|
28
|
+
aggregation_names:,
|
29
|
+
group_field_names:,
|
30
|
+
aggregate_field_names:
|
31
|
+
)
|
32
|
+
@aggregator = aggregator
|
33
|
+
@time_format = time_format
|
34
|
+
@time_field = time_field
|
35
|
+
@output_time_format = output_time_format
|
36
|
+
@intervals = intervals.uniq.sort!
|
37
|
+
@flush_interval = flush_interval
|
38
|
+
@keep_interval = keep_interval
|
39
|
+
@field_no_data_value = field_no_data_value
|
40
|
+
@processing_mode = processing_mode
|
41
|
+
|
42
|
+
|
43
|
+
if aggregation_names.nil? || ! aggregation_names.is_a?(Array)
|
44
|
+
raise "Configuration error, aggregation_names must be specified and Array"
|
45
|
+
end
|
46
|
+
if group_field_names.nil? || ! aggregation_names.is_a?(Array)
|
47
|
+
raise "Configuration error, group_field_names must be specified and Array"
|
48
|
+
end
|
49
|
+
if aggregate_field_names.nil? || ! aggregation_names.is_a?(Array)
|
50
|
+
raise "Configuration error, aggregate_field_names must be specified and Array"
|
51
|
+
end
|
52
|
+
|
53
|
+
@log = log
|
54
|
+
|
55
|
+
@hash_time_format = DEFAULT_HASH_TIME_FORMAT
|
56
|
+
@interval_seconds = DEFAULT_INERVAL_SECONDS
|
57
|
+
|
58
|
+
@aggregation_names = aggregation_names
|
59
|
+
@group_field_names = group_field_names
|
60
|
+
@aggregate_field_names = aggregate_field_names
|
61
|
+
|
62
|
+
@aggregation_names.each {|operation|
|
63
|
+
if ! VALID_AGGREGATIONS.include?(operation)
|
64
|
+
raise "aggregations must set any combination of sum,min,max,mean,median,variance,standard_deviation"
|
65
|
+
end
|
66
|
+
}
|
67
|
+
@intervals.each {|interval|
|
68
|
+
if ! (interval % @intervals[0] == 0)
|
69
|
+
raise "interval: #{interval} must be multiple of first interval: #{@intervals[0]}"
|
70
|
+
end
|
71
|
+
}
|
72
|
+
|
73
|
+
#TODO:
|
74
|
+
# - Duplicate intervals - Done
|
75
|
+
# - Sort intervals - Done
|
76
|
+
# - Validate aggregation_names, group_field_names, aggregate_field_names
|
77
|
+
end
|
78
|
+
|
79
|
+
def log_level(log_level)
|
80
|
+
@log.level = log_level
|
81
|
+
end
|
82
|
+
|
83
|
+
def add_events(record)
|
84
|
+
timestamp = nil
|
85
|
+
if ! record.has_key?(@time_field) || ! (timestamp = DateTime.strptime(record[@time_field],@time_format).to_time.to_i)
|
86
|
+
timestamp = DateTime.now.to_time.to_i
|
87
|
+
end
|
88
|
+
|
89
|
+
current_interval_seconds = (timestamp / @intervals[0]) * @intervals[0]
|
90
|
+
aggregator_hash_key = current_interval_seconds
|
91
|
+
|
92
|
+
hash_group_key = nil
|
93
|
+
@group_field_names.each {|field_name|
|
94
|
+
if ! hash_group_key.nil?
|
95
|
+
hash_group_key = "#{hash_group_key}_#{field_name}:#{record[field_name]}"
|
96
|
+
else
|
97
|
+
hash_group_key = "#{field_name}:#{record[field_name]}"
|
98
|
+
end
|
99
|
+
}
|
100
|
+
|
101
|
+
aggregator_item={}
|
102
|
+
if @aggregator.has_key?(hash_group_key)
|
103
|
+
aggregator_item = @aggregator[hash_group_key]
|
104
|
+
else
|
105
|
+
group_detail = {}
|
106
|
+
aggregate_detail = {}
|
107
|
+
interval_detail = {}
|
108
|
+
@group_field_names.each {|field_name|
|
109
|
+
if record.has_key?(field_name)
|
110
|
+
group_detail[field_name] = record[field_name]
|
111
|
+
else
|
112
|
+
group_detail[field_name] = @field_no_data_value
|
113
|
+
end
|
114
|
+
}
|
115
|
+
|
116
|
+
#Add interval empty data
|
117
|
+
@intervals.each{|interval|
|
118
|
+
interval_detail[interval.to_s]={}
|
119
|
+
}
|
120
|
+
|
121
|
+
aggregator_item["group_fields"]=group_detail
|
122
|
+
aggregator_item["aggregate_fields"]=aggregate_detail
|
123
|
+
aggregator_item["intervals"]=interval_detail
|
124
|
+
|
125
|
+
@aggregator[hash_group_key]=aggregator_item
|
126
|
+
end
|
127
|
+
|
128
|
+
if ! aggregator_item["aggregate_fields"].has_key?(aggregator_hash_key)
|
129
|
+
hash_aggregator = {}
|
130
|
+
hash_aggregator[:time_started]=Time.now.to_i
|
131
|
+
hash_aggregator["processed"]=1
|
132
|
+
aggregator_item["aggregate_fields"][aggregator_hash_key]=hash_aggregator
|
133
|
+
else
|
134
|
+
aggregator_item["aggregate_fields"][aggregator_hash_key]["processed"]+=1
|
135
|
+
end
|
136
|
+
|
137
|
+
@aggregate_field_names.each {|field_name|
|
138
|
+
aggregate_values = []
|
139
|
+
if aggregator_item["aggregate_fields"][aggregator_hash_key].has_key?(field_name)
|
140
|
+
aggregate_values = aggregator_item["aggregate_fields"][aggregator_hash_key][field_name]
|
141
|
+
end
|
142
|
+
if record[field_name].is_a?(Integer) or record[field_name].is_a?(Float)
|
143
|
+
aggregate_values << record[field_name]
|
144
|
+
else
|
145
|
+
aggregate_values << 0
|
146
|
+
end
|
147
|
+
aggregator_item["aggregate_fields"][aggregator_hash_key][field_name] = aggregate_values
|
148
|
+
}
|
149
|
+
end
|
150
|
+
|
151
|
+
def aggregate_data
|
152
|
+
@aggregator
|
153
|
+
end
|
154
|
+
|
155
|
+
def aggregate_events
|
156
|
+
aggregate_data = {}
|
157
|
+
|
158
|
+
#@log.debug @aggregator
|
159
|
+
#@aggregator_mutex.synchronize do
|
160
|
+
current_time = Time.now.to_i
|
161
|
+
@aggregator.each {|group_item_key,group_item_value|
|
162
|
+
group_item_value["aggregate_fields"].each {|aggregator_item_key,aggregator_item_value|
|
163
|
+
#If processing mode is :batch, aggregate immediatly, else wait to arrive events (streaming processing like fluentd)
|
164
|
+
@processing_mode == :batch ? limit_time = 0 : limit_time = aggregator_item_value[:time_started] + @intervals[0] + @keep_interval
|
165
|
+
|
166
|
+
#Is this data ready to aggregate (based on the ingest time), if @processing_mode is batch limit_time is 0
|
167
|
+
if current_time >= limit_time
|
168
|
+
aggregator_data = {}
|
169
|
+
aggregator_data[@time_field] = Time.at(aggregator_item_key).strftime(@output_time_format)
|
170
|
+
aggregator_data.merge!(group_item_value["group_fields"])
|
171
|
+
|
172
|
+
aggregator_data["time"] = aggregator_item_key
|
173
|
+
aggregator_data["processed"] = aggregator_item_value["processed"]
|
174
|
+
aggregator_data["aggregator_id"] = @aggregator_name
|
175
|
+
|
176
|
+
#Add entry in accumulative aggregation hash
|
177
|
+
group_item_value['intervals'].keys[1..-1].each{|interval_secs|
|
178
|
+
interval_aggregator_item_key=(aggregator_item_key/interval_secs.to_i)*interval_secs.to_i
|
179
|
+
#@log.debug "interval_aggregator_item_key: #{interval_aggregator_item_key}"
|
180
|
+
|
181
|
+
if interval_aggregator_item_value = group_item_value['intervals'][interval_secs][interval_aggregator_item_key]
|
182
|
+
interval_aggregator_item_value[:time_started] = aggregator_item_value[:time_started] if interval_aggregator_item_value[:time_started] < aggregator_item_value[:time_started]
|
183
|
+
interval_aggregator_item_value["processed"] += aggregator_item_value["processed"]
|
184
|
+
#@log.debug interval_aggregator_item_value
|
185
|
+
else
|
186
|
+
interval_aggregator_item_value = {}
|
187
|
+
interval_aggregator_item_value[:time_started] = aggregator_item_value[:time_started]
|
188
|
+
interval_aggregator_item_value["aggregate_fields"]={}
|
189
|
+
interval_aggregator_item_value["processed"] = aggregator_item_value["processed"]
|
190
|
+
group_item_value['intervals'][interval_secs][interval_aggregator_item_key] = interval_aggregator_item_value
|
191
|
+
#@log.debug interval_aggregator_item_value
|
192
|
+
end
|
193
|
+
}
|
194
|
+
|
195
|
+
aggregator_item_value.each { |aggregate_field_key,aggregate_field_value|
|
196
|
+
#Create field metadata for subsecuents aggregations
|
197
|
+
group_item_value['intervals'].keys[1..-1].each{|interval_secs|
|
198
|
+
interval_aggregator_item_key=(aggregator_item_key/interval_secs.to_i)*interval_secs.to_i
|
199
|
+
interval_aggregator_item_value = group_item_value['intervals'][interval_secs][interval_aggregator_item_key]
|
200
|
+
#@log.debug interval_aggregator_item_value
|
201
|
+
if ! interval_aggregator_item_value["aggregate_fields"].has_key?(aggregate_field_key) && aggregate_field_value.is_a?(Array)
|
202
|
+
interval_aggregator_item_value["aggregate_fields"][aggregate_field_key]={}
|
203
|
+
@aggregation_names.each {|operation|
|
204
|
+
interval_aggregator_item_value["aggregate_fields"][aggregate_field_key][operation]=[]
|
205
|
+
}
|
206
|
+
end
|
207
|
+
}
|
208
|
+
|
209
|
+
#Aggregate data
|
210
|
+
if aggregate_field_value.is_a?(Array)
|
211
|
+
@aggregation_names.each {|operation|
|
212
|
+
data = aggregate_field_value.method(operation).call
|
213
|
+
aggregator_data["#{aggregate_field_key}_#{operation}"] = data
|
214
|
+
|
215
|
+
#Add aggregated data to interval
|
216
|
+
group_item_value['intervals'].keys[1..-1].each{|interval_secs|
|
217
|
+
interval_aggregator_item_key=(aggregator_item_key/interval_secs.to_i)*interval_secs.to_i
|
218
|
+
interval_aggregator_item_value = group_item_value['intervals'][interval_secs][interval_aggregator_item_key]
|
219
|
+
interval_aggregator_item_value["aggregate_fields"][aggregate_field_key][operation] << data
|
220
|
+
}
|
221
|
+
}
|
222
|
+
end
|
223
|
+
}
|
224
|
+
|
225
|
+
group_item_value["aggregate_fields"].delete(aggregator_item_key)
|
226
|
+
aggregate_data[group_item_value['intervals'].keys[0]] =[] if aggregate_data[group_item_value['intervals'].keys[0]].nil?
|
227
|
+
aggregate_data[group_item_value['intervals'].keys[0]] << aggregator_data
|
228
|
+
end
|
229
|
+
}
|
230
|
+
|
231
|
+
#Calculate subsecuents aggregations
|
232
|
+
group_item_value["intervals"].keys[1..-1].each {|s_interval|
|
233
|
+
group_item_value["intervals"][s_interval].each{|aggregator_item_key,aggregator_item_value|
|
234
|
+
interval = s_interval.to_i
|
235
|
+
#If processing mode is :batch, aggregate immediatly, else wait to arrive events (streaming processing like fluentd)
|
236
|
+
@processing_mode == :batch ? limit_time = 0 : limit_time = aggregator_item_value[:time_started] + interval + @keep_interval
|
237
|
+
|
238
|
+
#@log.debug "processing_mode:#{@processing_mode} limit_time:#{limit_time}"
|
239
|
+
|
240
|
+
if current_time >= limit_time
|
241
|
+
aggregator_data = {}
|
242
|
+
aggregator_data[@time_field] = Time.at(aggregator_item_key).strftime(@output_time_format)
|
243
|
+
aggregator_data.merge!(group_item_value["group_fields"])
|
244
|
+
|
245
|
+
aggregator_data["time"] = aggregator_item_key
|
246
|
+
aggregator_data["processed"] = aggregator_item_value["processed"]
|
247
|
+
aggregator_data["aggregator_id"] = @aggregator_name
|
248
|
+
aggregator_item_value["aggregate_fields"].each{|field_name,field_data|
|
249
|
+
field_data.each{|operation,vector|
|
250
|
+
case operation
|
251
|
+
when 'max','min','mean','median'
|
252
|
+
data = vector.method(operation).call
|
253
|
+
else
|
254
|
+
data = vector.median
|
255
|
+
end
|
256
|
+
aggregator_data["#{field_name}_#{operation}"] = data
|
257
|
+
}
|
258
|
+
}
|
259
|
+
#@log.debug aggregator_item_value
|
260
|
+
#@log.debug aggregator_data
|
261
|
+
group_item_value["intervals"][s_interval].delete(aggregator_item_key)
|
262
|
+
aggregate_data[s_interval] =[] if aggregate_data[s_interval].nil?
|
263
|
+
aggregate_data[s_interval] << aggregator_data
|
264
|
+
end
|
265
|
+
}
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
#@log.debug aggregate_data
|
270
|
+
unless aggregate_data.empty?
|
271
|
+
aggregate_data
|
272
|
+
end
|
273
|
+
#rescue Exception => e
|
274
|
+
# $log.error e
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
metadata
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: dataoperations-aggregate
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Victor Guillen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-04-12 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: descriptive_statistics
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
description: Aggregate data over time
|
28
|
+
email: vguillen_public@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/dataoperations-aggregate.rb
|
34
|
+
homepage: https://github.com/superguillen/dataoperations-aggregate
|
35
|
+
licenses:
|
36
|
+
- MIT
|
37
|
+
metadata: {}
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
require_paths:
|
41
|
+
- lib
|
42
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '0'
|
52
|
+
requirements: []
|
53
|
+
rubygems_version: 3.0.3
|
54
|
+
signing_key:
|
55
|
+
specification_version: 4
|
56
|
+
summary: Aggregate data
|
57
|
+
test_files: []
|