geoptima 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/bin/csv_chart CHANGED
@@ -10,7 +10,7 @@ require 'geoptima/options'
10
10
  require 'fileutils'
11
11
  require 'geoptima/daterange'
12
12
 
13
- Geoptima::assert_version("0.1.3")
13
+ Geoptima::assert_version("0.1.4")
14
14
  Geoptima::Chart.available? || puts("No charting libraries available") || exit(-1)
15
15
 
16
16
  $export_dir = '.'
@@ -26,9 +26,7 @@ $files = Geoptima::Options.process_args do |option|
26
26
  option.S {$specfile = ARGV.shift}
27
27
  option.P {$diversity = ARGV.shift.to_f}
28
28
  option.W {$chart_width = ARGV.shift.to_i}
29
- option.T do
30
- $time_range = Geoptima::DateRange.from ARGV.shift
31
- end
29
+ option.T {$time_range = Geoptima::DateRange.from ARGV.shift}
32
30
  end
33
31
 
34
32
  FileUtils.mkdir_p $export_dir
@@ -148,8 +146,10 @@ class StatsManager
148
146
  end
149
147
 
150
148
  module Geoptima
149
+
150
+ # Class for original stats approach of creating a new 'column' from simple combinations of other columns
151
151
  class StatSpec
152
- attr_reader :header, :event, :index, :indices, :fields, :options, :proc, :groups
152
+ attr_reader :header, :event, :index, :indices, :fields, :options, :proc, :groups, :values
153
153
  def initialize(header,*fields,&block)
154
154
  @header = header
155
155
  @fields = fields
@@ -184,13 +184,13 @@ module Geoptima
184
184
  key = @group.call(time)
185
185
  ghead = "#{header} #{key}"
186
186
  @groups[key] = ghead
187
- stats_manager.add(map(fields),ghead,nil)
187
+ stats_manager.add(map_fields(fields),ghead,nil)
188
188
  end
189
189
  rescue ArgumentError
190
190
  puts "Error: Unable to process time field[#{time}]: #{$!}"
191
191
  end
192
192
  end
193
- stats_manager.add(map(fields),header,index)
193
+ stats_manager.add(map_fields(fields),header,index)
194
194
  end
195
195
  def div
196
196
  unless @div
@@ -224,18 +224,30 @@ module Geoptima
224
224
  val
225
225
  end
226
226
  end
227
- def map(values,filter=nil)
227
+ def prepare_values(values)
228
+ @values = []
228
229
  if @indices
229
230
  puts "StatSpec[#{self}]: #{options.inspect}" if($debug)
230
- vals = @indices.map{|i| values[i]}
231
- puts "\tVALUES: #{vals.inspect}" if($debug)
232
- (options[:filter] || {}).each do |field,expected|
231
+ @values = @indices.map{|i| values[i]}
232
+ puts "\tVALUES: #{values.inspect}" if($debug)
233
+ end
234
+ @values
235
+ end
236
+ def vals_for(values,filter={})
237
+ if @indices
238
+ prepare_values(values)
239
+ (options[:filter] || filter).each do |field,expected|
233
240
  puts "\t\tChecking if field #{field} is #{expected}" if($debug)
234
241
  puts "\t\tLooking for #{field} or #{event}.#{field} in #{@fields.inspect}" if($debug)
235
242
  hi = @fields.index(field.to_s) || @fields.index("#{event}.#{field}")
236
- puts "\t\t#{field} -> #{hi} -> #{hi && vals[hi]}" if($debug)
237
- return nil unless(hi && vals[hi] && (expected === vals[hi].downcase || vals[hi].downcase === expected.to_s.downcase))
243
+ puts "\t\t#{field} -> #{hi} -> #{hi && values[hi]}" if($debug)
244
+ return nil unless(hi && values[hi] && (expected === values[hi].downcase || values[hi].downcase === expected.to_s.downcase))
238
245
  end
246
+ values
247
+ end
248
+ end
249
+ def map_fields(values,filter={})
250
+ if vals = vals_for(values,filter)
239
251
  val = proc.nil? ? vals[0] : proc.call(*vals)
240
252
  puts "\tBLOCK MAP: #{vals.inspect} --> #{val.inspect}" if($debug)
241
253
  if options[:div]
@@ -266,6 +278,128 @@ module Geoptima
266
278
  "#{header}[#{index}]<-#{fields.inspect}(#{indices && indices.join(',')})"
267
279
  end
268
280
  end
281
+
282
+ class Group
283
+ attr_reader :name, :options, :proc, :is_time, :index
284
+ def initialize(name,options={},&block)
285
+ @name = name
286
+ @options = options
287
+ @proc = block
288
+ @is_time = options[:is_time]
289
+ end
290
+ def index= (ind)
291
+ puts "Set group header index=#{ind} for group '#{name}'"
292
+ @index = ind
293
+ end
294
+ def call(time,values)
295
+ is_time && @proc.call(time) || @proc.call(values[index])
296
+ end
297
+ end
298
+
299
+ # The KPI class allows for complex statistics called 'Key Performance Indicators'.
300
+ # These are specified using four functions:
301
+ # filter: how to choose rows to include in the statistics (default is '!map.nil?')
302
+ # map: how to convert a row into the internal stats (default is input columns)
303
+ # aggregate: how to aggregate internal stats to higher levels (eg. daily, default is count)
304
+ # reduce: how to extract presentable stats from internal stats (eg. avg=total/count, default is internal stats)
305
+ #
306
+ # The KPI is defined with a name and set of columns to use, followed by the block
307
+ # defining the four functions above. For example:
308
+ #
309
+ # kpi 'DNS Success', 'dnsLookup.address', 'dnsLookup.error', 'dnsLookup.interface' do |f|
310
+ # f.filter {|addr,err,int| addr =~/\w/}
311
+ # f.map {|addr,err,int| err.length==0 ? [1,1] : [1,0]}
312
+ # f.aggregate {|a,v| a[0]+=v[0];a[1]+=v[1];a}
313
+ # f.reduce {|a| 100.0*a[1].to_f/a[0].to_f}
314
+ # end
315
+ #
316
+ # Currently this class extends StatSpec for access to the prepare_indices method.
317
+ # We should consider moving that to a mixin, or depreciating the StatSpec class
318
+ # entirely since KPISpec should provide a superset of features.
319
+ class KPISpec < StatSpec
320
+ def initialize(header,*fields,&block)
321
+ @header = header
322
+ @fields = fields
323
+ @event = @fields[0].split(/\./)[0]
324
+ block.call self unless(block.nil?)
325
+ if @fields[-1].is_a?(Hash)
326
+ @options = @fields.pop
327
+ else
328
+ @options = {}
329
+ end
330
+ @group_procs = []
331
+ @groups = {}
332
+ if @options[:group]
333
+ [@options[:group]].flatten.compact.sort.uniq.each do |group_name|
334
+ gname = group_name.to_s.intern
335
+ case gname
336
+ when :months
337
+ group_by(gname,true) {|t| t.strftime("%Y-%m")}
338
+ when :weeks
339
+ group_by(gname,true) {|t| t.strftime("%Y w%W")}
340
+ when :days
341
+ group_by(gname,true) {|t| t.strftime("%Y-%m-%d")}
342
+ when :hours
343
+ group_by(gname,true) {|t| t.strftime("%Y-%m-%d %H")}
344
+ else
345
+ group_by(gname) {|f| f}
346
+ end
347
+ end
348
+ end
349
+ puts "Created StatSpec: #{self}"
350
+ end
351
+ def group_by(field,is_time=false,&block)
352
+ @group_procs = Group.new(field,:is_time => is_time,&block)
353
+ end
354
+ def filter(&block)
355
+ @filter_proc = block
356
+ end
357
+ def map(&block)
358
+ @map_proc = block
359
+ end
360
+ def aggregate(&block)
361
+ @aggregate_proc = block
362
+ end
363
+ def reduce(&block)
364
+ @reduce_proc = block
365
+ end
366
+ def add(stats_manager,values)
367
+ prepare_values(values)
368
+ if @group_procs.length > 0
369
+ begin
370
+ time = DateTime.parse(values[stats_manager.time_index])
371
+ if $time_range.nil? || $time_range.include?(time)
372
+ key = @group_procs.inject(header) do |ghead,group|
373
+ key = @group.call(time,values)
374
+ ghead += " #{key}"
375
+ end
376
+ @groups[key] = ghead
377
+ stats_manager.add(map_fields(fields),ghead,nil)
378
+ end
379
+ rescue ArgumentError
380
+ puts "Error: Unable to process time field[#{time}]: #{$!}"
381
+ end
382
+ end
383
+ stats_manager.add(map_fields(fields),header,index)
384
+ end
385
+ def map_fields(values,filter=nil)
386
+ if values
387
+ if @filter_proc.nil? || @filter_proc.call(*values)
388
+ val = @map_proc && @map_proc.call(*values) || values[0]
389
+ puts "\tBLOCK MAP: #{values.inspect} --> #{values.inspect}" if($debug)
390
+ end
391
+ val
392
+ end
393
+ end
394
+ def prepare_indices(stats_manager,headers)
395
+ super(stats_manager,headers)
396
+ @group_procs.each do |g|
397
+ g.index = fields.index(g.name)
398
+ end
399
+ end
400
+ end
401
+
402
+ # Class for specifications of individual charts
269
403
  class ChartSpec
270
404
  attr_reader :chart_type, :header, :options
271
405
  def initialize(header,options={})
@@ -328,14 +462,15 @@ module Geoptima
328
462
  g.write("#{$export_dir}/Chart_#{stats_manager.name}_#{header}_#{chart_type}_distribution.png")
329
463
  end
330
464
  def to_s
331
- "#{chart_type.upcase}-#{header}"
465
+ "#{chart_type.to_s.upcase}-#{header}"
332
466
  end
333
467
  end
334
468
  class StatsSpecs
335
- attr_reader :chart_specs, :stat_specs
469
+ attr_reader :chart_specs, :stat_specs, :kpi_specs
336
470
  def initialize(specfile)
337
471
  @chart_specs = []
338
472
  @stat_specs = []
473
+ @kpi_specs = []
339
474
  instance_eval(File.open(specfile).read)
340
475
  end
341
476
  def category_chart(header,options={})
@@ -353,10 +488,16 @@ module Geoptima
353
488
  def stats(header,*fields,&block)
354
489
  @stat_specs << StatSpec.new(header,*fields,&block)
355
490
  end
491
+ def kpi(header,*fields,&block)
492
+ @kpi_specs << KPISpec.new(header,*fields,&block)
493
+ end
356
494
  def add_stats(stats_manager,headers)
357
495
  stat_specs.each do |stat_spec|
358
496
  stat_spec.prepare_indices(stats_manager,headers)
359
497
  end
498
+ kpi_specs.each do |kpi_spec|
499
+ kpi_spec.prepare_indices(stats_manager,headers)
500
+ end
360
501
  end
361
502
  def add_fields(stats_manager,fields)
362
503
  puts "Adding fields to #{stat_specs.length} StatSpec's" if($debug)
@@ -364,9 +505,14 @@ module Geoptima
364
505
  puts "Adding fields to StatSpec: #{stat_spec}" if($debug)
365
506
  stat_spec.add(stats_manager,fields)
366
507
  end
508
+ puts "Adding fields to #{kpi_specs.length} KPISpec's" if($debug)
509
+ kpi_specs.each do |kpi_spec|
510
+ puts "Adding fields to KPISpec: #{kpi_spec}" if($debug)
511
+ kpi_spec.add(stats_manager,fields)
512
+ end
367
513
  end
368
514
  def to_s
369
- "Stats[#{@stat_specs.join(', ')}] AND Charts[#{@chart_specs.join(', ')}]"
515
+ "Stats[#{@stat_specs.join(', ')}] AND KPIs[#{@kpi_specs.join(', ')}] AND Charts[#{@chart_specs.join(', ')}]"
370
516
  end
371
517
  end
372
518
  end
@@ -468,7 +614,11 @@ end
468
614
  $stats_managers.each do |name,stats_manager|
469
615
  if $specs
470
616
  $specs.chart_specs.each do |chart_spec|
471
- chart_spec.process(stats_manager)
617
+ begin
618
+ chart_spec.process(stats_manager)
619
+ rescue NoMethodError
620
+ puts "Failed to process chart '#{chart_spec}': #{$!}"
621
+ end
472
622
  end
473
623
  end
474
624
  if $create_all
data/bin/csv_merge CHANGED
@@ -9,20 +9,24 @@ require 'geoptima/options'
9
9
  require 'fileutils'
10
10
  require 'geoptima/daterange'
11
11
 
12
- Geoptima::assert_version("0.1.3")
12
+ Geoptima::assert_version("0.1.4")
13
13
 
14
14
  $export_dir = '.'
15
15
  $export_name = 'merged.csv'
16
+ $split_by = :days
16
17
 
17
18
  $files = Geoptima::Options.process_args do |option|
18
19
  option.t {$time_split = true}
20
+ option.m {$low_memory = true}
19
21
  option.D {$export_dir = ARGV.shift}
20
22
  option.N {$export_name = ARGV.shift}
21
- option.T do
22
- $time_range = Geoptima::DateRange.new(*(ARGV.shift.split(/[\,]+/).map do |t|
23
- DateTime.parse t
24
- end))
23
+ option.S do
24
+ $split_by = case ARGV.shift.downcase.intern
25
+ when :days ; :days
26
+ else :days
27
+ end
25
28
  end
29
+ option.T {$time_range = Geoptima::DateRange.from ARGV.shift}
26
30
  end
27
31
 
28
32
  FileUtils.mkdir_p $export_dir
@@ -30,46 +34,216 @@ FileUtils.mkdir_p $export_dir
30
34
  $help = true unless($files.length>0)
31
35
  if $help
32
36
  puts <<EOHELP
33
- Usage: csv_chart <-dht> <-N name> <-D dir> <-T range> files...
37
+ Usage: csv_merge <-dhtm> <-N name> <-D dir> <-T range> <-S split_by> files...
34
38
  -d debug mode #{cw $debug}
35
39
  -h print this help #{cw $help}
36
- -t merge and split by time (days) #{cw $time_split}
37
- -N use specified name for merged dataset: #{$merged_name}
40
+ -t merge and split by time (#{$split_by}) #{cw $time_split}
41
+ -m use low memory, temporarily storing to intermediate files #{cw $low_memory}
42
+ -N use specified name for merged dataset: #{$export_name}
38
43
  -D export to specified directory: #{$export_dir}
44
+ -S time units to split exports by: #{$split_by}
39
45
  -T set time-range filter: #{$time_range}
40
46
  Files to import: #{$files.join(', ')}
41
47
  EOHELP
42
48
  exit
43
49
  end
44
50
 
45
- class CSVData
46
- attr_reader :headers
47
- def initialize(headers)
51
+ class CSVRecord
52
+ attr_reader :time, :fields, :day
53
+ def initialize(fields,time_index=0)
54
+ @fields = fields
55
+ @time = DateTime.parse(fields[time_index])
56
+ @day = @time.strftime("%Y-%m-%d")
57
+ end
58
+ def [](index)
59
+ fields[index]
60
+ end
61
+ def <=>(other)
62
+ time <=> other
63
+ end
64
+ def within(time_range)
65
+ time_range.nil? || time_range.include?(time)
48
66
  end
49
67
  end
50
68
 
51
- $files.each do |file|
52
- lines = 0
53
- headers = nil
54
- filename = File.basename(file)
55
- (names = filename.split(/[_\.]/)).pop
56
- name = $merge_all ? ($merged_name || 'All') : names.join('_')
57
- $stats_managers[name] ||= StatsManager.new(name)
58
- puts "About to read file #{file}"
59
- File.open(file).each do |line|
60
- lines += 1
61
- fields=line.chomp.split(/\t/)
62
- if headers
63
- puts "Processing line: #{line}" if($debug)
64
- $stats_managers[name].add_all(fields,headers)
69
+ class CSVDataset
70
+ attr_reader :filename, :headers, :day_map, :lines, :count, :record_creation_duration
71
+ def initialize(filename)
72
+ @filename = filename
73
+ @lines = []
74
+ @day_map = {}
75
+ @record_creation_duration = 0
76
+ @count = 0
77
+ @headers = nil
78
+ read_file do |fields|
79
+ add fields
80
+ end
81
+ end
82
+ def read_file
83
+ lines = 0
84
+ File.open(filename).each do |line|
85
+ fields=line.chomp.split(/\t/)
86
+ if lines > 0
87
+ puts "Processing line: #{line}" if($debug)
88
+ yield fields
89
+ else
90
+ if fields.length<2
91
+ puts "Too few headers, rejecting #{file}"
92
+ break
93
+ end
94
+ @headers ||= fields
95
+ end
96
+ lines += 1
97
+ end
98
+ @export_headers ||= @headers
99
+ end
100
+ def add(fields)
101
+ start_time = Time.new
102
+ line = create_line(fields)
103
+ if line
104
+ @day_map[line.day] ||= 0
105
+ @day_map[line.day] += 1
106
+ @lines << line unless($low_memory)
107
+ @count += 1
108
+ @record_creation_duration += Time.new - start_time
109
+ end
110
+ line
111
+ end
112
+ def create_line(fields)
113
+ begin
114
+ line = CSVRecord.new(fields,0)
115
+ if(line.within($time_range))
116
+ line
117
+ else
118
+ nil
119
+ end
120
+ rescue ArgumentError
121
+ puts "Failed to parse line with timestamp='#{fields[0]}': #{$!}"
122
+ end
123
+ end
124
+ def header_map(eh=nil)
125
+ if eh
126
+ @export_headers = eh
127
+ @header_map = nil
128
+ end
129
+ unless @header_map
130
+ @header_map = []
131
+ (@export_headers || @headers).each do |head|
132
+ @header_map << @headers.index(head)
133
+ end
134
+ end
135
+ @header_map
136
+ end
137
+ def map_line(line)
138
+ @header_map.map do |index|
139
+ index && line[index]
140
+ end
141
+ end
142
+ def days
143
+ @day_map.keys.sort
144
+ end
145
+ def each(eh=nil)
146
+ header_map(eh)
147
+ if $low_memory
148
+ read_file do |fields|
149
+ line = create_line fields
150
+ yield line.day,map_line(line)
151
+ end
65
152
  else
66
- headers = fields
67
- if headers.length<2
68
- puts "Too few headers, rejecting #{file}"
69
- break
153
+ (@lines || []).each do |line|
154
+ yield line.day,map_line(line)
70
155
  end
71
- $stats_managers[name].set_headers(headers)
72
156
  end
73
157
  end
158
+ def <=>(other)
159
+ self.filename <=> other.filename
160
+ end
161
+ def length
162
+ count
163
+ end
74
164
  end
75
165
 
166
+ class CSVDatasets
167
+ attr_reader :datasets
168
+ def initialize
169
+ @datasets = []
170
+ end
171
+ def add_file(file)
172
+ lines = 0
173
+ dataset = nil
174
+ filename = File.basename(file)
175
+ (names = filename.split(/[_\.]/)).pop
176
+ name = names.join('_')
177
+ puts "About to read file #{file}"
178
+ dataset = CSVDataset.new(file)
179
+ @datasets << dataset if(dataset && dataset.length>0)
180
+ dataset
181
+ end
182
+ def export_days
183
+ headers = @datasets.map{|d| d.headers}.flatten.uniq
184
+ days = @datasets.map{|d| d.days}.flatten.sort.uniq
185
+ day_files = {}
186
+ day_names = {}
187
+ count = {}
188
+ duration = {}
189
+ days.each do |day|
190
+ filename = "#{$export_dir}/#{$export_name.gsub(/\.csv$/,'')}_#{day}.csv"
191
+ puts "Exporting #{filename} for #{day}"
192
+ day_names[day] = filename
193
+ day_files[day] = File.open(filename,'w')
194
+ day_files[day].puts headers.join("\t")
195
+ count[day] = 0
196
+ duration[day] = 0
197
+ end
198
+ @datasets.sort.each do |dataset|
199
+ dataset.each(headers) do |day,line|
200
+ start_time = Time.new
201
+ day_files[day].puts line.join("\t")
202
+ duration[day] += Time.new - start_time
203
+ count[day] += 1
204
+ end
205
+ end
206
+ day_files.each do |day,out|
207
+ out.close
208
+ puts "\tExported #{count[day]} records to #{day_names[day]} in #{duration[day]} seconds"
209
+ end
210
+ end
211
+ def export_merged
212
+ headers = @datasets.map{|d| d.headers}.flatten.sort.uniq
213
+ filename = "#{$export_dir}/#{$export_name}"
214
+ File.open(filename,'w') do |out|
215
+ out.puts headers.join("\t")
216
+ @datasets.sort.each(headers) do |dataset|
217
+ dataset.each do |day,line|
218
+ out.puts line.join("\t")
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end
224
+
225
+ $datasets = CSVDatasets.new
226
+
227
+ $files.each do |file|
228
+ start_time = Time.new
229
+ ds = $datasets.add_file(file)
230
+ duration = Time.new - start_time
231
+ puts "\tLoaded #{file} in #{duration} seconds"
232
+ puts "\t#{(100.0 * ds.record_creation_duration.to_f/duration.to_f).to_i}% = #{ds.record_creation_duration}/#{duration} was spent creating records"
233
+ puts "\tFile contained #{ds.length} events for #{ds.days.length} days:"
234
+ ds.days.each do |day|
235
+ puts "\t\t#{day}: \t#{(100.0 * ds.day_map[day].to_f/ds.length.to_f).to_i}%\t#{ds.day_map[day]} records"
236
+ end
237
+ end
238
+
239
+ start_time = Time.new
240
+
241
+ if $time_split
242
+ $datasets.export_days
243
+ else
244
+ $datasets.export_merged
245
+ end
246
+
247
+ duration = Time.new - start_time
248
+ puts "Exported in #{duration} seconds"
249
+