geoptima 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/csv_chart CHANGED
@@ -10,7 +10,7 @@ require 'geoptima/options'
10
10
  require 'fileutils'
11
11
  require 'geoptima/daterange'
12
12
 
13
- Geoptima::assert_version("0.1.3")
13
+ Geoptima::assert_version("0.1.4")
14
14
  Geoptima::Chart.available? || puts("No charting libraries available") || exit(-1)
15
15
 
16
16
  $export_dir = '.'
@@ -26,9 +26,7 @@ $files = Geoptima::Options.process_args do |option|
26
26
  option.S {$specfile = ARGV.shift}
27
27
  option.P {$diversity = ARGV.shift.to_f}
28
28
  option.W {$chart_width = ARGV.shift.to_i}
29
- option.T do
30
- $time_range = Geoptima::DateRange.from ARGV.shift
31
- end
29
+ option.T {$time_range = Geoptima::DateRange.from ARGV.shift}
32
30
  end
33
31
 
34
32
  FileUtils.mkdir_p $export_dir
@@ -148,8 +146,10 @@ class StatsManager
148
146
  end
149
147
 
150
148
  module Geoptima
149
+
150
+ # Class for original stats approach of creating a new 'column' from simple combinations of other columns
151
151
  class StatSpec
152
- attr_reader :header, :event, :index, :indices, :fields, :options, :proc, :groups
152
+ attr_reader :header, :event, :index, :indices, :fields, :options, :proc, :groups, :values
153
153
  def initialize(header,*fields,&block)
154
154
  @header = header
155
155
  @fields = fields
@@ -184,13 +184,13 @@ module Geoptima
184
184
  key = @group.call(time)
185
185
  ghead = "#{header} #{key}"
186
186
  @groups[key] = ghead
187
- stats_manager.add(map(fields),ghead,nil)
187
+ stats_manager.add(map_fields(fields),ghead,nil)
188
188
  end
189
189
  rescue ArgumentError
190
190
  puts "Error: Unable to process time field[#{time}]: #{$!}"
191
191
  end
192
192
  end
193
- stats_manager.add(map(fields),header,index)
193
+ stats_manager.add(map_fields(fields),header,index)
194
194
  end
195
195
  def div
196
196
  unless @div
@@ -224,18 +224,30 @@ module Geoptima
224
224
  val
225
225
  end
226
226
  end
227
- def map(values,filter=nil)
227
+ def prepare_values(values)
228
+ @values = []
228
229
  if @indices
229
230
  puts "StatSpec[#{self}]: #{options.inspect}" if($debug)
230
- vals = @indices.map{|i| values[i]}
231
- puts "\tVALUES: #{vals.inspect}" if($debug)
232
- (options[:filter] || {}).each do |field,expected|
231
+ @values = @indices.map{|i| values[i]}
232
+ puts "\tVALUES: #{values.inspect}" if($debug)
233
+ end
234
+ @values
235
+ end
236
+ def vals_for(values,filter={})
237
+ if @indices
238
+ prepare_values(values)
239
+ (options[:filter] || filter).each do |field,expected|
233
240
  puts "\t\tChecking if field #{field} is #{expected}" if($debug)
234
241
  puts "\t\tLooking for #{field} or #{event}.#{field} in #{@fields.inspect}" if($debug)
235
242
  hi = @fields.index(field.to_s) || @fields.index("#{event}.#{field}")
236
- puts "\t\t#{field} -> #{hi} -> #{hi && vals[hi]}" if($debug)
237
- return nil unless(hi && vals[hi] && (expected === vals[hi].downcase || vals[hi].downcase === expected.to_s.downcase))
243
+ puts "\t\t#{field} -> #{hi} -> #{hi && values[hi]}" if($debug)
244
+ return nil unless(hi && values[hi] && (expected === values[hi].downcase || values[hi].downcase === expected.to_s.downcase))
238
245
  end
246
+ values
247
+ end
248
+ end
249
+ def map_fields(values,filter={})
250
+ if vals = vals_for(values,filter)
239
251
  val = proc.nil? ? vals[0] : proc.call(*vals)
240
252
  puts "\tBLOCK MAP: #{vals.inspect} --> #{val.inspect}" if($debug)
241
253
  if options[:div]
@@ -266,6 +278,128 @@ module Geoptima
266
278
  "#{header}[#{index}]<-#{fields.inspect}(#{indices && indices.join(',')})"
267
279
  end
268
280
  end
281
+
282
+ class Group
283
+ attr_reader :name, :options, :proc, :is_time, :index
284
+ def initialize(name,options={},&block)
285
+ @name = name
286
+ @options = options
287
+ @proc = block
288
+ @is_time = options[:is_time]
289
+ end
290
+ def index= (ind)
291
+ puts "Set group header index=#{ind} for group '#{name}'"
292
+ @index = ind
293
+ end
294
+ def call(time,values)
295
+ is_time && @proc.call(time) || @proc.call(values[index])
296
+ end
297
+ end
298
+
299
+ # The KPI class allows for complex statistics called 'Key Performance Indicators'.
300
+ # These are specified using four functions:
301
+ # filter: how to choose rows to include in the statistics (default is '!map.nil?')
302
+ # map: how to convert a row into the internal stats (default is input columns)
303
+ # aggregate: how to aggregate internal stats to higher levels (eg. daily, default is count)
304
+ # reduce: how to extract presentable stats from internal stats (eg. avg=total/count, default is internal stats)
305
+ #
306
+ # The KPI is defined with a name and set of columns to use, followed by the block
307
+ # defining the four functions above. For example:
308
+ #
309
+ # kpi 'DNS Success', 'dnsLookup.address', 'dnsLookup.error', 'dnsLookup.interface' do |f|
310
+ # f.filter {|addr,err,int| addr =~/\w/}
311
+ # f.map {|addr,err,int| err.length==0 ? [1,1] : [1,0]}
312
+ # f.aggregate {|a,v| a[0]+=v[0];a[1]+=v[1];a}
313
+ # f.reduce {|a| 100.0*a[1].to_f/a[0].to_f}
314
+ # end
315
+ #
316
+ # Currently this class extends StatSpec for access to the prepare_indices method.
317
+ # We should consider moving that to a mixin, or depreciating the StatSpec class
318
+ # entirely since KPISpec should provide a superset of features.
319
+ class KPISpec < StatSpec
320
+ def initialize(header,*fields,&block)
321
+ @header = header
322
+ @fields = fields
323
+ @event = @fields[0].split(/\./)[0]
324
+ block.call self unless(block.nil?)
325
+ if @fields[-1].is_a?(Hash)
326
+ @options = @fields.pop
327
+ else
328
+ @options = {}
329
+ end
330
+ @group_procs = []
331
+ @groups = {}
332
+ if @options[:group]
333
+ [@options[:group]].flatten.compact.sort.uniq.each do |group_name|
334
+ gname = group_name.to_s.intern
335
+ case gname
336
+ when :months
337
+ group_by(gname,true) {|t| t.strftime("%Y-%m")}
338
+ when :weeks
339
+ group_by(gname,true) {|t| t.strftime("%Y w%W")}
340
+ when :days
341
+ group_by(gname,true) {|t| t.strftime("%Y-%m-%d")}
342
+ when :hours
343
+ group_by(gname,true) {|t| t.strftime("%Y-%m-%d %H")}
344
+ else
345
+ group_by(gname) {|f| f}
346
+ end
347
+ end
348
+ end
349
+ puts "Created StatSpec: #{self}"
350
+ end
351
+ def group_by(field,is_time=false,&block)
352
+ @group_procs = Group.new(field,:is_time => is_time,&block)
353
+ end
354
+ def filter(&block)
355
+ @filter_proc = block
356
+ end
357
+ def map(&block)
358
+ @map_proc = block
359
+ end
360
+ def aggregate(&block)
361
+ @aggregate_proc = block
362
+ end
363
+ def reduce(&block)
364
+ @reduce_proc = block
365
+ end
366
+ def add(stats_manager,values)
367
+ prepare_values(values)
368
+ if @group_procs.length > 0
369
+ begin
370
+ time = DateTime.parse(values[stats_manager.time_index])
371
+ if $time_range.nil? || $time_range.include?(time)
372
+ key = @group_procs.inject(header) do |ghead,group|
373
+ key = @group.call(time,values)
374
+ ghead += " #{key}"
375
+ end
376
+ @groups[key] = ghead
377
+ stats_manager.add(map_fields(fields),ghead,nil)
378
+ end
379
+ rescue ArgumentError
380
+ puts "Error: Unable to process time field[#{time}]: #{$!}"
381
+ end
382
+ end
383
+ stats_manager.add(map_fields(fields),header,index)
384
+ end
385
+ def map_fields(values,filter=nil)
386
+ if values
387
+ if @filter_proc.nil? || @filter_proc.call(*values)
388
+ val = @map_proc && @map_proc.call(*values) || values[0]
389
+ puts "\tBLOCK MAP: #{values.inspect} --> #{values.inspect}" if($debug)
390
+ end
391
+ val
392
+ end
393
+ end
394
+ def prepare_indices(stats_manager,headers)
395
+ super(stats_manager,headers)
396
+ @group_procs.each do |g|
397
+ g.index = fields.index(g.name)
398
+ end
399
+ end
400
+ end
401
+
402
+ # Class for specifications of individual charts
269
403
  class ChartSpec
270
404
  attr_reader :chart_type, :header, :options
271
405
  def initialize(header,options={})
@@ -328,14 +462,15 @@ module Geoptima
328
462
  g.write("#{$export_dir}/Chart_#{stats_manager.name}_#{header}_#{chart_type}_distribution.png")
329
463
  end
330
464
  def to_s
331
- "#{chart_type.upcase}-#{header}"
465
+ "#{chart_type.to_s.upcase}-#{header}"
332
466
  end
333
467
  end
334
468
  class StatsSpecs
335
- attr_reader :chart_specs, :stat_specs
469
+ attr_reader :chart_specs, :stat_specs, :kpi_specs
336
470
  def initialize(specfile)
337
471
  @chart_specs = []
338
472
  @stat_specs = []
473
+ @kpi_specs = []
339
474
  instance_eval(File.open(specfile).read)
340
475
  end
341
476
  def category_chart(header,options={})
@@ -353,10 +488,16 @@ module Geoptima
353
488
  def stats(header,*fields,&block)
354
489
  @stat_specs << StatSpec.new(header,*fields,&block)
355
490
  end
491
+ def kpi(header,*fields,&block)
492
+ @kpi_specs << KPISpec.new(header,*fields,&block)
493
+ end
356
494
  def add_stats(stats_manager,headers)
357
495
  stat_specs.each do |stat_spec|
358
496
  stat_spec.prepare_indices(stats_manager,headers)
359
497
  end
498
+ kpi_specs.each do |kpi_spec|
499
+ kpi_spec.prepare_indices(stats_manager,headers)
500
+ end
360
501
  end
361
502
  def add_fields(stats_manager,fields)
362
503
  puts "Adding fields to #{stat_specs.length} StatSpec's" if($debug)
@@ -364,9 +505,14 @@ module Geoptima
364
505
  puts "Adding fields to StatSpec: #{stat_spec}" if($debug)
365
506
  stat_spec.add(stats_manager,fields)
366
507
  end
508
+ puts "Adding fields to #{kpi_specs.length} KPISpec's" if($debug)
509
+ kpi_specs.each do |kpi_spec|
510
+ puts "Adding fields to KPISpec: #{kpi_spec}" if($debug)
511
+ kpi_spec.add(stats_manager,fields)
512
+ end
367
513
  end
368
514
  def to_s
369
- "Stats[#{@stat_specs.join(', ')}] AND Charts[#{@chart_specs.join(', ')}]"
515
+ "Stats[#{@stat_specs.join(', ')}] AND KPIs[#{@kpi_specs.join(', ')}] AND Charts[#{@chart_specs.join(', ')}]"
370
516
  end
371
517
  end
372
518
  end
@@ -468,7 +614,11 @@ end
468
614
  $stats_managers.each do |name,stats_manager|
469
615
  if $specs
470
616
  $specs.chart_specs.each do |chart_spec|
471
- chart_spec.process(stats_manager)
617
+ begin
618
+ chart_spec.process(stats_manager)
619
+ rescue NoMethodError
620
+ puts "Failed to process chart '#{chart_spec}': #{$!}"
621
+ end
472
622
  end
473
623
  end
474
624
  if $create_all
data/bin/csv_merge CHANGED
@@ -9,20 +9,24 @@ require 'geoptima/options'
9
9
  require 'fileutils'
10
10
  require 'geoptima/daterange'
11
11
 
12
- Geoptima::assert_version("0.1.3")
12
+ Geoptima::assert_version("0.1.4")
13
13
 
14
14
  $export_dir = '.'
15
15
  $export_name = 'merged.csv'
16
+ $split_by = :days
16
17
 
17
18
  $files = Geoptima::Options.process_args do |option|
18
19
  option.t {$time_split = true}
20
+ option.m {$low_memory = true}
19
21
  option.D {$export_dir = ARGV.shift}
20
22
  option.N {$export_name = ARGV.shift}
21
- option.T do
22
- $time_range = Geoptima::DateRange.new(*(ARGV.shift.split(/[\,]+/).map do |t|
23
- DateTime.parse t
24
- end))
23
+ option.S do
24
+ $split_by = case ARGV.shift.downcase.intern
25
+ when :days ; :days
26
+ else :days
27
+ end
25
28
  end
29
+ option.T {$time_range = Geoptima::DateRange.from ARGV.shift}
26
30
  end
27
31
 
28
32
  FileUtils.mkdir_p $export_dir
@@ -30,46 +34,216 @@ FileUtils.mkdir_p $export_dir
30
34
  $help = true unless($files.length>0)
31
35
  if $help
32
36
  puts <<EOHELP
33
- Usage: csv_chart <-dht> <-N name> <-D dir> <-T range> files...
37
+ Usage: csv_merge <-dhtm> <-N name> <-D dir> <-T range> <-S split_by> files...
34
38
  -d debug mode #{cw $debug}
35
39
  -h print this help #{cw $help}
36
- -t merge and split by time (days) #{cw $time_split}
37
- -N use specified name for merged dataset: #{$merged_name}
40
+ -t merge and split by time (#{$split_by}) #{cw $time_split}
41
+ -m use low memory, temporarily storing to intermediate files #{cw $low_memory}
42
+ -N use specified name for merged dataset: #{$export_name}
38
43
  -D export to specified directory: #{$export_dir}
44
+ -S time units to split exports by: #{$split_by}
39
45
  -T set time-range filter: #{$time_range}
40
46
  Files to import: #{$files.join(', ')}
41
47
  EOHELP
42
48
  exit
43
49
  end
44
50
 
45
- class CSVData
46
- attr_reader :headers
47
- def initialize(headers)
51
+ class CSVRecord
52
+ attr_reader :time, :fields, :day
53
+ def initialize(fields,time_index=0)
54
+ @fields = fields
55
+ @time = DateTime.parse(fields[time_index])
56
+ @day = @time.strftime("%Y-%m-%d")
57
+ end
58
+ def [](index)
59
+ fields[index]
60
+ end
61
+ def <=>(other)
62
+ time <=> other
63
+ end
64
+ def within(time_range)
65
+ time_range.nil? || time_range.include?(time)
48
66
  end
49
67
  end
50
68
 
51
- $files.each do |file|
52
- lines = 0
53
- headers = nil
54
- filename = File.basename(file)
55
- (names = filename.split(/[_\.]/)).pop
56
- name = $merge_all ? ($merged_name || 'All') : names.join('_')
57
- $stats_managers[name] ||= StatsManager.new(name)
58
- puts "About to read file #{file}"
59
- File.open(file).each do |line|
60
- lines += 1
61
- fields=line.chomp.split(/\t/)
62
- if headers
63
- puts "Processing line: #{line}" if($debug)
64
- $stats_managers[name].add_all(fields,headers)
69
+ class CSVDataset
70
+ attr_reader :filename, :headers, :day_map, :lines, :count, :record_creation_duration
71
+ def initialize(filename)
72
+ @filename = filename
73
+ @lines = []
74
+ @day_map = {}
75
+ @record_creation_duration = 0
76
+ @count = 0
77
+ @headers = nil
78
+ read_file do |fields|
79
+ add fields
80
+ end
81
+ end
82
+ def read_file
83
+ lines = 0
84
+ File.open(filename).each do |line|
85
+ fields=line.chomp.split(/\t/)
86
+ if lines > 0
87
+ puts "Processing line: #{line}" if($debug)
88
+ yield fields
89
+ else
90
+ if fields.length<2
91
+ puts "Too few headers, rejecting #{file}"
92
+ break
93
+ end
94
+ @headers ||= fields
95
+ end
96
+ lines += 1
97
+ end
98
+ @export_headers ||= @headers
99
+ end
100
+ def add(fields)
101
+ start_time = Time.new
102
+ line = create_line(fields)
103
+ if line
104
+ @day_map[line.day] ||= 0
105
+ @day_map[line.day] += 1
106
+ @lines << line unless($low_memory)
107
+ @count += 1
108
+ @record_creation_duration += Time.new - start_time
109
+ end
110
+ line
111
+ end
112
+ def create_line(fields)
113
+ begin
114
+ line = CSVRecord.new(fields,0)
115
+ if(line.within($time_range))
116
+ line
117
+ else
118
+ nil
119
+ end
120
+ rescue ArgumentError
121
+ puts "Failed to parse line with timestamp='#{fields[0]}': #{$!}"
122
+ end
123
+ end
124
+ def header_map(eh=nil)
125
+ if eh
126
+ @export_headers = eh
127
+ @header_map = nil
128
+ end
129
+ unless @header_map
130
+ @header_map = []
131
+ (@export_headers || @headers).each do |head|
132
+ @header_map << @headers.index(head)
133
+ end
134
+ end
135
+ @header_map
136
+ end
137
+ def map_line(line)
138
+ @header_map.map do |index|
139
+ index && line[index]
140
+ end
141
+ end
142
+ def days
143
+ @day_map.keys.sort
144
+ end
145
+ def each(eh=nil)
146
+ header_map(eh)
147
+ if $low_memory
148
+ read_file do |fields|
149
+ line = create_line fields
150
+ yield line.day,map_line(line)
151
+ end
65
152
  else
66
- headers = fields
67
- if headers.length<2
68
- puts "Too few headers, rejecting #{file}"
69
- break
153
+ (@lines || []).each do |line|
154
+ yield line.day,map_line(line)
70
155
  end
71
- $stats_managers[name].set_headers(headers)
72
156
  end
73
157
  end
158
+ def <=>(other)
159
+ self.filename <=> other.filename
160
+ end
161
+ def length
162
+ count
163
+ end
74
164
  end
75
165
 
166
+ class CSVDatasets
167
+ attr_reader :datasets
168
+ def initialize
169
+ @datasets = []
170
+ end
171
+ def add_file(file)
172
+ lines = 0
173
+ dataset = nil
174
+ filename = File.basename(file)
175
+ (names = filename.split(/[_\.]/)).pop
176
+ name = names.join('_')
177
+ puts "About to read file #{file}"
178
+ dataset = CSVDataset.new(file)
179
+ @datasets << dataset if(dataset && dataset.length>0)
180
+ dataset
181
+ end
182
+ def export_days
183
+ headers = @datasets.map{|d| d.headers}.flatten.uniq
184
+ days = @datasets.map{|d| d.days}.flatten.sort.uniq
185
+ day_files = {}
186
+ day_names = {}
187
+ count = {}
188
+ duration = {}
189
+ days.each do |day|
190
+ filename = "#{$export_dir}/#{$export_name.gsub(/\.csv$/,'')}_#{day}.csv"
191
+ puts "Exporting #{filename} for #{day}"
192
+ day_names[day] = filename
193
+ day_files[day] = File.open(filename,'w')
194
+ day_files[day].puts headers.join("\t")
195
+ count[day] = 0
196
+ duration[day] = 0
197
+ end
198
+ @datasets.sort.each do |dataset|
199
+ dataset.each(headers) do |day,line|
200
+ start_time = Time.new
201
+ day_files[day].puts line.join("\t")
202
+ duration[day] += Time.new - start_time
203
+ count[day] += 1
204
+ end
205
+ end
206
+ day_files.each do |day,out|
207
+ out.close
208
+ puts "\tExported #{count[day]} records to #{day_names[day]} in #{duration[day]} seconds"
209
+ end
210
+ end
211
+ def export_merged
212
+ headers = @datasets.map{|d| d.headers}.flatten.sort.uniq
213
+ filename = "#{$export_dir}/#{$export_name}"
214
+ File.open(filename,'w') do |out|
215
+ out.puts headers.join("\t")
216
+ @datasets.sort.each(headers) do |dataset|
217
+ dataset.each do |day,line|
218
+ out.puts line.join("\t")
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end
224
+
225
+ $datasets = CSVDatasets.new
226
+
227
+ $files.each do |file|
228
+ start_time = Time.new
229
+ ds = $datasets.add_file(file)
230
+ duration = Time.new - start_time
231
+ puts "\tLoaded #{file} in #{duration} seconds"
232
+ puts "\t#{(100.0 * ds.record_creation_duration.to_f/duration.to_f).to_i}% = #{ds.record_creation_duration}/#{duration} was spent creating records"
233
+ puts "\tFile contained #{ds.length} events for #{ds.days.length} days:"
234
+ ds.days.each do |day|
235
+ puts "\t\t#{day}: \t#{(100.0 * ds.day_map[day].to_f/ds.length.to_f).to_i}%\t#{ds.day_map[day]} records"
236
+ end
237
+ end
238
+
239
+ start_time = Time.new
240
+
241
+ if $time_split
242
+ $datasets.export_days
243
+ else
244
+ $datasets.export_merged
245
+ end
246
+
247
+ duration = Time.new - start_time
248
+ puts "Exported in #{duration} seconds"
249
+