RubyGems - geoptima - Versions diffs - 0.1.3 → 0.1.4 - Mend

geoptima 0.1.3 → 0.1.4

Files changed (10) hide show

data/bin/csv_chart CHANGED Viewed

@@ -10,7 +10,7 @@ require 'geoptima/options'
 require 'fileutils'
 require 'geoptima/daterange'
-Geoptima::assert_version("0.1.3")
+Geoptima::assert_version("0.1.4")
 Geoptima::Chart.available? || puts("No charting libraries available") || exit(-1)
 $export_dir = '.'
@@ -26,9 +26,7 @@ $files = Geoptima::Options.process_args do |option|
   option.S {$specfile = ARGV.shift}
   option.P {$diversity = ARGV.shift.to_f}
   option.W {$chart_width = ARGV.shift.to_i}
-  option.T do
-    $time_range = Geoptima::DateRange.from ARGV.shift
-  end
+  option.T {$time_range = Geoptima::DateRange.from ARGV.shift}
 end
 FileUtils.mkdir_p $export_dir
@@ -148,8 +146,10 @@ class StatsManager
 end
 module Geoptima
+  # Class for original stats approach of creating a new 'column' from simple combinations of other columns
   class StatSpec
-    attr_reader :header, :event, :index, :indices, :fields, :options, :proc, :groups
+    attr_reader :header, :event, :index, :indices, :fields, :options, :proc, :groups, :values
     def initialize(header,*fields,&block)
       @header = header
       @fields = fields
@@ -184,13 +184,13 @@ module Geoptima
             key = @group.call(time)
             ghead = "#{header} #{key}"
             @groups[key] = ghead
-            stats_manager.add(map(fields),ghead,nil)
+            stats_manager.add(map_fields(fields),ghead,nil)
           end
         rescue ArgumentError
           puts "Error: Unable to process time field[#{time}]: #{$!}"
         end
       end
-      stats_manager.add(map(fields),header,index)
+      stats_manager.add(map_fields(fields),header,index)
     end
     def div
       unless @div
@@ -224,18 +224,30 @@ module Geoptima
         val
       end
     end
-    def map(values,filter=nil)
+    def prepare_values(values)
+      @values = []
       if @indices
         puts "StatSpec[#{self}]: #{options.inspect}" if($debug)
-        vals = @indices.map{|i| values[i]}
-        puts "\tVALUES:      #{vals.inspect}" if($debug)
-        (options[:filter] || {}).each do |field,expected|
+        @values = @indices.map{|i| values[i]}
+        puts "\tVALUES:      #{values.inspect}" if($debug)
+      end
+      @values
+    end
+    def vals_for(values,filter={})
+      if @indices
+        prepare_values(values)
+        (options[:filter] || filter).each do |field,expected|
           puts "\t\tChecking if field #{field} is #{expected}" if($debug)
           puts "\t\tLooking for #{field} or #{event}.#{field} in #{@fields.inspect}" if($debug)
           hi = @fields.index(field.to_s) || @fields.index("#{event}.#{field}")
-          puts "\t\t#{field} -> #{hi} -> #{hi && vals[hi]}" if($debug)
-          return nil unless(hi && vals[hi] && (expected === vals[hi].downcase || vals[hi].downcase === expected.to_s.downcase))
+          puts "\t\t#{field} -> #{hi} -> #{hi && values[hi]}" if($debug)
+          return nil unless(hi && values[hi] && (expected === values[hi].downcase || values[hi].downcase === expected.to_s.downcase))
         end
+        values
+      end
+    end
+    def map_fields(values,filter={})
+      if vals = vals_for(values,filter)
         val = proc.nil? ? vals[0] : proc.call(*vals)
         puts "\tBLOCK MAP:   #{vals.inspect} --> #{val.inspect}" if($debug)
         if options[:div]
@@ -266,6 +278,128 @@ module Geoptima
       "#{header}[#{index}]<-#{fields.inspect}(#{indices && indices.join(',')})"
     end
   end
+  class Group
+    attr_reader :name, :options, :proc, :is_time, :index
+    def initialize(name,options={},&block)
+      @name = name
+      @options = options
+      @proc = block
+      @is_time = options[:is_time]
+    end
+    def index= (ind)
+      puts "Set group header index=#{ind} for group '#{name}'"
+      @index = ind
+    end
+    def call(time,values)
+      is_time && @proc.call(time) || @proc.call(values[index])
+    end
+  end
+  # The KPI class allows for complex statistics called 'Key Performance Indicators'.
+  # These are specified using four functions:
+  # filter: how to choose rows to include in the statistics (default is '!map.nil?')
+  # map: how to convert a row into the internal stats (default is input columns)
+  # aggregate: how to aggregate internal stats to higher levels (eg. daily, default is count)
+  # reduce: how to extract presentable stats from internal stats (eg. avg=total/count, default is internal stats)
+  #
+  # The KPI is defined with a name and set of columns to use, followed by the block
+  # defining the four functions above. For example:
+  #
+  # kpi 'DNS Success', 'dnsLookup.address', 'dnsLookup.error', 'dnsLookup.interface' do |f|
+  #   f.filter {|addr,err,int| addr =~/\w/}
+  #   f.map {|addr,err,int| err.length==0 ? [1,1] : [1,0]}
+  #   f.aggregate {|a,v| a[0]+=v[0];a[1]+=v[1];a}
+  #   f.reduce {|a| 100.0*a[1].to_f/a[0].to_f}
+  # end
+  #
+  # Currently this class extends StatSpec for access to the prepare_indices method.
+  # We should consider moving that to a mixin, or depreciating the StatSpec class
+  # entirely since KPISpec should provide a superset of features.
+  class KPISpec < StatSpec
+    def initialize(header,*fields,&block)
+      @header = header
+      @fields = fields
+      @event = @fields[0].split(/\./)[0]
+      block.call self unless(block.nil?)
+      if @fields[-1].is_a?(Hash)
+        @options = @fields.pop
+      else
+        @options = {}
+      end
+      @group_procs = []
+      @groups = {}
+      if @options[:group]
+        [@options[:group]].flatten.compact.sort.uniq.each do |group_name|
+          gname = group_name.to_s.intern
+          case gname
+          when :months
+            group_by(gname,true) {|t| t.strftime("%Y-%m")}
+          when :weeks
+            group_by(gname,true) {|t| t.strftime("%Y w%W")}
+          when :days
+            group_by(gname,true) {|t| t.strftime("%Y-%m-%d")}
+          when :hours
+            group_by(gname,true) {|t| t.strftime("%Y-%m-%d %H")}
+          else
+            group_by(gname) {|f| f}
+          end
+        end
+      end
+      puts "Created StatSpec: #{self}"
+    end
+    def group_by(field,is_time=false,&block)
+      @group_procs = Group.new(field,:is_time => is_time,&block)
+    end
+    def filter(&block)
+      @filter_proc = block
+    end
+    def map(&block)
+      @map_proc = block
+    end
+    def aggregate(&block)
+      @aggregate_proc = block
+    end
+    def reduce(&block)
+      @reduce_proc = block
+    end
+    def add(stats_manager,values)
+      prepare_values(values)
+      if @group_procs.length > 0
+        begin
+          time = DateTime.parse(values[stats_manager.time_index])
+          if $time_range.nil? || $time_range.include?(time)
+            key = @group_procs.inject(header) do |ghead,group|
+              key = @group.call(time,values)
+              ghead += " #{key}"
+            end
+            @groups[key] = ghead
+            stats_manager.add(map_fields(fields),ghead,nil)
+          end
+        rescue ArgumentError
+          puts "Error: Unable to process time field[#{time}]: #{$!}"
+        end
+      end
+      stats_manager.add(map_fields(fields),header,index)
+    end
+    def map_fields(values,filter=nil)
+      if values
+        if @filter_proc.nil? || @filter_proc.call(*values)
+          val = @map_proc && @map_proc.call(*values) || values[0]
+          puts "\tBLOCK MAP:   #{values.inspect} --> #{values.inspect}" if($debug)
+        end
+        val
+      end
+    end
+    def prepare_indices(stats_manager,headers)
+      super(stats_manager,headers)
+      @group_procs.each do |g|
+        g.index = fields.index(g.name)
+      end
+    end
+  end
+  # Class for specifications of individual charts
   class ChartSpec
     attr_reader :chart_type, :header, :options
     def initialize(header,options={})
@@ -328,14 +462,15 @@ module Geoptima
       g.write("#{$export_dir}/Chart_#{stats_manager.name}_#{header}_#{chart_type}_distribution.png")
     end
     def to_s
-      "#{chart_type.upcase}-#{header}"
+      "#{chart_type.to_s.upcase}-#{header}"
     end
   end
   class StatsSpecs
-    attr_reader :chart_specs, :stat_specs
+    attr_reader :chart_specs, :stat_specs, :kpi_specs
     def initialize(specfile)
       @chart_specs = []
       @stat_specs = []
+      @kpi_specs = []
       instance_eval(File.open(specfile).read)
     end
     def category_chart(header,options={})
@@ -353,10 +488,16 @@ module Geoptima
     def stats(header,*fields,&block)
       @stat_specs << StatSpec.new(header,*fields,&block)
     end
+    def kpi(header,*fields,&block)
+      @kpi_specs << KPISpec.new(header,*fields,&block)
+    end
     def add_stats(stats_manager,headers)
       stat_specs.each do |stat_spec|
         stat_spec.prepare_indices(stats_manager,headers)
       end
+      kpi_specs.each do |kpi_spec|
+        kpi_spec.prepare_indices(stats_manager,headers)
+      end
     end
     def add_fields(stats_manager,fields)
       puts "Adding fields to #{stat_specs.length} StatSpec's" if($debug)
@@ -364,9 +505,14 @@ module Geoptima
         puts "Adding fields to StatSpec: #{stat_spec}" if($debug)
         stat_spec.add(stats_manager,fields)
       end
+      puts "Adding fields to #{kpi_specs.length} KPISpec's" if($debug)
+      kpi_specs.each do |kpi_spec|
+        puts "Adding fields to KPISpec: #{kpi_spec}" if($debug)
+        kpi_spec.add(stats_manager,fields)
+      end
     end
     def to_s
-      "Stats[#{@stat_specs.join(', ')}] AND Charts[#{@chart_specs.join(', ')}]"
+      "Stats[#{@stat_specs.join(', ')}] AND KPIs[#{@kpi_specs.join(', ')}] AND Charts[#{@chart_specs.join(', ')}]"
     end
   end
 end
@@ -468,7 +614,11 @@ end
 $stats_managers.each do |name,stats_manager|
   if $specs
     $specs.chart_specs.each do |chart_spec|
-      chart_spec.process(stats_manager)
+      begin
+        chart_spec.process(stats_manager)
+      rescue NoMethodError
+        puts "Failed to process chart '#{chart_spec}': #{$!}"
+      end
     end
   end
   if $create_all

data/bin/csv_merge CHANGED Viewed

@@ -9,20 +9,24 @@ require 'geoptima/options'
 require 'fileutils'
 require 'geoptima/daterange'
-Geoptima::assert_version("0.1.3")
+Geoptima::assert_version("0.1.4")
 $export_dir = '.'
 $export_name = 'merged.csv'
+$split_by = :days
 $files = Geoptima::Options.process_args do |option|
   option.t {$time_split = true}
+  option.m {$low_memory = true}
   option.D {$export_dir = ARGV.shift}
   option.N {$export_name = ARGV.shift}
-  option.T do
-    $time_range = Geoptima::DateRange.new(*(ARGV.shift.split(/[\,]+/).map do |t|
-      DateTime.parse t
-    end))
+  option.S do
+    $split_by = case ARGV.shift.downcase.intern
+    when :days ; :days
+    else :days
+    end
   end
+  option.T {$time_range = Geoptima::DateRange.from ARGV.shift}
 end
 FileUtils.mkdir_p $export_dir
@@ -30,46 +34,216 @@ FileUtils.mkdir_p $export_dir
 $help = true unless($files.length>0)
 if $help
   puts <<EOHELP
-Usage: csv_chart <-dht> <-N name> <-D dir> <-T range> files...
+Usage: csv_merge <-dhtm> <-N name> <-D dir> <-T range> <-S split_by> files...
  -d  debug mode #{cw $debug}
  -h  print this help #{cw $help}
- -t  merge and split by time (days) #{cw $time_split}
- -N  use specified name for merged dataset: #{$merged_name}
+ -t  merge and split by time (#{$split_by}) #{cw $time_split}
+ -m  use low memory, temporarily storing to intermediate files #{cw $low_memory}
+ -N  use specified name for merged dataset: #{$export_name}
  -D  export to specified directory: #{$export_dir}
+ -S  time units to split exports by: #{$split_by}
  -T  set time-range filter: #{$time_range}
 Files to import: #{$files.join(', ')}
 EOHELP
   exit
 end
-class CSVData
-  attr_reader :headers
-  def initialize(headers)
+class CSVRecord
+  attr_reader :time, :fields, :day
+  def initialize(fields,time_index=0)
+    @fields = fields
+    @time = DateTime.parse(fields[time_index])
+    @day = @time.strftime("%Y-%m-%d")
+  end
+  def [](index)
+    fields[index]
+  end
+  def <=>(other)
+    time <=> other
+  end
+  def within(time_range)
+    time_range.nil? || time_range.include?(time)
   end
 end
-$files.each do |file|
-  lines = 0
-  headers = nil
-  filename = File.basename(file)
-  (names = filename.split(/[_\.]/)).pop
-  name = $merge_all ? ($merged_name || 'All') : names.join('_')
-  $stats_managers[name] ||= StatsManager.new(name)
-  puts "About to read file #{file}"
-  File.open(file).each do |line|
-    lines += 1
-    fields=line.chomp.split(/\t/)
-    if headers
-      puts "Processing line: #{line}" if($debug)
-      $stats_managers[name].add_all(fields,headers)
+class CSVDataset
+  attr_reader :filename, :headers, :day_map, :lines, :count, :record_creation_duration
+  def initialize(filename)
+    @filename = filename
+    @lines = []
+    @day_map = {}
+    @record_creation_duration = 0
+    @count = 0
+    @headers = nil
+    read_file do |fields|
+      add fields
+    end
+  end
+  def read_file
+    lines = 0
+    File.open(filename).each do |line|
+      fields=line.chomp.split(/\t/)
+      if lines > 0
+        puts "Processing line: #{line}" if($debug)
+        yield fields
+      else
+        if fields.length<2
+          puts "Too few headers, rejecting #{file}"
+          break
+        end
+        @headers ||= fields
+      end
+      lines += 1
+    end
+    @export_headers ||= @headers
+  end
+  def add(fields)
+    start_time = Time.new
+    line = create_line(fields)
+    if line
+      @day_map[line.day] ||= 0
+      @day_map[line.day] += 1
+      @lines << line unless($low_memory)
+      @count += 1
+      @record_creation_duration += Time.new - start_time
+    end
+    line
+  end
+  def create_line(fields)
+    begin
+      line = CSVRecord.new(fields,0)
+      if(line.within($time_range))
+        line
+      else
+        nil
+      end
+    rescue ArgumentError
+      puts "Failed to parse line with timestamp='#{fields[0]}': #{$!}"
+    end
+  end
+  def header_map(eh=nil)
+    if eh
+      @export_headers = eh
+      @header_map = nil
+    end
+    unless @header_map
+      @header_map = []
+      (@export_headers || @headers).each do |head|
+        @header_map << @headers.index(head)
+      end
+    end
+    @header_map
+  end
+  def map_line(line)
+    @header_map.map do |index|
+      index && line[index]
+    end
+  end
+  def days
+    @day_map.keys.sort
+  end
+  def each(eh=nil)
+    header_map(eh)
+    if $low_memory
+      read_file do |fields|
+        line = create_line fields
+        yield line.day,map_line(line)
+      end
     else
-      headers = fields
-      if headers.length<2
-        puts "Too few headers, rejecting #{file}"
-        break
+      (@lines || []).each do |line|
+        yield line.day,map_line(line)
       end
-      $stats_managers[name].set_headers(headers)
     end
   end
+  def <=>(other)
+    self.filename <=> other.filename
+  end
+  def length
+    count
+  end
 end
+class CSVDatasets
+  attr_reader :datasets
+  def initialize
+    @datasets = []
+  end
+  def add_file(file)
+    lines = 0
+    dataset = nil
+    filename = File.basename(file)
+    (names = filename.split(/[_\.]/)).pop
+    name = names.join('_')
+    puts "About to read file #{file}"
+    dataset = CSVDataset.new(file)
+    @datasets << dataset if(dataset && dataset.length>0)
+    dataset
+  end
+  def export_days
+    headers = @datasets.map{|d| d.headers}.flatten.uniq
+    days = @datasets.map{|d| d.days}.flatten.sort.uniq
+    day_files = {}
+    day_names = {}
+    count = {}
+    duration = {}
+    days.each do |day|
+      filename = "#{$export_dir}/#{$export_name.gsub(/\.csv$/,'')}_#{day}.csv"
+      puts "Exporting #{filename} for #{day}"
+      day_names[day] = filename
+      day_files[day] = File.open(filename,'w')
+      day_files[day].puts headers.join("\t")
+      count[day] = 0
+      duration[day] = 0
+    end
+    @datasets.sort.each do |dataset|
+      dataset.each(headers) do |day,line|
+        start_time = Time.new
+        day_files[day].puts line.join("\t")
+        duration[day] += Time.new - start_time
+        count[day] += 1
+      end
+    end
+    day_files.each do |day,out|
+      out.close
+      puts "\tExported #{count[day]} records to #{day_names[day]} in #{duration[day]} seconds"
+    end
+  end
+  def export_merged
+    headers = @datasets.map{|d| d.headers}.flatten.sort.uniq
+    filename = "#{$export_dir}/#{$export_name}"
+    File.open(filename,'w') do |out|
+      out.puts headers.join("\t")
+      @datasets.sort.each(headers) do |dataset|
+        dataset.each do |day,line|
+          out.puts line.join("\t")
+        end
+      end
+    end
+  end
+end
+$datasets = CSVDatasets.new
+$files.each do |file|
+  start_time = Time.new
+  ds = $datasets.add_file(file)
+  duration = Time.new - start_time
+  puts "\tLoaded #{file} in #{duration} seconds"
+  puts "\t#{(100.0 * ds.record_creation_duration.to_f/duration.to_f).to_i}% = #{ds.record_creation_duration}/#{duration} was spent creating records"
+  puts "\tFile contained #{ds.length} events for #{ds.days.length} days:"
+  ds.days.each do |day|
+    puts "\t\t#{day}: \t#{(100.0 * ds.day_map[day].to_f/ds.length.to_f).to_i}%\t#{ds.day_map[day]} records"
+  end
+end
+start_time = Time.new
+if $time_split
+  $datasets.export_days
+else
+  $datasets.export_merged
+end
+duration = Time.new - start_time
+puts "Exported in #{duration} seconds"