RubyGems - geoptima - Versions diffs - 0.1.3 → 0.1.4 - Mend

geoptima 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/bin/csv_chart CHANGED Viewed

@@ -10,7 +10,7 @@ require 'geoptima/options'
 require 'fileutils'
 require 'geoptima/daterange'
-Geoptima::assert_version("0.1.3")
+Geoptima::assert_version("0.1.4")
 Geoptima::Chart.available? || puts("No charting libraries available") || exit(-1)
 $export_dir = '.'
@@ -26,9 +26,7 @@ $files = Geoptima::Options.process_args do |option|
   option.S {$specfile = ARGV.shift}
   option.P {$diversity = ARGV.shift.to_f}
   option.W {$chart_width = ARGV.shift.to_i}
-  option.T do
-    $time_range = Geoptima::DateRange.from ARGV.shift
-  end
+  option.T {$time_range = Geoptima::DateRange.from ARGV.shift}
 end
 FileUtils.mkdir_p $export_dir
@@ -148,8 +146,10 @@ class StatsManager
 end
 module Geoptima
+  # Class for original stats approach of creating a new 'column' from simple combinations of other columns
   class StatSpec
-    attr_reader :header, :event, :index, :indices, :fields, :options, :proc, :groups
+    attr_reader :header, :event, :index, :indices, :fields, :options, :proc, :groups, :values
     def initialize(header,*fields,&block)
       @header = header
       @fields = fields
@@ -184,13 +184,13 @@ module Geoptima
             key = @group.call(time)
             ghead = "#{header} #{key}"
             @groups[key] = ghead
-            stats_manager.add(map(fields),ghead,nil)
+            stats_manager.add(map_fields(fields),ghead,nil)
           end
         rescue ArgumentError
           puts "Error: Unable to process time field[#{time}]: #{$!}"
         end
       end
-      stats_manager.add(map(fields),header,index)
+      stats_manager.add(map_fields(fields),header,index)
     end
     def div
       unless @div
@@ -224,18 +224,30 @@ module Geoptima
         val
       end
     end
-    def map(values,filter=nil)
+    def prepare_values(values)
+      @values = []
       if @indices
         puts "StatSpec[#{self}]: #{options.inspect}" if($debug)
-        vals = @indices.map{|i| values[i]}
-        puts "\tVALUES:      #{vals.inspect}" if($debug)
-        (options[:filter] || {}).each do |field,expected|
+        @values = @indices.map{|i| values[i]}
+        puts "\tVALUES:      #{values.inspect}" if($debug)
+      end
+      @values
+    end
+    def vals_for(values,filter={})
+      if @indices
+        prepare_values(values)
+        (options[:filter] || filter).each do |field,expected|
           puts "\t\tChecking if field #{field} is #{expected}" if($debug)
           puts "\t\tLooking for #{field} or #{event}.#{field} in #{@fields.inspect}" if($debug)
           hi = @fields.index(field.to_s) || @fields.index("#{event}.#{field}")
-          puts "\t\t#{field} -> #{hi} -> #{hi && vals[hi]}" if($debug)
-          return nil unless(hi && vals[hi] && (expected === vals[hi].downcase || vals[hi].downcase === expected.to_s.downcase))
+          puts "\t\t#{field} -> #{hi} -> #{hi && values[hi]}" if($debug)
+          return nil unless(hi && values[hi] && (expected === values[hi].downcase || values[hi].downcase === expected.to_s.downcase))
         end
+        values
+      end
+    end
+    def map_fields(values,filter={})
+      if vals = vals_for(values,filter)
         val = proc.nil? ? vals[0] : proc.call(*vals)
         puts "\tBLOCK MAP:   #{vals.inspect} --> #{val.inspect}" if($debug)
         if options[:div]
@@ -266,6 +278,128 @@ module Geoptima
       "#{header}[#{index}]<-#{fields.inspect}(#{indices && indices.join(',')})"
     end
   end
+  class Group
+    attr_reader :name, :options, :proc, :is_time, :index
+    def initialize(name,options={},&block)
+      @name = name
+      @options = options
+      @proc = block
+      @is_time = options[:is_time]
+    end
+    def index= (ind)
+      puts "Set group header index=#{ind} for group '#{name}'"
+      @index = ind
+    end
+    def call(time,values)
+      is_time && @proc.call(time) || @proc.call(values[index])
+    end
+  end
+  # The KPI class allows for complex statistics called 'Key Performance Indicators'.
+  # These are specified using four functions:
+  # filter: how to choose rows to include in the statistics (default is '!map.nil?')
+  # map: how to convert a row into the internal stats (default is input columns)
+  # aggregate: how to aggregate internal stats to higher levels (eg. daily, default is count)
+  # reduce: how to extract presentable stats from internal stats (eg. avg=total/count, default is internal stats)
+  #
+  # The KPI is defined with a name and set of columns to use, followed by the block
+  # defining the four functions above. For example:
+  #
+  # kpi 'DNS Success', 'dnsLookup.address', 'dnsLookup.error', 'dnsLookup.interface' do |f|
+  #   f.filter {|addr,err,int| addr =~/\w/}
+  #   f.map {|addr,err,int| err.length==0 ? [1,1] : [1,0]}
+  #   f.aggregate {|a,v| a[0]+=v[0];a[1]+=v[1];a}
+  #   f.reduce {|a| 100.0*a[1].to_f/a[0].to_f}
+  # end
+  #
+  # Currently this class extends StatSpec for access to the prepare_indices method.
+  # We should consider moving that to a mixin, or depreciating the StatSpec class
+  # entirely since KPISpec should provide a superset of features.
+  class KPISpec < StatSpec
+    def initialize(header,*fields,&block)
+      @header = header
+      @fields = fields
+      @event = @fields[0].split(/\./)[0]
+      block.call self unless(block.nil?)
+      if @fields[-1].is_a?(Hash)
+        @options = @fields.pop
+      else
+        @options = {}
+      end
+      @group_procs = []
+      @groups = {}
+      if @options[:group]
+        [@options[:group]].flatten.compact.sort.uniq.each do |group_name|
+          gname = group_name.to_s.intern
+          case gname
+          when :months
+            group_by(gname,true) {|t| t.strftime("%Y-%m")}
+          when :weeks
+            group_by(gname,true) {|t| t.strftime("%Y w%W")}
+          when :days
+            group_by(gname,true) {|t| t.strftime("%Y-%m-%d")}
+          when :hours
+            group_by(gname,true) {|t| t.strftime("%Y-%m-%d %H")}
+          else
+            group_by(gname) {|f| f}
+          end
+        end
+      end
+      puts "Created StatSpec: #{self}"
+    end
+    def group_by(field,is_time=false,&block)
+      @group_procs = Group.new(field,:is_time => is_time,&block)
+    end
+    def filter(&block)
+      @filter_proc = block
+    end
+    def map(&block)
+      @map_proc = block
+    end
+    def aggregate(&block)
+      @aggregate_proc = block
+    end
+    def reduce(&block)
+      @reduce_proc = block
+    end
+    def add(stats_manager,values)
+      prepare_values(values)
+      if @group_procs.length > 0
+        begin
+          time = DateTime.parse(values[stats_manager.time_index])
+          if $time_range.nil? || $time_range.include?(time)
+            key = @group_procs.inject(header) do |ghead,group|
+              key = @group.call(time,values)
+              ghead += " #{key}"
+            end
+            @groups[key] = ghead
+            stats_manager.add(map_fields(fields),ghead,nil)
+          end
+        rescue ArgumentError
+          puts "Error: Unable to process time field[#{time}]: #{$!}"
+        end
+      end
+      stats_manager.add(map_fields(fields),header,index)
+    end
+    def map_fields(values,filter=nil)
+      if values
+        if @filter_proc.nil? || @filter_proc.call(*values)
+          val = @map_proc && @map_proc.call(*values) || values[0]
+          puts "\tBLOCK MAP:   #{values.inspect} --> #{values.inspect}" if($debug)
+        end
+        val
+      end
+    end
+    def prepare_indices(stats_manager,headers)
+      super(stats_manager,headers)
+      @group_procs.each do |g|
+        g.index = fields.index(g.name)
+      end
+    end
+  end
+  # Class for specifications of individual charts
   class ChartSpec
     attr_reader :chart_type, :header, :options
     def initialize(header,options={})
@@ -328,14 +462,15 @@ module Geoptima
       g.write("#{$export_dir}/Chart_#{stats_manager.name}_#{header}_#{chart_type}_distribution.png")
     end
     def to_s
-      "#{chart_type.upcase}-#{header}"
+      "#{chart_type.to_s.upcase}-#{header}"
     end
   end
   class StatsSpecs
-    attr_reader :chart_specs, :stat_specs
+    attr_reader :chart_specs, :stat_specs, :kpi_specs
     def initialize(specfile)
       @chart_specs = []
       @stat_specs = []
+      @kpi_specs = []
       instance_eval(File.open(specfile).read)
     end
     def category_chart(header,options={})
@@ -353,10 +488,16 @@ module Geoptima
     def stats(header,*fields,&block)
       @stat_specs << StatSpec.new(header,*fields,&block)
     end
+    def kpi(header,*fields,&block)
+      @kpi_specs << KPISpec.new(header,*fields,&block)
+    end
     def add_stats(stats_manager,headers)
       stat_specs.each do |stat_spec|
         stat_spec.prepare_indices(stats_manager,headers)
       end
+      kpi_specs.each do |kpi_spec|
+        kpi_spec.prepare_indices(stats_manager,headers)
+      end
     end
     def add_fields(stats_manager,fields)
       puts "Adding fields to #{stat_specs.length} StatSpec's" if($debug)
@@ -364,9 +505,14 @@ module Geoptima
         puts "Adding fields to StatSpec: #{stat_spec}" if($debug)
         stat_spec.add(stats_manager,fields)
       end
+      puts "Adding fields to #{kpi_specs.length} KPISpec's" if($debug)
+      kpi_specs.each do |kpi_spec|
+        puts "Adding fields to KPISpec: #{kpi_spec}" if($debug)
+        kpi_spec.add(stats_manager,fields)
+      end
     end
     def to_s
-      "Stats[#{@stat_specs.join(', ')}] AND Charts[#{@chart_specs.join(', ')}]"
+      "Stats[#{@stat_specs.join(', ')}] AND KPIs[#{@kpi_specs.join(', ')}] AND Charts[#{@chart_specs.join(', ')}]"
     end
   end
 end
@@ -468,7 +614,11 @@ end
 $stats_managers.each do |name,stats_manager|
   if $specs
     $specs.chart_specs.each do |chart_spec|
-      chart_spec.process(stats_manager)
+      begin
+        chart_spec.process(stats_manager)
+      rescue NoMethodError
+        puts "Failed to process chart '#{chart_spec}': #{$!}"
+      end
     end
   end
   if $create_all

data/bin/csv_merge CHANGED Viewed

@@ -9,20 +9,24 @@ require 'geoptima/options'
 require 'fileutils'
 require 'geoptima/daterange'
-Geoptima::assert_version("0.1.3")
+Geoptima::assert_version("0.1.4")
 $export_dir = '.'
 $export_name = 'merged.csv'
+$split_by = :days
 $files = Geoptima::Options.process_args do |option|
   option.t {$time_split = true}
+  option.m {$low_memory = true}
   option.D {$export_dir = ARGV.shift}
   option.N {$export_name = ARGV.shift}
-  option.T do
-    $time_range = Geoptima::DateRange.new(*(ARGV.shift.split(/[\,]+/).map do |t|
-      DateTime.parse t
-    end))
+  option.S do
+    $split_by = case ARGV.shift.downcase.intern
+    when :days ; :days
+    else :days
+    end
   end
+  option.T {$time_range = Geoptima::DateRange.from ARGV.shift}
 end
 FileUtils.mkdir_p $export_dir
@@ -30,46 +34,216 @@ FileUtils.mkdir_p $export_dir
 $help = true unless($files.length>0)
 if $help
   puts <<EOHELP
-Usage: csv_chart <-dht> <-N name> <-D dir> <-T range> files...
+Usage: csv_merge <-dhtm> <-N name> <-D dir> <-T range> <-S split_by> files...
  -d  debug mode #{cw $debug}
  -h  print this help #{cw $help}
- -t  merge and split by time (days) #{cw $time_split}
- -N  use specified name for merged dataset: #{$merged_name}
+ -t  merge and split by time (#{$split_by}) #{cw $time_split}
+ -m  use low memory, temporarily storing to intermediate files #{cw $low_memory}
+ -N  use specified name for merged dataset: #{$export_name}
  -D  export to specified directory: #{$export_dir}
+ -S  time units to split exports by: #{$split_by}
  -T  set time-range filter: #{$time_range}
 Files to import: #{$files.join(', ')}
 EOHELP
   exit
 end
-class CSVData
-  attr_reader :headers
-  def initialize(headers)
+class CSVRecord
+  attr_reader :time, :fields, :day
+  def initialize(fields,time_index=0)
+    @fields = fields
+    @time = DateTime.parse(fields[time_index])
+    @day = @time.strftime("%Y-%m-%d")
+  end
+  def [](index)
+    fields[index]
+  end
+  def <=>(other)
+    time <=> other
+  end
+  def within(time_range)
+    time_range.nil? || time_range.include?(time)
   end
 end
-$files.each do |file|
-  lines = 0
-  headers = nil
-  filename = File.basename(file)
-  (names = filename.split(/[_\.]/)).pop
-  name = $merge_all ? ($merged_name || 'All') : names.join('_')
-  $stats_managers[name] ||= StatsManager.new(name)
-  puts "About to read file #{file}"
-  File.open(file).each do |line|
-    lines += 1
-    fields=line.chomp.split(/\t/)
-    if headers
-      puts "Processing line: #{line}" if($debug)
-      $stats_managers[name].add_all(fields,headers)
+class CSVDataset
+  attr_reader :filename, :headers, :day_map, :lines, :count, :record_creation_duration
+  def initialize(filename)
+    @filename = filename
+    @lines = []
+    @day_map = {}
+    @record_creation_duration = 0
+    @count = 0
+    @headers = nil
+    read_file do |fields|
+      add fields
+    end
+  end
+  def read_file
+    lines = 0
+    File.open(filename).each do |line|
+      fields=line.chomp.split(/\t/)
+      if lines > 0
+        puts "Processing line: #{line}" if($debug)
+        yield fields
+      else
+        if fields.length<2
+          puts "Too few headers, rejecting #{file}"
+          break
+        end
+        @headers ||= fields
+      end
+      lines += 1
+    end
+    @export_headers ||= @headers
+  end
+  def add(fields)
+    start_time = Time.new
+    line = create_line(fields)
+    if line
+      @day_map[line.day] ||= 0
+      @day_map[line.day] += 1
+      @lines << line unless($low_memory)
+      @count += 1
+      @record_creation_duration += Time.new - start_time
+    end
+    line
+  end
+  def create_line(fields)
+    begin
+      line = CSVRecord.new(fields,0)
+      if(line.within($time_range))
+        line
+      else
+        nil
+      end
+    rescue ArgumentError
+      puts "Failed to parse line with timestamp='#{fields[0]}': #{$!}"
+    end
+  end
+  def header_map(eh=nil)
+    if eh
+      @export_headers = eh
+      @header_map = nil
+    end
+    unless @header_map
+      @header_map = []
+      (@export_headers || @headers).each do |head|
+        @header_map << @headers.index(head)
+      end
+    end
+    @header_map
+  end
+  def map_line(line)
+    @header_map.map do |index|
+      index && line[index]
+    end
+  end
+  def days
+    @day_map.keys.sort
+  end
+  def each(eh=nil)
+    header_map(eh)
+    if $low_memory
+      read_file do |fields|
+        line = create_line fields
+        yield line.day,map_line(line)
+      end
     else
-      headers = fields
-      if headers.length<2
-        puts "Too few headers, rejecting #{file}"
-        break
+      (@lines || []).each do |line|
+        yield line.day,map_line(line)
       end
-      $stats_managers[name].set_headers(headers)
     end
   end
+  def <=>(other)
+    self.filename <=> other.filename
+  end
+  def length
+    count
+  end
 end
+class CSVDatasets
+  attr_reader :datasets
+  def initialize
+    @datasets = []
+  end
+  def add_file(file)
+    lines = 0
+    dataset = nil
+    filename = File.basename(file)
+    (names = filename.split(/[_\.]/)).pop
+    name = names.join('_')
+    puts "About to read file #{file}"
+    dataset = CSVDataset.new(file)
+    @datasets << dataset if(dataset && dataset.length>0)
+    dataset
+  end
+  def export_days
+    headers = @datasets.map{|d| d.headers}.flatten.uniq
+    days = @datasets.map{|d| d.days}.flatten.sort.uniq
+    day_files = {}
+    day_names = {}
+    count = {}
+    duration = {}
+    days.each do |day|
+      filename = "#{$export_dir}/#{$export_name.gsub(/\.csv$/,'')}_#{day}.csv"
+      puts "Exporting #{filename} for #{day}"
+      day_names[day] = filename
+      day_files[day] = File.open(filename,'w')
+      day_files[day].puts headers.join("\t")
+      count[day] = 0
+      duration[day] = 0
+    end
+    @datasets.sort.each do |dataset|
+      dataset.each(headers) do |day,line|
+        start_time = Time.new
+        day_files[day].puts line.join("\t")
+        duration[day] += Time.new - start_time
+        count[day] += 1
+      end
+    end
+    day_files.each do |day,out|
+      out.close
+      puts "\tExported #{count[day]} records to #{day_names[day]} in #{duration[day]} seconds"
+    end
+  end
+  def export_merged
+    headers = @datasets.map{|d| d.headers}.flatten.sort.uniq
+    filename = "#{$export_dir}/#{$export_name}"
+    File.open(filename,'w') do |out|
+      out.puts headers.join("\t")
+      @datasets.sort.each(headers) do |dataset|
+        dataset.each do |day,line|
+          out.puts line.join("\t")
+        end
+      end
+    end
+  end
+end
+$datasets = CSVDatasets.new
+$files.each do |file|
+  start_time = Time.new
+  ds = $datasets.add_file(file)
+  duration = Time.new - start_time
+  puts "\tLoaded #{file} in #{duration} seconds"
+  puts "\t#{(100.0 * ds.record_creation_duration.to_f/duration.to_f).to_i}% = #{ds.record_creation_duration}/#{duration} was spent creating records"
+  puts "\tFile contained #{ds.length} events for #{ds.days.length} days:"
+  ds.days.each do |day|
+    puts "\t\t#{day}: \t#{(100.0 * ds.day_map[day].to_f/ds.length.to_f).to_i}%\t#{ds.day_map[day]} records"
+  end
+end
+start_time = Time.new
+if $time_split
+  $datasets.export_days
+else
+  $datasets.export_merged
+end
+duration = Time.new - start_time
+puts "Exported in #{duration} seconds"