RubyGems - data_tools - Versions diffs - 0.6.0 - Mend

data_tools 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data/.gitignore +5 -0
data/Gemfile +4 -0
data/Gemfile.lock +28 -0
data/README.md +7 -0
data/Rakefile +8 -0
data/data_tools.gemspec +26 -0
data/lib/data_tools.rb +42 -0
data/lib/data_tools/array.rb +42 -0
data/lib/data_tools/array_of_hashes.rb +219 -0
data/lib/data_tools/comparator.rb +85 -0
data/lib/data_tools/conversions.rb +46 -0
data/lib/data_tools/enumerator.rb +13 -0
data/lib/data_tools/file.rb +5 -0
data/lib/data_tools/hash.rb +226 -0
data/lib/data_tools/hash_of_arrays.rb +20 -0
data/lib/data_tools/io.rb +55 -0
data/lib/data_tools/object.rb +5 -0
data/lib/data_tools/rules.rb +39 -0
data/lib/data_tools/string.rb +14 -0
data/lib/data_tools/symbol.rb +14 -0
data/lib/data_tools/transformations.rb +51 -0
data/lib/data_tools/version.rb +3 -0
data/spec/array_spec.rb +31 -0
data/spec/comparator_spec.rb +59 -0
data/spec/hash_of_arrays_spec.rb +18 -0
data/spec/import_spec.rb +15 -0
data/spec/spec_helper.rb +5 -0
metadata +120 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,5 @@
+.irb_history
+.bundle
+pkg
+todo.txt
+.DS_Store

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source :rubygems
+# Specify your gem's dependencies in data_tools.gemspec
+gemspec

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,28 @@
+PATH
+  remote: .
+  specs:
+    data_tools (0.6.0)
+      awesome_print
+      facets
+GEM
+  remote: http://rubygems.org/
+  specs:
+    awesome_print (1.1.0)
+    diff-lcs (1.1.3)
+    facets (2.9.3)
+    rspec (2.12.0)
+      rspec-core (~> 2.12.0)
+      rspec-expectations (~> 2.12.0)
+      rspec-mocks (~> 2.12.0)
+    rspec-core (2.12.2)
+    rspec-expectations (2.12.1)
+      diff-lcs (~> 1.1.3)
+    rspec-mocks (2.12.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  data_tools!
+  rspec

data/README.md ADDED Viewed

@@ -0,0 +1,7 @@
+# README for data_tools
+Miscellaneous data-munging utility functions.
+## Array
+This is really an Array-of-Hashes.

data/Rakefile ADDED Viewed

@@ -0,0 +1,8 @@
+require 'bundler'
+Bundler::GemHelper.install_tasks
+require "rake"
+require "rspec/core/rake_task"
+desc "Run all RSpec tests"
+RSpec::Core::RakeTask.new(:spec)

data/data_tools.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+# -*- encoding: utf-8 -*-
+$LOAD_PATH.unshift File.expand_path('../lib', __FILE__)
+require 'data_tools/version'
+Gem::Specification.new do |s|
+  s.name        = "data_tools"
+  s.version     = DataTools::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors = ["Jason May"]
+  s.email = %q{jmay@pobox.com}
+  s.homepage    = "http://github.com/jmay/data_tools"
+  s.summary = %q{Miscellaneous data-munging utilities.}
+  s.description = %q{Data-munging utilities, including extensions to Array, Hash, String, Symbol plus data conversions and transformations.}
+  # s.required_rubygems_version = ">= 1.3.6"
+  s.rubyforge_project         = "data_tools"
+  s.add_dependency 'awesome_print'
+  s.add_dependency 'facets'
+  s.add_development_dependency "rspec"
+  s.files        = `git ls-files`.split("\n")
+  s.executables  = `git ls-files`.split("\n").select{|f| f =~ /^bin/}
+  s.require_path = 'lib'
+end

data/lib/data_tools.rb ADDED Viewed

@@ -0,0 +1,42 @@
+require "ap"
+require "set"
+require "csv"
+require "facets" # for Hash#delete_values
+module DataTools
+  def self.reload!
+    $".grep(/data_tools/).each {|f| load(f)}
+  end
+  def DataTools.scour(s, opts)
+    case s
+    when nil
+      nil
+    when String
+      s2 = s.strip.gsub(/\s+/, ' ').gsub(/^"/, '').gsub(/"$/, '')
+      if s2 =~ /^[\d]+(\.[\d]+){0,1}$/
+        # looks numeric
+        s2 = s2.to_i.to_s
+      end
+      (s2.empty? || opts[:junkwords].include?(s2)) ? nil : s2
+    when Numeric
+      s.to_s
+    else
+      s.to_s
+    end
+  end
+end
+[
+  "version",
+  "array", "hash",
+  "array_of_hashes", "hash_of_arrays",
+  "enumerator",
+  "comparator",
+  "object", "string", "symbol",
+  "file", "io",
+  "rules",
+  "conversions", "transformations"
+].each do |file|
+    require File.dirname(__FILE__) + "/data_tools/#{file}"
+end

data/lib/data_tools/array.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module DataTools::Array
+  # turns an array-of-arrays into an array-of-hashes
+  # the headers are used as names for the fields
+  # OK for rows to have fewer fields than the header record, but must not be longer
+  def hashify(headers = shift)
+    # ignore leading/trailing whitespace in header labels
+    headers.each {|hdr| hdr.strip! if hdr === String}
+    select {|row| row.any?}.map do |row|
+      raise "Row count mismatch: #{row}" if row.count > headers.count
+      hash = {}
+      row.zip(headers) do |v,k|
+        # ignore any keys with missing values
+        # remove leading/trailing whitespace from values
+        hash[k] = v.strip unless v.blank?
+      end
+      hash
+    end
+  end
+  # ARRAY OF SCALARS
+  # apply an operation (block) to every member of the array
+  # return the list of unique results
+  # if there is just one result, convert to a scalar value
+  def resolve(&block)
+    values = map {|v| block.call(v)}.uniq
+    values.count <= 1 ? values.first : values
+  end
+  # marshal (ruby-specific binary format) the contents of this structure to a file
+  # fails if file exists
+  def dumpme(filename)
+    raise "#{filename} exists" if File.exists?(filename)
+    File.open(filename, "w") {|f| f << Marshal.dump(self)}
+  end
+  # same as #dumpme but overwrites existing file
+  def dumpme!(filename)
+    File.unlink(filename) if File.exists?(filename)
+    File.open(filename, "w") {|f| f << Marshal.dump(self)}
+  end
+end

data/lib/data_tools/array_of_hashes.rb ADDED Viewed

@@ -0,0 +1,219 @@
+module DataTools::ArrayOfHashes
+  # convert an array of hashes to a hash of the same hashes
+  # where the key values are picked from the hashes
+  # the keys can be single fields, or an array, or a list
+  # options:
+  #   :multi (boolean, default false): if true, allow multiple values per key; store values as an array for each key
+  #   :first (boolean, default false): if true, when finding multiple values per key, store only the first and ignore the rest
+  #   :truncate (integer): see `Hash#key_for`
+  #
+  def key_on(*keyarray)
+    raise "Key(s) required" if keyarray.empty?
+    opts = keyarray.last.is_a?(Hash) ? keyarray.pop : {}
+    keyarray = keyarray.flatten
+    memo = opts[:multi] ? Hash.new {|h,k| h[k] = []} : Hash.new
+    each do |hash|
+      this_key = hash.key_for(keyarray, opts)
+      raise "Missing value for #{keyarray} in record #{hash}" if this_key.nil?
+      if opts[:multi]
+        memo[this_key] << hash
+      elsif opts[:first]
+        # ignore this value if we already have one for this key
+        if !memo.has_key?(this_key)
+          memo[this_key] = hash
+        end
+      else
+        raise "Found duplicate #{keyarray} in #{memo[this_key]} vs #{hash}" if memo.has_key?(this_key)
+        memo[this_key] = hash
+      end
+      memo
+    end
+    memo.extend DataTools::HashOfArrays
+    memo.default = nil
+    memo
+  end
+  # shorthand for `Array#select {|hash| hash[...] && hash[...] ...}`
+  # find all the members of the array where all the specified criteria are true
+  def where(conditions)
+    newhash = case conditions
+    when Hash
+      select do |record|
+        conditions.map do |k,v|
+          case v
+          when Regexp
+            record[k] =~ v
+          when TrueClass
+            !record[k].nil?
+          when FalseClass
+            record[k].nil?
+          else
+            record[k] == v
+          end
+        end.reduce(:&) # all tests must pass
+      end
+    when String,Symbol
+      # just check for presence & non-nil value of specified key
+      select {|record| record[conditions]}
+    end
+    newhash.extend DataTools::ArrayOfHashes
+  end
+  # are all the values for `key` defined and unique?
+  def unique?(*keyarray)
+    raise "Key(s) required" if keyarray.empty?
+    keyarray = keyarray.flatten
+    keys = map {|hash| hash.key_for(keyarray)}
+    return false if keys.any?(&:nil?)
+    keys.uniq.count == self.count
+  end
+  def unique_values_for(*keyarray)
+    raise "Key(s) required" if keyarray.empty?
+    keyarray = keyarray.flatten
+    map {|hash| hash.key_for(keyarray)}.to_set
+  end
+  # assign unique IDs to every hash in the array
+  # argument is the name of the field to use for the generated sequential key
+  def count_off!(key = :key, start = 0)
+    raise "Values exist for [#{key}]" if any?{|h| h[key]}
+    each_with_index do |hash, i|
+      hash[key] = i + start
+    end
+    self
+  end
+  def redundant(*keyarray)
+    key_on(keyarray, :multi => true).select {|k,v| v.count > 1}
+  end
+  # combine a set of hashes into one
+  # for each key, find all the distinct values from all the hashes
+  # if there's one unique value, store the single value in key of the result
+  # if there are multiple values, store them all as an array
+  def coalesce
+    allkeys = map {|h| h.keys}.flatten.uniq
+    allkeys.reduce({}) do |memo,key|
+      memo[key] = map {|h| h[key]}.compact.uniq
+      memo[key] = memo[key].first if memo[key].count <= 1
+      memo
+    end
+  end
+  # apply the same resolution operation to every hash in the list
+  def resolve_all(key, &block)
+    map do |hash|
+      hash = hash.dup
+      hash[key] = hash[key].resolve(&block)
+      hash
+    end
+  end
+  # marshal (ruby-specific binary format) the contents of this structure to a file
+  # fails if file exists
+  def dumpme(filename)
+    raise "#{filename} exists" if File.exists?(filename)
+    File.open(filename, "w") {|f| f << Marshal.dump(self)}
+  end
+  # same as #dumpme but overwrites existing file
+  def dumpme!(filename)
+    File.unlink(filename) if File.exists?(filename)
+    File.open(filename, "w") {|f| f << Marshal.dump(self)}
+  end
+  # attempt to dump out contents of this array-of-hashes as CSV to named file
+  # fields is list of attribute names to write out
+  # options headers is public names for the fields
+  def csvme(filename, fields, headers = fields)
+    CSV.open(filename, "wb") do |csv|
+      csv << headers unless headers.nil?
+      pluck(fields).each do |ary|
+        csv << ary
+      end
+    end
+    true
+  end
+  def tsvme(filename, fields, headers = fields)
+    File.open(target) do |output|
+      output.puts headers.join("\t")
+      pluck(fields).each do |ary|
+        output.puts ary.join("\t")
+      end
+    end
+    true
+  end
+  # What different keys appear in this collection of hashes?
+  def allkeys
+    each_with_object({}) do |h, memo|
+      h.keys.each {|k| memo[k] += 1}
+    end.keys
+  end
+  def metrics
+    allkeys.reduce({}) do |m,k|
+      values = self.map {|h| h[k]}
+      m[k] = {
+        :non_nil => values.compact.count,
+        :nil => values.count - values.compact.count,
+        :unique => values.uniq.count
+      }
+      if m[k][:unique] <= 10
+        m[k][:values] = histogram(k)
+      end
+      m
+    end
+  end
+  def numify!(*keyarray)
+    each {|h| h.numify!(*keyarray)}
+  end
+  def nilify!(keyvalue)
+    each {|h| h.nilify!(keyvalue)}
+  end
+  # return histogram of value distribution for the specified key: hash of value/count pairs
+  def histogram(*args, &block)
+    reduce(Hash.new(0)) do |hist, h|
+      if block_given?
+        v = yield(h)
+      else
+        v = h[args.first]
+      end
+      hist[v] += 1
+      hist
+    end
+  end
+  # hash slice for all the named attributes from each hashes in the array
+  def subset(*keys)
+    keys = keys.flatten
+    map {|h| h.subset(keys) }
+  end
+  # pull out all the named attributes from the hashes in the array (into array-of-arrays)
+  def pluck(*keys)
+    keys = keys.flatten
+    map {|h| h.pluck(keys)}
+    # if keys.count > 1
+    #   map {|h| keys.map {|k| h[k]}}
+    # else
+    #   map {|h| h[keys.first]}
+    # end
+  end
+  # For each record, output a subset of the values as an array (suitable for passing to `#to_csv`)
+  # supports hierarchical subkeys (e.g. :master:id or "master:name")
+  def project(args)
+    defaults = args[:defaults] || {}
+    map do |h|
+      args[:keys].map do |k|
+        (k.splitkey? && (deref = h[k.superkey]) && deref[k.subkey]) || h[k] || defaults[k] || args[:nilvalue]
+      end
+    end
+  end
+end

data/lib/data_tools/comparator.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# MULTI-MATCHING via components
+# go through all users
+# group by distinct sets of components
+# pick a (small) subset of component-keys, say <10.  Maybe random sample?
+# build a set of matching rules
+# run the subset * the full corpus * the matching rules
+class Comparator
+  attr_reader :corpus
+  def initialize(corpus)
+    @corpus = corpus
+    prep_missing_initials
+  end
+  def crunch(record)
+    (@corpus - [record]).each_with_object([]) do |candidate,matches|
+      if evaluate(record, candidate)
+        matches << candidate
+      end
+    end
+  end
+  def evaluate(record, candidate)
+    [:missing_initials].each do |rule|
+      return true if send(rule, record, candidate)
+    end
+    false
+  end
+  # don't need an 'identical' test - assuming that the input record does not appear in the corpus
+  # def identical(a,b)
+  #   a == b
+  # end
+  # must have at least 2 long (non-initial-only) components in each
+  # those long parts must be identical
+  # only one of the names can have any initials
+  def missing_initials(a,b)
+    longnames_a = a.select {|s| s.length > 1}
+    longnames_b = b.select {|s| s.length > 1}
+    inits_a = a.select {|s| s.length == 1}
+    inits_b = b.select {|s| s.length == 1}
+    longnames_a.count >= 2 && longnames_b.count >= 2 && longnames_a == longnames_b && (inits_a.empty? || inits_b.empty?)
+  end
+  def prep_missing_initials
+    @corpus_missing_initials = corpus.each_with_object(Set.new) do |rec,set|
+      without_initials = rec.select {|s| s.length > 1}
+      if without_initials.count >= 2
+        set << without_initials
+      end
+    end
+  end
+  # must have at least 1 long (non-initial-only) component in each
+  # those long parts must be identical
+  # all initials should correspond to non-matched longnames in the other input
+  def matching_initials(a,b)
+    longnames_a = a.select {|s| s.length > 1}
+    longnames_b = b.select {|s| s.length > 1}
+    inits_a = a.select {|s| s.length == 1}
+    inits_b = b.select {|s| s.length == 1}
+    return false unless longnames_a.count >= 1 && longnames_b.count >= 1
+    unmatched_longnames_a = longnames_a - longnames_b
+    unmatched_longnames_b = longnames_b - longnames_a
+    unmatched_inits_a = unmatched_longnames_a.map {|s| s[0]}
+    unmatched_inits_b = unmatched_longnames_b.map {|s| s[0]}
+    inits_a == unmatched_inits_b && inits_b == unmatched_inits_a
+  end
+  # ignore any initials. look for cases where there is exactly one name component that differs between the inputs.
+  def matching_all_but_one(a,b)
+    longnames_a = a.select {|s| s.length > 1}
+    longnames_b = b.select {|s| s.length > 1}
+    ((longnames_a | longnames_b) - (longnames_a & longnames_b)).count == 1
+  end
+end

data/lib/data_tools/conversions.rb ADDED Viewed

@@ -0,0 +1,46 @@
+module DataTools::Conversions
+  def self.noop(value)
+    value
+  end
+  # MSAD uses INT64 (8 bytes) for lastLogon, lastLogonTimestamp, accountExpires
+  def self.msad_long_timestamp(value)
+    case value.to_i
+    when 0, 0x7FFFFFFFFFFFFFFF
+      nil
+    else
+      DateTime.new(1601, 1, 1) + value.to_i/(60.0 * 10000000 * 1440)
+    end
+  end
+  def self.readable_timestamp(value)
+    DateTime.parse(value)
+  end
+  def self.first_ou(value)
+    (ou = value.split(',').select{|s| s =~ /^OU=/}.first) && ou.split('=').last
+  end
+  def self.second_ou(value)
+    (ou = value.split(',').select{|s| s =~ /^OU=/}[1]) && ou.split('=').last
+  end
+  def self.msad_active_account(value)
+    value.to_i & 2 == 0
+  end
+  def self.datestr(value)
+    value.strftime("%m/%d/%Y")
+  end
+  def self.max_datestr(values)
+    (dt = values.compact.max) && dt.strftime("%m/%d/%Y")
+  end
+  # def self.difference_in_days(start_ts, end_ts1, end_ts2 = nil)
+  def self.difference_in_days(args)
+    start_ts, end_ts1, end_ts2 = *args
+    end_ts = end_ts1 || end_ts2
+    end_ts && start_ts && (end_ts.to_date - start_ts.to_date).to_i
+  end
+end

data/lib/data_tools/enumerator.rb ADDED Viewed

@@ -0,0 +1,13 @@
+module DataTools::Enumerator
+  def csvme(outputstream, fields, headers = fields)
+    outputstream.puts headers.to_csv
+    each do |hash|
+      outputstream.puts hash.pluck(fields).to_csv
+    end
+    outputstream
+  end
+end
+class Enumerator
+  include DataTools::Enumerator
+end

data/lib/data_tools/file.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class File
+  def self.[](filename)
+    open(filename)
+  end
+end

data/lib/data_tools/hash.rb ADDED Viewed

@@ -0,0 +1,226 @@
+module DataTools::Hash
+  # construct a hash of changes needed to convert from an original hash to the new set of values
+  # keys in the original that do not appear in the new hash should appear in the diff with nil values
+  # EXCEPT that *symbol* keys from the original that *do not appear* (a nil value means it still appears) in the new hash should be ignored
+  def diffs_from(orig)
+    (self.keys | orig.keys).inject({}) do |diffs,key|
+      if key.is_a?(Symbol) && !self.include?(key)
+        # ignore this
+      elsif orig[key] != self[key]
+        diffs[key] = self[key]
+      end
+      diffs
+    end
+  end
+  # construct a key field for the has based on the list of fields provided
+  # options:
+  #   :strip (true/false, default = true): remove leading & trailing whitespace from each value
+  #   :truncate (integer): set maximum length for each value; truncate BEFORE stripping
+  def key_for(keyarray, opts = {})
+    opts[:strip] = true unless opts.has_key?(:strip)
+    meth = lambda do |k|
+      v = self[k]
+      v = v[0,opts[:truncate]] if opts[:truncate]
+      v = v.strip if opts[:strip] && v.is_a?(String)
+      v
+    end
+    this_key = keyarray.map(&meth) #{|k| self[k].strip}
+    return nil if this_key.all? {|v| v.nil?}
+    return this_key.first if this_key.count == 1 # turn single-field keys into single values, not arrays
+    if opts[:delim]
+      this_key.join(opts[:delim])
+    else
+      this_key
+    end
+  end
+  # for a Hash where all the values are Arrays
+  # hash2 should also be a hash of key/array pairs
+  # find all the cases where keys appear in both source hashes
+  def pair_off(hash2)
+    pairs = {}
+    each do |k,ary|
+      if hash2[k] && hash2[k].any?
+        pairs[k] = [ary, hash2[k]]
+      end
+    end
+    pairs
+  end
+  # same as `pair_off`, except that it chooses the partner key by calling a block
+  # rather than doing a strict comparison
+  def pair_off_by(hash2, &block)
+    pairs = {}
+    each do |k,ary|
+      k2 = block.call(k)
+      if hash2[k2] && hash2[k2].any?
+        pairs[k] = [ary, hash2[k2]]
+      end
+    end
+    pairs
+  end
+  # destructive version of `#pair_off` above.
+  # when matching keys are found, the keys are removed from both source hashes.
+  def pair_off!(hash2)
+    pairs = {}
+    each do |k,ary|
+      if hash2[k].any?
+        pairs[k] = [ary, hash2[k]]
+        delete(k)
+        hash2.delete(k)
+      end
+    end
+    pairs
+  end
+  def pair_off_by!(hash2, &block)
+    pairs = {}
+    each do |k,ary|
+      k2 = block.call(k)
+      if hash2[k2] && hash2[k2].any?
+        pairs[k] = [ary, hash2[k2]]
+        delete(k)
+        hash2.delete(k2)
+      end
+    end
+    pairs
+  end
+  def dumpme(filename)
+    raise "#{filename} exists" if File.exists?(filename)
+    File.open(filename, "w") {|f| f << Marshal.dump(self)}
+  end
+  def dumpme!(filename)
+    File.unlink(filename) if File.exists?(filename)
+    File.open(filename, "w") {|f| f << Marshal.dump(self)}
+  end
+  # HASH OF ARRAYS
+  def append(hash2)
+    (self.keys | hash2.keys).inject({}) {|h,k| h[k] = Array(self[k]) + Array(hash2[k]); h}
+  end
+  # HASH OF HASHES
+  # compare to another hash-of-hashes (aka changes, deltas, diffs)
+  # report the changes between a current state and a future state (hash2)
+  # each of the four sections (new elements, lost elements, unchanged elements, changes) is another hash-of-hashes
+  def compare(hash2)
+    newkeys = hash2.keys - self.keys
+    lostkeys = self.keys - hash2.keys
+    commonkeys = self.keys & hash2.keys
+    unchanged = []
+    changes = {}
+    commonkeys.each do |k|
+      if (diffs = hash2[k].diff(self[k])).any?
+        changes[k] = diffs
+      else
+        unchanged << k
+      end
+    end
+    {
+      :new => hash2.slice(*newkeys),
+      :lost => self.slice(*lostkeys),
+      :unchanged => self.slice(*unchanged),
+      :changes => changes
+    }
+  end
+  # convert specified fields to integers
+  def numify!(*keyarray)
+    keyarray.each do |k|
+      self[k] = self[k].to_i if self[k]
+    end
+    self
+  end
+  # ARRAY OF HASHES
+  #     correlated(:with => correlation-hash, :by => key-field)
+  # pull subset that have mappings in the correlation hash
+  def correlated?(args = {})
+    with = args[:with]
+    through = args[:through]
+    onkey = args[:onkey]
+    my_keys = keys
+    correlation_keys = through.keys
+    mismatches = select do |k,h|
+      this_match = h[onkey]
+      should_match = through[k] && with[through[k]]
+      this_match != should_match
+    end
+    unmatched = correlation_keys - my_keys
+    mismatches | unmatched
+    # should be any empty array
+    # select {|h| args[:with][h.key_for(args[:by], :delim => nil)]}
+  end
+  # apply correlations
+  #     correlate!(:with => hash2, :through => mapping-hash, :onkey => attribute-to-record-mapping-in)
+  # replaces any existing correlations (the `:on` field will be set to nil where the key does not appear in the correlation hash)
+  def correlate!(args = {})
+    with = args[:with]
+    through = args[:through]
+    onkey = args[:onkey]
+    raise "Missing argument" if args[:onkey].nil?
+    each do |k,h|
+      this_match = through[k] && with[through[k]]
+      h[onkey] = this_match
+    end
+  end
+  # remove all the keys that contain nil values (or specify a "nil" value for sources that fill in empty records with special nil placeholders)
+  def nilify!(nilvalue = nil)
+    each do |k,v|
+      self.delete(k) if v == nilvalue
+    end
+  end
+  # # HASH OF ARRAYS
+  # def coalesce!(args)
+  #   rules = args[:per]
+  #   rules.each do |from, to|
+  #     if self[to].nil?
+  #       raise "cannot merge #{from} into #{to}, destination does not exist"
+  #     end
+  #     if self[from].nil?
+  #       $stderr.puts "cannot merge #{from} into #{to}, source does not exist, ignoring"
+  #       next
+  #     end
+  #     self[to] += self[from]
+  #     self.delete(from)
+  #   end
+  #   self
+  # end
+  def cleanse(options = {})
+    each_with_object({}) do |(k,v), out|
+      out[k] = DataTools.scour(v, options)
+      if dateformat = options[:datefields][k]
+        begin
+          out[k] = DateTime.strptime(v, dateformat).to_time
+        rescue
+          warn "invalid #{k} (expected #{dateformat}): #{rec}"
+        end
+      end
+    end
+  end
+  def subset(keys)
+    map do |h|
+      h.select {|k,v| keys.include? k}
+    end
+  end
+  def pluck(*keys)
+    keys.flatten.map {|k| self[k]}
+  end
+end
+class Hash
+  include DataTools::Hash
+end

data/lib/data_tools/hash_of_arrays.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# keys can be anything
+# values are always arrays
+module DataTools::HashOfArrays
+  def append(hash2)
+    (self.keys | hash2.keys).inject({}) {|h,k| h[k] = Array(self[k]) + Array(hash2[k]); h}
+  end
+  def coalesce(key1, args)
+    key2 = args[:into] or raise "usage: coalesce(key1, :into => key)"
+    self[key2] += self[key1]
+    delete(key1)
+  end
+  def choose
+    each_with_object({}) do |(key, values), result|
+      result[key] = yield values
+    end
+  end
+end

data/lib/data_tools/io.rb ADDED Viewed

@@ -0,0 +1,55 @@
+require "csv"
+module DataTools::IO
+  def unmarshal
+    Marshal.load(self)
+  end
+  def headers
+    @import_headers ||= @import_options[:headers] || behead
+  end
+  def split(line)
+    case import_options[:format]
+    when :tsv
+      line.split("\t")
+    when :qcq
+      line.split('","')
+    else # default is CSV
+      line.parse_csv
+    end
+  end
+  def parseline(line)
+    split(line.chomp)
+  end
+  def import_options
+    @import_options ||= {
+      junkwords: [],
+      datefields: {}
+    }
+  end
+  def configure_import(options)
+    import_options.merge!(options)
+  end
+  def import(opts = {}) # expects a block
+    configure_import(opts)
+    headers = opts[:headers] || parseline(readline)
+    # warn "HEADERS ARE #{headers}"
+    Enumerator.new do |yielder|
+      self.each do |line|
+        rec = Hash[headers.zip(parseline(line))]
+        rec.extend DataTools::Hash
+        yielder.yield rec.cleanse(import_options)
+      end
+      # need to emit anything to trigger a file-completed action? (such as pushing a batch to storage)
+    end
+  end
+end
+class IO
+  include DataTools::IO
+end

data/lib/data_tools/object.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class Object
+  def vconvert(rule)
+    self && DataTools::Conversions.method(rule).call(self)
+  end
+end

data/lib/data_tools/rules.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# intent is for classes with array-of-hash behavior to `include` this module, or for instances to `extend` it
+module DataTools::Rules
+  def enhance!(args)
+    raise "missing :rules" unless args[:rules]
+    each do |rec|
+      args[:rules].each do |rule|
+        runrule(rule, rec)
+      end
+    end
+  end
+  private
+  def runrule(rule, data)
+    begin
+      code = code_for(rule[:rule])
+      case rule[:input]
+      when Array
+        data[rule[:output]] = code.call(data.values_at(*rule[:input]))
+      else
+        data[rule[:output]] = code.call(data[rule[:input]]) unless data[rule[:input]].nil?
+      end
+    rescue Exception => e
+      STDERR.puts "RULE #{rule[:rule]} FAILED: #{e.to_s} WITH INPUTS #{data.values_at(*rule[:input]).inspect}"
+      raise
+    end
+  end
+  def code_for(rule)
+    case rule
+    when Symbol
+      DataTools::Conversions.method(rule)
+    else
+      rule
+    end
+  end
+end

data/lib/data_tools/string.rb ADDED Viewed

@@ -0,0 +1,14 @@
+class String
+  # identifying keys (strings) that represent hierarchical structures, with format "superkey:subkey"
+  def splitkey?
+    self =~ /:/
+  end
+  # we always interpret the first part as a symbol
+  def superkey
+    split(/:/, 2).first.to_sym
+  end
+  # for STRINGS we always interpret the last part as a string ("resource:name" translates to :resource => name)
+  def subkey
+    split(/:/, 2).last
+  end
+end

data/lib/data_tools/symbol.rb ADDED Viewed

@@ -0,0 +1,14 @@
+class Symbol
+  # identifying keys (strings) that represent hierarchical structures, with format :"superkey:subkey"
+  def splitkey?
+    to_s =~ /:/
+  end
+  # we always interpret the first part as a symbol
+  def superkey
+    to_s.split(/:/, 2).first.to_sym
+  end
+  # for SYMBOLS we always interpret the last part as a symbol (:"resource:id" translates to :resource => :id)
+  def subkey
+    to_s.split(/:/, 2).last.to_sym
+  end
+end

data/lib/data_tools/transformations.rb ADDED Viewed

@@ -0,0 +1,51 @@
+module DataTools::Transformations
+  # unraveling the hierarchical group membership structure in Microsoft Active Directory
+  # expand the group information from MSAD "memberOf" fields
+  # flatten the hierarchy, so each account records every group of which it is a member, even through sub-groups
+  def self.expand_msad_groups(hashes)
+    $stderr.puts "Analyzing #{hashes.size} Active Directory records"
+    msad_accounts_by_dn = hashes.key_on('DN')
+    $stderr.puts "Found #{msad_accounts_by_dn.size} distinct DN values"
+    # expand the multi-valued memberOf field, and look up each group
+    # WARNING: does not report any cases if the DN for the group does not appear in the hashes, will just leave a nil in the list
+    hashes.each do |hash|
+      hash[:memberof] = (hash['memberOf'] || '').split(';').map {|dn| msad_accounts_by_dn[dn]}
+    end
+    $stderr.puts "Expanded groups on #{hashes.select {|h| h[:memberof].any?}.size} records"
+    membership_counts = hashes.map {|h| h[:memberof].size}.sum
+    begin
+      $stderr.puts "Found #{membership_counts} memberships, moving up membership hierarchy..."
+      base_membership_counts = membership_counts
+      hashes.each do |hash|
+        hash[:memberof] |= hash[:memberof].map {|g| g[:memberof]}.flatten.uniq
+      end
+      membership_counts = hashes.map {|h| h[:memberof].size}.sum
+      # repeat until no further memberships are found
+    end while membership_counts == base_membership_counts
+  end
+  # superseded by rules.rb
+  # def self.enhance(args)
+  #   h = args[:hash]
+  #   args[:rules].each do |rule|
+  #     self.runrule(rule, h)
+  #   end
+  #   h
+  # end
+  #
+  # def self.runrule(rule, data)
+  #   begin
+  #     if rule[:input].is_a?(Array)
+  #       data[rule[:output]] = data.values_at(*rule[:input]).vconvert(rule[:rule])
+  #     else
+  #       data[rule[:output]] = data[rule[:input]].vconvert(rule[:rule])
+  #     end
+  #   rescue Exception => e
+  #     STDERR.puts "RULE #{rule[:rule]} FAILED: #{e.to_s} WITH INPUTS #{data.values_at(*rule[:input]).inspect}"
+  #     exit
+  #   end
+  # end
+end

data/lib/data_tools/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module DataTools
+  VERSION = "0.6.0"
+end

data/spec/array_spec.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require_relative "spec_helper"
+describe "DataTools Array extensions" do
+  before(:each) do
+    @a = [
+      {:name => "bob", :city => "sunnyvale"},
+      {:name => "phil", :city => "mountain view"}
+    ]
+  end
+  it "can do gymnastics" do
+    3.should == 3
+  end
+  it "can handle rules" do
+    @a.extend DataTools::Rules
+    rules = [
+      {:input => :name, :output => :upname, :rule => lambda {|x| x.upcase}},
+      {:input => :city, :output => :ytic, :rule => lambda {|x| x.reverse}}
+    ]
+    @a.enhance!(:rules => rules)
+    @a.should == [
+      {:name => "bob", :city => "sunnyvale", :upname => "BOB", :ytic => "elavynnus"},
+      {:name => "phil", :city => "mountain view", :upname => "PHIL", :ytic => "weiv niatnuom"}
+    ]
+  end
+  # pending "isn't ready yet" do
+  #   4.should == 5
+  # end
+end

data/spec/comparator_spec.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require_relative "spec_helper"
+describe "Comparator" do
+  def explode(name)
+    name.gsub(/[,._-]/, ' ').split.map(&:upcase).sort
+  end
+  def try(rule, name1, name2)
+    @comp.send(rule, explode(name1), explode(name2)).should be_true
+  end
+  def bust(rule, name1, name2)
+    @comp.send(rule, explode(name1), explode(name2)).should be_false
+  end
+  before :all do
+    names = [
+      "michael g palmer",
+      "francis l palmer",
+      "michael palmer"
+    ]
+    corpus = names.map {|name| explode(name)}
+    @comp = Comparator.new(corpus)
+  end
+  it "finds names that match without initials" do
+    try(:missing_initials, "michael palmer", "michael g palmer")
+    try(:missing_initials, "michael palmer", "Q michael palmer")
+    try(:missing_initials, "michael palmer", "Michael N Palmer x")
+    bust(:missing_initials, "michael palmer", "Michael P")
+    bust(:missing_initials, "michael palmer", "Michael John Palmer")
+    matches = @comp.crunch(explode("michael palmer"))
+    matches.should == [explode("michael g palmer")]
+    matches = @comp.crunch(explode("palmer michael"))
+    matches.should == [explode("michael g palmer")]
+    matches = @comp.crunch(explode("michael g palmer"))
+    matches.should == [explode("michael palmer")]
+  end
+  it "finds names that match initials to names" do
+    try(:matching_initials, "fred jones", "f jones")
+    try(:matching_initials, "fred jones", "jones f")
+    try(:matching_initials, "fred jones", "fred j")
+    try(:matching_initials, "fred xavier jones", "fred x jones")
+    try(:matching_initials, "fred xavier jones", "xavier jones f")
+    bust(:matching_initials, "fred xavier jones", "fred jones")
+    bust(:matching_initials, "fred xavier jones", "fred q jones")
+    bust(:matching_initials, "fred x jones", "fred q jones")
+    bust(:matching_initials, "fred xavier jones", "homer simpson")
+  end
+  it "finds names that match on all but one long names" do
+    try(:matching_all_but_one, "john philip sousa", "john sousa")
+    try(:matching_all_but_one, "philip sousa", "philip john sousa")
+    bust(:matching_all_but_one, "john philip sousa", "philip john sousa")
+    try(:matching_all_but_one, "Helen Q. Glorpworth-Smythe", "helen smythe")
+  end
+end

data/spec/hash_of_arrays_spec.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe "Hash of Arrays" do
+  before(:each) do
+    @hoa = {
+      "one" => ["a", "b", "c"],
+      "two" => ["d", "e"],
+      "three" => ["f"]
+    }
+    @hoa.extend DataTools::HashOfArrays
+  end
+  it "coalesces" do
+    @hoa.coalesce("one", :into => "three")
+    @hoa.size.should == 2
+    @hoa["three"].should == ["f", "a", "b", "c"]
+  end
+end

data/spec/import_spec.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+require "json"
+describe "File Import" do
+  it "imports" do
+    f = File.open(File.dirname(__FILE__) + "/../hrhead.csv")
+    # sio = f.import.csvme(StringIO.new, ['Person Phone GUID', 'Person Address GUID'])
+    # puts sio.string
+    f.import.each_slice(3) do |slice|
+      puts slice.extend(DataTools::ArrayOfHashes).pluck('Person Phone GUID', 'Person Address GUID').to_json
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,5 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+require 'awesome_print'
+require "data_tools"

metadata ADDED Viewed

@@ -0,0 +1,120 @@
+--- !ruby/object:Gem::Specification
+name: data_tools
+version: !ruby/object:Gem::Version
+  version: 0.6.0
+  prerelease:
+platform: ruby
+authors:
+- Jason May
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-01-23 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: awesome_print
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: facets
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Data-munging utilities, including extensions to Array, Hash, String,
+  Symbol plus data conversions and transformations.
+email: jmay@pobox.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- Gemfile.lock
+- README.md
+- Rakefile
+- data_tools.gemspec
+- lib/data_tools.rb
+- lib/data_tools/array.rb
+- lib/data_tools/array_of_hashes.rb
+- lib/data_tools/comparator.rb
+- lib/data_tools/conversions.rb
+- lib/data_tools/enumerator.rb
+- lib/data_tools/file.rb
+- lib/data_tools/hash.rb
+- lib/data_tools/hash_of_arrays.rb
+- lib/data_tools/io.rb
+- lib/data_tools/object.rb
+- lib/data_tools/rules.rb
+- lib/data_tools/string.rb
+- lib/data_tools/symbol.rb
+- lib/data_tools/transformations.rb
+- lib/data_tools/version.rb
+- spec/array_spec.rb
+- spec/comparator_spec.rb
+- spec/hash_of_arrays_spec.rb
+- spec/import_spec.rb
+- spec/spec_helper.rb
+homepage: http://github.com/jmay/data_tools
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: data_tools
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Miscellaneous data-munging utilities.
+test_files: []