data_tools 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ .irb_history
2
+ .bundle
3
+ pkg
4
+ todo.txt
5
+ .DS_Store
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source :rubygems
2
+
3
+ # Specify your gem's dependencies in data_tools.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,28 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ data_tools (0.6.0)
5
+ awesome_print
6
+ facets
7
+
8
+ GEM
9
+ remote: http://rubygems.org/
10
+ specs:
11
+ awesome_print (1.1.0)
12
+ diff-lcs (1.1.3)
13
+ facets (2.9.3)
14
+ rspec (2.12.0)
15
+ rspec-core (~> 2.12.0)
16
+ rspec-expectations (~> 2.12.0)
17
+ rspec-mocks (~> 2.12.0)
18
+ rspec-core (2.12.2)
19
+ rspec-expectations (2.12.1)
20
+ diff-lcs (~> 1.1.3)
21
+ rspec-mocks (2.12.1)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ data_tools!
28
+ rspec
data/README.md ADDED
@@ -0,0 +1,7 @@
1
+ # README for data_tools
2
+
3
+ Miscellaneous data-munging utility functions.
4
+
5
+ ## Array
6
+
7
+ This is really an Array-of-Hashes.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require "rake"
5
+
6
+ require "rspec/core/rake_task"
7
+ desc "Run all RSpec tests"
8
+ RSpec::Core::RakeTask.new(:spec)
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $LOAD_PATH.unshift File.expand_path('../lib', __FILE__)
3
+ require 'data_tools/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "data_tools"
7
+ s.version = DataTools::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Jason May"]
10
+ s.email = %q{jmay@pobox.com}
11
+ s.homepage = "http://github.com/jmay/data_tools"
12
+ s.summary = %q{Miscellaneous data-munging utilities.}
13
+ s.description = %q{Data-munging utilities, including extensions to Array, Hash, String, Symbol plus data conversions and transformations.}
14
+
15
+ # s.required_rubygems_version = ">= 1.3.6"
16
+ s.rubyforge_project = "data_tools"
17
+
18
+ s.add_dependency 'awesome_print'
19
+ s.add_dependency 'facets'
20
+
21
+ s.add_development_dependency "rspec"
22
+
23
+ s.files = `git ls-files`.split("\n")
24
+ s.executables = `git ls-files`.split("\n").select{|f| f =~ /^bin/}
25
+ s.require_path = 'lib'
26
+ end
data/lib/data_tools.rb ADDED
@@ -0,0 +1,42 @@
1
+ require "ap"
2
+ require "set"
3
+ require "csv"
4
+ require "facets" # for Hash#delete_values
5
+
6
+ module DataTools
7
+ def self.reload!
8
+ $".grep(/data_tools/).each {|f| load(f)}
9
+ end
10
+
11
+ def DataTools.scour(s, opts)
12
+ case s
13
+ when nil
14
+ nil
15
+ when String
16
+ s2 = s.strip.gsub(/\s+/, ' ').gsub(/^"/, '').gsub(/"$/, '')
17
+ if s2 =~ /^[\d]+(\.[\d]+){0,1}$/
18
+ # looks numeric
19
+ s2 = s2.to_i.to_s
20
+ end
21
+ (s2.empty? || opts[:junkwords].include?(s2)) ? nil : s2
22
+ when Numeric
23
+ s.to_s
24
+ else
25
+ s.to_s
26
+ end
27
+ end
28
+ end
29
+
30
+ [
31
+ "version",
32
+ "array", "hash",
33
+ "array_of_hashes", "hash_of_arrays",
34
+ "enumerator",
35
+ "comparator",
36
+ "object", "string", "symbol",
37
+ "file", "io",
38
+ "rules",
39
+ "conversions", "transformations"
40
+ ].each do |file|
41
+ require File.dirname(__FILE__) + "/data_tools/#{file}"
42
+ end
@@ -0,0 +1,42 @@
1
+ module DataTools::Array
2
+ # turns an array-of-arrays into an array-of-hashes
3
+ # the headers are used as names for the fields
4
+ # OK for rows to have fewer fields than the header record, but must not be longer
5
+ def hashify(headers = shift)
6
+ # ignore leading/trailing whitespace in header labels
7
+ headers.each {|hdr| hdr.strip! if hdr === String}
8
+ select {|row| row.any?}.map do |row|
9
+ raise "Row count mismatch: #{row}" if row.count > headers.count
10
+ hash = {}
11
+ row.zip(headers) do |v,k|
12
+ # ignore any keys with missing values
13
+ # remove leading/trailing whitespace from values
14
+ hash[k] = v.strip unless v.blank?
15
+ end
16
+ hash
17
+ end
18
+ end
19
+
20
+ # ARRAY OF SCALARS
21
+ # apply an operation (block) to every member of the array
22
+ # return the list of unique results
23
+ # if there is just one result, convert to a scalar value
24
+ def resolve(&block)
25
+ values = map {|v| block.call(v)}.uniq
26
+ values.count <= 1 ? values.first : values
27
+ end
28
+
29
+ # marshal (ruby-specific binary format) the contents of this structure to a file
30
+ # fails if file exists
31
+ def dumpme(filename)
32
+ raise "#{filename} exists" if File.exists?(filename)
33
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
34
+ end
35
+ # same as #dumpme but overwrites existing file
36
+ def dumpme!(filename)
37
+ File.unlink(filename) if File.exists?(filename)
38
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
39
+ end
40
+
41
+
42
+ end
@@ -0,0 +1,219 @@
1
+ module DataTools::ArrayOfHashes
2
+ # convert an array of hashes to a hash of the same hashes
3
+ # where the key values are picked from the hashes
4
+ # the keys can be single fields, or an array, or a list
5
+ # options:
6
+ # :multi (boolean, default false): if true, allow multiple values per key; store values as an array for each key
7
+ # :first (boolean, default false): if true, when finding multiple values per key, store only the first and ignore the rest
8
+ # :truncate (integer): see `Hash#key_for`
9
+ #
10
+ def key_on(*keyarray)
11
+ raise "Key(s) required" if keyarray.empty?
12
+ opts = keyarray.last.is_a?(Hash) ? keyarray.pop : {}
13
+ keyarray = keyarray.flatten
14
+
15
+ memo = opts[:multi] ? Hash.new {|h,k| h[k] = []} : Hash.new
16
+ each do |hash|
17
+ this_key = hash.key_for(keyarray, opts)
18
+ raise "Missing value for #{keyarray} in record #{hash}" if this_key.nil?
19
+ if opts[:multi]
20
+ memo[this_key] << hash
21
+ elsif opts[:first]
22
+ # ignore this value if we already have one for this key
23
+ if !memo.has_key?(this_key)
24
+ memo[this_key] = hash
25
+ end
26
+ else
27
+ raise "Found duplicate #{keyarray} in #{memo[this_key]} vs #{hash}" if memo.has_key?(this_key)
28
+ memo[this_key] = hash
29
+ end
30
+ memo
31
+ end
32
+ memo.extend DataTools::HashOfArrays
33
+ memo.default = nil
34
+ memo
35
+ end
36
+
37
+ # shorthand for `Array#select {|hash| hash[...] && hash[...] ...}`
38
+ # find all the members of the array where all the specified criteria are true
39
+ def where(conditions)
40
+ newhash = case conditions
41
+ when Hash
42
+ select do |record|
43
+ conditions.map do |k,v|
44
+ case v
45
+ when Regexp
46
+ record[k] =~ v
47
+ when TrueClass
48
+ !record[k].nil?
49
+ when FalseClass
50
+ record[k].nil?
51
+ else
52
+ record[k] == v
53
+ end
54
+ end.reduce(:&) # all tests must pass
55
+ end
56
+ when String,Symbol
57
+ # just check for presence & non-nil value of specified key
58
+ select {|record| record[conditions]}
59
+ end
60
+ newhash.extend DataTools::ArrayOfHashes
61
+ end
62
+
63
+ # are all the values for `key` defined and unique?
64
+ def unique?(*keyarray)
65
+ raise "Key(s) required" if keyarray.empty?
66
+ keyarray = keyarray.flatten
67
+ keys = map {|hash| hash.key_for(keyarray)}
68
+ return false if keys.any?(&:nil?)
69
+ keys.uniq.count == self.count
70
+ end
71
+
72
+ def unique_values_for(*keyarray)
73
+ raise "Key(s) required" if keyarray.empty?
74
+ keyarray = keyarray.flatten
75
+ map {|hash| hash.key_for(keyarray)}.to_set
76
+ end
77
+
78
+ # assign unique IDs to every hash in the array
79
+ # argument is the name of the field to use for the generated sequential key
80
+ def count_off!(key = :key, start = 0)
81
+ raise "Values exist for [#{key}]" if any?{|h| h[key]}
82
+ each_with_index do |hash, i|
83
+ hash[key] = i + start
84
+ end
85
+ self
86
+ end
87
+
88
+ def redundant(*keyarray)
89
+ key_on(keyarray, :multi => true).select {|k,v| v.count > 1}
90
+ end
91
+
92
+ # combine a set of hashes into one
93
+ # for each key, find all the distinct values from all the hashes
94
+ # if there's one unique value, store the single value in key of the result
95
+ # if there are multiple values, store them all as an array
96
+ def coalesce
97
+ allkeys = map {|h| h.keys}.flatten.uniq
98
+ allkeys.reduce({}) do |memo,key|
99
+ memo[key] = map {|h| h[key]}.compact.uniq
100
+ memo[key] = memo[key].first if memo[key].count <= 1
101
+ memo
102
+ end
103
+ end
104
+
105
+ # apply the same resolution operation to every hash in the list
106
+ def resolve_all(key, &block)
107
+ map do |hash|
108
+ hash = hash.dup
109
+ hash[key] = hash[key].resolve(&block)
110
+ hash
111
+ end
112
+ end
113
+
114
+ # marshal (ruby-specific binary format) the contents of this structure to a file
115
+ # fails if file exists
116
+ def dumpme(filename)
117
+ raise "#{filename} exists" if File.exists?(filename)
118
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
119
+ end
120
+ # same as #dumpme but overwrites existing file
121
+ def dumpme!(filename)
122
+ File.unlink(filename) if File.exists?(filename)
123
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
124
+ end
125
+
126
+ # attempt to dump out contents of this array-of-hashes as CSV to named file
127
+ # fields is list of attribute names to write out
128
+ # options headers is public names for the fields
129
+ def csvme(filename, fields, headers = fields)
130
+ CSV.open(filename, "wb") do |csv|
131
+ csv << headers unless headers.nil?
132
+ pluck(fields).each do |ary|
133
+ csv << ary
134
+ end
135
+ end
136
+ true
137
+ end
138
+
139
+ def tsvme(filename, fields, headers = fields)
140
+ File.open(target) do |output|
141
+ output.puts headers.join("\t")
142
+ pluck(fields).each do |ary|
143
+ output.puts ary.join("\t")
144
+ end
145
+ end
146
+ true
147
+ end
148
+
149
+ # What different keys appear in this collection of hashes?
150
+ def allkeys
151
+ each_with_object({}) do |h, memo|
152
+ h.keys.each {|k| memo[k] += 1}
153
+ end.keys
154
+ end
155
+
156
+ def metrics
157
+ allkeys.reduce({}) do |m,k|
158
+ values = self.map {|h| h[k]}
159
+ m[k] = {
160
+ :non_nil => values.compact.count,
161
+ :nil => values.count - values.compact.count,
162
+ :unique => values.uniq.count
163
+ }
164
+ if m[k][:unique] <= 10
165
+ m[k][:values] = histogram(k)
166
+ end
167
+ m
168
+ end
169
+ end
170
+
171
+ def numify!(*keyarray)
172
+ each {|h| h.numify!(*keyarray)}
173
+ end
174
+
175
+ def nilify!(keyvalue)
176
+ each {|h| h.nilify!(keyvalue)}
177
+ end
178
+
179
+ # return histogram of value distribution for the specified key: hash of value/count pairs
180
+ def histogram(*args, &block)
181
+ reduce(Hash.new(0)) do |hist, h|
182
+ if block_given?
183
+ v = yield(h)
184
+ else
185
+ v = h[args.first]
186
+ end
187
+ hist[v] += 1
188
+ hist
189
+ end
190
+ end
191
+
192
+ # hash slice for all the named attributes from each hashes in the array
193
+ def subset(*keys)
194
+ keys = keys.flatten
195
+ map {|h| h.subset(keys) }
196
+ end
197
+
198
+ # pull out all the named attributes from the hashes in the array (into array-of-arrays)
199
+ def pluck(*keys)
200
+ keys = keys.flatten
201
+ map {|h| h.pluck(keys)}
202
+ # if keys.count > 1
203
+ # map {|h| keys.map {|k| h[k]}}
204
+ # else
205
+ # map {|h| h[keys.first]}
206
+ # end
207
+ end
208
+
209
+ # For each record, output a subset of the values as an array (suitable for passing to `#to_csv`)
210
+ # supports hierarchical subkeys (e.g. :master:id or "master:name")
211
+ def project(args)
212
+ defaults = args[:defaults] || {}
213
+ map do |h|
214
+ args[:keys].map do |k|
215
+ (k.splitkey? && (deref = h[k.superkey]) && deref[k.subkey]) || h[k] || defaults[k] || args[:nilvalue]
216
+ end
217
+ end
218
+ end
219
+ end
@@ -0,0 +1,85 @@
1
+ # MULTI-MATCHING via components
2
+ # go through all users
3
+ # group by distinct sets of components
4
+ # pick a (small) subset of component-keys, say <10. Maybe random sample?
5
+ # build a set of matching rules
6
+ # run the subset * the full corpus * the matching rules
7
+
8
+ class Comparator
9
+ attr_reader :corpus
10
+
11
+ def initialize(corpus)
12
+ @corpus = corpus
13
+
14
+ prep_missing_initials
15
+ end
16
+
17
+ def crunch(record)
18
+ (@corpus - [record]).each_with_object([]) do |candidate,matches|
19
+ if evaluate(record, candidate)
20
+ matches << candidate
21
+ end
22
+ end
23
+ end
24
+
25
+ def evaluate(record, candidate)
26
+ [:missing_initials].each do |rule|
27
+ return true if send(rule, record, candidate)
28
+ end
29
+ false
30
+ end
31
+
32
+ # don't need an 'identical' test - assuming that the input record does not appear in the corpus
33
+ # def identical(a,b)
34
+ # a == b
35
+ # end
36
+
37
+ # must have at least 2 long (non-initial-only) components in each
38
+ # those long parts must be identical
39
+ # only one of the names can have any initials
40
+ def missing_initials(a,b)
41
+ longnames_a = a.select {|s| s.length > 1}
42
+ longnames_b = b.select {|s| s.length > 1}
43
+ inits_a = a.select {|s| s.length == 1}
44
+ inits_b = b.select {|s| s.length == 1}
45
+
46
+ longnames_a.count >= 2 && longnames_b.count >= 2 && longnames_a == longnames_b && (inits_a.empty? || inits_b.empty?)
47
+ end
48
+
49
+ def prep_missing_initials
50
+ @corpus_missing_initials = corpus.each_with_object(Set.new) do |rec,set|
51
+ without_initials = rec.select {|s| s.length > 1}
52
+ if without_initials.count >= 2
53
+ set << without_initials
54
+ end
55
+ end
56
+ end
57
+
58
+ # must have at least 1 long (non-initial-only) component in each
59
+ # those long parts must be identical
60
+ # all initials should correspond to non-matched longnames in the other input
61
+ def matching_initials(a,b)
62
+ longnames_a = a.select {|s| s.length > 1}
63
+ longnames_b = b.select {|s| s.length > 1}
64
+ inits_a = a.select {|s| s.length == 1}
65
+ inits_b = b.select {|s| s.length == 1}
66
+
67
+ return false unless longnames_a.count >= 1 && longnames_b.count >= 1
68
+
69
+ unmatched_longnames_a = longnames_a - longnames_b
70
+ unmatched_longnames_b = longnames_b - longnames_a
71
+ unmatched_inits_a = unmatched_longnames_a.map {|s| s[0]}
72
+ unmatched_inits_b = unmatched_longnames_b.map {|s| s[0]}
73
+
74
+ inits_a == unmatched_inits_b && inits_b == unmatched_inits_a
75
+ end
76
+
77
+ # ignore any initials. look for cases where there is exactly one name component that differs between the inputs.
78
+ def matching_all_but_one(a,b)
79
+ longnames_a = a.select {|s| s.length > 1}
80
+ longnames_b = b.select {|s| s.length > 1}
81
+
82
+ ((longnames_a | longnames_b) - (longnames_a & longnames_b)).count == 1
83
+ end
84
+
85
+ end
@@ -0,0 +1,46 @@
1
+ module DataTools::Conversions
2
+ def self.noop(value)
3
+ value
4
+ end
5
+
6
+ # MSAD uses INT64 (8 bytes) for lastLogon, lastLogonTimestamp, accountExpires
7
+ def self.msad_long_timestamp(value)
8
+ case value.to_i
9
+ when 0, 0x7FFFFFFFFFFFFFFF
10
+ nil
11
+ else
12
+ DateTime.new(1601, 1, 1) + value.to_i/(60.0 * 10000000 * 1440)
13
+ end
14
+ end
15
+
16
+ def self.readable_timestamp(value)
17
+ DateTime.parse(value)
18
+ end
19
+
20
+ def self.first_ou(value)
21
+ (ou = value.split(',').select{|s| s =~ /^OU=/}.first) && ou.split('=').last
22
+ end
23
+
24
+ def self.second_ou(value)
25
+ (ou = value.split(',').select{|s| s =~ /^OU=/}[1]) && ou.split('=').last
26
+ end
27
+
28
+ def self.msad_active_account(value)
29
+ value.to_i & 2 == 0
30
+ end
31
+
32
+ def self.datestr(value)
33
+ value.strftime("%m/%d/%Y")
34
+ end
35
+
36
+ def self.max_datestr(values)
37
+ (dt = values.compact.max) && dt.strftime("%m/%d/%Y")
38
+ end
39
+
40
+ # def self.difference_in_days(start_ts, end_ts1, end_ts2 = nil)
41
+ def self.difference_in_days(args)
42
+ start_ts, end_ts1, end_ts2 = *args
43
+ end_ts = end_ts1 || end_ts2
44
+ end_ts && start_ts && (end_ts.to_date - start_ts.to_date).to_i
45
+ end
46
+ end
@@ -0,0 +1,13 @@
1
+ module DataTools::Enumerator
2
+ def csvme(outputstream, fields, headers = fields)
3
+ outputstream.puts headers.to_csv
4
+ each do |hash|
5
+ outputstream.puts hash.pluck(fields).to_csv
6
+ end
7
+ outputstream
8
+ end
9
+ end
10
+
11
+ class Enumerator
12
+ include DataTools::Enumerator
13
+ end
@@ -0,0 +1,5 @@
1
+ class File
2
+ def self.[](filename)
3
+ open(filename)
4
+ end
5
+ end
@@ -0,0 +1,226 @@
1
+ module DataTools::Hash
2
+ # construct a hash of changes needed to convert from an original hash to the new set of values
3
+ # keys in the original that do not appear in the new hash should appear in the diff with nil values
4
+ # EXCEPT that *symbol* keys from the original that *do not appear* (a nil value means it still appears) in the new hash should be ignored
5
+ def diffs_from(orig)
6
+ (self.keys | orig.keys).inject({}) do |diffs,key|
7
+ if key.is_a?(Symbol) && !self.include?(key)
8
+ # ignore this
9
+ elsif orig[key] != self[key]
10
+ diffs[key] = self[key]
11
+ end
12
+ diffs
13
+ end
14
+ end
15
+
16
+ # construct a key field for the has based on the list of fields provided
17
+ # options:
18
+ # :strip (true/false, default = true): remove leading & trailing whitespace from each value
19
+ # :truncate (integer): set maximum length for each value; truncate BEFORE stripping
20
+ def key_for(keyarray, opts = {})
21
+ opts[:strip] = true unless opts.has_key?(:strip)
22
+ meth = lambda do |k|
23
+ v = self[k]
24
+ v = v[0,opts[:truncate]] if opts[:truncate]
25
+ v = v.strip if opts[:strip] && v.is_a?(String)
26
+ v
27
+ end
28
+ this_key = keyarray.map(&meth) #{|k| self[k].strip}
29
+ return nil if this_key.all? {|v| v.nil?}
30
+ return this_key.first if this_key.count == 1 # turn single-field keys into single values, not arrays
31
+ if opts[:delim]
32
+ this_key.join(opts[:delim])
33
+ else
34
+ this_key
35
+ end
36
+ end
37
+
38
+ # for a Hash where all the values are Arrays
39
+ # hash2 should also be a hash of key/array pairs
40
+ # find all the cases where keys appear in both source hashes
41
+ def pair_off(hash2)
42
+ pairs = {}
43
+ each do |k,ary|
44
+ if hash2[k] && hash2[k].any?
45
+ pairs[k] = [ary, hash2[k]]
46
+ end
47
+ end
48
+ pairs
49
+ end
50
+
51
+ # same as `pair_off`, except that it chooses the partner key by calling a block
52
+ # rather than doing a strict comparison
53
+ def pair_off_by(hash2, &block)
54
+ pairs = {}
55
+ each do |k,ary|
56
+ k2 = block.call(k)
57
+ if hash2[k2] && hash2[k2].any?
58
+ pairs[k] = [ary, hash2[k2]]
59
+ end
60
+ end
61
+ pairs
62
+ end
63
+
64
+ # destructive version of `#pair_off` above.
65
+ # when matching keys are found, the keys are removed from both source hashes.
66
+ def pair_off!(hash2)
67
+ pairs = {}
68
+ each do |k,ary|
69
+ if hash2[k].any?
70
+ pairs[k] = [ary, hash2[k]]
71
+ delete(k)
72
+ hash2.delete(k)
73
+ end
74
+ end
75
+ pairs
76
+ end
77
+
78
+ def pair_off_by!(hash2, &block)
79
+ pairs = {}
80
+ each do |k,ary|
81
+ k2 = block.call(k)
82
+ if hash2[k2] && hash2[k2].any?
83
+ pairs[k] = [ary, hash2[k2]]
84
+ delete(k)
85
+ hash2.delete(k2)
86
+ end
87
+ end
88
+ pairs
89
+ end
90
+
91
+ def dumpme(filename)
92
+ raise "#{filename} exists" if File.exists?(filename)
93
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
94
+ end
95
+ def dumpme!(filename)
96
+ File.unlink(filename) if File.exists?(filename)
97
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
98
+ end
99
+
100
+ # HASH OF ARRAYS
101
+ def append(hash2)
102
+ (self.keys | hash2.keys).inject({}) {|h,k| h[k] = Array(self[k]) + Array(hash2[k]); h}
103
+ end
104
+
105
+ # HASH OF HASHES
106
+ # compare to another hash-of-hashes (aka changes, deltas, diffs)
107
+ # report the changes between a current state and a future state (hash2)
108
+ # each of the four sections (new elements, lost elements, unchanged elements, changes) is another hash-of-hashes
109
+ def compare(hash2)
110
+ newkeys = hash2.keys - self.keys
111
+ lostkeys = self.keys - hash2.keys
112
+ commonkeys = self.keys & hash2.keys
113
+
114
+ unchanged = []
115
+ changes = {}
116
+ commonkeys.each do |k|
117
+ if (diffs = hash2[k].diff(self[k])).any?
118
+ changes[k] = diffs
119
+ else
120
+ unchanged << k
121
+ end
122
+ end
123
+
124
+ {
125
+ :new => hash2.slice(*newkeys),
126
+ :lost => self.slice(*lostkeys),
127
+ :unchanged => self.slice(*unchanged),
128
+ :changes => changes
129
+ }
130
+ end
131
+
132
+ # convert specified fields to integers
133
+ def numify!(*keyarray)
134
+ keyarray.each do |k|
135
+ self[k] = self[k].to_i if self[k]
136
+ end
137
+ self
138
+ end
139
+
140
+ # ARRAY OF HASHES
141
+ # correlated(:with => correlation-hash, :by => key-field)
142
+ # pull subset that have mappings in the correlation hash
143
+ def correlated?(args = {})
144
+ with = args[:with]
145
+ through = args[:through]
146
+ onkey = args[:onkey]
147
+
148
+ my_keys = keys
149
+ correlation_keys = through.keys
150
+
151
+ mismatches = select do |k,h|
152
+ this_match = h[onkey]
153
+ should_match = through[k] && with[through[k]]
154
+ this_match != should_match
155
+ end
156
+ unmatched = correlation_keys - my_keys
157
+ mismatches | unmatched
158
+ # should be any empty array
159
+ # select {|h| args[:with][h.key_for(args[:by], :delim => nil)]}
160
+ end
161
+
162
+ # apply correlations
163
+ # correlate!(:with => hash2, :through => mapping-hash, :onkey => attribute-to-record-mapping-in)
164
+ # replaces any existing correlations (the `:on` field will be set to nil where the key does not appear in the correlation hash)
165
+ def correlate!(args = {})
166
+ with = args[:with]
167
+ through = args[:through]
168
+ onkey = args[:onkey]
169
+ raise "Missing argument" if args[:onkey].nil?
170
+ each do |k,h|
171
+ this_match = through[k] && with[through[k]]
172
+ h[onkey] = this_match
173
+ end
174
+ end
175
+
176
+ # remove all the keys that contain nil values (or specify a "nil" value for sources that fill in empty records with special nil placeholders)
177
+ def nilify!(nilvalue = nil)
178
+ each do |k,v|
179
+ self.delete(k) if v == nilvalue
180
+ end
181
+ end
182
+
183
+ # # HASH OF ARRAYS
184
+ # def coalesce!(args)
185
+ # rules = args[:per]
186
+ # rules.each do |from, to|
187
+ # if self[to].nil?
188
+ # raise "cannot merge #{from} into #{to}, destination does not exist"
189
+ # end
190
+ # if self[from].nil?
191
+ # $stderr.puts "cannot merge #{from} into #{to}, source does not exist, ignoring"
192
+ # next
193
+ # end
194
+ # self[to] += self[from]
195
+ # self.delete(from)
196
+ # end
197
+ # self
198
+ # end
199
+
200
+ def cleanse(options = {})
201
+ each_with_object({}) do |(k,v), out|
202
+ out[k] = DataTools.scour(v, options)
203
+ if dateformat = options[:datefields][k]
204
+ begin
205
+ out[k] = DateTime.strptime(v, dateformat).to_time
206
+ rescue
207
+ warn "invalid #{k} (expected #{dateformat}): #{rec}"
208
+ end
209
+ end
210
+ end
211
+ end
212
+
213
+ def subset(keys)
214
+ map do |h|
215
+ h.select {|k,v| keys.include? k}
216
+ end
217
+ end
218
+
219
+ def pluck(*keys)
220
+ keys.flatten.map {|k| self[k]}
221
+ end
222
+ end
223
+
224
+ class Hash
225
+ include DataTools::Hash
226
+ end
@@ -0,0 +1,20 @@
1
+ # keys can be anything
2
+ # values are always arrays
3
+
4
+ module DataTools::HashOfArrays
5
+ def append(hash2)
6
+ (self.keys | hash2.keys).inject({}) {|h,k| h[k] = Array(self[k]) + Array(hash2[k]); h}
7
+ end
8
+
9
+ def coalesce(key1, args)
10
+ key2 = args[:into] or raise "usage: coalesce(key1, :into => key)"
11
+ self[key2] += self[key1]
12
+ delete(key1)
13
+ end
14
+
15
+ def choose
16
+ each_with_object({}) do |(key, values), result|
17
+ result[key] = yield values
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,55 @@
1
+ require "csv"
2
+
3
+ module DataTools::IO
4
+ def unmarshal
5
+ Marshal.load(self)
6
+ end
7
+
8
+ def headers
9
+ @import_headers ||= @import_options[:headers] || behead
10
+ end
11
+
12
+ def split(line)
13
+ case import_options[:format]
14
+ when :tsv
15
+ line.split("\t")
16
+ when :qcq
17
+ line.split('","')
18
+ else # default is CSV
19
+ line.parse_csv
20
+ end
21
+ end
22
+
23
+ def parseline(line)
24
+ split(line.chomp)
25
+ end
26
+
27
+ def import_options
28
+ @import_options ||= {
29
+ junkwords: [],
30
+ datefields: {}
31
+ }
32
+ end
33
+
34
+ def configure_import(options)
35
+ import_options.merge!(options)
36
+ end
37
+
38
+ def import(opts = {}) # expects a block
39
+ configure_import(opts)
40
+ headers = opts[:headers] || parseline(readline)
41
+ # warn "HEADERS ARE #{headers}"
42
+ Enumerator.new do |yielder|
43
+ self.each do |line|
44
+ rec = Hash[headers.zip(parseline(line))]
45
+ rec.extend DataTools::Hash
46
+ yielder.yield rec.cleanse(import_options)
47
+ end
48
+ # need to emit anything to trigger a file-completed action? (such as pushing a batch to storage)
49
+ end
50
+ end
51
+ end
52
+
53
+ class IO
54
+ include DataTools::IO
55
+ end
@@ -0,0 +1,5 @@
1
+ class Object
2
+ def vconvert(rule)
3
+ self && DataTools::Conversions.method(rule).call(self)
4
+ end
5
+ end
@@ -0,0 +1,39 @@
1
+ # intent is for classes with array-of-hash behavior to `include` this module, or for instances to `extend` it
2
+
3
+ module DataTools::Rules
4
+ def enhance!(args)
5
+ raise "missing :rules" unless args[:rules]
6
+ each do |rec|
7
+ args[:rules].each do |rule|
8
+ runrule(rule, rec)
9
+ end
10
+ end
11
+ end
12
+
13
+ private
14
+
15
+ def runrule(rule, data)
16
+ begin
17
+ code = code_for(rule[:rule])
18
+
19
+ case rule[:input]
20
+ when Array
21
+ data[rule[:output]] = code.call(data.values_at(*rule[:input]))
22
+ else
23
+ data[rule[:output]] = code.call(data[rule[:input]]) unless data[rule[:input]].nil?
24
+ end
25
+ rescue Exception => e
26
+ STDERR.puts "RULE #{rule[:rule]} FAILED: #{e.to_s} WITH INPUTS #{data.values_at(*rule[:input]).inspect}"
27
+ raise
28
+ end
29
+ end
30
+
31
+ def code_for(rule)
32
+ case rule
33
+ when Symbol
34
+ DataTools::Conversions.method(rule)
35
+ else
36
+ rule
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,14 @@
1
+ class String
2
+ # identifying keys (strings) that represent hierarchical structures, with format "superkey:subkey"
3
+ def splitkey?
4
+ self =~ /:/
5
+ end
6
+ # we always interpret the first part as a symbol
7
+ def superkey
8
+ split(/:/, 2).first.to_sym
9
+ end
10
+ # for STRINGS we always interpret the last part as a string ("resource:name" translates to :resource => name)
11
+ def subkey
12
+ split(/:/, 2).last
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ class Symbol
2
+ # identifying keys (strings) that represent hierarchical structures, with format :"superkey:subkey"
3
+ def splitkey?
4
+ to_s =~ /:/
5
+ end
6
+ # we always interpret the first part as a symbol
7
+ def superkey
8
+ to_s.split(/:/, 2).first.to_sym
9
+ end
10
+ # for SYMBOLS we always interpret the last part as a symbol (:"resource:id" translates to :resource => :id)
11
+ def subkey
12
+ to_s.split(/:/, 2).last.to_sym
13
+ end
14
+ end
@@ -0,0 +1,51 @@
1
+ module DataTools::Transformations
2
+ # unraveling the hierarchical group membership structure in Microsoft Active Directory
3
+ # expand the group information from MSAD "memberOf" fields
4
+ # flatten the hierarchy, so each account records every group of which it is a member, even through sub-groups
5
+ def self.expand_msad_groups(hashes)
6
+ $stderr.puts "Analyzing #{hashes.size} Active Directory records"
7
+ msad_accounts_by_dn = hashes.key_on('DN')
8
+ $stderr.puts "Found #{msad_accounts_by_dn.size} distinct DN values"
9
+
10
+ # expand the multi-valued memberOf field, and look up each group
11
+ # WARNING: does not report any cases if the DN for the group does not appear in the hashes, will just leave a nil in the list
12
+ hashes.each do |hash|
13
+ hash[:memberof] = (hash['memberOf'] || '').split(';').map {|dn| msad_accounts_by_dn[dn]}
14
+ end
15
+ $stderr.puts "Expanded groups on #{hashes.select {|h| h[:memberof].any?}.size} records"
16
+
17
+ membership_counts = hashes.map {|h| h[:memberof].size}.sum
18
+
19
+ begin
20
+ $stderr.puts "Found #{membership_counts} memberships, moving up membership hierarchy..."
21
+ base_membership_counts = membership_counts
22
+ hashes.each do |hash|
23
+ hash[:memberof] |= hash[:memberof].map {|g| g[:memberof]}.flatten.uniq
24
+ end
25
+ membership_counts = hashes.map {|h| h[:memberof].size}.sum
26
+ # repeat until no further memberships are found
27
+ end while membership_counts == base_membership_counts
28
+ end
29
+
30
+ # superseded by rules.rb
31
+ # def self.enhance(args)
32
+ # h = args[:hash]
33
+ # args[:rules].each do |rule|
34
+ # self.runrule(rule, h)
35
+ # end
36
+ # h
37
+ # end
38
+ #
39
+ # def self.runrule(rule, data)
40
+ # begin
41
+ # if rule[:input].is_a?(Array)
42
+ # data[rule[:output]] = data.values_at(*rule[:input]).vconvert(rule[:rule])
43
+ # else
44
+ # data[rule[:output]] = data[rule[:input]].vconvert(rule[:rule])
45
+ # end
46
+ # rescue Exception => e
47
+ # STDERR.puts "RULE #{rule[:rule]} FAILED: #{e.to_s} WITH INPUTS #{data.values_at(*rule[:input]).inspect}"
48
+ # exit
49
+ # end
50
+ # end
51
+ end
@@ -0,0 +1,3 @@
1
+ module DataTools
2
+ VERSION = "0.6.0"
3
+ end
@@ -0,0 +1,31 @@
1
+ require_relative "spec_helper"
2
+
3
+ describe "DataTools Array extensions" do
4
+ before(:each) do
5
+ @a = [
6
+ {:name => "bob", :city => "sunnyvale"},
7
+ {:name => "phil", :city => "mountain view"}
8
+ ]
9
+ end
10
+
11
+ it "can do gymnastics" do
12
+ 3.should == 3
13
+ end
14
+
15
+ it "can handle rules" do
16
+ @a.extend DataTools::Rules
17
+ rules = [
18
+ {:input => :name, :output => :upname, :rule => lambda {|x| x.upcase}},
19
+ {:input => :city, :output => :ytic, :rule => lambda {|x| x.reverse}}
20
+ ]
21
+ @a.enhance!(:rules => rules)
22
+ @a.should == [
23
+ {:name => "bob", :city => "sunnyvale", :upname => "BOB", :ytic => "elavynnus"},
24
+ {:name => "phil", :city => "mountain view", :upname => "PHIL", :ytic => "weiv niatnuom"}
25
+ ]
26
+ end
27
+
28
+ # pending "isn't ready yet" do
29
+ # 4.should == 5
30
+ # end
31
+ end
@@ -0,0 +1,59 @@
1
+ require_relative "spec_helper"
2
+
3
+ describe "Comparator" do
4
+ def explode(name)
5
+ name.gsub(/[,._-]/, ' ').split.map(&:upcase).sort
6
+ end
7
+
8
+ def try(rule, name1, name2)
9
+ @comp.send(rule, explode(name1), explode(name2)).should be_true
10
+ end
11
+
12
+ def bust(rule, name1, name2)
13
+ @comp.send(rule, explode(name1), explode(name2)).should be_false
14
+ end
15
+
16
+ before :all do
17
+ names = [
18
+ "michael g palmer",
19
+ "francis l palmer",
20
+ "michael palmer"
21
+ ]
22
+ corpus = names.map {|name| explode(name)}
23
+ @comp = Comparator.new(corpus)
24
+ end
25
+
26
+ it "finds names that match without initials" do
27
+ try(:missing_initials, "michael palmer", "michael g palmer")
28
+ try(:missing_initials, "michael palmer", "Q michael palmer")
29
+ try(:missing_initials, "michael palmer", "Michael N Palmer x")
30
+ bust(:missing_initials, "michael palmer", "Michael P")
31
+ bust(:missing_initials, "michael palmer", "Michael John Palmer")
32
+
33
+ matches = @comp.crunch(explode("michael palmer"))
34
+ matches.should == [explode("michael g palmer")]
35
+ matches = @comp.crunch(explode("palmer michael"))
36
+ matches.should == [explode("michael g palmer")]
37
+ matches = @comp.crunch(explode("michael g palmer"))
38
+ matches.should == [explode("michael palmer")]
39
+ end
40
+
41
+ it "finds names that match initials to names" do
42
+ try(:matching_initials, "fred jones", "f jones")
43
+ try(:matching_initials, "fred jones", "jones f")
44
+ try(:matching_initials, "fred jones", "fred j")
45
+ try(:matching_initials, "fred xavier jones", "fred x jones")
46
+ try(:matching_initials, "fred xavier jones", "xavier jones f")
47
+ bust(:matching_initials, "fred xavier jones", "fred jones")
48
+ bust(:matching_initials, "fred xavier jones", "fred q jones")
49
+ bust(:matching_initials, "fred x jones", "fred q jones")
50
+ bust(:matching_initials, "fred xavier jones", "homer simpson")
51
+ end
52
+
53
+ it "finds names that match on all but one long names" do
54
+ try(:matching_all_but_one, "john philip sousa", "john sousa")
55
+ try(:matching_all_but_one, "philip sousa", "philip john sousa")
56
+ bust(:matching_all_but_one, "john philip sousa", "philip john sousa")
57
+ try(:matching_all_but_one, "Helen Q. Glorpworth-Smythe", "helen smythe")
58
+ end
59
+ end
@@ -0,0 +1,18 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Hash of Arrays" do
4
+ before(:each) do
5
+ @hoa = {
6
+ "one" => ["a", "b", "c"],
7
+ "two" => ["d", "e"],
8
+ "three" => ["f"]
9
+ }
10
+ @hoa.extend DataTools::HashOfArrays
11
+ end
12
+
13
+ it "coalesces" do
14
+ @hoa.coalesce("one", :into => "three")
15
+ @hoa.size.should == 2
16
+ @hoa["three"].should == ["f", "a", "b", "c"]
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ require "json"
4
+
5
+ describe "File Import" do
6
+ it "imports" do
7
+ f = File.open(File.dirname(__FILE__) + "/../hrhead.csv")
8
+ # sio = f.import.csvme(StringIO.new, ['Person Phone GUID', 'Person Address GUID'])
9
+ # puts sio.string
10
+
11
+ f.import.each_slice(3) do |slice|
12
+ puts slice.extend(DataTools::ArrayOfHashes).pluck('Person Phone GUID', 'Person Address GUID').to_json
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,5 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'awesome_print'
4
+
5
+ require "data_tools"
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_tools
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.6.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason May
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: awesome_print
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: facets
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: Data-munging utilities, including extensions to Array, Hash, String,
63
+ Symbol plus data conversions and transformations.
64
+ email: jmay@pobox.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - Gemfile
71
+ - Gemfile.lock
72
+ - README.md
73
+ - Rakefile
74
+ - data_tools.gemspec
75
+ - lib/data_tools.rb
76
+ - lib/data_tools/array.rb
77
+ - lib/data_tools/array_of_hashes.rb
78
+ - lib/data_tools/comparator.rb
79
+ - lib/data_tools/conversions.rb
80
+ - lib/data_tools/enumerator.rb
81
+ - lib/data_tools/file.rb
82
+ - lib/data_tools/hash.rb
83
+ - lib/data_tools/hash_of_arrays.rb
84
+ - lib/data_tools/io.rb
85
+ - lib/data_tools/object.rb
86
+ - lib/data_tools/rules.rb
87
+ - lib/data_tools/string.rb
88
+ - lib/data_tools/symbol.rb
89
+ - lib/data_tools/transformations.rb
90
+ - lib/data_tools/version.rb
91
+ - spec/array_spec.rb
92
+ - spec/comparator_spec.rb
93
+ - spec/hash_of_arrays_spec.rb
94
+ - spec/import_spec.rb
95
+ - spec/spec_helper.rb
96
+ homepage: http://github.com/jmay/data_tools
97
+ licenses: []
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ! '>='
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ required_rubygems_version: !ruby/object:Gem::Requirement
109
+ none: false
110
+ requirements:
111
+ - - ! '>='
112
+ - !ruby/object:Gem::Version
113
+ version: '0'
114
+ requirements: []
115
+ rubyforge_project: data_tools
116
+ rubygems_version: 1.8.24
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Miscellaneous data-munging utilities.
120
+ test_files: []