data_tools 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ .irb_history
2
+ .bundle
3
+ pkg
4
+ todo.txt
5
+ .DS_Store
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source :rubygems
2
+
3
+ # Specify your gem's dependencies in data_tools.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,28 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ data_tools (0.6.0)
5
+ awesome_print
6
+ facets
7
+
8
+ GEM
9
+ remote: http://rubygems.org/
10
+ specs:
11
+ awesome_print (1.1.0)
12
+ diff-lcs (1.1.3)
13
+ facets (2.9.3)
14
+ rspec (2.12.0)
15
+ rspec-core (~> 2.12.0)
16
+ rspec-expectations (~> 2.12.0)
17
+ rspec-mocks (~> 2.12.0)
18
+ rspec-core (2.12.2)
19
+ rspec-expectations (2.12.1)
20
+ diff-lcs (~> 1.1.3)
21
+ rspec-mocks (2.12.1)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ data_tools!
28
+ rspec
data/README.md ADDED
@@ -0,0 +1,7 @@
1
+ # README for data_tools
2
+
3
+ Miscellaneous data-munging utility functions.
4
+
5
+ ## Array
6
+
7
+ This is really an Array-of-Hashes.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require "rake"
5
+
6
+ require "rspec/core/rake_task"
7
+ desc "Run all RSpec tests"
8
+ RSpec::Core::RakeTask.new(:spec)
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $LOAD_PATH.unshift File.expand_path('../lib', __FILE__)
3
+ require 'data_tools/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "data_tools"
7
+ s.version = DataTools::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Jason May"]
10
+ s.email = %q{jmay@pobox.com}
11
+ s.homepage = "http://github.com/jmay/data_tools"
12
+ s.summary = %q{Miscellaneous data-munging utilities.}
13
+ s.description = %q{Data-munging utilities, including extensions to Array, Hash, String, Symbol plus data conversions and transformations.}
14
+
15
+ # s.required_rubygems_version = ">= 1.3.6"
16
+ s.rubyforge_project = "data_tools"
17
+
18
+ s.add_dependency 'awesome_print'
19
+ s.add_dependency 'facets'
20
+
21
+ s.add_development_dependency "rspec"
22
+
23
+ s.files = `git ls-files`.split("\n")
24
+ s.executables = `git ls-files`.split("\n").select{|f| f =~ /^bin/}
25
+ s.require_path = 'lib'
26
+ end
data/lib/data_tools.rb ADDED
@@ -0,0 +1,42 @@
1
+ require "ap"
2
+ require "set"
3
+ require "csv"
4
+ require "facets" # for Hash#delete_values
5
+
6
+ module DataTools
7
+ def self.reload!
8
+ $".grep(/data_tools/).each {|f| load(f)}
9
+ end
10
+
11
+ def DataTools.scour(s, opts)
12
+ case s
13
+ when nil
14
+ nil
15
+ when String
16
+ s2 = s.strip.gsub(/\s+/, ' ').gsub(/^"/, '').gsub(/"$/, '')
17
+ if s2 =~ /^[\d]+(\.[\d]+){0,1}$/
18
+ # looks numeric
19
+ s2 = s2.to_i.to_s
20
+ end
21
+ (s2.empty? || opts[:junkwords].include?(s2)) ? nil : s2
22
+ when Numeric
23
+ s.to_s
24
+ else
25
+ s.to_s
26
+ end
27
+ end
28
+ end
29
+
30
+ [
31
+ "version",
32
+ "array", "hash",
33
+ "array_of_hashes", "hash_of_arrays",
34
+ "enumerator",
35
+ "comparator",
36
+ "object", "string", "symbol",
37
+ "file", "io",
38
+ "rules",
39
+ "conversions", "transformations"
40
+ ].each do |file|
41
+ require File.dirname(__FILE__) + "/data_tools/#{file}"
42
+ end
@@ -0,0 +1,42 @@
1
+ module DataTools::Array
2
+ # turns an array-of-arrays into an array-of-hashes
3
+ # the headers are used as names for the fields
4
+ # OK for rows to have fewer fields than the header record, but must not be longer
5
+ def hashify(headers = shift)
6
+ # ignore leading/trailing whitespace in header labels
7
+ headers.each {|hdr| hdr.strip! if hdr === String}
8
+ select {|row| row.any?}.map do |row|
9
+ raise "Row count mismatch: #{row}" if row.count > headers.count
10
+ hash = {}
11
+ row.zip(headers) do |v,k|
12
+ # ignore any keys with missing values
13
+ # remove leading/trailing whitespace from values
14
+ hash[k] = v.strip unless v.blank?
15
+ end
16
+ hash
17
+ end
18
+ end
19
+
20
+ # ARRAY OF SCALARS
21
+ # apply an operation (block) to every member of the array
22
+ # return the list of unique results
23
+ # if there is just one result, convert to a scalar value
24
+ def resolve(&block)
25
+ values = map {|v| block.call(v)}.uniq
26
+ values.count <= 1 ? values.first : values
27
+ end
28
+
29
+ # marshal (ruby-specific binary format) the contents of this structure to a file
30
+ # fails if file exists
31
+ def dumpme(filename)
32
+ raise "#{filename} exists" if File.exists?(filename)
33
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
34
+ end
35
+ # same as #dumpme but overwrites existing file
36
+ def dumpme!(filename)
37
+ File.unlink(filename) if File.exists?(filename)
38
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
39
+ end
40
+
41
+
42
+ end
@@ -0,0 +1,219 @@
1
+ module DataTools::ArrayOfHashes
2
+ # convert an array of hashes to a hash of the same hashes
3
+ # where the key values are picked from the hashes
4
+ # the keys can be single fields, or an array, or a list
5
+ # options:
6
+ # :multi (boolean, default false): if true, allow multiple values per key; store values as an array for each key
7
+ # :first (boolean, default false): if true, when finding multiple values per key, store only the first and ignore the rest
8
+ # :truncate (integer): see `Hash#key_for`
9
+ #
10
+ def key_on(*keyarray)
11
+ raise "Key(s) required" if keyarray.empty?
12
+ opts = keyarray.last.is_a?(Hash) ? keyarray.pop : {}
13
+ keyarray = keyarray.flatten
14
+
15
+ memo = opts[:multi] ? Hash.new {|h,k| h[k] = []} : Hash.new
16
+ each do |hash|
17
+ this_key = hash.key_for(keyarray, opts)
18
+ raise "Missing value for #{keyarray} in record #{hash}" if this_key.nil?
19
+ if opts[:multi]
20
+ memo[this_key] << hash
21
+ elsif opts[:first]
22
+ # ignore this value if we already have one for this key
23
+ if !memo.has_key?(this_key)
24
+ memo[this_key] = hash
25
+ end
26
+ else
27
+ raise "Found duplicate #{keyarray} in #{memo[this_key]} vs #{hash}" if memo.has_key?(this_key)
28
+ memo[this_key] = hash
29
+ end
30
+ memo
31
+ end
32
+ memo.extend DataTools::HashOfArrays
33
+ memo.default = nil
34
+ memo
35
+ end
36
+
37
+ # shorthand for `Array#select {|hash| hash[...] && hash[...] ...}`
38
+ # find all the members of the array where all the specified criteria are true
39
+ def where(conditions)
40
+ newhash = case conditions
41
+ when Hash
42
+ select do |record|
43
+ conditions.map do |k,v|
44
+ case v
45
+ when Regexp
46
+ record[k] =~ v
47
+ when TrueClass
48
+ !record[k].nil?
49
+ when FalseClass
50
+ record[k].nil?
51
+ else
52
+ record[k] == v
53
+ end
54
+ end.reduce(:&) # all tests must pass
55
+ end
56
+ when String,Symbol
57
+ # just check for presence & non-nil value of specified key
58
+ select {|record| record[conditions]}
59
+ end
60
+ newhash.extend DataTools::ArrayOfHashes
61
+ end
62
+
63
+ # are all the values for `key` defined and unique?
64
+ def unique?(*keyarray)
65
+ raise "Key(s) required" if keyarray.empty?
66
+ keyarray = keyarray.flatten
67
+ keys = map {|hash| hash.key_for(keyarray)}
68
+ return false if keys.any?(&:nil?)
69
+ keys.uniq.count == self.count
70
+ end
71
+
72
+ def unique_values_for(*keyarray)
73
+ raise "Key(s) required" if keyarray.empty?
74
+ keyarray = keyarray.flatten
75
+ map {|hash| hash.key_for(keyarray)}.to_set
76
+ end
77
+
78
+ # assign unique IDs to every hash in the array
79
+ # argument is the name of the field to use for the generated sequential key
80
+ def count_off!(key = :key, start = 0)
81
+ raise "Values exist for [#{key}]" if any?{|h| h[key]}
82
+ each_with_index do |hash, i|
83
+ hash[key] = i + start
84
+ end
85
+ self
86
+ end
87
+
88
+ def redundant(*keyarray)
89
+ key_on(keyarray, :multi => true).select {|k,v| v.count > 1}
90
+ end
91
+
92
+ # combine a set of hashes into one
93
+ # for each key, find all the distinct values from all the hashes
94
+ # if there's one unique value, store the single value in key of the result
95
+ # if there are multiple values, store them all as an array
96
+ def coalesce
97
+ allkeys = map {|h| h.keys}.flatten.uniq
98
+ allkeys.reduce({}) do |memo,key|
99
+ memo[key] = map {|h| h[key]}.compact.uniq
100
+ memo[key] = memo[key].first if memo[key].count <= 1
101
+ memo
102
+ end
103
+ end
104
+
105
+ # apply the same resolution operation to every hash in the list
106
+ def resolve_all(key, &block)
107
+ map do |hash|
108
+ hash = hash.dup
109
+ hash[key] = hash[key].resolve(&block)
110
+ hash
111
+ end
112
+ end
113
+
114
+ # marshal (ruby-specific binary format) the contents of this structure to a file
115
+ # fails if file exists
116
+ def dumpme(filename)
117
+ raise "#{filename} exists" if File.exists?(filename)
118
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
119
+ end
120
+ # same as #dumpme but overwrites existing file
121
+ def dumpme!(filename)
122
+ File.unlink(filename) if File.exists?(filename)
123
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
124
+ end
125
+
126
+ # attempt to dump out contents of this array-of-hashes as CSV to named file
127
+ # fields is list of attribute names to write out
128
+ # options headers is public names for the fields
129
+ def csvme(filename, fields, headers = fields)
130
+ CSV.open(filename, "wb") do |csv|
131
+ csv << headers unless headers.nil?
132
+ pluck(fields).each do |ary|
133
+ csv << ary
134
+ end
135
+ end
136
+ true
137
+ end
138
+
139
+ def tsvme(filename, fields, headers = fields)
140
+ File.open(target) do |output|
141
+ output.puts headers.join("\t")
142
+ pluck(fields).each do |ary|
143
+ output.puts ary.join("\t")
144
+ end
145
+ end
146
+ true
147
+ end
148
+
149
+ # What different keys appear in this collection of hashes?
150
+ def allkeys
151
+ each_with_object({}) do |h, memo|
152
+ h.keys.each {|k| memo[k] += 1}
153
+ end.keys
154
+ end
155
+
156
+ def metrics
157
+ allkeys.reduce({}) do |m,k|
158
+ values = self.map {|h| h[k]}
159
+ m[k] = {
160
+ :non_nil => values.compact.count,
161
+ :nil => values.count - values.compact.count,
162
+ :unique => values.uniq.count
163
+ }
164
+ if m[k][:unique] <= 10
165
+ m[k][:values] = histogram(k)
166
+ end
167
+ m
168
+ end
169
+ end
170
+
171
+ def numify!(*keyarray)
172
+ each {|h| h.numify!(*keyarray)}
173
+ end
174
+
175
+ def nilify!(keyvalue)
176
+ each {|h| h.nilify!(keyvalue)}
177
+ end
178
+
179
+ # return histogram of value distribution for the specified key: hash of value/count pairs
180
+ def histogram(*args, &block)
181
+ reduce(Hash.new(0)) do |hist, h|
182
+ if block_given?
183
+ v = yield(h)
184
+ else
185
+ v = h[args.first]
186
+ end
187
+ hist[v] += 1
188
+ hist
189
+ end
190
+ end
191
+
192
+ # hash slice for all the named attributes from each hashes in the array
193
+ def subset(*keys)
194
+ keys = keys.flatten
195
+ map {|h| h.subset(keys) }
196
+ end
197
+
198
+ # pull out all the named attributes from the hashes in the array (into array-of-arrays)
199
+ def pluck(*keys)
200
+ keys = keys.flatten
201
+ map {|h| h.pluck(keys)}
202
+ # if keys.count > 1
203
+ # map {|h| keys.map {|k| h[k]}}
204
+ # else
205
+ # map {|h| h[keys.first]}
206
+ # end
207
+ end
208
+
209
+ # For each record, output a subset of the values as an array (suitable for passing to `#to_csv`)
210
+ # supports hierarchical subkeys (e.g. :master:id or "master:name")
211
+ def project(args)
212
+ defaults = args[:defaults] || {}
213
+ map do |h|
214
+ args[:keys].map do |k|
215
+ (k.splitkey? && (deref = h[k.superkey]) && deref[k.subkey]) || h[k] || defaults[k] || args[:nilvalue]
216
+ end
217
+ end
218
+ end
219
+ end
@@ -0,0 +1,85 @@
1
+ # MULTI-MATCHING via components
2
+ # go through all users
3
+ # group by distinct sets of components
4
+ # pick a (small) subset of component-keys, say <10. Maybe random sample?
5
+ # build a set of matching rules
6
+ # run the subset * the full corpus * the matching rules
7
+
8
+ class Comparator
9
+ attr_reader :corpus
10
+
11
+ def initialize(corpus)
12
+ @corpus = corpus
13
+
14
+ prep_missing_initials
15
+ end
16
+
17
+ def crunch(record)
18
+ (@corpus - [record]).each_with_object([]) do |candidate,matches|
19
+ if evaluate(record, candidate)
20
+ matches << candidate
21
+ end
22
+ end
23
+ end
24
+
25
+ def evaluate(record, candidate)
26
+ [:missing_initials].each do |rule|
27
+ return true if send(rule, record, candidate)
28
+ end
29
+ false
30
+ end
31
+
32
+ # don't need an 'identical' test - assuming that the input record does not appear in the corpus
33
+ # def identical(a,b)
34
+ # a == b
35
+ # end
36
+
37
+ # must have at least 2 long (non-initial-only) components in each
38
+ # those long parts must be identical
39
+ # only one of the names can have any initials
40
+ def missing_initials(a,b)
41
+ longnames_a = a.select {|s| s.length > 1}
42
+ longnames_b = b.select {|s| s.length > 1}
43
+ inits_a = a.select {|s| s.length == 1}
44
+ inits_b = b.select {|s| s.length == 1}
45
+
46
+ longnames_a.count >= 2 && longnames_b.count >= 2 && longnames_a == longnames_b && (inits_a.empty? || inits_b.empty?)
47
+ end
48
+
49
+ def prep_missing_initials
50
+ @corpus_missing_initials = corpus.each_with_object(Set.new) do |rec,set|
51
+ without_initials = rec.select {|s| s.length > 1}
52
+ if without_initials.count >= 2
53
+ set << without_initials
54
+ end
55
+ end
56
+ end
57
+
58
+ # must have at least 1 long (non-initial-only) component in each
59
+ # those long parts must be identical
60
+ # all initials should correspond to non-matched longnames in the other input
61
+ def matching_initials(a,b)
62
+ longnames_a = a.select {|s| s.length > 1}
63
+ longnames_b = b.select {|s| s.length > 1}
64
+ inits_a = a.select {|s| s.length == 1}
65
+ inits_b = b.select {|s| s.length == 1}
66
+
67
+ return false unless longnames_a.count >= 1 && longnames_b.count >= 1
68
+
69
+ unmatched_longnames_a = longnames_a - longnames_b
70
+ unmatched_longnames_b = longnames_b - longnames_a
71
+ unmatched_inits_a = unmatched_longnames_a.map {|s| s[0]}
72
+ unmatched_inits_b = unmatched_longnames_b.map {|s| s[0]}
73
+
74
+ inits_a == unmatched_inits_b && inits_b == unmatched_inits_a
75
+ end
76
+
77
+ # ignore any initials. look for cases where there is exactly one name component that differs between the inputs.
78
+ def matching_all_but_one(a,b)
79
+ longnames_a = a.select {|s| s.length > 1}
80
+ longnames_b = b.select {|s| s.length > 1}
81
+
82
+ ((longnames_a | longnames_b) - (longnames_a & longnames_b)).count == 1
83
+ end
84
+
85
+ end
@@ -0,0 +1,46 @@
1
+ module DataTools::Conversions
2
+ def self.noop(value)
3
+ value
4
+ end
5
+
6
+ # MSAD uses INT64 (8 bytes) for lastLogon, lastLogonTimestamp, accountExpires
7
+ def self.msad_long_timestamp(value)
8
+ case value.to_i
9
+ when 0, 0x7FFFFFFFFFFFFFFF
10
+ nil
11
+ else
12
+ DateTime.new(1601, 1, 1) + value.to_i/(60.0 * 10000000 * 1440)
13
+ end
14
+ end
15
+
16
+ def self.readable_timestamp(value)
17
+ DateTime.parse(value)
18
+ end
19
+
20
+ def self.first_ou(value)
21
+ (ou = value.split(',').select{|s| s =~ /^OU=/}.first) && ou.split('=').last
22
+ end
23
+
24
+ def self.second_ou(value)
25
+ (ou = value.split(',').select{|s| s =~ /^OU=/}[1]) && ou.split('=').last
26
+ end
27
+
28
+ def self.msad_active_account(value)
29
+ value.to_i & 2 == 0
30
+ end
31
+
32
+ def self.datestr(value)
33
+ value.strftime("%m/%d/%Y")
34
+ end
35
+
36
+ def self.max_datestr(values)
37
+ (dt = values.compact.max) && dt.strftime("%m/%d/%Y")
38
+ end
39
+
40
+ # def self.difference_in_days(start_ts, end_ts1, end_ts2 = nil)
41
+ def self.difference_in_days(args)
42
+ start_ts, end_ts1, end_ts2 = *args
43
+ end_ts = end_ts1 || end_ts2
44
+ end_ts && start_ts && (end_ts.to_date - start_ts.to_date).to_i
45
+ end
46
+ end
@@ -0,0 +1,13 @@
1
+ module DataTools::Enumerator
2
+ def csvme(outputstream, fields, headers = fields)
3
+ outputstream.puts headers.to_csv
4
+ each do |hash|
5
+ outputstream.puts hash.pluck(fields).to_csv
6
+ end
7
+ outputstream
8
+ end
9
+ end
10
+
11
+ class Enumerator
12
+ include DataTools::Enumerator
13
+ end
@@ -0,0 +1,5 @@
1
+ class File
2
+ def self.[](filename)
3
+ open(filename)
4
+ end
5
+ end
@@ -0,0 +1,226 @@
1
+ module DataTools::Hash
2
+ # construct a hash of changes needed to convert from an original hash to the new set of values
3
+ # keys in the original that do not appear in the new hash should appear in the diff with nil values
4
+ # EXCEPT that *symbol* keys from the original that *do not appear* (a nil value means it still appears) in the new hash should be ignored
5
+ def diffs_from(orig)
6
+ (self.keys | orig.keys).inject({}) do |diffs,key|
7
+ if key.is_a?(Symbol) && !self.include?(key)
8
+ # ignore this
9
+ elsif orig[key] != self[key]
10
+ diffs[key] = self[key]
11
+ end
12
+ diffs
13
+ end
14
+ end
15
+
16
+ # construct a key field for the has based on the list of fields provided
17
+ # options:
18
+ # :strip (true/false, default = true): remove leading & trailing whitespace from each value
19
+ # :truncate (integer): set maximum length for each value; truncate BEFORE stripping
20
+ def key_for(keyarray, opts = {})
21
+ opts[:strip] = true unless opts.has_key?(:strip)
22
+ meth = lambda do |k|
23
+ v = self[k]
24
+ v = v[0,opts[:truncate]] if opts[:truncate]
25
+ v = v.strip if opts[:strip] && v.is_a?(String)
26
+ v
27
+ end
28
+ this_key = keyarray.map(&meth) #{|k| self[k].strip}
29
+ return nil if this_key.all? {|v| v.nil?}
30
+ return this_key.first if this_key.count == 1 # turn single-field keys into single values, not arrays
31
+ if opts[:delim]
32
+ this_key.join(opts[:delim])
33
+ else
34
+ this_key
35
+ end
36
+ end
37
+
38
+ # for a Hash where all the values are Arrays
39
+ # hash2 should also be a hash of key/array pairs
40
+ # find all the cases where keys appear in both source hashes
41
+ def pair_off(hash2)
42
+ pairs = {}
43
+ each do |k,ary|
44
+ if hash2[k] && hash2[k].any?
45
+ pairs[k] = [ary, hash2[k]]
46
+ end
47
+ end
48
+ pairs
49
+ end
50
+
51
+ # same as `pair_off`, except that it chooses the partner key by calling a block
52
+ # rather than doing a strict comparison
53
+ def pair_off_by(hash2, &block)
54
+ pairs = {}
55
+ each do |k,ary|
56
+ k2 = block.call(k)
57
+ if hash2[k2] && hash2[k2].any?
58
+ pairs[k] = [ary, hash2[k2]]
59
+ end
60
+ end
61
+ pairs
62
+ end
63
+
64
+ # destructive version of `#pair_off` above.
65
+ # when matching keys are found, the keys are removed from both source hashes.
66
+ def pair_off!(hash2)
67
+ pairs = {}
68
+ each do |k,ary|
69
+ if hash2[k].any?
70
+ pairs[k] = [ary, hash2[k]]
71
+ delete(k)
72
+ hash2.delete(k)
73
+ end
74
+ end
75
+ pairs
76
+ end
77
+
78
+ def pair_off_by!(hash2, &block)
79
+ pairs = {}
80
+ each do |k,ary|
81
+ k2 = block.call(k)
82
+ if hash2[k2] && hash2[k2].any?
83
+ pairs[k] = [ary, hash2[k2]]
84
+ delete(k)
85
+ hash2.delete(k2)
86
+ end
87
+ end
88
+ pairs
89
+ end
90
+
91
+ def dumpme(filename)
92
+ raise "#{filename} exists" if File.exists?(filename)
93
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
94
+ end
95
+ def dumpme!(filename)
96
+ File.unlink(filename) if File.exists?(filename)
97
+ File.open(filename, "w") {|f| f << Marshal.dump(self)}
98
+ end
99
+
100
+ # HASH OF ARRAYS
101
+ def append(hash2)
102
+ (self.keys | hash2.keys).inject({}) {|h,k| h[k] = Array(self[k]) + Array(hash2[k]); h}
103
+ end
104
+
105
+ # HASH OF HASHES
106
+ # compare to another hash-of-hashes (aka changes, deltas, diffs)
107
+ # report the changes between a current state and a future state (hash2)
108
+ # each of the four sections (new elements, lost elements, unchanged elements, changes) is another hash-of-hashes
109
+ def compare(hash2)
110
+ newkeys = hash2.keys - self.keys
111
+ lostkeys = self.keys - hash2.keys
112
+ commonkeys = self.keys & hash2.keys
113
+
114
+ unchanged = []
115
+ changes = {}
116
+ commonkeys.each do |k|
117
+ if (diffs = hash2[k].diff(self[k])).any?
118
+ changes[k] = diffs
119
+ else
120
+ unchanged << k
121
+ end
122
+ end
123
+
124
+ {
125
+ :new => hash2.slice(*newkeys),
126
+ :lost => self.slice(*lostkeys),
127
+ :unchanged => self.slice(*unchanged),
128
+ :changes => changes
129
+ }
130
+ end
131
+
132
+ # convert specified fields to integers
133
+ def numify!(*keyarray)
134
+ keyarray.each do |k|
135
+ self[k] = self[k].to_i if self[k]
136
+ end
137
+ self
138
+ end
139
+
140
+ # ARRAY OF HASHES
141
+ # correlated(:with => correlation-hash, :by => key-field)
142
+ # pull subset that have mappings in the correlation hash
143
+ def correlated?(args = {})
144
+ with = args[:with]
145
+ through = args[:through]
146
+ onkey = args[:onkey]
147
+
148
+ my_keys = keys
149
+ correlation_keys = through.keys
150
+
151
+ mismatches = select do |k,h|
152
+ this_match = h[onkey]
153
+ should_match = through[k] && with[through[k]]
154
+ this_match != should_match
155
+ end
156
+ unmatched = correlation_keys - my_keys
157
+ mismatches | unmatched
158
+ # should be any empty array
159
+ # select {|h| args[:with][h.key_for(args[:by], :delim => nil)]}
160
+ end
161
+
162
+ # apply correlations
163
+ # correlate!(:with => hash2, :through => mapping-hash, :onkey => attribute-to-record-mapping-in)
164
+ # replaces any existing correlations (the `:on` field will be set to nil where the key does not appear in the correlation hash)
165
+ def correlate!(args = {})
166
+ with = args[:with]
167
+ through = args[:through]
168
+ onkey = args[:onkey]
169
+ raise "Missing argument" if args[:onkey].nil?
170
+ each do |k,h|
171
+ this_match = through[k] && with[through[k]]
172
+ h[onkey] = this_match
173
+ end
174
+ end
175
+
176
+ # remove all the keys that contain nil values (or specify a "nil" value for sources that fill in empty records with special nil placeholders)
177
+ def nilify!(nilvalue = nil)
178
+ each do |k,v|
179
+ self.delete(k) if v == nilvalue
180
+ end
181
+ end
182
+
183
+ # # HASH OF ARRAYS
184
+ # def coalesce!(args)
185
+ # rules = args[:per]
186
+ # rules.each do |from, to|
187
+ # if self[to].nil?
188
+ # raise "cannot merge #{from} into #{to}, destination does not exist"
189
+ # end
190
+ # if self[from].nil?
191
+ # $stderr.puts "cannot merge #{from} into #{to}, source does not exist, ignoring"
192
+ # next
193
+ # end
194
+ # self[to] += self[from]
195
+ # self.delete(from)
196
+ # end
197
+ # self
198
+ # end
199
+
200
+ def cleanse(options = {})
201
+ each_with_object({}) do |(k,v), out|
202
+ out[k] = DataTools.scour(v, options)
203
+ if dateformat = options[:datefields][k]
204
+ begin
205
+ out[k] = DateTime.strptime(v, dateformat).to_time
206
+ rescue
207
+ warn "invalid #{k} (expected #{dateformat}): #{rec}"
208
+ end
209
+ end
210
+ end
211
+ end
212
+
213
+ def subset(keys)
214
+ map do |h|
215
+ h.select {|k,v| keys.include? k}
216
+ end
217
+ end
218
+
219
+ def pluck(*keys)
220
+ keys.flatten.map {|k| self[k]}
221
+ end
222
+ end
223
+
224
+ class Hash
225
+ include DataTools::Hash
226
+ end
@@ -0,0 +1,20 @@
1
+ # keys can be anything
2
+ # values are always arrays
3
+
4
+ module DataTools::HashOfArrays
5
+ def append(hash2)
6
+ (self.keys | hash2.keys).inject({}) {|h,k| h[k] = Array(self[k]) + Array(hash2[k]); h}
7
+ end
8
+
9
+ def coalesce(key1, args)
10
+ key2 = args[:into] or raise "usage: coalesce(key1, :into => key)"
11
+ self[key2] += self[key1]
12
+ delete(key1)
13
+ end
14
+
15
+ def choose
16
+ each_with_object({}) do |(key, values), result|
17
+ result[key] = yield values
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,55 @@
1
+ require "csv"
2
+
3
+ module DataTools::IO
4
+ def unmarshal
5
+ Marshal.load(self)
6
+ end
7
+
8
+ def headers
9
+ @import_headers ||= @import_options[:headers] || behead
10
+ end
11
+
12
+ def split(line)
13
+ case import_options[:format]
14
+ when :tsv
15
+ line.split("\t")
16
+ when :qcq
17
+ line.split('","')
18
+ else # default is CSV
19
+ line.parse_csv
20
+ end
21
+ end
22
+
23
+ def parseline(line)
24
+ split(line.chomp)
25
+ end
26
+
27
+ def import_options
28
+ @import_options ||= {
29
+ junkwords: [],
30
+ datefields: {}
31
+ }
32
+ end
33
+
34
+ def configure_import(options)
35
+ import_options.merge!(options)
36
+ end
37
+
38
+ def import(opts = {}) # expects a block
39
+ configure_import(opts)
40
+ headers = opts[:headers] || parseline(readline)
41
+ # warn "HEADERS ARE #{headers}"
42
+ Enumerator.new do |yielder|
43
+ self.each do |line|
44
+ rec = Hash[headers.zip(parseline(line))]
45
+ rec.extend DataTools::Hash
46
+ yielder.yield rec.cleanse(import_options)
47
+ end
48
+ # need to emit anything to trigger a file-completed action? (such as pushing a batch to storage)
49
+ end
50
+ end
51
+ end
52
+
53
+ class IO
54
+ include DataTools::IO
55
+ end
@@ -0,0 +1,5 @@
1
+ class Object
2
+ def vconvert(rule)
3
+ self && DataTools::Conversions.method(rule).call(self)
4
+ end
5
+ end
@@ -0,0 +1,39 @@
1
+ # intent is for classes with array-of-hash behavior to `include` this module, or for instances to `extend` it
2
+
3
+ module DataTools::Rules
4
+ def enhance!(args)
5
+ raise "missing :rules" unless args[:rules]
6
+ each do |rec|
7
+ args[:rules].each do |rule|
8
+ runrule(rule, rec)
9
+ end
10
+ end
11
+ end
12
+
13
+ private
14
+
15
+ def runrule(rule, data)
16
+ begin
17
+ code = code_for(rule[:rule])
18
+
19
+ case rule[:input]
20
+ when Array
21
+ data[rule[:output]] = code.call(data.values_at(*rule[:input]))
22
+ else
23
+ data[rule[:output]] = code.call(data[rule[:input]]) unless data[rule[:input]].nil?
24
+ end
25
+ rescue Exception => e
26
+ STDERR.puts "RULE #{rule[:rule]} FAILED: #{e.to_s} WITH INPUTS #{data.values_at(*rule[:input]).inspect}"
27
+ raise
28
+ end
29
+ end
30
+
31
+ def code_for(rule)
32
+ case rule
33
+ when Symbol
34
+ DataTools::Conversions.method(rule)
35
+ else
36
+ rule
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,14 @@
1
+ class String
2
+ # identifying keys (strings) that represent hierarchical structures, with format "superkey:subkey"
3
+ def splitkey?
4
+ self =~ /:/
5
+ end
6
+ # we always interpret the first part as a symbol
7
+ def superkey
8
+ split(/:/, 2).first.to_sym
9
+ end
10
+ # for STRINGS we always interpret the last part as a string ("resource:name" translates to :resource => name)
11
+ def subkey
12
+ split(/:/, 2).last
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ class Symbol
2
+ # identifying keys (strings) that represent hierarchical structures, with format :"superkey:subkey"
3
+ def splitkey?
4
+ to_s =~ /:/
5
+ end
6
+ # we always interpret the first part as a symbol
7
+ def superkey
8
+ to_s.split(/:/, 2).first.to_sym
9
+ end
10
+ # for SYMBOLS we always interpret the last part as a symbol (:"resource:id" translates to :resource => :id)
11
+ def subkey
12
+ to_s.split(/:/, 2).last.to_sym
13
+ end
14
+ end
@@ -0,0 +1,51 @@
1
+ module DataTools::Transformations
2
+ # unraveling the hierarchical group membership structure in Microsoft Active Directory
3
+ # expand the group information from MSAD "memberOf" fields
4
+ # flatten the hierarchy, so each account records every group of which it is a member, even through sub-groups
5
+ def self.expand_msad_groups(hashes)
6
+ $stderr.puts "Analyzing #{hashes.size} Active Directory records"
7
+ msad_accounts_by_dn = hashes.key_on('DN')
8
+ $stderr.puts "Found #{msad_accounts_by_dn.size} distinct DN values"
9
+
10
+ # expand the multi-valued memberOf field, and look up each group
11
+ # WARNING: does not report any cases if the DN for the group does not appear in the hashes, will just leave a nil in the list
12
+ hashes.each do |hash|
13
+ hash[:memberof] = (hash['memberOf'] || '').split(';').map {|dn| msad_accounts_by_dn[dn]}
14
+ end
15
+ $stderr.puts "Expanded groups on #{hashes.select {|h| h[:memberof].any?}.size} records"
16
+
17
+ membership_counts = hashes.map {|h| h[:memberof].size}.sum
18
+
19
+ begin
20
+ $stderr.puts "Found #{membership_counts} memberships, moving up membership hierarchy..."
21
+ base_membership_counts = membership_counts
22
+ hashes.each do |hash|
23
+ hash[:memberof] |= hash[:memberof].map {|g| g[:memberof]}.flatten.uniq
24
+ end
25
+ membership_counts = hashes.map {|h| h[:memberof].size}.sum
26
+ # repeat until no further memberships are found
27
+ end while membership_counts == base_membership_counts
28
+ end
29
+
30
+ # superseded by rules.rb
31
+ # def self.enhance(args)
32
+ # h = args[:hash]
33
+ # args[:rules].each do |rule|
34
+ # self.runrule(rule, h)
35
+ # end
36
+ # h
37
+ # end
38
+ #
39
+ # def self.runrule(rule, data)
40
+ # begin
41
+ # if rule[:input].is_a?(Array)
42
+ # data[rule[:output]] = data.values_at(*rule[:input]).vconvert(rule[:rule])
43
+ # else
44
+ # data[rule[:output]] = data[rule[:input]].vconvert(rule[:rule])
45
+ # end
46
+ # rescue Exception => e
47
+ # STDERR.puts "RULE #{rule[:rule]} FAILED: #{e.to_s} WITH INPUTS #{data.values_at(*rule[:input]).inspect}"
48
+ # exit
49
+ # end
50
+ # end
51
+ end
@@ -0,0 +1,3 @@
1
+ module DataTools
2
+ VERSION = "0.6.0"
3
+ end
@@ -0,0 +1,31 @@
1
+ require_relative "spec_helper"
2
+
3
+ describe "DataTools Array extensions" do
4
+ before(:each) do
5
+ @a = [
6
+ {:name => "bob", :city => "sunnyvale"},
7
+ {:name => "phil", :city => "mountain view"}
8
+ ]
9
+ end
10
+
11
+ it "can do gymnastics" do
12
+ 3.should == 3
13
+ end
14
+
15
+ it "can handle rules" do
16
+ @a.extend DataTools::Rules
17
+ rules = [
18
+ {:input => :name, :output => :upname, :rule => lambda {|x| x.upcase}},
19
+ {:input => :city, :output => :ytic, :rule => lambda {|x| x.reverse}}
20
+ ]
21
+ @a.enhance!(:rules => rules)
22
+ @a.should == [
23
+ {:name => "bob", :city => "sunnyvale", :upname => "BOB", :ytic => "elavynnus"},
24
+ {:name => "phil", :city => "mountain view", :upname => "PHIL", :ytic => "weiv niatnuom"}
25
+ ]
26
+ end
27
+
28
+ # pending "isn't ready yet" do
29
+ # 4.should == 5
30
+ # end
31
+ end
@@ -0,0 +1,59 @@
1
+ require_relative "spec_helper"
2
+
3
+ describe "Comparator" do
4
+ def explode(name)
5
+ name.gsub(/[,._-]/, ' ').split.map(&:upcase).sort
6
+ end
7
+
8
+ def try(rule, name1, name2)
9
+ @comp.send(rule, explode(name1), explode(name2)).should be_true
10
+ end
11
+
12
+ def bust(rule, name1, name2)
13
+ @comp.send(rule, explode(name1), explode(name2)).should be_false
14
+ end
15
+
16
+ before :all do
17
+ names = [
18
+ "michael g palmer",
19
+ "francis l palmer",
20
+ "michael palmer"
21
+ ]
22
+ corpus = names.map {|name| explode(name)}
23
+ @comp = Comparator.new(corpus)
24
+ end
25
+
26
+ it "finds names that match without initials" do
27
+ try(:missing_initials, "michael palmer", "michael g palmer")
28
+ try(:missing_initials, "michael palmer", "Q michael palmer")
29
+ try(:missing_initials, "michael palmer", "Michael N Palmer x")
30
+ bust(:missing_initials, "michael palmer", "Michael P")
31
+ bust(:missing_initials, "michael palmer", "Michael John Palmer")
32
+
33
+ matches = @comp.crunch(explode("michael palmer"))
34
+ matches.should == [explode("michael g palmer")]
35
+ matches = @comp.crunch(explode("palmer michael"))
36
+ matches.should == [explode("michael g palmer")]
37
+ matches = @comp.crunch(explode("michael g palmer"))
38
+ matches.should == [explode("michael palmer")]
39
+ end
40
+
41
+ it "finds names that match initials to names" do
42
+ try(:matching_initials, "fred jones", "f jones")
43
+ try(:matching_initials, "fred jones", "jones f")
44
+ try(:matching_initials, "fred jones", "fred j")
45
+ try(:matching_initials, "fred xavier jones", "fred x jones")
46
+ try(:matching_initials, "fred xavier jones", "xavier jones f")
47
+ bust(:matching_initials, "fred xavier jones", "fred jones")
48
+ bust(:matching_initials, "fred xavier jones", "fred q jones")
49
+ bust(:matching_initials, "fred x jones", "fred q jones")
50
+ bust(:matching_initials, "fred xavier jones", "homer simpson")
51
+ end
52
+
53
+ it "finds names that match on all but one long names" do
54
+ try(:matching_all_but_one, "john philip sousa", "john sousa")
55
+ try(:matching_all_but_one, "philip sousa", "philip john sousa")
56
+ bust(:matching_all_but_one, "john philip sousa", "philip john sousa")
57
+ try(:matching_all_but_one, "Helen Q. Glorpworth-Smythe", "helen smythe")
58
+ end
59
+ end
@@ -0,0 +1,18 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Hash of Arrays" do
4
+ before(:each) do
5
+ @hoa = {
6
+ "one" => ["a", "b", "c"],
7
+ "two" => ["d", "e"],
8
+ "three" => ["f"]
9
+ }
10
+ @hoa.extend DataTools::HashOfArrays
11
+ end
12
+
13
+ it "coalesces" do
14
+ @hoa.coalesce("one", :into => "three")
15
+ @hoa.size.should == 2
16
+ @hoa["three"].should == ["f", "a", "b", "c"]
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ require "json"
4
+
5
+ describe "File Import" do
6
+ it "imports" do
7
+ f = File.open(File.dirname(__FILE__) + "/../hrhead.csv")
8
+ # sio = f.import.csvme(StringIO.new, ['Person Phone GUID', 'Person Address GUID'])
9
+ # puts sio.string
10
+
11
+ f.import.each_slice(3) do |slice|
12
+ puts slice.extend(DataTools::ArrayOfHashes).pluck('Person Phone GUID', 'Person Address GUID').to_json
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,5 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'awesome_print'
4
+
5
+ require "data_tools"
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_tools
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.6.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason May
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: awesome_print
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: facets
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: Data-munging utilities, including extensions to Array, Hash, String,
63
+ Symbol plus data conversions and transformations.
64
+ email: jmay@pobox.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - Gemfile
71
+ - Gemfile.lock
72
+ - README.md
73
+ - Rakefile
74
+ - data_tools.gemspec
75
+ - lib/data_tools.rb
76
+ - lib/data_tools/array.rb
77
+ - lib/data_tools/array_of_hashes.rb
78
+ - lib/data_tools/comparator.rb
79
+ - lib/data_tools/conversions.rb
80
+ - lib/data_tools/enumerator.rb
81
+ - lib/data_tools/file.rb
82
+ - lib/data_tools/hash.rb
83
+ - lib/data_tools/hash_of_arrays.rb
84
+ - lib/data_tools/io.rb
85
+ - lib/data_tools/object.rb
86
+ - lib/data_tools/rules.rb
87
+ - lib/data_tools/string.rb
88
+ - lib/data_tools/symbol.rb
89
+ - lib/data_tools/transformations.rb
90
+ - lib/data_tools/version.rb
91
+ - spec/array_spec.rb
92
+ - spec/comparator_spec.rb
93
+ - spec/hash_of_arrays_spec.rb
94
+ - spec/import_spec.rb
95
+ - spec/spec_helper.rb
96
+ homepage: http://github.com/jmay/data_tools
97
+ licenses: []
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ! '>='
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ required_rubygems_version: !ruby/object:Gem::Requirement
109
+ none: false
110
+ requirements:
111
+ - - ! '>='
112
+ - !ruby/object:Gem::Version
113
+ version: '0'
114
+ requirements: []
115
+ rubyforge_project: data_tools
116
+ rubygems_version: 1.8.24
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Miscellaneous data-munging utilities.
120
+ test_files: []