columns_matcher 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.DS_Store ADDED
Binary file
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in columns-matcher.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 David Pham
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Columns::Matcher
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'columns-matcher'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install columns-matcher
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'columns_matcher/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = 'columns_matcher'
8
+ gem.version = ColumnsMatcher::VERSION
9
+ gem.authors = ['David Pham']
10
+ gem.email = ['hello@khoi.co']
11
+ gem.description = %q{Determine the mappings between 2 sets of data}
12
+ gem.summary = %q{Determine the mappings between 2 sets of data}
13
+ gem.homepage = ''
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ['lib']
19
+
20
+ gem.add_dependency 'gratr19', '~> 0.4.4.1'
21
+ gem.add_dependency 'rserve-client', '~> 0.3.0'
22
+ gem.add_dependency 'rserve-simpler', '~> 0.0.6'
23
+ gem.add_dependency 'munkres', '~> 0.1.0'
24
+ end
data/lib/.DS_Store ADDED
Binary file
@@ -0,0 +1,52 @@
1
+ module ColumnsMatcher
2
+ module Converter
3
+ # values is an array of any type
4
+ # Returns an array of numeric types
5
+ def self.convert_from_array_to_numeric_array(values)
6
+ values.collect {|value| value.is_a?(String) ? value.sum : value }
7
+ end
8
+
9
+ # values is an array of numeric types
10
+ # Returns an R vector
11
+ def self.convert_from_array_to_r_vector(values)
12
+ "c(#{values.join(',')})"
13
+ end
14
+
15
+ # graph is an UndirectedGraph
16
+ # Returns an array of vectors in the format of [Vertex weight, [Edges' weights in descending order]]
17
+ # for input into calculation of Euclidian distance
18
+ def self.convert_from_vertices_to_vectors(graph)
19
+ graph_vectors = []
20
+
21
+ graph.vertices.each do |vertex|
22
+ graph_vectors << [graph[vertex]] + graph.adjacent(vertex, { type: :edges }).collect{|edge| edge.label }.sort {|first, second| second <=> first }
23
+ end
24
+
25
+ graph_vectors
26
+ end
27
+
28
+ # first_vectors and second_vectors are arrays of vectors in the format of [Vertex weight, [Edges' weights in descending order]]
29
+ # Returns a square cost matrix - cost calculated using Euclidian distance - for input into Munkres assignment algorithm
30
+ def self.convert_from_vectors_to_euclidian_distance_cost_matrix(first_vectors, second_vectors)
31
+ combinations_of_vectors = first_vectors.product(second_vectors)
32
+ euclidian_distances = combinations_of_vectors.map {|combination_of_vector| Statistics::euclidian_distance(combination_of_vector.first, combination_of_vector.last) }
33
+
34
+ squared_vector = nil
35
+ cost_matrix = nil
36
+
37
+ size = first_vectors.size >= second_vectors.size ? first_vectors.size : second_vectors.size
38
+
39
+ squared_vector = Array.new(size ** 2) { Graph::STUB_VALUE }
40
+
41
+ (0..(first_vectors.size - 1)).each do |row|
42
+ (0..(second_vectors.size - 1)).each do |column|
43
+ index = (row * size) + column
44
+
45
+ squared_vector[index] = euclidian_distances[index]
46
+ end
47
+ end
48
+
49
+ cost_matrix = squared_vector.each_slice(size).to_a
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,73 @@
1
+ require 'gratr'
2
+
3
+ module ColumnsMatcher
4
+ module Graph
5
+ include GRATR
6
+
7
+ STUB_VALUE = 1000
8
+
9
+ # columns is a hash with
10
+ # column names as the keys and column values as the values
11
+ # Returns a graph of vertices and edges, with weights
12
+ def self.build_dependency_graph(columns)
13
+ combinations_of_column_names = Utilities::get_combinations(columns.keys)
14
+
15
+ graph = UndirectedGraph.new
16
+
17
+ combinations_of_column_names.map do |combination_of_column_names|
18
+ first_column_name = combination_of_column_names.first
19
+ second_column_name = combination_of_column_names.last
20
+
21
+ first_column_values = columns[first_column_name]
22
+ second_column_values = columns[second_column_name]
23
+
24
+ if first_column_name == second_column_name
25
+ entropy_of_column = Statistics::entropy(first_column_values)
26
+
27
+ graph[first_column_name] = entropy_of_column
28
+ else
29
+ mutual_information_between_columns = Statistics::mutual_information(first_column_values, second_column_values)
30
+
31
+ graph.add_edge!(first_column_name, second_column_name, mutual_information_between_columns)
32
+ end
33
+ end
34
+
35
+ graph
36
+ end
37
+
38
+ # graph_to_match_against and graph_to_match are are UndirectedGraph's
39
+ # Returns mappings of columns with the keys as columns to match and values as columns to match against
40
+ def self.match_graphs(graph_to_match_against, graph_to_match)
41
+ graph_to_match_against_vectors = Converter::convert_from_vertices_to_vectors(graph_to_match_against)
42
+ graph_to_match_vectors = Converter::convert_from_vertices_to_vectors(graph_to_match)
43
+
44
+ cost_matrix = Converter::convert_from_vectors_to_euclidian_distance_cost_matrix(graph_to_match_vectors, graph_to_match_against_vectors)
45
+
46
+ pairings = Statistics::munkres_assignment_algorithm(cost_matrix)
47
+
48
+ mappings = Graph::get_mappings(pairings, graph_to_match_against, graph_to_match)
49
+
50
+ mappings
51
+ end
52
+
53
+
54
+ # pairings is a 2D array of sub-arrays of size 2 - the results of the Munkres assignment algorithm
55
+ # first_graph and second_graph are UndirectedGraph's
56
+ # Returns mappings of columns with the keys as columns to match and values as columns to match against
57
+ def self.get_mappings(pairings, first_graph, second_graph)
58
+ first_graph_vertices = first_graph.vertices
59
+ second_graph_vertices = second_graph.vertices
60
+
61
+ mappings = Hash.new
62
+
63
+ pairings.each do |pair|
64
+ first_graph_vertex = first_graph_vertices[pair.last]
65
+ second_graph_vertex = second_graph_vertices[pair.first]
66
+
67
+ mappings[second_graph_vertex] = first_graph_vertex
68
+ end
69
+
70
+ mappings
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,66 @@
1
+ require 'munkres'
2
+
3
+ module ColumnsMatcher
4
+ module Statistics
5
+ # values is an array of values
6
+ # Returns entropy statistic
7
+ def self.entropy(values)
8
+ r = Rserve::Simpler.new
9
+
10
+ r.converse("library(\"entropy\")")
11
+
12
+ numeric_values = Converter::convert_from_array_to_numeric_array(values)
13
+
14
+ values_in_r_vector = Converter::convert_from_array_to_r_vector(numeric_values)
15
+
16
+ entropy = r.converse("entropy(table(#{values_in_r_vector}))")
17
+
18
+ entropy
19
+ end
20
+
21
+ # first_values and second_values are arrays of values
22
+ # Returns mutual information statistic
23
+ def self.mutual_information(first_values, second_values)
24
+ r = Rserve::Simpler.new
25
+
26
+ r.converse("library(\"entropy\")")
27
+
28
+ first_numeric_values = Converter::convert_from_array_to_numeric_array(first_values)
29
+ second_numeric_values = Converter::convert_from_array_to_numeric_array(second_values)
30
+
31
+ first_values_in_r_vector = Converter::convert_from_array_to_r_vector(first_numeric_values)
32
+ second_values_in_r_vector = Converter::convert_from_array_to_r_vector(second_numeric_values)
33
+
34
+ mutual_information = r.converse("mi.plugin(rbind(#{first_values_in_r_vector}, #{second_values_in_r_vector}))")
35
+
36
+ mutual_information
37
+ end
38
+
39
+ # first_vector and second_vector are two arrays of two potentially different sizes
40
+ # Returns Euclidian distance between the two vectors
41
+ def self.euclidian_distance(first_vector, second_vector)
42
+ sum = 0
43
+
44
+ size = (first_vector.size >= second_vector.size) ? first_vector.size : second_vector.size
45
+
46
+ equalized_first_vector = Array.new(size) { 0 }.fill {|index| first_vector[index] }
47
+ equalized_second_vector = Array.new(size) { 0 }.fill {|index| second_vector[index] }
48
+
49
+ equalized_first_vector.zip(equalized_second_vector).each do |first, second|
50
+ component = (first - second) ** 2
51
+
52
+ sum += component
53
+ end
54
+
55
+ Math.sqrt(sum)
56
+ end
57
+
58
+ # cost_matrix is a square matrix of costs
59
+ # Returns pairings that optimize costs - lowest overall cost
60
+ def self.munkres_assignment_algorithm(cost_matrix)
61
+ munkres = Munkres.new(cost_matrix)
62
+
63
+ pairings = munkres.find_pairings
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,9 @@
1
+ module ColumnsMatcher
2
+ module Utilities
3
+ # column_names is an array
4
+ # Returns combinations of column names
5
+ def self.get_combinations(column_names)
6
+ column_names.product(column_names)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,3 @@
1
+ module ColumnsMatcher
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,20 @@
1
+ require 'columns_matcher/version'
2
+ require 'columns_matcher/utilities'
3
+ require 'columns_matcher/converter'
4
+ require 'columns_matcher/statistics'
5
+ require 'columns_matcher/graph'
6
+ require 'rserve/simpler'
7
+
8
+ module ColumnsMatcher
9
+ # columns_to_match_against and columns_to_match are hashes with
10
+ # column names as the keys and column values as the values
11
+ # Returns mappings of columns with the keys as columns to match and values as columns to match against
12
+ def self.match(columns_to_match_against, columns_to_match)
13
+ columns_to_match_against_graph = Graph::build_dependency_graph(columns_to_match_against)
14
+ columns_to_match_graph = Graph::build_dependency_graph(columns_to_match)
15
+
16
+ column_mappings = Graph::match_graphs(columns_to_match_against_graph, columns_to_match_graph)
17
+
18
+ column_mappings
19
+ end
20
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: columns_matcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - David Pham
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: gratr19
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 0.4.4.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 0.4.4.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: rserve-client
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 0.3.0
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 0.3.0
46
+ - !ruby/object:Gem::Dependency
47
+ name: rserve-simpler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 0.0.6
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.0.6
62
+ - !ruby/object:Gem::Dependency
63
+ name: munkres
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 0.1.0
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 0.1.0
78
+ description: Determine the mappings between 2 sets of data
79
+ email:
80
+ - hello@khoi.co
81
+ executables: []
82
+ extensions: []
83
+ extra_rdoc_files: []
84
+ files:
85
+ - .DS_Store
86
+ - .gitignore
87
+ - Gemfile
88
+ - LICENSE.txt
89
+ - README.md
90
+ - Rakefile
91
+ - columns_matcher.gemspec
92
+ - lib/.DS_Store
93
+ - lib/columns_matcher.rb
94
+ - lib/columns_matcher/converter.rb
95
+ - lib/columns_matcher/graph.rb
96
+ - lib/columns_matcher/statistics.rb
97
+ - lib/columns_matcher/utilities.rb
98
+ - lib/columns_matcher/version.rb
99
+ homepage: ''
100
+ licenses: []
101
+ post_install_message:
102
+ rdoc_options: []
103
+ require_paths:
104
+ - lib
105
+ required_ruby_version: !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ none: false
113
+ requirements:
114
+ - - ! '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 1.8.24
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: Determine the mappings between 2 sets of data
123
+ test_files: []