columns_matcher 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.DS_Store +0 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/columns_matcher.gemspec +24 -0
- data/lib/.DS_Store +0 -0
- data/lib/columns_matcher/converter.rb +52 -0
- data/lib/columns_matcher/graph.rb +73 -0
- data/lib/columns_matcher/statistics.rb +66 -0
- data/lib/columns_matcher/utilities.rb +9 -0
- data/lib/columns_matcher/version.rb +3 -0
- data/lib/columns_matcher.rb +20 -0
- metadata +123 -0
data/.DS_Store
ADDED
Binary file
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 David Pham
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Columns::Matcher
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'columns-matcher'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install columns-matcher
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'columns_matcher/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = 'columns_matcher'
|
8
|
+
gem.version = ColumnsMatcher::VERSION
|
9
|
+
gem.authors = ['David Pham']
|
10
|
+
gem.email = ['hello@khoi.co']
|
11
|
+
gem.description = %q{Determine the mappings between 2 sets of data}
|
12
|
+
gem.summary = %q{Determine the mappings between 2 sets of data}
|
13
|
+
gem.homepage = ''
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ['lib']
|
19
|
+
|
20
|
+
gem.add_dependency 'gratr19', '~> 0.4.4.1'
|
21
|
+
gem.add_dependency 'rserve-client', '~> 0.3.0'
|
22
|
+
gem.add_dependency 'rserve-simpler', '~> 0.0.6'
|
23
|
+
gem.add_dependency 'munkres', '~> 0.1.0'
|
24
|
+
end
|
data/lib/.DS_Store
ADDED
Binary file
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module ColumnsMatcher
|
2
|
+
module Converter
|
3
|
+
# values is an array of any type
|
4
|
+
# Returns an array of numeric types
|
5
|
+
def self.convert_from_array_to_numeric_array(values)
|
6
|
+
values.collect {|value| value.is_a?(String) ? value.sum : value }
|
7
|
+
end
|
8
|
+
|
9
|
+
# values is an array of numeric types
|
10
|
+
# Returns an R vector
|
11
|
+
def self.convert_from_array_to_r_vector(values)
|
12
|
+
"c(#{values.join(',')})"
|
13
|
+
end
|
14
|
+
|
15
|
+
# graph is an UndirectedGraph
|
16
|
+
# Returns an array of vectors in the format of [Vertex weight, [Edges' weights in descending order]]
|
17
|
+
# for input into calculation of Euclidian distance
|
18
|
+
def self.convert_from_vertices_to_vectors(graph)
|
19
|
+
graph_vectors = []
|
20
|
+
|
21
|
+
graph.vertices.each do |vertex|
|
22
|
+
graph_vectors << [graph[vertex]] + graph.adjacent(vertex, { type: :edges }).collect{|edge| edge.label }.sort {|first, second| second <=> first }
|
23
|
+
end
|
24
|
+
|
25
|
+
graph_vectors
|
26
|
+
end
|
27
|
+
|
28
|
+
# first_vectors and second_vectors are arrays of vectors in the format of [Vertex weight, [Edges' weights in descending order]]
|
29
|
+
# Returns a square cost matrix - cost calculated using Euclidian distance - for input into Munkres assignment algorithm
|
30
|
+
def self.convert_from_vectors_to_euclidian_distance_cost_matrix(first_vectors, second_vectors)
|
31
|
+
combinations_of_vectors = first_vectors.product(second_vectors)
|
32
|
+
euclidian_distances = combinations_of_vectors.map {|combination_of_vector| Statistics::euclidian_distance(combination_of_vector.first, combination_of_vector.last) }
|
33
|
+
|
34
|
+
squared_vector = nil
|
35
|
+
cost_matrix = nil
|
36
|
+
|
37
|
+
size = first_vectors.size >= second_vectors.size ? first_vectors.size : second_vectors.size
|
38
|
+
|
39
|
+
squared_vector = Array.new(size ** 2) { Graph::STUB_VALUE }
|
40
|
+
|
41
|
+
(0..(first_vectors.size - 1)).each do |row|
|
42
|
+
(0..(second_vectors.size - 1)).each do |column|
|
43
|
+
index = (row * size) + column
|
44
|
+
|
45
|
+
squared_vector[index] = euclidian_distances[index]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
cost_matrix = squared_vector.each_slice(size).to_a
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'gratr'
|
2
|
+
|
3
|
+
module ColumnsMatcher
|
4
|
+
module Graph
|
5
|
+
include GRATR
|
6
|
+
|
7
|
+
STUB_VALUE = 1000
|
8
|
+
|
9
|
+
# columns is a hash with
|
10
|
+
# column names as the keys and column values as the values
|
11
|
+
# Returns a graph of vertices and edges, with weights
|
12
|
+
def self.build_dependency_graph(columns)
|
13
|
+
combinations_of_column_names = Utilities::get_combinations(columns.keys)
|
14
|
+
|
15
|
+
graph = UndirectedGraph.new
|
16
|
+
|
17
|
+
combinations_of_column_names.map do |combination_of_column_names|
|
18
|
+
first_column_name = combination_of_column_names.first
|
19
|
+
second_column_name = combination_of_column_names.last
|
20
|
+
|
21
|
+
first_column_values = columns[first_column_name]
|
22
|
+
second_column_values = columns[second_column_name]
|
23
|
+
|
24
|
+
if first_column_name == second_column_name
|
25
|
+
entropy_of_column = Statistics::entropy(first_column_values)
|
26
|
+
|
27
|
+
graph[first_column_name] = entropy_of_column
|
28
|
+
else
|
29
|
+
mutual_information_between_columns = Statistics::mutual_information(first_column_values, second_column_values)
|
30
|
+
|
31
|
+
graph.add_edge!(first_column_name, second_column_name, mutual_information_between_columns)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
graph
|
36
|
+
end
|
37
|
+
|
38
|
+
# graph_to_match_against and graph_to_match are are UndirectedGraph's
|
39
|
+
# Returns mappings of columns with the keys as columns to match and values as columns to match against
|
40
|
+
def self.match_graphs(graph_to_match_against, graph_to_match)
|
41
|
+
graph_to_match_against_vectors = Converter::convert_from_vertices_to_vectors(graph_to_match_against)
|
42
|
+
graph_to_match_vectors = Converter::convert_from_vertices_to_vectors(graph_to_match)
|
43
|
+
|
44
|
+
cost_matrix = Converter::convert_from_vectors_to_euclidian_distance_cost_matrix(graph_to_match_vectors, graph_to_match_against_vectors)
|
45
|
+
|
46
|
+
pairings = Statistics::munkres_assignment_algorithm(cost_matrix)
|
47
|
+
|
48
|
+
mappings = Graph::get_mappings(pairings, graph_to_match_against, graph_to_match)
|
49
|
+
|
50
|
+
mappings
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
# pairings is a 2D array of sub-arrays of size 2 - the results of the Munkres assignment algorithm
|
55
|
+
# first_graph and second_graph are UndirectedGraph's
|
56
|
+
# Returns mappings of columns with the keys as columns to match and values as columns to match against
|
57
|
+
def self.get_mappings(pairings, first_graph, second_graph)
|
58
|
+
first_graph_vertices = first_graph.vertices
|
59
|
+
second_graph_vertices = second_graph.vertices
|
60
|
+
|
61
|
+
mappings = Hash.new
|
62
|
+
|
63
|
+
pairings.each do |pair|
|
64
|
+
first_graph_vertex = first_graph_vertices[pair.last]
|
65
|
+
second_graph_vertex = second_graph_vertices[pair.first]
|
66
|
+
|
67
|
+
mappings[second_graph_vertex] = first_graph_vertex
|
68
|
+
end
|
69
|
+
|
70
|
+
mappings
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'munkres'
|
2
|
+
|
3
|
+
module ColumnsMatcher
|
4
|
+
module Statistics
|
5
|
+
# values is an array of values
|
6
|
+
# Returns entropy statistic
|
7
|
+
def self.entropy(values)
|
8
|
+
r = Rserve::Simpler.new
|
9
|
+
|
10
|
+
r.converse("library(\"entropy\")")
|
11
|
+
|
12
|
+
numeric_values = Converter::convert_from_array_to_numeric_array(values)
|
13
|
+
|
14
|
+
values_in_r_vector = Converter::convert_from_array_to_r_vector(numeric_values)
|
15
|
+
|
16
|
+
entropy = r.converse("entropy(table(#{values_in_r_vector}))")
|
17
|
+
|
18
|
+
entropy
|
19
|
+
end
|
20
|
+
|
21
|
+
# first_values and second_values are arrays of values
|
22
|
+
# Returns mutual information statistic
|
23
|
+
def self.mutual_information(first_values, second_values)
|
24
|
+
r = Rserve::Simpler.new
|
25
|
+
|
26
|
+
r.converse("library(\"entropy\")")
|
27
|
+
|
28
|
+
first_numeric_values = Converter::convert_from_array_to_numeric_array(first_values)
|
29
|
+
second_numeric_values = Converter::convert_from_array_to_numeric_array(second_values)
|
30
|
+
|
31
|
+
first_values_in_r_vector = Converter::convert_from_array_to_r_vector(first_numeric_values)
|
32
|
+
second_values_in_r_vector = Converter::convert_from_array_to_r_vector(second_numeric_values)
|
33
|
+
|
34
|
+
mutual_information = r.converse("mi.plugin(rbind(#{first_values_in_r_vector}, #{second_values_in_r_vector}))")
|
35
|
+
|
36
|
+
mutual_information
|
37
|
+
end
|
38
|
+
|
39
|
+
# first_vector and second_vector are two arrays of two potentially different sizes
|
40
|
+
# Returns Euclidian distance between the two vectors
|
41
|
+
def self.euclidian_distance(first_vector, second_vector)
|
42
|
+
sum = 0
|
43
|
+
|
44
|
+
size = (first_vector.size >= second_vector.size) ? first_vector.size : second_vector.size
|
45
|
+
|
46
|
+
equalized_first_vector = Array.new(size) { 0 }.fill {|index| first_vector[index] }
|
47
|
+
equalized_second_vector = Array.new(size) { 0 }.fill {|index| second_vector[index] }
|
48
|
+
|
49
|
+
equalized_first_vector.zip(equalized_second_vector).each do |first, second|
|
50
|
+
component = (first - second) ** 2
|
51
|
+
|
52
|
+
sum += component
|
53
|
+
end
|
54
|
+
|
55
|
+
Math.sqrt(sum)
|
56
|
+
end
|
57
|
+
|
58
|
+
# cost_matrix is a square matrix of costs
|
59
|
+
# Returns pairings that optimize costs - lowest overall cost
|
60
|
+
def self.munkres_assignment_algorithm(cost_matrix)
|
61
|
+
munkres = Munkres.new(cost_matrix)
|
62
|
+
|
63
|
+
pairings = munkres.find_pairings
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'columns_matcher/version'
|
2
|
+
require 'columns_matcher/utilities'
|
3
|
+
require 'columns_matcher/converter'
|
4
|
+
require 'columns_matcher/statistics'
|
5
|
+
require 'columns_matcher/graph'
|
6
|
+
require 'rserve/simpler'
|
7
|
+
|
8
|
+
module ColumnsMatcher
|
9
|
+
# columns_to_match_against and columns_to_match are hashes with
|
10
|
+
# column names as the keys and column values as the values
|
11
|
+
# Returns mappings of columns with the keys as columns to match and values as columns to match against
|
12
|
+
def self.match(columns_to_match_against, columns_to_match)
|
13
|
+
columns_to_match_against_graph = Graph::build_dependency_graph(columns_to_match_against)
|
14
|
+
columns_to_match_graph = Graph::build_dependency_graph(columns_to_match)
|
15
|
+
|
16
|
+
column_mappings = Graph::match_graphs(columns_to_match_against_graph, columns_to_match_graph)
|
17
|
+
|
18
|
+
column_mappings
|
19
|
+
end
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: columns_matcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- David Pham
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: gratr19
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.4.4.1
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.4.4.1
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rserve-client
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 0.3.0
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 0.3.0
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rserve-simpler
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.0.6
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.0.6
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: munkres
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.1.0
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 0.1.0
|
78
|
+
description: Determine the mappings between 2 sets of data
|
79
|
+
email:
|
80
|
+
- hello@khoi.co
|
81
|
+
executables: []
|
82
|
+
extensions: []
|
83
|
+
extra_rdoc_files: []
|
84
|
+
files:
|
85
|
+
- .DS_Store
|
86
|
+
- .gitignore
|
87
|
+
- Gemfile
|
88
|
+
- LICENSE.txt
|
89
|
+
- README.md
|
90
|
+
- Rakefile
|
91
|
+
- columns_matcher.gemspec
|
92
|
+
- lib/.DS_Store
|
93
|
+
- lib/columns_matcher.rb
|
94
|
+
- lib/columns_matcher/converter.rb
|
95
|
+
- lib/columns_matcher/graph.rb
|
96
|
+
- lib/columns_matcher/statistics.rb
|
97
|
+
- lib/columns_matcher/utilities.rb
|
98
|
+
- lib/columns_matcher/version.rb
|
99
|
+
homepage: ''
|
100
|
+
licenses: []
|
101
|
+
post_install_message:
|
102
|
+
rdoc_options: []
|
103
|
+
require_paths:
|
104
|
+
- lib
|
105
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ! '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
none: false
|
113
|
+
requirements:
|
114
|
+
- - ! '>='
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
requirements: []
|
118
|
+
rubyforge_project:
|
119
|
+
rubygems_version: 1.8.24
|
120
|
+
signing_key:
|
121
|
+
specification_version: 3
|
122
|
+
summary: Determine the mappings between 2 sets of data
|
123
|
+
test_files: []
|