columns_matcher 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.DS_Store ADDED
Binary file
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in columns-matcher.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 David Pham
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Columns::Matcher
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'columns-matcher'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install columns-matcher
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'columns_matcher/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = 'columns_matcher'
8
+ gem.version = ColumnsMatcher::VERSION
9
+ gem.authors = ['David Pham']
10
+ gem.email = ['hello@khoi.co']
11
+ gem.description = %q{Determine the mappings between 2 sets of data}
12
+ gem.summary = %q{Determine the mappings between 2 sets of data}
13
+ gem.homepage = ''
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ['lib']
19
+
20
+ gem.add_dependency 'gratr19', '~> 0.4.4.1'
21
+ gem.add_dependency 'rserve-client', '~> 0.3.0'
22
+ gem.add_dependency 'rserve-simpler', '~> 0.0.6'
23
+ gem.add_dependency 'munkres', '~> 0.1.0'
24
+ end
data/lib/.DS_Store ADDED
Binary file
@@ -0,0 +1,52 @@
1
+ module ColumnsMatcher
2
+ module Converter
3
+ # values is an array of any type
4
+ # Returns an array of numeric types
5
+ def self.convert_from_array_to_numeric_array(values)
6
+ values.collect {|value| value.is_a?(String) ? value.sum : value }
7
+ end
8
+
9
+ # values is an array of numeric types
10
+ # Returns an R vector
11
+ def self.convert_from_array_to_r_vector(values)
12
+ "c(#{values.join(',')})"
13
+ end
14
+
15
+ # graph is an UndirectedGraph
16
+ # Returns an array of vectors in the format of [Vertex weight, [Edges' weights in descending order]]
17
+ # for input into calculation of Euclidian distance
18
+ def self.convert_from_vertices_to_vectors(graph)
19
+ graph_vectors = []
20
+
21
+ graph.vertices.each do |vertex|
22
+ graph_vectors << [graph[vertex]] + graph.adjacent(vertex, { type: :edges }).collect{|edge| edge.label }.sort {|first, second| second <=> first }
23
+ end
24
+
25
+ graph_vectors
26
+ end
27
+
28
+ # first_vectors and second_vectors are arrays of vectors in the format of [Vertex weight, [Edges' weights in descending order]]
29
+ # Returns a square cost matrix - cost calculated using Euclidian distance - for input into Munkres assignment algorithm
30
+ def self.convert_from_vectors_to_euclidian_distance_cost_matrix(first_vectors, second_vectors)
31
+ combinations_of_vectors = first_vectors.product(second_vectors)
32
+ euclidian_distances = combinations_of_vectors.map {|combination_of_vector| Statistics::euclidian_distance(combination_of_vector.first, combination_of_vector.last) }
33
+
34
+ squared_vector = nil
35
+ cost_matrix = nil
36
+
37
+ size = first_vectors.size >= second_vectors.size ? first_vectors.size : second_vectors.size
38
+
39
+ squared_vector = Array.new(size ** 2) { Graph::STUB_VALUE }
40
+
41
+ (0..(first_vectors.size - 1)).each do |row|
42
+ (0..(second_vectors.size - 1)).each do |column|
43
+ index = (row * size) + column
44
+
45
+ squared_vector[index] = euclidian_distances[index]
46
+ end
47
+ end
48
+
49
+ cost_matrix = squared_vector.each_slice(size).to_a
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,73 @@
1
+ require 'gratr'
2
+
3
+ module ColumnsMatcher
4
+ module Graph
5
+ include GRATR
6
+
7
+ STUB_VALUE = 1000
8
+
9
+ # columns is a hash with
10
+ # column names as the keys and column values as the values
11
+ # Returns a graph of vertices and edges, with weights
12
+ def self.build_dependency_graph(columns)
13
+ combinations_of_column_names = Utilities::get_combinations(columns.keys)
14
+
15
+ graph = UndirectedGraph.new
16
+
17
+ combinations_of_column_names.map do |combination_of_column_names|
18
+ first_column_name = combination_of_column_names.first
19
+ second_column_name = combination_of_column_names.last
20
+
21
+ first_column_values = columns[first_column_name]
22
+ second_column_values = columns[second_column_name]
23
+
24
+ if first_column_name == second_column_name
25
+ entropy_of_column = Statistics::entropy(first_column_values)
26
+
27
+ graph[first_column_name] = entropy_of_column
28
+ else
29
+ mutual_information_between_columns = Statistics::mutual_information(first_column_values, second_column_values)
30
+
31
+ graph.add_edge!(first_column_name, second_column_name, mutual_information_between_columns)
32
+ end
33
+ end
34
+
35
+ graph
36
+ end
37
+
38
+ # graph_to_match_against and graph_to_match are are UndirectedGraph's
39
+ # Returns mappings of columns with the keys as columns to match and values as columns to match against
40
+ def self.match_graphs(graph_to_match_against, graph_to_match)
41
+ graph_to_match_against_vectors = Converter::convert_from_vertices_to_vectors(graph_to_match_against)
42
+ graph_to_match_vectors = Converter::convert_from_vertices_to_vectors(graph_to_match)
43
+
44
+ cost_matrix = Converter::convert_from_vectors_to_euclidian_distance_cost_matrix(graph_to_match_vectors, graph_to_match_against_vectors)
45
+
46
+ pairings = Statistics::munkres_assignment_algorithm(cost_matrix)
47
+
48
+ mappings = Graph::get_mappings(pairings, graph_to_match_against, graph_to_match)
49
+
50
+ mappings
51
+ end
52
+
53
+
54
+ # pairings is a 2D array of sub-arrays of size 2 - the results of the Munkres assignment algorithm
55
+ # first_graph and second_graph are UndirectedGraph's
56
+ # Returns mappings of columns with the keys as columns to match and values as columns to match against
57
+ def self.get_mappings(pairings, first_graph, second_graph)
58
+ first_graph_vertices = first_graph.vertices
59
+ second_graph_vertices = second_graph.vertices
60
+
61
+ mappings = Hash.new
62
+
63
+ pairings.each do |pair|
64
+ first_graph_vertex = first_graph_vertices[pair.last]
65
+ second_graph_vertex = second_graph_vertices[pair.first]
66
+
67
+ mappings[second_graph_vertex] = first_graph_vertex
68
+ end
69
+
70
+ mappings
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,66 @@
1
+ require 'munkres'
2
+
3
+ module ColumnsMatcher
4
+ module Statistics
5
+ # values is an array of values
6
+ # Returns entropy statistic
7
+ def self.entropy(values)
8
+ r = Rserve::Simpler.new
9
+
10
+ r.converse("library(\"entropy\")")
11
+
12
+ numeric_values = Converter::convert_from_array_to_numeric_array(values)
13
+
14
+ values_in_r_vector = Converter::convert_from_array_to_r_vector(numeric_values)
15
+
16
+ entropy = r.converse("entropy(table(#{values_in_r_vector}))")
17
+
18
+ entropy
19
+ end
20
+
21
+ # first_values and second_values are arrays of values
22
+ # Returns mutual information statistic
23
+ def self.mutual_information(first_values, second_values)
24
+ r = Rserve::Simpler.new
25
+
26
+ r.converse("library(\"entropy\")")
27
+
28
+ first_numeric_values = Converter::convert_from_array_to_numeric_array(first_values)
29
+ second_numeric_values = Converter::convert_from_array_to_numeric_array(second_values)
30
+
31
+ first_values_in_r_vector = Converter::convert_from_array_to_r_vector(first_numeric_values)
32
+ second_values_in_r_vector = Converter::convert_from_array_to_r_vector(second_numeric_values)
33
+
34
+ mutual_information = r.converse("mi.plugin(rbind(#{first_values_in_r_vector}, #{second_values_in_r_vector}))")
35
+
36
+ mutual_information
37
+ end
38
+
39
+ # first_vector and second_vector are two arrays of two potentially different sizes
40
+ # Returns Euclidian distance between the two vectors
41
+ def self.euclidian_distance(first_vector, second_vector)
42
+ sum = 0
43
+
44
+ size = (first_vector.size >= second_vector.size) ? first_vector.size : second_vector.size
45
+
46
+ equalized_first_vector = Array.new(size) { 0 }.fill {|index| first_vector[index] }
47
+ equalized_second_vector = Array.new(size) { 0 }.fill {|index| second_vector[index] }
48
+
49
+ equalized_first_vector.zip(equalized_second_vector).each do |first, second|
50
+ component = (first - second) ** 2
51
+
52
+ sum += component
53
+ end
54
+
55
+ Math.sqrt(sum)
56
+ end
57
+
58
+ # cost_matrix is a square matrix of costs
59
+ # Returns pairings that optimize costs - lowest overall cost
60
+ def self.munkres_assignment_algorithm(cost_matrix)
61
+ munkres = Munkres.new(cost_matrix)
62
+
63
+ pairings = munkres.find_pairings
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,9 @@
1
+ module ColumnsMatcher
2
+ module Utilities
3
+ # column_names is an array
4
+ # Returns combinations of column names
5
+ def self.get_combinations(column_names)
6
+ column_names.product(column_names)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,3 @@
1
+ module ColumnsMatcher
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,20 @@
1
+ require 'columns_matcher/version'
2
+ require 'columns_matcher/utilities'
3
+ require 'columns_matcher/converter'
4
+ require 'columns_matcher/statistics'
5
+ require 'columns_matcher/graph'
6
+ require 'rserve/simpler'
7
+
8
+ module ColumnsMatcher
9
+ # columns_to_match_against and columns_to_match are hashes with
10
+ # column names as the keys and column values as the values
11
+ # Returns mappings of columns with the keys as columns to match and values as columns to match against
12
+ def self.match(columns_to_match_against, columns_to_match)
13
+ columns_to_match_against_graph = Graph::build_dependency_graph(columns_to_match_against)
14
+ columns_to_match_graph = Graph::build_dependency_graph(columns_to_match)
15
+
16
+ column_mappings = Graph::match_graphs(columns_to_match_against_graph, columns_to_match_graph)
17
+
18
+ column_mappings
19
+ end
20
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: columns_matcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - David Pham
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: gratr19
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 0.4.4.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 0.4.4.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: rserve-client
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 0.3.0
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 0.3.0
46
+ - !ruby/object:Gem::Dependency
47
+ name: rserve-simpler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 0.0.6
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.0.6
62
+ - !ruby/object:Gem::Dependency
63
+ name: munkres
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 0.1.0
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 0.1.0
78
+ description: Determine the mappings between 2 sets of data
79
+ email:
80
+ - hello@khoi.co
81
+ executables: []
82
+ extensions: []
83
+ extra_rdoc_files: []
84
+ files:
85
+ - .DS_Store
86
+ - .gitignore
87
+ - Gemfile
88
+ - LICENSE.txt
89
+ - README.md
90
+ - Rakefile
91
+ - columns_matcher.gemspec
92
+ - lib/.DS_Store
93
+ - lib/columns_matcher.rb
94
+ - lib/columns_matcher/converter.rb
95
+ - lib/columns_matcher/graph.rb
96
+ - lib/columns_matcher/statistics.rb
97
+ - lib/columns_matcher/utilities.rb
98
+ - lib/columns_matcher/version.rb
99
+ homepage: ''
100
+ licenses: []
101
+ post_install_message:
102
+ rdoc_options: []
103
+ require_paths:
104
+ - lib
105
+ required_ruby_version: !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ none: false
113
+ requirements:
114
+ - - ! '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 1.8.24
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: Determine the mappings between 2 sets of data
123
+ test_files: []