linkage 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,108 @@
1
+ module Linkage
2
+ # Abstract class to represent SQL functions. No attempts are made to
3
+ # ensure that the function actually exists in the database you're using.
4
+ #
5
+ # @abstract
6
+ class Function < Data
7
+ # Register a new function.
8
+ #
9
+ # @param [Class] klass Function class (probably a subclass of {Function})
10
+ def self.register(klass)
11
+ if klass.instance_methods(false).any? { |m| m.to_s == "ruby_type" }
12
+ @functions ||= {}
13
+ @functions[klass.function_name] = klass
14
+ else
15
+ raise ArgumentError, "ruby_type instance method must be defined"
16
+ end
17
+ end
18
+
19
+ def self.[](name)
20
+ @functions ? @functions[name] : nil
21
+ end
22
+
23
+ # Subclasses must define this.
24
+ def self.function_name
25
+ raise NotImplementedError
26
+ end
27
+
28
+ # Subclasses can define this to require a specific number of arguments
29
+ # of a certain class. To require two parameters of either String or
30
+ # Integer, do something like this:
31
+ #
32
+ # @@parameters = [[String, Integer], [String, Integer]]
33
+ # def self.parameters
34
+ # @@parameters
35
+ # end
36
+ #
37
+ def self.parameters
38
+ nil
39
+ end
40
+
41
+ attr_reader :dataset
42
+
43
+ # @param [Linkage::Field, Object] args Function arguments
44
+ def initialize(*args)
45
+ @names = [self.class.function_name]
46
+ @args = args
47
+ @dataset = nil
48
+ process_args
49
+ end
50
+
51
+ def name
52
+ @name ||= @names.join("_").to_sym
53
+ end
54
+
55
+ def static?
56
+ @static
57
+ end
58
+
59
+ # Subclasses must define this. The return value should be a Hash with
60
+ # the following elements:
61
+ # :type - column type (Ruby class) of the result
62
+ # :opts - Optional hash with additional options (like :size)
63
+ def ruby_type
64
+ raise NotImplementedError
65
+ end
66
+
67
+ # @return [Sequel::SQL::Function]
68
+ def to_expr
69
+ self.class.function_name.to_sym.sql_function(*@values)
70
+ end
71
+
72
+ private
73
+
74
+ def process_args
75
+ parameters = self.class.parameters
76
+ if parameters && parameters.length != @args.length
77
+ raise ArgumentError, "wrong number of arguments (#{@args.length} for #{parameters.length})"
78
+ end
79
+
80
+ @static = true
81
+ @values = []
82
+ @args.each_with_index do |arg, i|
83
+ if arg.kind_of?(Data)
84
+ @names << arg.name
85
+ @static &&= arg.static?
86
+ if @dataset && !arg.static? && @dataset != arg.dataset
87
+ raise ArgumentError, "You cannot supply fields from different datasets as arguments to the same function)"
88
+ end
89
+ @dataset ||= arg.dataset
90
+ type = arg.ruby_type[:type]
91
+ value = arg.is_a?(Field) ? arg.name : arg.to_expr
92
+ else
93
+ @names << arg.to_s.gsub(/\W/, "")
94
+ type = arg.class
95
+ value = arg
96
+ end
97
+ if parameters && !parameters[i].include?(type)
98
+ raise TypeError, "expected type #{parameters[i].join(" or ")}, got #{type}"
99
+ end
100
+ @values << value
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+ Dir.glob(File.expand_path(File.join(File.dirname(__FILE__), "functions", "*.rb"))).each do |filename|
107
+ require filename
108
+ end
@@ -0,0 +1,22 @@
1
+ module Linkage
2
+ module Functions
3
+ class Trim < Function
4
+ def self.function_name
5
+ "trim"
6
+ end
7
+
8
+ def self.parameters
9
+ [[String]]
10
+ end
11
+
12
+ def ruby_type
13
+ if @args[0].kind_of?(Data)
14
+ @args[0].ruby_type
15
+ else
16
+ {:type => String}
17
+ end
18
+ end
19
+ end
20
+ Function.register(Trim)
21
+ end
22
+ end
@@ -0,0 +1,5 @@
1
+ module Linkage
2
+ module Warnings
3
+ # TODO :)
4
+ end
5
+ end
data/lib/linkage.rb CHANGED
@@ -6,10 +6,13 @@ end
6
6
 
7
7
  path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'linkage'
8
8
  require path + 'utils'
9
+ require path + 'warnings'
9
10
  require path + 'dataset'
10
11
  require path + 'runner'
11
12
  require path + 'expectation'
13
+ require path + 'data'
12
14
  require path + 'field'
15
+ require path + 'function'
13
16
  require path + 'group'
14
17
  require path + 'import_buffer'
15
18
  require path + 'configuration'
data/linkage.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "linkage"
8
- s.version = "0.0.1"
8
+ s.version = "0.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jeremy Stephens"]
12
- s.date = "2011-10-25"
12
+ s.date = "2011-12-06"
13
13
  s.description = "Wraps Sequel to perform record linkage between one or two datasets"
14
14
  s.email = "jeremy.f.stephens@vanderbilt.edu"
15
15
  s.extra_rdoc_files = [
@@ -28,34 +28,42 @@ Gem::Specification.new do |s|
28
28
  "VERSION",
29
29
  "lib/linkage.rb",
30
30
  "lib/linkage/configuration.rb",
31
+ "lib/linkage/data.rb",
31
32
  "lib/linkage/dataset.rb",
32
33
  "lib/linkage/expectation.rb",
33
34
  "lib/linkage/field.rb",
35
+ "lib/linkage/function.rb",
36
+ "lib/linkage/functions/trim.rb",
34
37
  "lib/linkage/group.rb",
35
38
  "lib/linkage/import_buffer.rb",
36
39
  "lib/linkage/runner.rb",
37
40
  "lib/linkage/runner/single_threaded.rb",
38
41
  "lib/linkage/utils.rb",
42
+ "lib/linkage/warnings.rb",
39
43
  "linkage.gemspec",
44
+ "test/config.yml",
40
45
  "test/helper.rb",
41
46
  "test/integration/test_cross_linkage.rb",
42
47
  "test/integration/test_dual_linkage.rb",
43
48
  "test/integration/test_self_linkage.rb",
49
+ "test/unit/functions/test_trim.rb",
50
+ "test/unit/runner/test_single_threaded.rb",
44
51
  "test/unit/test_configuration.rb",
52
+ "test/unit/test_data.rb",
45
53
  "test/unit/test_dataset.rb",
46
54
  "test/unit/test_expectation.rb",
47
55
  "test/unit/test_field.rb",
56
+ "test/unit/test_function.rb",
48
57
  "test/unit/test_group.rb",
49
58
  "test/unit/test_import_buffer.rb",
50
59
  "test/unit/test_linkage.rb",
51
60
  "test/unit/test_runner.rb",
52
- "test/unit/test_single_threaded_runner.rb",
53
61
  "test/unit/test_utils.rb"
54
62
  ]
55
63
  s.homepage = "http://github.com/coupler/linkage"
56
64
  s.licenses = ["MIT"]
57
65
  s.require_paths = ["lib"]
58
- s.rubygems_version = "1.8.10"
66
+ s.rubygems_version = "1.8.11"
59
67
  s.summary = "Sequel-based record linkage"
60
68
 
61
69
  if s.respond_to? :specification_version then
@@ -74,6 +82,10 @@ Gem::Specification.new do |s|
74
82
  s.add_development_dependency(%q<rake>, [">= 0"])
75
83
  s.add_development_dependency(%q<versionomy>, [">= 0"])
76
84
  s.add_development_dependency(%q<guard-yard>, [">= 0"])
85
+ s.add_development_dependency(%q<rb-inotify>, [">= 0"])
86
+ s.add_development_dependency(%q<mysql2>, [">= 0"])
87
+ s.add_development_dependency(%q<pry>, [">= 0"])
88
+ s.add_development_dependency(%q<rdiscount>, [">= 0"])
77
89
  else
78
90
  s.add_dependency(%q<sequel>, [">= 0"])
79
91
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
@@ -87,6 +99,10 @@ Gem::Specification.new do |s|
87
99
  s.add_dependency(%q<rake>, [">= 0"])
88
100
  s.add_dependency(%q<versionomy>, [">= 0"])
89
101
  s.add_dependency(%q<guard-yard>, [">= 0"])
102
+ s.add_dependency(%q<rb-inotify>, [">= 0"])
103
+ s.add_dependency(%q<mysql2>, [">= 0"])
104
+ s.add_dependency(%q<pry>, [">= 0"])
105
+ s.add_dependency(%q<rdiscount>, [">= 0"])
90
106
  end
91
107
  else
92
108
  s.add_dependency(%q<sequel>, [">= 0"])
@@ -101,6 +117,10 @@ Gem::Specification.new do |s|
101
117
  s.add_dependency(%q<rake>, [">= 0"])
102
118
  s.add_dependency(%q<versionomy>, [">= 0"])
103
119
  s.add_dependency(%q<guard-yard>, [">= 0"])
120
+ s.add_dependency(%q<rb-inotify>, [">= 0"])
121
+ s.add_dependency(%q<mysql2>, [">= 0"])
122
+ s.add_dependency(%q<pry>, [">= 0"])
123
+ s.add_dependency(%q<rdiscount>, [">= 0"])
104
124
  end
105
125
  end
106
126
 
data/test/config.yml ADDED
@@ -0,0 +1,5 @@
1
+ mysql:
2
+ host: localhost
3
+ port: 3306
4
+ database: test
5
+ user: test
data/test/helper.rb CHANGED
@@ -13,6 +13,7 @@ require 'tmpdir'
13
13
  require 'logger'
14
14
  require 'pp'
15
15
  require 'versionomy'
16
+ require 'pry'
16
17
 
17
18
  $LOAD_PATH.unshift(File.dirname(__FILE__))
18
19
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
@@ -21,15 +22,35 @@ require 'linkage'
21
22
  class Test::Unit::TestCase
22
23
  def stub_field(name, options = {}, &block)
23
24
  f = Linkage::Field.allocate
25
+ f.stubs({:static? => false}.merge(options))
26
+ if block
27
+ f.send(:instance_eval, &block)
28
+ end
29
+ f
30
+ end
31
+
32
+ def stub_function(name, options = {}, &block)
33
+ f = Linkage::Function.allocate
24
34
  f.stubs(options)
25
35
  if block
26
36
  f.send(:instance_eval, &block)
27
37
  end
28
- f.stubs(:is_a?).returns(false)
29
- f.stubs(:is_a?).with(Linkage::Field).returns(true)
30
38
  f
31
39
  end
32
40
 
41
+ def new_function(name, ruby_type = nil, params = nil, &block)
42
+ klass = Class.new(Linkage::Function)
43
+ klass.send(:define_singleton_method, :function_name) { name }
44
+ if ruby_type
45
+ klass.send(:define_method, :ruby_type) { ruby_type }
46
+ end
47
+ if params
48
+ klass.send(:define_singleton_method, :parameters) { params }
49
+ end
50
+ klass
51
+ end
52
+
53
+
33
54
  def self.current_ruby_version
34
55
  @current_ruby_version ||= Versionomy.parse(RUBY_VERSION)
35
56
  end
@@ -37,6 +58,10 @@ class Test::Unit::TestCase
37
58
  def self.ruby19
38
59
  @ruby19 ||= Versionomy.parse("1.9")
39
60
  end
61
+
62
+ def test_config
63
+ @test_config ||= YAML.load_file(File.join(File.dirname(__FILE__), "config.yml"))
64
+ end
40
65
  end
41
66
 
42
67
  module UnitTests; end
@@ -81,5 +81,38 @@ module IntegrationTests
81
81
  end
82
82
  end
83
83
  end
84
+
85
+ test "handles MySQL's ignorance of trailing spaces when comparing strings" do
86
+ pend
87
+ if !test_config['mysql']
88
+ omission("No MySQL test configuration found")
89
+ end
90
+ uri = "mysql2://%s:%s/%s?user=%s" % test_config['mysql'].values_at('host', 'port', 'database', 'user')
91
+ Sequel.connect(uri) do |db|
92
+ db.create_table!(:foo) { primary_key(:id); String(:one); String(:two) }
93
+ db[:foo].import([:id, :one, :two], [[1, "", "test"], [2, "", "test"], [3, " ", "test "], [4, "", "test"], [5, "", "junk"]])
94
+
95
+ db.create_table!(:bar) { primary_key(:id); String(:one); String(:two) }
96
+ db[:bar].import([:id, :one, :two], [[1, "", "junk"]])
97
+
98
+ db.run("DROP TABLE IF EXISTS groups")
99
+ db.run("DROP TABLE IF EXISTS groups_records")
100
+ end
101
+
102
+ ds_1 = Linkage::Dataset.new(uri, "foo", :single_threaded => true)
103
+ ds_2 = Linkage::Dataset.new(uri, "bar", :single_threaded => true)
104
+ conf = ds_1.link_with(ds_2) do
105
+ lhs[:one].must == rhs[:one]
106
+ lhs[:two].must == rhs[:two]
107
+ end
108
+
109
+ logger = Logger.new(STDERR)
110
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri, :logger => logger)
111
+ runner.execute
112
+
113
+ Sequel.connect(@tmpuri) do |db|
114
+ assert_equal 1, db[:groups].count
115
+ end
116
+ end
84
117
  end
85
118
  end
@@ -205,5 +205,26 @@ module IntegrationTests
205
205
  end
206
206
  end
207
207
  end
208
+
209
+ test "match functions" do
210
+ # insert the test data
211
+ database do |db|
212
+ db.create_table(:foo) { primary_key(:id); String(:bar) }
213
+ db[:foo].import([:id, :bar],
214
+ Array.new(100) { |i| [i, "bar%s" % (" " * (i % 10))] })
215
+ end
216
+
217
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
218
+ conf = ds.link_with(ds) do
219
+ trim(lhs[:bar]).must == trim(rhs[:bar])
220
+ end
221
+ assert_equal :self, conf.linkage_type
222
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
223
+ runner.execute
224
+
225
+ database do |db|
226
+ assert_equal 1, db[:groups].count
227
+ end
228
+ end
208
229
  end
209
230
  end
@@ -0,0 +1,36 @@
1
+ require 'helper'
2
+
3
+ class UnitTests::TestTrim < Test::Unit::TestCase
4
+ test "subclass of Function" do
5
+ assert_equal Linkage::Function, Linkage::Functions::Trim.superclass
6
+ end
7
+
8
+ test "ruby_type for string literal" do
9
+ assert_equal({:type => String}, Linkage::Functions::Trim.new("foo").ruby_type)
10
+ end
11
+
12
+ test "ruby_type for string field" do
13
+ field_1 = stub_field('field 1', :name => :bar, :ruby_type => {:type => String})
14
+ assert_equal({:type => String}, Linkage::Functions::Trim.new(field_1).ruby_type)
15
+
16
+ field_2 = stub_field('field 2', :name => :bar, :ruby_type => {:type => String, :opts => {:size => 123}})
17
+ assert_equal({:type => String, :opts => {:size => 123}}, Linkage::Functions::Trim.new(field_2).ruby_type)
18
+ end
19
+
20
+ test "ruby_type for string function" do
21
+ func = new_function('foo', {:type => String, :opts => {:junk => '123'}})
22
+ assert_equal({:type => String, :opts => {:junk => '123'}}, Linkage::Functions::Trim.new(func.new).ruby_type)
23
+ end
24
+
25
+ test "parameters" do
26
+ assert_equal [[String]], Linkage::Functions::Trim.parameters
27
+ end
28
+
29
+ test "name" do
30
+ assert_equal "trim", Linkage::Functions::Trim.function_name
31
+ end
32
+
33
+ test "registers itself" do
34
+ assert_equal Linkage::Function["trim"], Linkage::Functions::Trim
35
+ end
36
+ end
@@ -23,7 +23,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
23
23
  end
24
24
 
25
25
  test "linkage_type is cross when there's different filters on both sides" do
26
- field = stub('field')
26
+ field = stub_field('field')
27
27
  dataset = stub('dataset', :set_new_id => nil)
28
28
  dataset.stubs(:fields).returns({:foo => field})
29
29
  c = Linkage::Configuration.new(dataset, dataset)
@@ -39,7 +39,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
39
39
  end
40
40
 
41
41
  test "linkage_type is self when there's identical static filters on each side" do
42
- field = stub('field')
42
+ field = stub_field('field')
43
43
  dataset = stub('dataset', :set_new_id => nil)
44
44
  dataset.stubs(:fields).returns({:foo => field})
45
45
  c = Linkage::Configuration.new(dataset, dataset)
@@ -53,8 +53,8 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
53
53
  end
54
54
 
55
55
  test "linkage_type is self when there's a two-field filter on one side" do
56
- field_1 = stub('field 1')
57
- field_2 = stub('field 2')
56
+ field_1 = stub_field('field 1')
57
+ field_2 = stub_field('field 2')
58
58
  dataset = stub('dataset', :set_new_id => nil)
59
59
  dataset.stubs(:fields).returns({:foo => field_1, :bar => field_2})
60
60
  c = Linkage::Configuration.new(dataset, dataset)
@@ -71,7 +71,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
71
71
 
72
72
  test "static expectation" do
73
73
  dataset_1 = stub('dataset')
74
- field = stub('field')
74
+ field = stub_field('field')
75
75
  dataset_1.stubs(:fields).returns({:foo => field})
76
76
  dataset_2 = stub('dataset')
77
77
  c = Linkage::Configuration.new(dataset_1, dataset_2)
@@ -84,7 +84,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
84
84
  ## Maybe in the future
85
85
  #test "static expectation, flopped" do
86
86
  #dataset_1 = stub('dataset')
87
- #field = stub('field')
87
+ #field = stub_field('field')
88
88
  #dataset_1.stubs(:fields).returns({:foo => field})
89
89
  #dataset_2 = stub('dataset')
90
90
  #c = Linkage::Configuration.new(dataset_1, dataset_2)
@@ -142,4 +142,40 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
142
142
  lhs[:foo].must_not == 123
143
143
  end
144
144
  end
145
+
146
+ test "dynamic database function" do
147
+ dataset_1 = stub('dataset')
148
+ field_1 = stub_field('field 1')
149
+ dataset_1.stubs(:fields).returns({:foo => field_1})
150
+ dataset_2 = stub('dataset')
151
+ field_2 = stub_field('field 2')
152
+ dataset_2.stubs(:fields).returns({:foo => field_2})
153
+
154
+ func = stub_function('function', :static? => false)
155
+ Linkage::Functions::Trim.expects(:new).with(field_1).returns(func)
156
+
157
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
158
+ Linkage::MustExpectation.expects(:new).with(:==, func, field_2, nil)
159
+ c.send(:instance_eval) do
160
+ trim(lhs[:foo]).must == rhs[:foo]
161
+ end
162
+ end
163
+
164
+ test "static database function" do
165
+ dataset_1 = stub('dataset')
166
+ field_1 = stub_field('field 1')
167
+ dataset_1.stubs(:fields).returns({:foo => field_1})
168
+ dataset_2 = stub('dataset')
169
+ field_2 = stub_field('field 2')
170
+ dataset_2.stubs(:fields).returns({:foo => field_2})
171
+
172
+ func = stub_function('function', :static? => true)
173
+ Linkage::Functions::Trim.expects(:new).with("foo").returns(func)
174
+
175
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
176
+ Linkage::MustExpectation.expects(:new).with(:==, field_1, func, :filter)
177
+ c.send(:instance_eval) do
178
+ lhs[:foo].must == trim("foo")
179
+ end
180
+ end
145
181
  end