linkage 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ module Linkage
2
+ # Abstract class to represent SQL functions. No attempts are made to
3
+ # ensure that the function actually exists in the database you're using.
4
+ #
5
+ # @abstract
6
+ class Function < Data
7
+ # Register a new function.
8
+ #
9
+ # @param [Class] klass Function class (probably a subclass of {Function})
10
+ def self.register(klass)
11
+ if klass.instance_methods(false).any? { |m| m.to_s == "ruby_type" }
12
+ @functions ||= {}
13
+ @functions[klass.function_name] = klass
14
+ else
15
+ raise ArgumentError, "ruby_type instance method must be defined"
16
+ end
17
+ end
18
+
19
+ def self.[](name)
20
+ @functions ? @functions[name] : nil
21
+ end
22
+
23
+ # Subclasses must define this.
24
+ def self.function_name
25
+ raise NotImplementedError
26
+ end
27
+
28
+ # Subclasses can define this to require a specific number of arguments
29
+ # of a certain class. To require two parameters of either String or
30
+ # Integer, do something like this:
31
+ #
32
+ # @@parameters = [[String, Integer], [String, Integer]]
33
+ # def self.parameters
34
+ # @@parameters
35
+ # end
36
+ #
37
+ def self.parameters
38
+ nil
39
+ end
40
+
41
+ attr_reader :dataset
42
+
43
+ # @param [Linkage::Field, Object] args Function arguments
44
+ def initialize(*args)
45
+ @names = [self.class.function_name]
46
+ @args = args
47
+ @dataset = nil
48
+ process_args
49
+ end
50
+
51
+ def name
52
+ @name ||= @names.join("_").to_sym
53
+ end
54
+
55
+ def static?
56
+ @static
57
+ end
58
+
59
+ # Subclasses must define this. The return value should be a Hash with
60
+ # the following elements:
61
+ # :type - column type (Ruby class) of the result
62
+ # :opts - Optional hash with additional options (like :size)
63
+ def ruby_type
64
+ raise NotImplementedError
65
+ end
66
+
67
+ # @return [Sequel::SQL::Function]
68
+ def to_expr
69
+ self.class.function_name.to_sym.sql_function(*@values)
70
+ end
71
+
72
+ private
73
+
74
+ def process_args
75
+ parameters = self.class.parameters
76
+ if parameters && parameters.length != @args.length
77
+ raise ArgumentError, "wrong number of arguments (#{@args.length} for #{parameters.length})"
78
+ end
79
+
80
+ @static = true
81
+ @values = []
82
+ @args.each_with_index do |arg, i|
83
+ if arg.kind_of?(Data)
84
+ @names << arg.name
85
+ @static &&= arg.static?
86
+ if @dataset && !arg.static? && @dataset != arg.dataset
87
+ raise ArgumentError, "You cannot supply fields from different datasets as arguments to the same function)"
88
+ end
89
+ @dataset ||= arg.dataset
90
+ type = arg.ruby_type[:type]
91
+ value = arg.is_a?(Field) ? arg.name : arg.to_expr
92
+ else
93
+ @names << arg.to_s.gsub(/\W/, "")
94
+ type = arg.class
95
+ value = arg
96
+ end
97
+ if parameters && !parameters[i].include?(type)
98
+ raise TypeError, "expected type #{parameters[i].join(" or ")}, got #{type}"
99
+ end
100
+ @values << value
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+ Dir.glob(File.expand_path(File.join(File.dirname(__FILE__), "functions", "*.rb"))).each do |filename|
107
+ require filename
108
+ end
@@ -0,0 +1,22 @@
1
+ module Linkage
2
+ module Functions
3
+ class Trim < Function
4
+ def self.function_name
5
+ "trim"
6
+ end
7
+
8
+ def self.parameters
9
+ [[String]]
10
+ end
11
+
12
+ def ruby_type
13
+ if @args[0].kind_of?(Data)
14
+ @args[0].ruby_type
15
+ else
16
+ {:type => String}
17
+ end
18
+ end
19
+ end
20
+ Function.register(Trim)
21
+ end
22
+ end
@@ -0,0 +1,5 @@
1
+ module Linkage
2
+ module Warnings
3
+ # TODO :)
4
+ end
5
+ end
data/lib/linkage.rb CHANGED
@@ -6,10 +6,13 @@ end
6
6
 
7
7
  path = Pathname.new(File.expand_path(File.dirname(__FILE__))) + 'linkage'
8
8
  require path + 'utils'
9
+ require path + 'warnings'
9
10
  require path + 'dataset'
10
11
  require path + 'runner'
11
12
  require path + 'expectation'
13
+ require path + 'data'
12
14
  require path + 'field'
15
+ require path + 'function'
13
16
  require path + 'group'
14
17
  require path + 'import_buffer'
15
18
  require path + 'configuration'
data/linkage.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "linkage"
8
- s.version = "0.0.1"
8
+ s.version = "0.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jeremy Stephens"]
12
- s.date = "2011-10-25"
12
+ s.date = "2011-12-06"
13
13
  s.description = "Wraps Sequel to perform record linkage between one or two datasets"
14
14
  s.email = "jeremy.f.stephens@vanderbilt.edu"
15
15
  s.extra_rdoc_files = [
@@ -28,34 +28,42 @@ Gem::Specification.new do |s|
28
28
  "VERSION",
29
29
  "lib/linkage.rb",
30
30
  "lib/linkage/configuration.rb",
31
+ "lib/linkage/data.rb",
31
32
  "lib/linkage/dataset.rb",
32
33
  "lib/linkage/expectation.rb",
33
34
  "lib/linkage/field.rb",
35
+ "lib/linkage/function.rb",
36
+ "lib/linkage/functions/trim.rb",
34
37
  "lib/linkage/group.rb",
35
38
  "lib/linkage/import_buffer.rb",
36
39
  "lib/linkage/runner.rb",
37
40
  "lib/linkage/runner/single_threaded.rb",
38
41
  "lib/linkage/utils.rb",
42
+ "lib/linkage/warnings.rb",
39
43
  "linkage.gemspec",
44
+ "test/config.yml",
40
45
  "test/helper.rb",
41
46
  "test/integration/test_cross_linkage.rb",
42
47
  "test/integration/test_dual_linkage.rb",
43
48
  "test/integration/test_self_linkage.rb",
49
+ "test/unit/functions/test_trim.rb",
50
+ "test/unit/runner/test_single_threaded.rb",
44
51
  "test/unit/test_configuration.rb",
52
+ "test/unit/test_data.rb",
45
53
  "test/unit/test_dataset.rb",
46
54
  "test/unit/test_expectation.rb",
47
55
  "test/unit/test_field.rb",
56
+ "test/unit/test_function.rb",
48
57
  "test/unit/test_group.rb",
49
58
  "test/unit/test_import_buffer.rb",
50
59
  "test/unit/test_linkage.rb",
51
60
  "test/unit/test_runner.rb",
52
- "test/unit/test_single_threaded_runner.rb",
53
61
  "test/unit/test_utils.rb"
54
62
  ]
55
63
  s.homepage = "http://github.com/coupler/linkage"
56
64
  s.licenses = ["MIT"]
57
65
  s.require_paths = ["lib"]
58
- s.rubygems_version = "1.8.10"
66
+ s.rubygems_version = "1.8.11"
59
67
  s.summary = "Sequel-based record linkage"
60
68
 
61
69
  if s.respond_to? :specification_version then
@@ -74,6 +82,10 @@ Gem::Specification.new do |s|
74
82
  s.add_development_dependency(%q<rake>, [">= 0"])
75
83
  s.add_development_dependency(%q<versionomy>, [">= 0"])
76
84
  s.add_development_dependency(%q<guard-yard>, [">= 0"])
85
+ s.add_development_dependency(%q<rb-inotify>, [">= 0"])
86
+ s.add_development_dependency(%q<mysql2>, [">= 0"])
87
+ s.add_development_dependency(%q<pry>, [">= 0"])
88
+ s.add_development_dependency(%q<rdiscount>, [">= 0"])
77
89
  else
78
90
  s.add_dependency(%q<sequel>, [">= 0"])
79
91
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
@@ -87,6 +99,10 @@ Gem::Specification.new do |s|
87
99
  s.add_dependency(%q<rake>, [">= 0"])
88
100
  s.add_dependency(%q<versionomy>, [">= 0"])
89
101
  s.add_dependency(%q<guard-yard>, [">= 0"])
102
+ s.add_dependency(%q<rb-inotify>, [">= 0"])
103
+ s.add_dependency(%q<mysql2>, [">= 0"])
104
+ s.add_dependency(%q<pry>, [">= 0"])
105
+ s.add_dependency(%q<rdiscount>, [">= 0"])
90
106
  end
91
107
  else
92
108
  s.add_dependency(%q<sequel>, [">= 0"])
@@ -101,6 +117,10 @@ Gem::Specification.new do |s|
101
117
  s.add_dependency(%q<rake>, [">= 0"])
102
118
  s.add_dependency(%q<versionomy>, [">= 0"])
103
119
  s.add_dependency(%q<guard-yard>, [">= 0"])
120
+ s.add_dependency(%q<rb-inotify>, [">= 0"])
121
+ s.add_dependency(%q<mysql2>, [">= 0"])
122
+ s.add_dependency(%q<pry>, [">= 0"])
123
+ s.add_dependency(%q<rdiscount>, [">= 0"])
104
124
  end
105
125
  end
106
126
 
data/test/config.yml ADDED
@@ -0,0 +1,5 @@
1
+ mysql:
2
+ host: localhost
3
+ port: 3306
4
+ database: test
5
+ user: test
data/test/helper.rb CHANGED
@@ -13,6 +13,7 @@ require 'tmpdir'
13
13
  require 'logger'
14
14
  require 'pp'
15
15
  require 'versionomy'
16
+ require 'pry'
16
17
 
17
18
  $LOAD_PATH.unshift(File.dirname(__FILE__))
18
19
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
@@ -21,15 +22,35 @@ require 'linkage'
21
22
  class Test::Unit::TestCase
22
23
  def stub_field(name, options = {}, &block)
23
24
  f = Linkage::Field.allocate
25
+ f.stubs({:static? => false}.merge(options))
26
+ if block
27
+ f.send(:instance_eval, &block)
28
+ end
29
+ f
30
+ end
31
+
32
+ def stub_function(name, options = {}, &block)
33
+ f = Linkage::Function.allocate
24
34
  f.stubs(options)
25
35
  if block
26
36
  f.send(:instance_eval, &block)
27
37
  end
28
- f.stubs(:is_a?).returns(false)
29
- f.stubs(:is_a?).with(Linkage::Field).returns(true)
30
38
  f
31
39
  end
32
40
 
41
+ def new_function(name, ruby_type = nil, params = nil, &block)
42
+ klass = Class.new(Linkage::Function)
43
+ klass.send(:define_singleton_method, :function_name) { name }
44
+ if ruby_type
45
+ klass.send(:define_method, :ruby_type) { ruby_type }
46
+ end
47
+ if params
48
+ klass.send(:define_singleton_method, :parameters) { params }
49
+ end
50
+ klass
51
+ end
52
+
53
+
33
54
  def self.current_ruby_version
34
55
  @current_ruby_version ||= Versionomy.parse(RUBY_VERSION)
35
56
  end
@@ -37,6 +58,10 @@ class Test::Unit::TestCase
37
58
  def self.ruby19
38
59
  @ruby19 ||= Versionomy.parse("1.9")
39
60
  end
61
+
62
+ def test_config
63
+ @test_config ||= YAML.load_file(File.join(File.dirname(__FILE__), "config.yml"))
64
+ end
40
65
  end
41
66
 
42
67
  module UnitTests; end
@@ -81,5 +81,38 @@ module IntegrationTests
81
81
  end
82
82
  end
83
83
  end
84
+
85
+ test "handles MySQL's ignorance of trailing spaces when comparing strings" do
86
+ pend
87
+ if !test_config['mysql']
88
+ omission("No MySQL test configuration found")
89
+ end
90
+ uri = "mysql2://%s:%s/%s?user=%s" % test_config['mysql'].values_at('host', 'port', 'database', 'user')
91
+ Sequel.connect(uri) do |db|
92
+ db.create_table!(:foo) { primary_key(:id); String(:one); String(:two) }
93
+ db[:foo].import([:id, :one, :two], [[1, "", "test"], [2, "", "test"], [3, " ", "test "], [4, "", "test"], [5, "", "junk"]])
94
+
95
+ db.create_table!(:bar) { primary_key(:id); String(:one); String(:two) }
96
+ db[:bar].import([:id, :one, :two], [[1, "", "junk"]])
97
+
98
+ db.run("DROP TABLE IF EXISTS groups")
99
+ db.run("DROP TABLE IF EXISTS groups_records")
100
+ end
101
+
102
+ ds_1 = Linkage::Dataset.new(uri, "foo", :single_threaded => true)
103
+ ds_2 = Linkage::Dataset.new(uri, "bar", :single_threaded => true)
104
+ conf = ds_1.link_with(ds_2) do
105
+ lhs[:one].must == rhs[:one]
106
+ lhs[:two].must == rhs[:two]
107
+ end
108
+
109
+ logger = Logger.new(STDERR)
110
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri, :logger => logger)
111
+ runner.execute
112
+
113
+ Sequel.connect(@tmpuri) do |db|
114
+ assert_equal 1, db[:groups].count
115
+ end
116
+ end
84
117
  end
85
118
  end
@@ -205,5 +205,26 @@ module IntegrationTests
205
205
  end
206
206
  end
207
207
  end
208
+
209
+ test "match functions" do
210
+ # insert the test data
211
+ database do |db|
212
+ db.create_table(:foo) { primary_key(:id); String(:bar) }
213
+ db[:foo].import([:id, :bar],
214
+ Array.new(100) { |i| [i, "bar%s" % (" " * (i % 10))] })
215
+ end
216
+
217
+ ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
218
+ conf = ds.link_with(ds) do
219
+ trim(lhs[:bar]).must == trim(rhs[:bar])
220
+ end
221
+ assert_equal :self, conf.linkage_type
222
+ runner = Linkage::SingleThreadedRunner.new(conf, @tmpuri)
223
+ runner.execute
224
+
225
+ database do |db|
226
+ assert_equal 1, db[:groups].count
227
+ end
228
+ end
208
229
  end
209
230
  end
@@ -0,0 +1,36 @@
1
+ require 'helper'
2
+
3
+ class UnitTests::TestTrim < Test::Unit::TestCase
4
+ test "subclass of Function" do
5
+ assert_equal Linkage::Function, Linkage::Functions::Trim.superclass
6
+ end
7
+
8
+ test "ruby_type for string literal" do
9
+ assert_equal({:type => String}, Linkage::Functions::Trim.new("foo").ruby_type)
10
+ end
11
+
12
+ test "ruby_type for string field" do
13
+ field_1 = stub_field('field 1', :name => :bar, :ruby_type => {:type => String})
14
+ assert_equal({:type => String}, Linkage::Functions::Trim.new(field_1).ruby_type)
15
+
16
+ field_2 = stub_field('field 2', :name => :bar, :ruby_type => {:type => String, :opts => {:size => 123}})
17
+ assert_equal({:type => String, :opts => {:size => 123}}, Linkage::Functions::Trim.new(field_2).ruby_type)
18
+ end
19
+
20
+ test "ruby_type for string function" do
21
+ func = new_function('foo', {:type => String, :opts => {:junk => '123'}})
22
+ assert_equal({:type => String, :opts => {:junk => '123'}}, Linkage::Functions::Trim.new(func.new).ruby_type)
23
+ end
24
+
25
+ test "parameters" do
26
+ assert_equal [[String]], Linkage::Functions::Trim.parameters
27
+ end
28
+
29
+ test "name" do
30
+ assert_equal "trim", Linkage::Functions::Trim.function_name
31
+ end
32
+
33
+ test "registers itself" do
34
+ assert_equal Linkage::Function["trim"], Linkage::Functions::Trim
35
+ end
36
+ end
@@ -23,7 +23,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
23
23
  end
24
24
 
25
25
  test "linkage_type is cross when there's different filters on both sides" do
26
- field = stub('field')
26
+ field = stub_field('field')
27
27
  dataset = stub('dataset', :set_new_id => nil)
28
28
  dataset.stubs(:fields).returns({:foo => field})
29
29
  c = Linkage::Configuration.new(dataset, dataset)
@@ -39,7 +39,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
39
39
  end
40
40
 
41
41
  test "linkage_type is self when there's identical static filters on each side" do
42
- field = stub('field')
42
+ field = stub_field('field')
43
43
  dataset = stub('dataset', :set_new_id => nil)
44
44
  dataset.stubs(:fields).returns({:foo => field})
45
45
  c = Linkage::Configuration.new(dataset, dataset)
@@ -53,8 +53,8 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
53
53
  end
54
54
 
55
55
  test "linkage_type is self when there's a two-field filter on one side" do
56
- field_1 = stub('field 1')
57
- field_2 = stub('field 2')
56
+ field_1 = stub_field('field 1')
57
+ field_2 = stub_field('field 2')
58
58
  dataset = stub('dataset', :set_new_id => nil)
59
59
  dataset.stubs(:fields).returns({:foo => field_1, :bar => field_2})
60
60
  c = Linkage::Configuration.new(dataset, dataset)
@@ -71,7 +71,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
71
71
 
72
72
  test "static expectation" do
73
73
  dataset_1 = stub('dataset')
74
- field = stub('field')
74
+ field = stub_field('field')
75
75
  dataset_1.stubs(:fields).returns({:foo => field})
76
76
  dataset_2 = stub('dataset')
77
77
  c = Linkage::Configuration.new(dataset_1, dataset_2)
@@ -84,7 +84,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
84
84
  ## Maybe in the future
85
85
  #test "static expectation, flopped" do
86
86
  #dataset_1 = stub('dataset')
87
- #field = stub('field')
87
+ #field = stub_field('field')
88
88
  #dataset_1.stubs(:fields).returns({:foo => field})
89
89
  #dataset_2 = stub('dataset')
90
90
  #c = Linkage::Configuration.new(dataset_1, dataset_2)
@@ -142,4 +142,40 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
142
142
  lhs[:foo].must_not == 123
143
143
  end
144
144
  end
145
+
146
+ test "dynamic database function" do
147
+ dataset_1 = stub('dataset')
148
+ field_1 = stub_field('field 1')
149
+ dataset_1.stubs(:fields).returns({:foo => field_1})
150
+ dataset_2 = stub('dataset')
151
+ field_2 = stub_field('field 2')
152
+ dataset_2.stubs(:fields).returns({:foo => field_2})
153
+
154
+ func = stub_function('function', :static? => false)
155
+ Linkage::Functions::Trim.expects(:new).with(field_1).returns(func)
156
+
157
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
158
+ Linkage::MustExpectation.expects(:new).with(:==, func, field_2, nil)
159
+ c.send(:instance_eval) do
160
+ trim(lhs[:foo]).must == rhs[:foo]
161
+ end
162
+ end
163
+
164
+ test "static database function" do
165
+ dataset_1 = stub('dataset')
166
+ field_1 = stub_field('field 1')
167
+ dataset_1.stubs(:fields).returns({:foo => field_1})
168
+ dataset_2 = stub('dataset')
169
+ field_2 = stub_field('field 2')
170
+ dataset_2.stubs(:fields).returns({:foo => field_2})
171
+
172
+ func = stub_function('function', :static? => true)
173
+ Linkage::Functions::Trim.expects(:new).with("foo").returns(func)
174
+
175
+ c = Linkage::Configuration.new(dataset_1, dataset_2)
176
+ Linkage::MustExpectation.expects(:new).with(:==, field_1, func, :filter)
177
+ c.send(:instance_eval) do
178
+ lhs[:foo].must == trim("foo")
179
+ end
180
+ end
145
181
  end