linkage 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,7 +2,7 @@ GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
4
  blockenspiel (0.4.3)
5
- coderay (1.0.5)
5
+ coderay (1.0.6)
6
6
  ffi (1.0.11)
7
7
  git (1.2.5)
8
8
  guard (1.0.1)
@@ -25,7 +25,7 @@ GEM
25
25
  mocha (0.10.5)
26
26
  metaclass (~> 0.0.1)
27
27
  mysql2 (0.3.11)
28
- pry (0.9.8.4)
28
+ pry (0.9.9)
29
29
  coderay (~> 1.0.5)
30
30
  method_source (~> 0.7.1)
31
31
  slop (>= 2.4.4, < 3)
@@ -33,12 +33,12 @@ GEM
33
33
  rdiscount (1.6.8)
34
34
  rdoc (3.12)
35
35
  json (~> 1.4)
36
- sequel (3.33.0)
36
+ sequel (3.34.1)
37
37
  slop (2.4.4)
38
- sqlite3 (1.3.5)
38
+ sqlite3 (1.3.6)
39
39
  test-unit (2.4.8)
40
40
  thor (0.14.6)
41
- versionomy (0.4.2)
41
+ versionomy (0.4.3)
42
42
  blockenspiel (>= 0.4.3)
43
43
  yard (0.7.5)
44
44
 
data/Rakefile CHANGED
@@ -48,3 +48,5 @@ require 'yard'
48
48
  YARD::Rake::YardocTask.new do |t|
49
49
  t.files = ['lib/**/*.rb']
50
50
  end
51
+
52
+ task :build => :gemspec
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.5
1
+ 0.0.6
@@ -117,12 +117,12 @@ module Linkage
117
117
  end
118
118
 
119
119
  expr = target.to_expr(side)
120
- aliased_expr = expr
120
+ aliaz = nil
121
121
  if expr != merged_field.name
122
- aliased_expr = expr.as(merged_field.name)
122
+ aliaz = merged_field.name
123
123
  end
124
124
 
125
- dataset.order_more(expr).select_more(aliased_expr)
125
+ dataset.match(expr, aliaz)
126
126
  end
127
127
 
128
128
  def same_filter?(other)
@@ -318,9 +318,9 @@ module Linkage
318
318
  def initialize(dataset_1, dataset_2)
319
319
  @dataset_1 = dataset_1
320
320
  @dataset_2 = dataset_2
321
+ @linkage_type = dataset_1 == dataset_2 ? :self : :dual
321
322
  @expectations = []
322
323
  @visual_comparisons = []
323
- @linkage_type = dataset_1 == dataset_2 ? :self : :dual
324
324
  end
325
325
 
326
326
  def configure(&block)
@@ -348,5 +348,15 @@ module Linkage
348
348
  def result_set
349
349
  @result_set ||= ResultSet.new(self)
350
350
  end
351
+
352
+ def datasets_with_applied_expectations
353
+ dataset_1 = @dataset_1
354
+ dataset_2 = @dataset_2
355
+ @expectations.each do |exp|
356
+ dataset_1 = exp.apply_to(dataset_1, :lhs)
357
+ dataset_2 = exp.apply_to(dataset_2, :rhs) if @linkage_type != :self
358
+ end
359
+ @linkage_type == :self ? [dataset_1, dataset_1] : [dataset_1, dataset_2]
360
+ end
351
361
  end
352
362
  end
@@ -1,17 +1,23 @@
1
1
  module Linkage
2
- class Dataset < Delegator
2
+ # Delegator around Sequel::Dataset with some extra functionality.
3
+ class Dataset
3
4
  attr_reader :field_set, :table_name
4
5
 
5
6
  def initialize(uri, table, options = {})
6
7
  @table_name = table.to_sym
7
8
  db = Sequel.connect(uri, options)
8
- ds = db[@table_name]
9
- super(ds)
9
+ @dataset = db[@table_name]
10
10
  @field_set = FieldSet.new(db.schema(@table_name))
11
+ @_match = []
11
12
  end
12
13
 
13
- def __setobj__(obj); @dataset = obj; end
14
- def __getobj__; @dataset; end
14
+ def obj
15
+ @dataset
16
+ end
17
+
18
+ def obj=(value)
19
+ @dataset = value
20
+ end
15
21
 
16
22
  # Setup a linkage with another dataset
17
23
  #
@@ -26,21 +32,70 @@ module Linkage
26
32
  @dataset.db.adapter_scheme
27
33
  end
28
34
 
29
- def initialize_clone(obj)
30
- new_obj = obj.instance_variable_get(:@new_obj)
35
+ def match(expr, aliaz = nil)
36
+ clone(:match => {:expr => expr, :alias => aliaz})
37
+ end
38
+
39
+ def clone(new_opts={})
40
+ new_opts = new_opts.dup
41
+ new_obj = new_opts.delete(:new_obj)
42
+
43
+ match = new_opts.delete(:match)
44
+ result = super()
45
+ result.send(:_match, match)
46
+
31
47
  if new_obj
32
- __setobj__(new_obj)
48
+ result.obj = new_obj
33
49
  else
34
- super
50
+ result.obj = obj.clone(new_opts)
51
+ end
52
+ result
53
+ end
54
+
55
+ def each_group(min = 2)
56
+ @dataset.group_and_count(*aliased_match_expressions).having{count >= min}.each do |row|
57
+ count = row.delete(:count)
58
+ yield Group.new(row, {:count => count})
59
+ end
60
+ end
61
+
62
+ def group_by_matches(aliased = false)
63
+ expr = aliased ? aliased_match_expressions : match_expressions
64
+ group(*expr)
65
+ end
66
+
67
+ def dataset_for_group(group)
68
+ filters = []
69
+ group.values.each_pair do |key, value|
70
+ # find a matched expression with this alias
71
+ m = @_match.detect { |h| h[:alias] ? h[:alias] == key : h[:expr] == key }
72
+ raise "this dataset isn't compatible with the given group" if !m
73
+ filters << {m[:expr] => value}
74
+ end
75
+ filter(*filters)
76
+ end
77
+
78
+ private
79
+
80
+ def _match(opts)
81
+ if opts
82
+ @_match += [opts]
35
83
  end
36
84
  end
37
85
 
86
+ def match_expressions
87
+ @_match.collect { |m| m[:expr] }
88
+ end
89
+
90
+ def aliased_match_expressions
91
+ @_match.collect { |m| m[:alias] ? m[:expr].as(m[:alias]) : m[:expr] }
92
+ end
93
+
38
94
  def method_missing(name, *args, &block)
39
- result = super
95
+ result = @dataset.send(name, *args, &block)
40
96
  if result.kind_of?(Sequel::Dataset)
41
- @new_obj = result
42
- result = clone
43
- @new_obj = nil
97
+ new_obj = result
98
+ result = clone(:new_obj => result)
44
99
  end
45
100
  result
46
101
  end
@@ -1,43 +1,35 @@
1
1
  module Linkage
2
- # This class represents a group of records that match based on criteria
3
- # described via the {Dataset#link_with} method. Group's are created by
4
- # subclasses of the {Runner} class during execution.
5
- #
6
- # @see Dataset#link_with
7
- # @see SingleThreadedRunner
8
2
  class Group
9
- # @return [Array<Object>] An array of this group's record ids
10
- attr_reader :records
11
-
12
3
  # @return [Hash] Hash of matching values
13
4
  attr_reader :values
14
5
 
15
- # @param [Hash] matching_values Values that define this group
16
- # @example
17
- # Linkage::Group.new({:foo => 123, :bar => 'baz'})
18
- def initialize(matching_values)
19
- @values = matching_values
20
- @records = []
21
- end
6
+ # @return [Integer] Number of records in this group
7
+ attr_reader :count
22
8
 
23
- # Check to see if the given set of values matches this group's values.
24
- #
25
- # @param [Hash] values Hash of values
26
- # @return [Boolean] true if match, false if not
27
- def matches?(values)
28
- @values == values
29
- end
9
+ # @return [Integer] This group's ID (if it exists)
10
+ attr_reader :id
30
11
 
31
- # Add a record id to this group's set of records.
32
- #
33
- # @param [Object] record_id
34
- def add_record(record_id)
35
- @records << record_id
12
+ def self.from_row(row)
13
+ values = {}
14
+ options = {}
15
+ row.each_pair do |key, value|
16
+ if key == :id || key == :count
17
+ options[key] = value
18
+ else
19
+ values[key] = value
20
+ end
21
+ end
22
+ new(values, options)
36
23
  end
37
24
 
38
- # @return [Fixnum] Number of records in this group
39
- def count
40
- @records.count
25
+ # @param [Hash] values Values that define this group
26
+ # @param [Hash] options
27
+ # @example
28
+ # Linkage::Group.new({:foo => 123, :bar => 'baz'}, {:count => 5, :id => 456})
29
+ def initialize(values, options)
30
+ @count = options[:count]
31
+ @id = options[:id]
32
+ @values = values
41
33
  end
42
34
  end
43
35
  end
@@ -7,11 +7,7 @@ module Linkage
7
7
  end
8
8
 
9
9
  def groups_dataset
10
- Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
11
- end
12
-
13
- def groups_records_dataset
14
- Dataset.new(@config.results_uri, :groups_records, @config.results_uri_options)
10
+ @groups_dataset ||= Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
15
11
  end
16
12
 
17
13
  def database(&block)
@@ -40,18 +36,22 @@ module Linkage
40
36
  groups_headers = [:id] + group.values.keys
41
37
  @groups_buffer = ImportBuffer.new(@config.results_uri, :groups, groups_headers, @config.results_uri_options)
42
38
  end
43
- @groups_records_buffer ||= ImportBuffer.new(@config.results_uri, :groups_records, [:group_id, :dataset, :record_id], @config.results_uri_options)
44
-
45
39
  group_id = next_group_id
46
40
  @groups_buffer.add([group_id] + group.values.values)
47
- group.records.each do |record_id|
48
- @groups_records_buffer.add([group_id, dataset_id, record_id])
49
- end
50
41
  end
51
42
 
52
43
  def flush!
53
44
  @groups_buffer.flush if @groups_buffer
54
- @groups_records_buffer.flush if @groups_records_buffer
45
+ end
46
+
47
+ def get_group(index)
48
+ values = groups_dataset.order(:id).limit(1, index).first
49
+ Group.from_row(values)
50
+ end
51
+
52
+ def groups_records_datasets(group)
53
+ datasets = @config.datasets_with_applied_expectations
54
+ datasets.collect! { |ds| ds.dataset_for_group(group) }
55
55
  end
56
56
 
57
57
  private
@@ -6,7 +6,6 @@ module Linkage
6
6
  # @return [Linkage::ResultSet]
7
7
  def execute
8
8
  setup_datasets
9
- apply_expectations
10
9
  group_records
11
10
 
12
11
  return result_set
@@ -15,18 +14,13 @@ module Linkage
15
14
  private
16
15
 
17
16
  def setup_datasets
18
- pk = config.dataset_1.field_set.primary_key
19
- @dataset_1 = config.dataset_1.select(pk.to_expr)
20
- if @config.linkage_type != :self
21
- pk = config.dataset_2.field_set.primary_key
22
- @dataset_2 = config.dataset_2.select(pk.to_expr)
23
- end
24
- end
17
+ @dataset_1, @dataset_2 = config.datasets_with_applied_expectations
25
18
 
26
- def apply_expectations
27
- config.expectations.each do |exp|
28
- @dataset_1 = exp.apply_to(@dataset_1, :lhs)
29
- @dataset_2 = exp.apply_to(@dataset_2, :rhs) if config.linkage_type != :self
19
+ pk = @dataset_1.field_set.primary_key
20
+ @dataset_1 = @dataset_1.select(pk.to_expr)
21
+ if @config.linkage_type != :self
22
+ pk = @dataset_2.field_set.primary_key
23
+ @dataset_2 = @dataset_2.select(pk.to_expr)
30
24
  end
31
25
  end
32
26
 
@@ -47,23 +41,10 @@ module Linkage
47
41
  # @param [Boolean] ignore_empty_groups
48
42
  # @yield [Linkage::Group] If a block is given, yield completed groups to
49
43
  # the block. Otherwise, call ResultSet#add_group on the group.
50
- def group_records_for(dataset, dataset_id = nil, ignore_empty_groups = true, &block)
51
- current_group = nil
52
- block ||= lambda { |group| result_set.add_group(current_group, dataset_id) }
53
- primary_key = dataset.field_set.primary_key.to_expr
54
- dataset.each do |row|
55
- pk = row.delete(primary_key)
56
- if current_group.nil? || !current_group.matches?(row)
57
- if current_group && (!ignore_empty_groups || current_group.count > 1)
58
- block.call(current_group)
59
- end
60
- new_group = Group.new(row)
61
- current_group = new_group
62
- end
63
- current_group.add_record(pk)
64
- end
65
- if current_group && (!ignore_empty_groups || current_group.count > 1)
66
- block.call(current_group)
44
+ def group_records_for(dataset, dataset_id, ignore_empty_groups = true)
45
+ group_minimum = ignore_empty_groups ? 2 : 1
46
+ dataset.each_group(group_minimum) do |group|
47
+ result_set.add_group(group, dataset_id)
67
48
  end
68
49
  result_set.flush!
69
50
  end
@@ -72,37 +53,20 @@ module Linkage
72
53
  # Create a new dataset for the groups table
73
54
  groups_dataset = result_set.groups_dataset
74
55
 
75
- exprs = groups_dataset.field_set.values.inject([]) do |arr, field|
56
+ groups_dataset.field_set.values.each do |field|
76
57
  # Sort on all fields
77
- field.primary_key? ? arr : arr << field.to_expr
78
- end
79
- groups_dataset = groups_dataset.select(*exprs, groups_dataset.field_set.primary_key.to_expr).order(*exprs) # ensure matching groups are sorted by id
80
-
81
- result_set.database do |db|
82
- groups_to_delete = []
83
- db.transaction do # for speed reasons
84
- group_records_for(groups_dataset, nil, false) do |group|
85
- if group.count == 1
86
- # Delete the empty group
87
- groups_to_delete << group.records[0]
88
- else
89
- # Change group_id in the groups_records table to the first group
90
- # id, delete other groups.
91
- new_group_id = group.records[0]
92
- group.records[1..-1].each do |old_group_id|
93
- # NOTE: There can only be a group with max size of 2, but
94
- # this adds in future support for matching more than
95
- # 2 datasets at once.
96
- db[:groups_records].filter(:group_id => old_group_id).
97
- update(:group_id => new_group_id)
98
- groups_to_delete << old_group_id
99
- end
100
- end
101
- end
58
+ if !field.primary_key?
59
+ groups_dataset = groups_dataset.match(field.to_expr)
102
60
  end
103
- db[:groups_records].filter(:group_id => groups_to_delete).delete
104
- db[:groups].filter(:id => groups_to_delete).delete
105
61
  end
62
+
63
+ # Delete non-matching groups
64
+ sub_dataset = groups_dataset.select(:id).group_by_matches.having(:count.sql_function(:id) => 1)
65
+ groups_dataset.filter(:id => sub_dataset.obj).delete
66
+
67
+ # Delete duplicate groups
68
+ sub_dataset = groups_dataset.select(:max.sql_function(:id).as(:id)).group_by_matches
69
+ groups_dataset.filter(:id => sub_dataset.obj).delete
106
70
  end
107
71
  end
108
72
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "linkage"
8
- s.version = "0.0.5"
8
+ s.version = "0.0.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jeremy Stephens"]
12
- s.date = "2012-03-30"
12
+ s.date = "2012-05-08"
13
13
  s.description = "Performs record linkage between one or two datasets, using Sequel on the backend"
14
14
  s.email = "jeremy.f.stephens@vanderbilt.edu"
15
15
  s.extra_rdoc_files = [
@@ -69,7 +69,7 @@ Gem::Specification.new do |s|
69
69
  s.homepage = "http://github.com/coupler/linkage"
70
70
  s.licenses = ["MIT"]
71
71
  s.require_paths = ["lib"]
72
- s.rubygems_version = "1.8.18"
72
+ s.rubygems_version = "1.8.23"
73
73
  s.summary = "Record linkage library"
74
74
 
75
75
  if s.respond_to? :specification_version then
@@ -62,6 +62,16 @@ class Test::Unit::TestCase
62
62
  def test_config
63
63
  @test_config ||= YAML.load_file(File.join(File.dirname(__FILE__), "config.yml"))
64
64
  end
65
+
66
+ def prefixed_logger(prefix)
67
+ logger = Logger.new(STDERR)
68
+ original_formatter = Logger::Formatter.new
69
+ logger.formatter = proc { |severity, datetime, progname, msg|
70
+ result = original_formatter.call(severity, datetime, progname, msg)
71
+ "[#{prefix}] #{result}"
72
+ }
73
+ logger
74
+ end
65
75
  end
66
76
 
67
77
  module UnitTests; end
@@ -7,8 +7,8 @@ module IntegrationTests
7
7
  @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
8
  end
9
9
 
10
- def database(&block)
11
- Sequel.connect(@tmpuri, &block)
10
+ def database(options = {}, &block)
11
+ Sequel.connect(@tmpuri, options, &block)
12
12
  end
13
13
 
14
14
  def teardown
@@ -16,6 +16,10 @@ module IntegrationTests
16
16
  end
17
17
 
18
18
  test "one mandatory field equality on single threaded runner" do
19
+ #setup_logger = Logger.new(STDERR)
20
+ #setup_logger.formatter = lambda { |severity, time, progname, msg|
21
+ #" SETUP : %s [%s]: %s\n" % [severity, time, msg]
22
+ #}
19
23
  # insert the test data
20
24
  database do |db|
21
25
  db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
@@ -23,12 +27,22 @@ module IntegrationTests
23
27
  Array.new(100) { |i| [i, i % 10, i % 5] })
24
28
  end
25
29
 
30
+ #ds_logger = Logger.new(STDERR)
31
+ #ds_logger.formatter = lambda { |severity, time, progname, msg|
32
+ #"DATASET: %s [%s]: %s\n" % [severity, time, msg]
33
+ #}
26
34
  ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
35
+
36
+ #rs_logger = Logger.new(STDERR)
37
+ #rs_logger.formatter = lambda { |severity, time, progname, msg|
38
+ #"RESULTS: %s [%s]: %s\n" % [severity, time, msg]
39
+ #}
27
40
  tmpuri = @tmpuri
28
41
  conf = ds.link_with(ds) do
29
42
  lhs[:foo].must == rhs[:bar]
30
43
  save_results_in(tmpuri, :single_threaded => true)
31
44
  end
45
+ assert_equal :cross, conf.linkage_type
32
46
  runner = Linkage::SingleThreadedRunner.new(conf)
33
47
  runner.execute
34
48
 
@@ -38,11 +52,11 @@ module IntegrationTests
38
52
  assert_equal i, row[:foo_bar]
39
53
  end
40
54
 
41
- assert_equal 150, db[:groups_records].count
42
- db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
43
- expected_group_id = (row[:record_id] % 5) + 1
44
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
45
- end
55
+ #assert_equal 150, db[:groups_records].count
56
+ #db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
57
+ #expected_group_id = (row[:record_id] % 5) + 1
58
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
59
+ #end
46
60
  end
47
61
  end
48
62
 
@@ -27,4 +27,47 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
27
27
  assert_equal ds_2.field_set, ds_1.field_set
28
28
  assert_match /`foo` = 'bar'/, ds_2.sql
29
29
  end
30
+
31
+ test "each_group" do
32
+ database do |db|
33
+ db.create_table(:foo) do
34
+ primary_key :id
35
+ String :bar
36
+ end
37
+ db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo'], [3, 'bar'], [4, 'baz']])
38
+ end
39
+
40
+ ds = Linkage::Dataset.new(@tmpuri, "foo")
41
+ ds = ds.match(:bar)
42
+ ds.each_group do |group|
43
+ assert_equal({:bar => "foo"}, group.values)
44
+ assert_equal(2, group.count)
45
+ end
46
+
47
+ groups = []
48
+ ds.each_group(1) do |group|
49
+ groups << group
50
+ end
51
+ assert_equal 3, groups.length
52
+ end
53
+
54
+ test "each_group with filters" do
55
+ database do |db|
56
+ db.create_table(:foo) do
57
+ primary_key :id
58
+ String :bar
59
+ Integer :baz
60
+ end
61
+ db[:foo].import([:id, :bar, :baz], [[1, 'foo', 1], [2, 'foo', 2], [3, 'bar', 3], [4, 'baz', 4]])
62
+ end
63
+
64
+ ds = Linkage::Dataset.new(@tmpuri, "foo")
65
+ ds = ds.match(:bar)
66
+ ds = ds.filter { baz >= 3 }
67
+ groups = []
68
+ ds.each_group(1) do |group|
69
+ groups << group
70
+ end
71
+ assert_equal 2, groups.length
72
+ end
30
73
  end
@@ -7,8 +7,8 @@ module IntegrationTests
7
7
  @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
8
  end
9
9
 
10
- def database(&block)
11
- Sequel.connect(@tmpuri, &block)
10
+ def database(options = {}, &block)
11
+ Sequel.connect(@tmpuri, options, &block)
12
12
  end
13
13
 
14
14
  def teardown
@@ -43,16 +43,16 @@ module IntegrationTests
43
43
  assert_equal "12345678#{i%10}", row[:ssn]
44
44
  end
45
45
 
46
- assert_equal 200, db[:groups_records].count
47
- db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
48
- if i % 20 >= 10
49
- assert_equal 2, row[:dataset], row.inspect
50
- else
51
- assert_equal 1, row[:dataset], row.inspect
52
- end
53
- expected_group_id = i / 20 + 1
54
- assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
55
- end
46
+ #assert_equal 200, db[:groups_records].count
47
+ #db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
48
+ #if i % 20 >= 10
49
+ #assert_equal 2, row[:dataset], row.inspect
50
+ #else
51
+ #assert_equal 1, row[:dataset], row.inspect
52
+ #end
53
+ #expected_group_id = i / 20 + 1
54
+ #assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
55
+ #end
56
56
  end
57
57
  end
58
58
 
@@ -86,8 +86,7 @@ module IntegrationTests
86
86
  end
87
87
  end
88
88
 
89
- test "handles MySQL's ignorance of trailing spaces when comparing strings" do
90
- pend
89
+ test "reacts properly when using two databases with different string equality methods" do
91
90
  if !test_config['mysql']
92
91
  omission("No MySQL test configuration found")
93
92
  end
@@ -98,24 +97,21 @@ module IntegrationTests
98
97
 
99
98
  db.create_table!(:bar) { primary_key(:id); String(:one); String(:two) }
100
99
  db[:bar].import([:id, :one, :two], [[1, "", "junk"]])
101
-
102
- db.run("DROP TABLE IF EXISTS groups")
103
- db.run("DROP TABLE IF EXISTS groups_records")
104
100
  end
105
101
 
106
102
  ds_1 = Linkage::Dataset.new(uri, "foo", :single_threaded => true)
107
103
  ds_2 = Linkage::Dataset.new(uri, "bar", :single_threaded => true)
108
- logger = Logger.new(STDERR)
104
+ tmpuri = @tmpuri
109
105
  conf = ds_1.link_with(ds_2) do
110
106
  lhs[:one].must == rhs[:one]
111
107
  lhs[:two].must == rhs[:two]
112
- save_results_in(uri, :logger => logger)
108
+ save_results_in(tmpuri)
113
109
  end
114
110
 
115
111
  runner = Linkage::SingleThreadedRunner.new(conf)
116
112
  runner.execute
117
113
 
118
- Sequel.connect(@tmpuri) do |db|
114
+ database do |db|
119
115
  assert_equal 1, db[:groups].count
120
116
  end
121
117
  end
@@ -39,20 +39,21 @@ module IntegrationTests
39
39
  end
40
40
 
41
41
  test "strftime in sqlite" do
42
- logger = Logger.new(STDERR)
43
- database(:logger => logger) do |db|
42
+ #logger = Logger.new(STDERR)
43
+ #database(:logger => logger) do |db|
44
+ database do |db|
44
45
  db.create_table(:foo) { primary_key(:id); Date(:foo_date) }
45
46
  db.create_table(:bar) { primary_key(:id); String(:bar_string) }
46
47
  db[:foo].insert({:id => 1, :foo_date => Date.today})
47
48
  db[:bar].insert({:id => 1, :bar_string => Date.today.strftime("%Y-%m-%d")})
48
49
  end
49
50
 
50
- ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true, :logger => logger)
51
- ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true, :logger => logger)
51
+ ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
52
+ ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
52
53
  tmpuri = @tmpuri
53
54
  conf = ds_1.link_with(ds_2) do
54
55
  strftime(lhs[:foo_date], "%Y-%m-%d").must == rhs[:bar_string]
55
- save_results_in(tmpuri, :logger => logger)
56
+ save_results_in(tmpuri)
56
57
  end
57
58
  runner = Linkage::SingleThreadedRunner.new(conf)
58
59
  runner.execute
@@ -37,13 +37,10 @@ module IntegrationTests
37
37
  assert_equal 10, db[:groups].count
38
38
  db[:groups].order(:ssn).each_with_index do |row, i|
39
39
  assert_equal "12345678#{i%10}", row[:ssn]
40
- end
41
40
 
42
- assert_equal 100, db[:groups_records].count
43
- expected_group_id = nil
44
- db[:groups_records].order(:record_id).each do |row|
45
- expected_group_id = (row[:record_id] % 10) + 1
46
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
41
+ group = Linkage::Group.from_row(row)
42
+ dataset, _ = result_set.groups_records_datasets(group)
43
+ assert_equal 10, dataset.count
47
44
  end
48
45
  end
49
46
  end
@@ -73,13 +70,13 @@ module IntegrationTests
73
70
  assert_equal Date.civil(1985, 1, i / 2 + 1 + (i % 2 == 0 ? 0 : 10)), row[:dob]
74
71
  end
75
72
 
76
- assert_equal 100, db[:groups_records].count
77
- expected_group_id = nil
78
- db[:groups_records].order(:record_id).each do |row|
79
- v = row[:record_id] % 20
80
- expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
81
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
82
- end
73
+ #assert_equal 100, db[:groups_records].count
74
+ #expected_group_id = nil
75
+ #db[:groups_records].order(:record_id).each do |row|
76
+ #v = row[:record_id] % 20
77
+ #expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
78
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
79
+ #end
83
80
  end
84
81
  end
85
82
 
@@ -107,12 +104,12 @@ module IntegrationTests
107
104
  assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
108
105
  end
109
106
 
110
- assert_equal 20, db[:groups_records].count
111
- expected_group_id = nil
112
- db[:groups_records].order(:record_id).each do |row|
113
- expected_group_id = (row[:record_id] / 5) % 2 + 1
114
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
115
- end
107
+ #assert_equal 20, db[:groups_records].count
108
+ #expected_group_id = nil
109
+ #db[:groups_records].order(:record_id).each do |row|
110
+ #expected_group_id = (row[:record_id] / 5) % 2 + 1
111
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
112
+ #end
116
113
  end
117
114
  end
118
115
 
@@ -141,12 +138,12 @@ module IntegrationTests
141
138
  assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
142
139
  end
143
140
 
144
- assert_equal 20, db[:groups_records].count
145
- expected_group_id = nil
146
- db[:groups_records].order(:record_id).each do |row|
147
- expected_group_id = (row[:record_id] / 5) % 2 + 1
148
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
149
- end
141
+ #assert_equal 20, db[:groups_records].count
142
+ #expected_group_id = nil
143
+ #db[:groups_records].order(:record_id).each do |row|
144
+ #expected_group_id = (row[:record_id] / 5) % 2 + 1
145
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
146
+ #end
150
147
  end
151
148
  end
152
149
 
@@ -175,12 +172,12 @@ module IntegrationTests
175
172
  assert_equal "123456789#{i}", row[:ssn]
176
173
  end
177
174
 
178
- assert_equal 25, db[:groups_records].count
179
- expected_group_id = nil
180
- db[:groups_records].order(:record_id).each do |row|
181
- expected_group_id = row[:record_id] % 5 + 1
182
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
183
- end
175
+ #assert_equal 25, db[:groups_records].count
176
+ #expected_group_id = nil
177
+ #db[:groups_records].order(:record_id).each do |row|
178
+ #expected_group_id = row[:record_id] % 5 + 1
179
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
180
+ #end
184
181
  end
185
182
  end
186
183
 
@@ -210,12 +207,12 @@ module IntegrationTests
210
207
  assert_equal "123456789#{i}", row[:ssn]
211
208
  end
212
209
 
213
- assert_equal 25, db[:groups_records].count
214
- expected_group_id = nil
215
- db[:groups_records].order(:record_id).each do |row|
216
- expected_group_id = row[:record_id] % 5 + 1
217
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
218
- end
210
+ #assert_equal 25, db[:groups_records].count
211
+ #expected_group_id = nil
212
+ #db[:groups_records].order(:record_id).each do |row|
213
+ #expected_group_id = row[:record_id] % 5 + 1
214
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
215
+ #end
219
216
  end
220
217
  end
221
218
  end
@@ -135,9 +135,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
135
135
  field_2 = stub('field 2', :to_expr => :foo)
136
136
  dataset_2.stubs(:field_set).returns({:foo => field_2})
137
137
 
138
- func_expr = stub('function expression') do
139
- expects(:as).with(:trim_foo_foo).returns(self)
140
- end
138
+ func_expr = stub('function expression')
141
139
  func = stub('function', :static? => false, :to_expr => func_expr)
142
140
  Linkage::Functions::Trim.expects(:new).with(field_1).returns(func)
143
141
  merged_field = stub('merged field', :name => :trim_foo_foo)
@@ -147,12 +145,10 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
147
145
  c.configure do
148
146
  trim(lhs[:foo]).must == rhs[:foo]
149
147
  end
150
- dataset_1.expects(:select_more).with(func_expr).returns(dataset_1)
151
- dataset_1.expects(:order_more).with(func_expr).returns(dataset_1)
148
+ dataset_1.expects(:match).with(func_expr, :trim_foo_foo).returns(dataset_1)
152
149
  c.expectations[0].apply_to(dataset_1, :lhs)
153
150
 
154
- dataset_2.expects(:select_more).with(:foo.as(:trim_foo_foo)).returns(dataset_2)
155
- dataset_2.expects(:order_more).with(:foo).returns(dataset_2)
151
+ dataset_2.expects(:match).with(:foo, :trim_foo_foo).returns(dataset_2)
156
152
  c.expectations[0].apply_to(dataset_2, :rhs)
157
153
  end
158
154
 
@@ -34,4 +34,55 @@ class UnitTests::TestDataset < Test::Unit::TestCase
34
34
  @database.expects(:adapter_scheme).returns(:foo)
35
35
  assert_equal :foo, ds.adapter_scheme
36
36
  end
37
+
38
+ test "add match expression" do
39
+ ds_1 = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
40
+ @dataset.expects(:clone).returns(@dataset)
41
+ ds_2 = ds_1.match(:foo)
42
+ assert_not_same ds_1, ds_2
43
+ assert_not_equal ds_1.instance_variable_get(:@_match),
44
+ ds_2.instance_variable_get(:@_match)
45
+ end
46
+
47
+ test "add match expression with alias, then each_group" do
48
+ ds_1 = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
49
+ @dataset.expects(:clone).returns(@dataset)
50
+ ds_2 = ds_1.match(:foo, :aliased_foo)
51
+ @dataset.expects(:group_and_count).with(:foo.as(:aliased_foo)).returns(@dataset)
52
+ @dataset.expects(:having).returns(@dataset)
53
+ @dataset.expects(:each).yields({:aliased_foo => 123, :count => 1})
54
+ ds_2.each_group { |g| }
55
+ end
56
+
57
+ test "group_by_matches" do
58
+ ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
59
+
60
+ @dataset.expects(:clone).returns(@dataset)
61
+ ds = ds.match(:foo)
62
+ @dataset.expects(:group).with(:foo).returns(@dataset)
63
+
64
+ ds.group_by_matches
65
+ end
66
+
67
+ test "dataset_for_group" do
68
+ ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
69
+ @dataset.expects(:clone).returns(@dataset)
70
+ ds = ds.match(:foo, :foo_bar)
71
+
72
+ group = stub("group", :values => {:foo_bar => 'baz'})
73
+ filtered_dataset = stub('filtered dataset')
74
+ @dataset.expects(:filter).with(:foo => 'baz').returns(filtered_dataset)
75
+ assert_equal filtered_dataset, ds.dataset_for_group(group)
76
+ end
77
+
78
+ test "dataset_for_group without aliases" do
79
+ ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
80
+ @dataset.expects(:clone).returns(@dataset)
81
+ ds = ds.match(:foo)
82
+
83
+ group = stub("group", :values => {:foo => 'baz'})
84
+ filtered_dataset = stub('filtered dataset')
85
+ @dataset.expects(:filter).with(:foo => 'baz').returns(filtered_dataset)
86
+ assert_equal filtered_dataset, ds.dataset_for_group(group)
87
+ end
37
88
  end
@@ -1,21 +1,10 @@
1
1
  require 'helper'
2
2
 
3
3
  class UnitTests::TestGroup < Test::Unit::TestCase
4
- test "matches?" do
5
- g = Linkage::Group.new(:test => 'test')
6
- assert g.matches?({:test => 'test'})
7
- assert !g.matches?({:foo => 'bar'})
8
- end
9
-
10
- test "add_record adds a record" do
11
- g = Linkage::Group.new(:test => 'test')
12
- g.add_record(123)
13
- assert_equal [123], g.records
14
- end
15
-
16
- test "count returns number of records" do
17
- g = Linkage::Group.new(:test => 'test')
18
- g.add_record(123)
4
+ test "initialize" do
5
+ g = Linkage::Group.new({:test => 'test'}, {:count => 1, :id => 456})
6
+ assert_equal({:test => 'test'}, g.values)
19
7
  assert_equal 1, g.count
8
+ assert_equal 456, g.id
20
9
  end
21
10
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-30 00:00:00.000000000 Z
12
+ date: 2012-05-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: sequel
@@ -307,7 +307,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
307
307
  version: '0'
308
308
  segments:
309
309
  - 0
310
- hash: -1705215583013388953
310
+ hash: -1901911346636016746
311
311
  required_rubygems_version: !ruby/object:Gem::Requirement
312
312
  none: false
313
313
  requirements:
@@ -316,7 +316,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
316
316
  version: '0'
317
317
  requirements: []
318
318
  rubyforge_project:
319
- rubygems_version: 1.8.18
319
+ rubygems_version: 1.8.23
320
320
  signing_key:
321
321
  specification_version: 3
322
322
  summary: Record linkage library