linkage 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@ GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
4
  blockenspiel (0.4.3)
5
- coderay (1.0.5)
5
+ coderay (1.0.6)
6
6
  ffi (1.0.11)
7
7
  git (1.2.5)
8
8
  guard (1.0.1)
@@ -25,7 +25,7 @@ GEM
25
25
  mocha (0.10.5)
26
26
  metaclass (~> 0.0.1)
27
27
  mysql2 (0.3.11)
28
- pry (0.9.8.4)
28
+ pry (0.9.9)
29
29
  coderay (~> 1.0.5)
30
30
  method_source (~> 0.7.1)
31
31
  slop (>= 2.4.4, < 3)
@@ -33,12 +33,12 @@ GEM
33
33
  rdiscount (1.6.8)
34
34
  rdoc (3.12)
35
35
  json (~> 1.4)
36
- sequel (3.33.0)
36
+ sequel (3.34.1)
37
37
  slop (2.4.4)
38
- sqlite3 (1.3.5)
38
+ sqlite3 (1.3.6)
39
39
  test-unit (2.4.8)
40
40
  thor (0.14.6)
41
- versionomy (0.4.2)
41
+ versionomy (0.4.3)
42
42
  blockenspiel (>= 0.4.3)
43
43
  yard (0.7.5)
44
44
 
data/Rakefile CHANGED
@@ -48,3 +48,5 @@ require 'yard'
48
48
  YARD::Rake::YardocTask.new do |t|
49
49
  t.files = ['lib/**/*.rb']
50
50
  end
51
+
52
+ task :build => :gemspec
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.5
1
+ 0.0.6
@@ -117,12 +117,12 @@ module Linkage
117
117
  end
118
118
 
119
119
  expr = target.to_expr(side)
120
- aliased_expr = expr
120
+ aliaz = nil
121
121
  if expr != merged_field.name
122
- aliased_expr = expr.as(merged_field.name)
122
+ aliaz = merged_field.name
123
123
  end
124
124
 
125
- dataset.order_more(expr).select_more(aliased_expr)
125
+ dataset.match(expr, aliaz)
126
126
  end
127
127
 
128
128
  def same_filter?(other)
@@ -318,9 +318,9 @@ module Linkage
318
318
  def initialize(dataset_1, dataset_2)
319
319
  @dataset_1 = dataset_1
320
320
  @dataset_2 = dataset_2
321
+ @linkage_type = dataset_1 == dataset_2 ? :self : :dual
321
322
  @expectations = []
322
323
  @visual_comparisons = []
323
- @linkage_type = dataset_1 == dataset_2 ? :self : :dual
324
324
  end
325
325
 
326
326
  def configure(&block)
@@ -348,5 +348,15 @@ module Linkage
348
348
  def result_set
349
349
  @result_set ||= ResultSet.new(self)
350
350
  end
351
+
352
+ def datasets_with_applied_expectations
353
+ dataset_1 = @dataset_1
354
+ dataset_2 = @dataset_2
355
+ @expectations.each do |exp|
356
+ dataset_1 = exp.apply_to(dataset_1, :lhs)
357
+ dataset_2 = exp.apply_to(dataset_2, :rhs) if @linkage_type != :self
358
+ end
359
+ @linkage_type == :self ? [dataset_1, dataset_1] : [dataset_1, dataset_2]
360
+ end
351
361
  end
352
362
  end
@@ -1,17 +1,23 @@
1
1
  module Linkage
2
- class Dataset < Delegator
2
+ # Delegator around Sequel::Dataset with some extra functionality.
3
+ class Dataset
3
4
  attr_reader :field_set, :table_name
4
5
 
5
6
  def initialize(uri, table, options = {})
6
7
  @table_name = table.to_sym
7
8
  db = Sequel.connect(uri, options)
8
- ds = db[@table_name]
9
- super(ds)
9
+ @dataset = db[@table_name]
10
10
  @field_set = FieldSet.new(db.schema(@table_name))
11
+ @_match = []
11
12
  end
12
13
 
13
- def __setobj__(obj); @dataset = obj; end
14
- def __getobj__; @dataset; end
14
+ def obj
15
+ @dataset
16
+ end
17
+
18
+ def obj=(value)
19
+ @dataset = value
20
+ end
15
21
 
16
22
  # Setup a linkage with another dataset
17
23
  #
@@ -26,21 +32,70 @@ module Linkage
26
32
  @dataset.db.adapter_scheme
27
33
  end
28
34
 
29
- def initialize_clone(obj)
30
- new_obj = obj.instance_variable_get(:@new_obj)
35
+ def match(expr, aliaz = nil)
36
+ clone(:match => {:expr => expr, :alias => aliaz})
37
+ end
38
+
39
+ def clone(new_opts={})
40
+ new_opts = new_opts.dup
41
+ new_obj = new_opts.delete(:new_obj)
42
+
43
+ match = new_opts.delete(:match)
44
+ result = super()
45
+ result.send(:_match, match)
46
+
31
47
  if new_obj
32
- __setobj__(new_obj)
48
+ result.obj = new_obj
33
49
  else
34
- super
50
+ result.obj = obj.clone(new_opts)
51
+ end
52
+ result
53
+ end
54
+
55
+ def each_group(min = 2)
56
+ @dataset.group_and_count(*aliased_match_expressions).having{count >= min}.each do |row|
57
+ count = row.delete(:count)
58
+ yield Group.new(row, {:count => count})
59
+ end
60
+ end
61
+
62
+ def group_by_matches(aliased = false)
63
+ expr = aliased ? aliased_match_expressions : match_expressions
64
+ group(*expr)
65
+ end
66
+
67
+ def dataset_for_group(group)
68
+ filters = []
69
+ group.values.each_pair do |key, value|
70
+ # find a matched expression with this alias
71
+ m = @_match.detect { |h| h[:alias] ? h[:alias] == key : h[:expr] == key }
72
+ raise "this dataset isn't compatible with the given group" if !m
73
+ filters << {m[:expr] => value}
74
+ end
75
+ filter(*filters)
76
+ end
77
+
78
+ private
79
+
80
+ def _match(opts)
81
+ if opts
82
+ @_match += [opts]
35
83
  end
36
84
  end
37
85
 
86
+ def match_expressions
87
+ @_match.collect { |m| m[:expr] }
88
+ end
89
+
90
+ def aliased_match_expressions
91
+ @_match.collect { |m| m[:alias] ? m[:expr].as(m[:alias]) : m[:expr] }
92
+ end
93
+
38
94
  def method_missing(name, *args, &block)
39
- result = super
95
+ result = @dataset.send(name, *args, &block)
40
96
  if result.kind_of?(Sequel::Dataset)
41
- @new_obj = result
42
- result = clone
43
- @new_obj = nil
97
+ new_obj = result
98
+ result = clone(:new_obj => result)
44
99
  end
45
100
  result
46
101
  end
@@ -1,43 +1,35 @@
1
1
  module Linkage
2
- # This class represents a group of records that match based on criteria
3
- # described via the {Dataset#link_with} method. Group's are created by
4
- # subclasses of the {Runner} class during execution.
5
- #
6
- # @see Dataset#link_with
7
- # @see SingleThreadedRunner
8
2
  class Group
9
- # @return [Array<Object>] An array of this group's record ids
10
- attr_reader :records
11
-
12
3
  # @return [Hash] Hash of matching values
13
4
  attr_reader :values
14
5
 
15
- # @param [Hash] matching_values Values that define this group
16
- # @example
17
- # Linkage::Group.new({:foo => 123, :bar => 'baz'})
18
- def initialize(matching_values)
19
- @values = matching_values
20
- @records = []
21
- end
6
+ # @return [Integer] Number of records in this group
7
+ attr_reader :count
22
8
 
23
- # Check to see if the given set of values matches this group's values.
24
- #
25
- # @param [Hash] values Hash of values
26
- # @return [Boolean] true if match, false if not
27
- def matches?(values)
28
- @values == values
29
- end
9
+ # @return [Integer] This group's ID (if it exists)
10
+ attr_reader :id
30
11
 
31
- # Add a record id to this group's set of records.
32
- #
33
- # @param [Object] record_id
34
- def add_record(record_id)
35
- @records << record_id
12
+ def self.from_row(row)
13
+ values = {}
14
+ options = {}
15
+ row.each_pair do |key, value|
16
+ if key == :id || key == :count
17
+ options[key] = value
18
+ else
19
+ values[key] = value
20
+ end
21
+ end
22
+ new(values, options)
36
23
  end
37
24
 
38
- # @return [Fixnum] Number of records in this group
39
- def count
40
- @records.count
25
+ # @param [Hash] values Values that define this group
26
+ # @param [Hash] options
27
+ # @example
28
+ # Linkage::Group.new({:foo => 123, :bar => 'baz'}, {:count => 5, :id => 456})
29
+ def initialize(values, options)
30
+ @count = options[:count]
31
+ @id = options[:id]
32
+ @values = values
41
33
  end
42
34
  end
43
35
  end
@@ -7,11 +7,7 @@ module Linkage
7
7
  end
8
8
 
9
9
  def groups_dataset
10
- Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
11
- end
12
-
13
- def groups_records_dataset
14
- Dataset.new(@config.results_uri, :groups_records, @config.results_uri_options)
10
+ @groups_dataset ||= Dataset.new(@config.results_uri, :groups, @config.results_uri_options)
15
11
  end
16
12
 
17
13
  def database(&block)
@@ -40,18 +36,22 @@ module Linkage
40
36
  groups_headers = [:id] + group.values.keys
41
37
  @groups_buffer = ImportBuffer.new(@config.results_uri, :groups, groups_headers, @config.results_uri_options)
42
38
  end
43
- @groups_records_buffer ||= ImportBuffer.new(@config.results_uri, :groups_records, [:group_id, :dataset, :record_id], @config.results_uri_options)
44
-
45
39
  group_id = next_group_id
46
40
  @groups_buffer.add([group_id] + group.values.values)
47
- group.records.each do |record_id|
48
- @groups_records_buffer.add([group_id, dataset_id, record_id])
49
- end
50
41
  end
51
42
 
52
43
  def flush!
53
44
  @groups_buffer.flush if @groups_buffer
54
- @groups_records_buffer.flush if @groups_records_buffer
45
+ end
46
+
47
+ def get_group(index)
48
+ values = groups_dataset.order(:id).limit(1, index).first
49
+ Group.from_row(values)
50
+ end
51
+
52
+ def groups_records_datasets(group)
53
+ datasets = @config.datasets_with_applied_expectations
54
+ datasets.collect! { |ds| ds.dataset_for_group(group) }
55
55
  end
56
56
 
57
57
  private
@@ -6,7 +6,6 @@ module Linkage
6
6
  # @return [Linkage::ResultSet]
7
7
  def execute
8
8
  setup_datasets
9
- apply_expectations
10
9
  group_records
11
10
 
12
11
  return result_set
@@ -15,18 +14,13 @@ module Linkage
15
14
  private
16
15
 
17
16
  def setup_datasets
18
- pk = config.dataset_1.field_set.primary_key
19
- @dataset_1 = config.dataset_1.select(pk.to_expr)
20
- if @config.linkage_type != :self
21
- pk = config.dataset_2.field_set.primary_key
22
- @dataset_2 = config.dataset_2.select(pk.to_expr)
23
- end
24
- end
17
+ @dataset_1, @dataset_2 = config.datasets_with_applied_expectations
25
18
 
26
- def apply_expectations
27
- config.expectations.each do |exp|
28
- @dataset_1 = exp.apply_to(@dataset_1, :lhs)
29
- @dataset_2 = exp.apply_to(@dataset_2, :rhs) if config.linkage_type != :self
19
+ pk = @dataset_1.field_set.primary_key
20
+ @dataset_1 = @dataset_1.select(pk.to_expr)
21
+ if @config.linkage_type != :self
22
+ pk = @dataset_2.field_set.primary_key
23
+ @dataset_2 = @dataset_2.select(pk.to_expr)
30
24
  end
31
25
  end
32
26
 
@@ -47,23 +41,10 @@ module Linkage
47
41
  # @param [Boolean] ignore_empty_groups
48
42
  # @yield [Linkage::Group] If a block is given, yield completed groups to
49
43
  # the block. Otherwise, call ResultSet#add_group on the group.
50
- def group_records_for(dataset, dataset_id = nil, ignore_empty_groups = true, &block)
51
- current_group = nil
52
- block ||= lambda { |group| result_set.add_group(current_group, dataset_id) }
53
- primary_key = dataset.field_set.primary_key.to_expr
54
- dataset.each do |row|
55
- pk = row.delete(primary_key)
56
- if current_group.nil? || !current_group.matches?(row)
57
- if current_group && (!ignore_empty_groups || current_group.count > 1)
58
- block.call(current_group)
59
- end
60
- new_group = Group.new(row)
61
- current_group = new_group
62
- end
63
- current_group.add_record(pk)
64
- end
65
- if current_group && (!ignore_empty_groups || current_group.count > 1)
66
- block.call(current_group)
44
+ def group_records_for(dataset, dataset_id, ignore_empty_groups = true)
45
+ group_minimum = ignore_empty_groups ? 2 : 1
46
+ dataset.each_group(group_minimum) do |group|
47
+ result_set.add_group(group, dataset_id)
67
48
  end
68
49
  result_set.flush!
69
50
  end
@@ -72,37 +53,20 @@ module Linkage
72
53
  # Create a new dataset for the groups table
73
54
  groups_dataset = result_set.groups_dataset
74
55
 
75
- exprs = groups_dataset.field_set.values.inject([]) do |arr, field|
56
+ groups_dataset.field_set.values.each do |field|
76
57
  # Sort on all fields
77
- field.primary_key? ? arr : arr << field.to_expr
78
- end
79
- groups_dataset = groups_dataset.select(*exprs, groups_dataset.field_set.primary_key.to_expr).order(*exprs) # ensure matching groups are sorted by id
80
-
81
- result_set.database do |db|
82
- groups_to_delete = []
83
- db.transaction do # for speed reasons
84
- group_records_for(groups_dataset, nil, false) do |group|
85
- if group.count == 1
86
- # Delete the empty group
87
- groups_to_delete << group.records[0]
88
- else
89
- # Change group_id in the groups_records table to the first group
90
- # id, delete other groups.
91
- new_group_id = group.records[0]
92
- group.records[1..-1].each do |old_group_id|
93
- # NOTE: There can only be a group with max size of 2, but
94
- # this adds in future support for matching more than
95
- # 2 datasets at once.
96
- db[:groups_records].filter(:group_id => old_group_id).
97
- update(:group_id => new_group_id)
98
- groups_to_delete << old_group_id
99
- end
100
- end
101
- end
58
+ if !field.primary_key?
59
+ groups_dataset = groups_dataset.match(field.to_expr)
102
60
  end
103
- db[:groups_records].filter(:group_id => groups_to_delete).delete
104
- db[:groups].filter(:id => groups_to_delete).delete
105
61
  end
62
+
63
+ # Delete non-matching groups
64
+ sub_dataset = groups_dataset.select(:id).group_by_matches.having(:count.sql_function(:id) => 1)
65
+ groups_dataset.filter(:id => sub_dataset.obj).delete
66
+
67
+ # Delete duplicate groups
68
+ sub_dataset = groups_dataset.select(:max.sql_function(:id).as(:id)).group_by_matches
69
+ groups_dataset.filter(:id => sub_dataset.obj).delete
106
70
  end
107
71
  end
108
72
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "linkage"
8
- s.version = "0.0.5"
8
+ s.version = "0.0.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jeremy Stephens"]
12
- s.date = "2012-03-30"
12
+ s.date = "2012-05-08"
13
13
  s.description = "Performs record linkage between one or two datasets, using Sequel on the backend"
14
14
  s.email = "jeremy.f.stephens@vanderbilt.edu"
15
15
  s.extra_rdoc_files = [
@@ -69,7 +69,7 @@ Gem::Specification.new do |s|
69
69
  s.homepage = "http://github.com/coupler/linkage"
70
70
  s.licenses = ["MIT"]
71
71
  s.require_paths = ["lib"]
72
- s.rubygems_version = "1.8.18"
72
+ s.rubygems_version = "1.8.23"
73
73
  s.summary = "Record linkage library"
74
74
 
75
75
  if s.respond_to? :specification_version then
@@ -62,6 +62,16 @@ class Test::Unit::TestCase
62
62
  def test_config
63
63
  @test_config ||= YAML.load_file(File.join(File.dirname(__FILE__), "config.yml"))
64
64
  end
65
+
66
+ def prefixed_logger(prefix)
67
+ logger = Logger.new(STDERR)
68
+ original_formatter = Logger::Formatter.new
69
+ logger.formatter = proc { |severity, datetime, progname, msg|
70
+ result = original_formatter.call(severity, datetime, progname, msg)
71
+ "[#{prefix}] #{result}"
72
+ }
73
+ logger
74
+ end
65
75
  end
66
76
 
67
77
  module UnitTests; end
@@ -7,8 +7,8 @@ module IntegrationTests
7
7
  @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
8
  end
9
9
 
10
- def database(&block)
11
- Sequel.connect(@tmpuri, &block)
10
+ def database(options = {}, &block)
11
+ Sequel.connect(@tmpuri, options, &block)
12
12
  end
13
13
 
14
14
  def teardown
@@ -16,6 +16,10 @@ module IntegrationTests
16
16
  end
17
17
 
18
18
  test "one mandatory field equality on single threaded runner" do
19
+ #setup_logger = Logger.new(STDERR)
20
+ #setup_logger.formatter = lambda { |severity, time, progname, msg|
21
+ #" SETUP : %s [%s]: %s\n" % [severity, time, msg]
22
+ #}
19
23
  # insert the test data
20
24
  database do |db|
21
25
  db.create_table(:foo) { primary_key(:id); Integer(:foo); Integer(:bar) }
@@ -23,12 +27,22 @@ module IntegrationTests
23
27
  Array.new(100) { |i| [i, i % 10, i % 5] })
24
28
  end
25
29
 
30
+ #ds_logger = Logger.new(STDERR)
31
+ #ds_logger.formatter = lambda { |severity, time, progname, msg|
32
+ #"DATASET: %s [%s]: %s\n" % [severity, time, msg]
33
+ #}
26
34
  ds = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
35
+
36
+ #rs_logger = Logger.new(STDERR)
37
+ #rs_logger.formatter = lambda { |severity, time, progname, msg|
38
+ #"RESULTS: %s [%s]: %s\n" % [severity, time, msg]
39
+ #}
27
40
  tmpuri = @tmpuri
28
41
  conf = ds.link_with(ds) do
29
42
  lhs[:foo].must == rhs[:bar]
30
43
  save_results_in(tmpuri, :single_threaded => true)
31
44
  end
45
+ assert_equal :cross, conf.linkage_type
32
46
  runner = Linkage::SingleThreadedRunner.new(conf)
33
47
  runner.execute
34
48
 
@@ -38,11 +52,11 @@ module IntegrationTests
38
52
  assert_equal i, row[:foo_bar]
39
53
  end
40
54
 
41
- assert_equal 150, db[:groups_records].count
42
- db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
43
- expected_group_id = (row[:record_id] % 5) + 1
44
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
45
- end
55
+ #assert_equal 150, db[:groups_records].count
56
+ #db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
57
+ #expected_group_id = (row[:record_id] % 5) + 1
58
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
59
+ #end
46
60
  end
47
61
  end
48
62
 
@@ -27,4 +27,47 @@ class IntegrationTests::TestDataset < Test::Unit::TestCase
27
27
  assert_equal ds_2.field_set, ds_1.field_set
28
28
  assert_match /`foo` = 'bar'/, ds_2.sql
29
29
  end
30
+
31
+ test "each_group" do
32
+ database do |db|
33
+ db.create_table(:foo) do
34
+ primary_key :id
35
+ String :bar
36
+ end
37
+ db[:foo].import([:id, :bar], [[1, 'foo'], [2, 'foo'], [3, 'bar'], [4, 'baz']])
38
+ end
39
+
40
+ ds = Linkage::Dataset.new(@tmpuri, "foo")
41
+ ds = ds.match(:bar)
42
+ ds.each_group do |group|
43
+ assert_equal({:bar => "foo"}, group.values)
44
+ assert_equal(2, group.count)
45
+ end
46
+
47
+ groups = []
48
+ ds.each_group(1) do |group|
49
+ groups << group
50
+ end
51
+ assert_equal 3, groups.length
52
+ end
53
+
54
+ test "each_group with filters" do
55
+ database do |db|
56
+ db.create_table(:foo) do
57
+ primary_key :id
58
+ String :bar
59
+ Integer :baz
60
+ end
61
+ db[:foo].import([:id, :bar, :baz], [[1, 'foo', 1], [2, 'foo', 2], [3, 'bar', 3], [4, 'baz', 4]])
62
+ end
63
+
64
+ ds = Linkage::Dataset.new(@tmpuri, "foo")
65
+ ds = ds.match(:bar)
66
+ ds = ds.filter { baz >= 3 }
67
+ groups = []
68
+ ds.each_group(1) do |group|
69
+ groups << group
70
+ end
71
+ assert_equal 2, groups.length
72
+ end
30
73
  end
@@ -7,8 +7,8 @@ module IntegrationTests
7
7
  @tmpuri = "sqlite://" + File.join(@tmpdir, "foo")
8
8
  end
9
9
 
10
- def database(&block)
11
- Sequel.connect(@tmpuri, &block)
10
+ def database(options = {}, &block)
11
+ Sequel.connect(@tmpuri, options, &block)
12
12
  end
13
13
 
14
14
  def teardown
@@ -43,16 +43,16 @@ module IntegrationTests
43
43
  assert_equal "12345678#{i%10}", row[:ssn]
44
44
  end
45
45
 
46
- assert_equal 200, db[:groups_records].count
47
- db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
48
- if i % 20 >= 10
49
- assert_equal 2, row[:dataset], row.inspect
50
- else
51
- assert_equal 1, row[:dataset], row.inspect
52
- end
53
- expected_group_id = i / 20 + 1
54
- assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
55
- end
46
+ #assert_equal 200, db[:groups_records].count
47
+ #db[:groups_records].order(:group_id, :dataset, :record_id).each_with_index do |row, i|
48
+ #if i % 20 >= 10
49
+ #assert_equal 2, row[:dataset], row.inspect
50
+ #else
51
+ #assert_equal 1, row[:dataset], row.inspect
52
+ #end
53
+ #expected_group_id = i / 20 + 1
54
+ #assert_equal expected_group_id, row[:group_id], "Record #{row.inspect} should have been in group #{expected_group_id}"
55
+ #end
56
56
  end
57
57
  end
58
58
 
@@ -86,8 +86,7 @@ module IntegrationTests
86
86
  end
87
87
  end
88
88
 
89
- test "handles MySQL's ignorance of trailing spaces when comparing strings" do
90
- pend
89
+ test "reacts properly when using two databases with different string equality methods" do
91
90
  if !test_config['mysql']
92
91
  omission("No MySQL test configuration found")
93
92
  end
@@ -98,24 +97,21 @@ module IntegrationTests
98
97
 
99
98
  db.create_table!(:bar) { primary_key(:id); String(:one); String(:two) }
100
99
  db[:bar].import([:id, :one, :two], [[1, "", "junk"]])
101
-
102
- db.run("DROP TABLE IF EXISTS groups")
103
- db.run("DROP TABLE IF EXISTS groups_records")
104
100
  end
105
101
 
106
102
  ds_1 = Linkage::Dataset.new(uri, "foo", :single_threaded => true)
107
103
  ds_2 = Linkage::Dataset.new(uri, "bar", :single_threaded => true)
108
- logger = Logger.new(STDERR)
104
+ tmpuri = @tmpuri
109
105
  conf = ds_1.link_with(ds_2) do
110
106
  lhs[:one].must == rhs[:one]
111
107
  lhs[:two].must == rhs[:two]
112
- save_results_in(uri, :logger => logger)
108
+ save_results_in(tmpuri)
113
109
  end
114
110
 
115
111
  runner = Linkage::SingleThreadedRunner.new(conf)
116
112
  runner.execute
117
113
 
118
- Sequel.connect(@tmpuri) do |db|
114
+ database do |db|
119
115
  assert_equal 1, db[:groups].count
120
116
  end
121
117
  end
@@ -39,20 +39,21 @@ module IntegrationTests
39
39
  end
40
40
 
41
41
  test "strftime in sqlite" do
42
- logger = Logger.new(STDERR)
43
- database(:logger => logger) do |db|
42
+ #logger = Logger.new(STDERR)
43
+ #database(:logger => logger) do |db|
44
+ database do |db|
44
45
  db.create_table(:foo) { primary_key(:id); Date(:foo_date) }
45
46
  db.create_table(:bar) { primary_key(:id); String(:bar_string) }
46
47
  db[:foo].insert({:id => 1, :foo_date => Date.today})
47
48
  db[:bar].insert({:id => 1, :bar_string => Date.today.strftime("%Y-%m-%d")})
48
49
  end
49
50
 
50
- ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true, :logger => logger)
51
- ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true, :logger => logger)
51
+ ds_1 = Linkage::Dataset.new(@tmpuri, "foo", :single_threaded => true)
52
+ ds_2 = Linkage::Dataset.new(@tmpuri, "bar", :single_threaded => true)
52
53
  tmpuri = @tmpuri
53
54
  conf = ds_1.link_with(ds_2) do
54
55
  strftime(lhs[:foo_date], "%Y-%m-%d").must == rhs[:bar_string]
55
- save_results_in(tmpuri, :logger => logger)
56
+ save_results_in(tmpuri)
56
57
  end
57
58
  runner = Linkage::SingleThreadedRunner.new(conf)
58
59
  runner.execute
@@ -37,13 +37,10 @@ module IntegrationTests
37
37
  assert_equal 10, db[:groups].count
38
38
  db[:groups].order(:ssn).each_with_index do |row, i|
39
39
  assert_equal "12345678#{i%10}", row[:ssn]
40
- end
41
40
 
42
- assert_equal 100, db[:groups_records].count
43
- expected_group_id = nil
44
- db[:groups_records].order(:record_id).each do |row|
45
- expected_group_id = (row[:record_id] % 10) + 1
46
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
41
+ group = Linkage::Group.from_row(row)
42
+ dataset, _ = result_set.groups_records_datasets(group)
43
+ assert_equal 10, dataset.count
47
44
  end
48
45
  end
49
46
  end
@@ -73,13 +70,13 @@ module IntegrationTests
73
70
  assert_equal Date.civil(1985, 1, i / 2 + 1 + (i % 2 == 0 ? 0 : 10)), row[:dob]
74
71
  end
75
72
 
76
- assert_equal 100, db[:groups_records].count
77
- expected_group_id = nil
78
- db[:groups_records].order(:record_id).each do |row|
79
- v = row[:record_id] % 20
80
- expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
81
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
82
- end
73
+ #assert_equal 100, db[:groups_records].count
74
+ #expected_group_id = nil
75
+ #db[:groups_records].order(:record_id).each do |row|
76
+ #v = row[:record_id] % 20
77
+ #expected_group_id = v < 10 ? 1 + 2 * v : 2 * (v % 10 + 1)
78
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
79
+ #end
83
80
  end
84
81
  end
85
82
 
@@ -107,12 +104,12 @@ module IntegrationTests
107
104
  assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
108
105
  end
109
106
 
110
- assert_equal 20, db[:groups_records].count
111
- expected_group_id = nil
112
- db[:groups_records].order(:record_id).each do |row|
113
- expected_group_id = (row[:record_id] / 5) % 2 + 1
114
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
115
- end
107
+ #assert_equal 20, db[:groups_records].count
108
+ #expected_group_id = nil
109
+ #db[:groups_records].order(:record_id).each do |row|
110
+ #expected_group_id = (row[:record_id] / 5) % 2 + 1
111
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
112
+ #end
116
113
  end
117
114
  end
118
115
 
@@ -141,12 +138,12 @@ module IntegrationTests
141
138
  assert_equal "12345678#{(i * 5) + 3}", row[:ssn]
142
139
  end
143
140
 
144
- assert_equal 20, db[:groups_records].count
145
- expected_group_id = nil
146
- db[:groups_records].order(:record_id).each do |row|
147
- expected_group_id = (row[:record_id] / 5) % 2 + 1
148
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
149
- end
141
+ #assert_equal 20, db[:groups_records].count
142
+ #expected_group_id = nil
143
+ #db[:groups_records].order(:record_id).each do |row|
144
+ #expected_group_id = (row[:record_id] / 5) % 2 + 1
145
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
146
+ #end
150
147
  end
151
148
  end
152
149
 
@@ -175,12 +172,12 @@ module IntegrationTests
175
172
  assert_equal "123456789#{i}", row[:ssn]
176
173
  end
177
174
 
178
- assert_equal 25, db[:groups_records].count
179
- expected_group_id = nil
180
- db[:groups_records].order(:record_id).each do |row|
181
- expected_group_id = row[:record_id] % 5 + 1
182
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
183
- end
175
+ #assert_equal 25, db[:groups_records].count
176
+ #expected_group_id = nil
177
+ #db[:groups_records].order(:record_id).each do |row|
178
+ #expected_group_id = row[:record_id] % 5 + 1
179
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
180
+ #end
184
181
  end
185
182
  end
186
183
 
@@ -210,12 +207,12 @@ module IntegrationTests
210
207
  assert_equal "123456789#{i}", row[:ssn]
211
208
  end
212
209
 
213
- assert_equal 25, db[:groups_records].count
214
- expected_group_id = nil
215
- db[:groups_records].order(:record_id).each do |row|
216
- expected_group_id = row[:record_id] % 5 + 1
217
- assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
218
- end
210
+ #assert_equal 25, db[:groups_records].count
211
+ #expected_group_id = nil
212
+ #db[:groups_records].order(:record_id).each do |row|
213
+ #expected_group_id = row[:record_id] % 5 + 1
214
+ #assert_equal expected_group_id, row[:group_id], "Record #{row[:record_id]} should have been in group #{expected_group_id}"
215
+ #end
219
216
  end
220
217
  end
221
218
  end
@@ -135,9 +135,7 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
135
135
  field_2 = stub('field 2', :to_expr => :foo)
136
136
  dataset_2.stubs(:field_set).returns({:foo => field_2})
137
137
 
138
- func_expr = stub('function expression') do
139
- expects(:as).with(:trim_foo_foo).returns(self)
140
- end
138
+ func_expr = stub('function expression')
141
139
  func = stub('function', :static? => false, :to_expr => func_expr)
142
140
  Linkage::Functions::Trim.expects(:new).with(field_1).returns(func)
143
141
  merged_field = stub('merged field', :name => :trim_foo_foo)
@@ -147,12 +145,10 @@ class UnitTests::TestConfiguration < Test::Unit::TestCase
147
145
  c.configure do
148
146
  trim(lhs[:foo]).must == rhs[:foo]
149
147
  end
150
- dataset_1.expects(:select_more).with(func_expr).returns(dataset_1)
151
- dataset_1.expects(:order_more).with(func_expr).returns(dataset_1)
148
+ dataset_1.expects(:match).with(func_expr, :trim_foo_foo).returns(dataset_1)
152
149
  c.expectations[0].apply_to(dataset_1, :lhs)
153
150
 
154
- dataset_2.expects(:select_more).with(:foo.as(:trim_foo_foo)).returns(dataset_2)
155
- dataset_2.expects(:order_more).with(:foo).returns(dataset_2)
151
+ dataset_2.expects(:match).with(:foo, :trim_foo_foo).returns(dataset_2)
156
152
  c.expectations[0].apply_to(dataset_2, :rhs)
157
153
  end
158
154
 
@@ -34,4 +34,55 @@ class UnitTests::TestDataset < Test::Unit::TestCase
34
34
  @database.expects(:adapter_scheme).returns(:foo)
35
35
  assert_equal :foo, ds.adapter_scheme
36
36
  end
37
+
38
+ test "add match expression" do
39
+ ds_1 = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
40
+ @dataset.expects(:clone).returns(@dataset)
41
+ ds_2 = ds_1.match(:foo)
42
+ assert_not_same ds_1, ds_2
43
+ assert_not_equal ds_1.instance_variable_get(:@_match),
44
+ ds_2.instance_variable_get(:@_match)
45
+ end
46
+
47
+ test "add match expression with alias, then each_group" do
48
+ ds_1 = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
49
+ @dataset.expects(:clone).returns(@dataset)
50
+ ds_2 = ds_1.match(:foo, :aliased_foo)
51
+ @dataset.expects(:group_and_count).with(:foo.as(:aliased_foo)).returns(@dataset)
52
+ @dataset.expects(:having).returns(@dataset)
53
+ @dataset.expects(:each).yields({:aliased_foo => 123, :count => 1})
54
+ ds_2.each_group { |g| }
55
+ end
56
+
57
+ test "group_by_matches" do
58
+ ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
59
+
60
+ @dataset.expects(:clone).returns(@dataset)
61
+ ds = ds.match(:foo)
62
+ @dataset.expects(:group).with(:foo).returns(@dataset)
63
+
64
+ ds.group_by_matches
65
+ end
66
+
67
+ test "dataset_for_group" do
68
+ ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
69
+ @dataset.expects(:clone).returns(@dataset)
70
+ ds = ds.match(:foo, :foo_bar)
71
+
72
+ group = stub("group", :values => {:foo_bar => 'baz'})
73
+ filtered_dataset = stub('filtered dataset')
74
+ @dataset.expects(:filter).with(:foo => 'baz').returns(filtered_dataset)
75
+ assert_equal filtered_dataset, ds.dataset_for_group(group)
76
+ end
77
+
78
+ test "dataset_for_group without aliases" do
79
+ ds = Linkage::Dataset.new('foo:/bar', "foo", {:foo => 'bar'})
80
+ @dataset.expects(:clone).returns(@dataset)
81
+ ds = ds.match(:foo)
82
+
83
+ group = stub("group", :values => {:foo => 'baz'})
84
+ filtered_dataset = stub('filtered dataset')
85
+ @dataset.expects(:filter).with(:foo => 'baz').returns(filtered_dataset)
86
+ assert_equal filtered_dataset, ds.dataset_for_group(group)
87
+ end
37
88
  end
@@ -1,21 +1,10 @@
1
1
  require 'helper'
2
2
 
3
3
  class UnitTests::TestGroup < Test::Unit::TestCase
4
- test "matches?" do
5
- g = Linkage::Group.new(:test => 'test')
6
- assert g.matches?({:test => 'test'})
7
- assert !g.matches?({:foo => 'bar'})
8
- end
9
-
10
- test "add_record adds a record" do
11
- g = Linkage::Group.new(:test => 'test')
12
- g.add_record(123)
13
- assert_equal [123], g.records
14
- end
15
-
16
- test "count returns number of records" do
17
- g = Linkage::Group.new(:test => 'test')
18
- g.add_record(123)
4
+ test "initialize" do
5
+ g = Linkage::Group.new({:test => 'test'}, {:count => 1, :id => 456})
6
+ assert_equal({:test => 'test'}, g.values)
19
7
  assert_equal 1, g.count
8
+ assert_equal 456, g.id
20
9
  end
21
10
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-03-30 00:00:00.000000000 Z
12
+ date: 2012-05-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: sequel
@@ -307,7 +307,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
307
307
  version: '0'
308
308
  segments:
309
309
  - 0
310
- hash: -1705215583013388953
310
+ hash: -1901911346636016746
311
311
  required_rubygems_version: !ruby/object:Gem::Requirement
312
312
  none: false
313
313
  requirements:
@@ -316,7 +316,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
316
316
  version: '0'
317
317
  requirements: []
318
318
  rubyforge_project:
319
- rubygems_version: 1.8.18
319
+ rubygems_version: 1.8.23
320
320
  signing_key:
321
321
  specification_version: 3
322
322
  summary: Record linkage library