loose_tight_dictionary 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -76,10 +76,12 @@ class LooseTightDictionary
76
76
  end
77
77
  end
78
78
 
79
- encompassed, unencompassed = if blockings.any?
79
+ joint, disjoint = if blockings.any?
80
80
  haystack.partition do |straw|
81
- blockings.any? do |blocking|
82
- blocking.encompass?(needle, straw) == true
81
+ if first_blocking_decides
82
+ blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
83
+ else
84
+ blockings.any? { |blocking| blocking.join? needle, straw }
83
85
  end
84
86
  end
85
87
  else
@@ -87,25 +89,25 @@ class LooseTightDictionary
87
89
  end
88
90
 
89
91
  # special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
90
- if encompassed.none?
91
- encompassed = unencompassed
92
- unencompassed = []
92
+ if joint.none?
93
+ joint = disjoint
94
+ disjoint = []
93
95
  end
94
96
 
95
97
  if gather_last_result
96
- last_result.encompassed = encompassed
97
- last_result.unencompassed = unencompassed
98
+ last_result.joint = joint
99
+ last_result.disjoint = disjoint
98
100
  end
99
101
 
100
102
  possibly_identical, certainly_different = if identities.any?
101
- encompassed.partition do |straw|
103
+ joint.partition do |straw|
102
104
  identities.all? do |identity|
103
105
  answer = identity.identical? needle, straw
104
106
  answer.nil? or answer == true
105
107
  end
106
108
  end
107
109
  else
108
- [ encompassed.dup, [] ]
110
+ [ joint.dup, [] ]
109
111
  end
110
112
 
111
113
  if gather_last_result
@@ -117,9 +119,7 @@ class LooseTightDictionary
117
119
  return possibly_identical.map { |straw| straw.record }
118
120
  end
119
121
 
120
- similarities = possibly_identical.map do |straw|
121
- needle.similarity straw
122
- end.sort
122
+ similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
123
123
 
124
124
  best_similarity = similarities[-1]
125
125
  straw = best_similarity.wrapper2
@@ -164,13 +164,13 @@ class LooseTightDictionary
164
164
  log "-" * 150
165
165
  log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
166
166
  log
167
- log "Included"
167
+ log "Joint"
168
168
  log "-" * 150
169
- log last_result.encompassed.blank? ? '(none)' : last_result.encompassed.map { |encompassed| encompassed.to_str }.join("\n")
169
+ log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.to_str }.join("\n")
170
170
  log
171
- log "Ignored"
171
+ log "Disjoint"
172
172
  log "-" * 150
173
- log last_result.unencompassed.blank? ? '(none)' : last_result.unencompassed.map { |unencompassed| unencompassed.to_str }.join("\n")
173
+ log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.to_str }.join("\n")
174
174
  log
175
175
  log "Possibly identical"
176
176
  log "-" * 150
@@ -196,6 +196,10 @@ class LooseTightDictionary
196
196
  def must_match_blocking
197
197
  options[:must_match_blocking] || false
198
198
  end
199
+
200
+ def first_blocking_decides
201
+ options[:first_blocking_decides] || false
202
+ end
199
203
 
200
204
  def tighteners
201
205
  @tighteners ||= (options[:tighteners] || []).map do |regexp_or_str|
@@ -10,18 +10,18 @@ class LooseTightDictionary
10
10
  attr_reader :regexp
11
11
 
12
12
  def initialize(regexp_or_str)
13
- @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
13
+ @regexp = regexp_or_str.to_regexp
14
14
  end
15
15
 
16
16
  def match?(str)
17
17
  !!(regexp.match(str))
18
18
  end
19
19
 
20
- # If a blocking "encompasses" two strings, that means they both fit into it.
20
+ # If a blocking "joins" two strings, that means they both fit into it.
21
21
  #
22
22
  # Returns false if they certainly don't fit this blocking.
23
23
  # Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
24
- def encompass?(str1, str2)
24
+ def join?(str1, str2)
25
25
  if str2_match_data = regexp.match(str2)
26
26
  if str1_match_data = regexp.match(str1)
27
27
  str2_match_data.captures == str1_match_data.captures
@@ -5,7 +5,7 @@ class LooseTightDictionary
5
5
  attr_reader :regexp
6
6
 
7
7
  def initialize(regexp_or_str)
8
- @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
8
+ @regexp = regexp_or_str.to_regexp
9
9
  end
10
10
 
11
11
  # Two strings are "identical" if they both match this identity and the captures are equal.
@@ -4,8 +4,8 @@ class LooseTightDictionary
4
4
  attr_accessor :tighteners
5
5
  attr_accessor :blockings
6
6
  attr_accessor :identities
7
- attr_accessor :encompassed
8
- attr_accessor :unencompassed
7
+ attr_accessor :joint
8
+ attr_accessor :disjoint
9
9
  attr_accessor :possibly_identical
10
10
  attr_accessor :certainly_different
11
11
  attr_accessor :similarities
@@ -13,7 +13,7 @@ class LooseTightDictionary
13
13
  attr_accessor :score
14
14
 
15
15
  def haystack
16
- encompassed + unencompassed
16
+ joint + disjoint
17
17
  end
18
18
 
19
19
  def free
@@ -4,7 +4,7 @@ class LooseTightDictionary
4
4
  attr_reader :regexp
5
5
 
6
6
  def initialize(regexp_or_str)
7
- @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
7
+ @regexp = regexp_or_str.to_regexp
8
8
  end
9
9
 
10
10
  # A tightener applies when its regexp matches and captures a new (shorter) string
@@ -1,3 +1,3 @@
1
1
  class LooseTightDictionary
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.1'
3
3
  end
@@ -23,5 +23,5 @@ Gem::Specification.new do |s|
23
23
  s.add_development_dependency "remote_table"
24
24
  s.add_dependency 'activesupport', '>=2.3.4'
25
25
  s.add_dependency 'amatch'
26
- s.add_dependency 'to_regexp'
26
+ s.add_dependency 'to_regexp', '>=0.0.3'
27
27
  end
@@ -6,18 +6,18 @@ class TestBlocking < Test::Unit::TestCase
6
6
  assert_equal true, b.match?('2 apples')
7
7
  end
8
8
 
9
- def test_002_encompass_both
9
+ def test_002_join_both
10
10
  b = LooseTightDictionary::Blocking.new %r{apple}
11
- assert_equal true, b.encompass?('apple', '2 apples')
11
+ assert_equal true, b.join?('apple', '2 apples')
12
12
  end
13
13
 
14
- def test_002_doesnt_encompass_both
14
+ def test_002_doesnt_join_both
15
15
  b = LooseTightDictionary::Blocking.new %r{apple}
16
- assert_equal false, b.encompass?('orange', '2 apples')
16
+ assert_equal false, b.join?('orange', '2 apples')
17
17
  end
18
18
 
19
19
  def test_003_no_information
20
20
  b = LooseTightDictionary::Blocking.new %r{apple}
21
- assert_equal nil, b.encompass?('orange', 'orange')
21
+ assert_equal nil, b.join?('orange', 'orange')
22
22
  end
23
23
  end
@@ -78,4 +78,23 @@ class TestLooseTightDictionary < Test::Unit::TestCase
78
78
  assert_equal ['X', 'X22' ], d.find_all('X')
79
79
  assert_equal [], d.find_all('A')
80
80
  end
81
+
82
+ def test_013_first_blocking_decides
83
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
84
+ assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
85
+
86
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
87
+ assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
88
+
89
+ # first_blocking_decides refers to the needle
90
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
91
+ assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing ER6')
92
+
93
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
94
+ assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
95
+
96
+ # or equivalently with an identity
97
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
98
+ assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
99
+ end
81
100
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: loose_tight_dictionary
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 21
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 0
10
- version: 0.2.0
9
+ - 1
10
+ version: 0.2.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Seamus Abshere
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-04-27 00:00:00 Z
18
+ date: 2011-04-28 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: shoulda
@@ -83,10 +83,12 @@ dependencies:
83
83
  requirements:
84
84
  - - ">="
85
85
  - !ruby/object:Gem::Version
86
- hash: 3
86
+ hash: 25
87
87
  segments:
88
88
  - 0
89
- version: "0"
89
+ - 0
90
+ - 3
91
+ version: 0.0.3
90
92
  type: :runtime
91
93
  version_requirements: *id005
92
94
  description: Create dictionaries that link rows between two tables using loose matching (string similarity) by default and tight matching (regexp) by request.