loose_tight_dictionary 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -76,10 +76,12 @@ class LooseTightDictionary
76
76
  end
77
77
  end
78
78
 
79
- encompassed, unencompassed = if blockings.any?
79
+ joint, disjoint = if blockings.any?
80
80
  haystack.partition do |straw|
81
- blockings.any? do |blocking|
82
- blocking.encompass?(needle, straw) == true
81
+ if first_blocking_decides
82
+ blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
83
+ else
84
+ blockings.any? { |blocking| blocking.join? needle, straw }
83
85
  end
84
86
  end
85
87
  else
@@ -87,25 +89,25 @@ class LooseTightDictionary
87
89
  end
88
90
 
89
91
  # special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
90
- if encompassed.none?
91
- encompassed = unencompassed
92
- unencompassed = []
92
+ if joint.none?
93
+ joint = disjoint
94
+ disjoint = []
93
95
  end
94
96
 
95
97
  if gather_last_result
96
- last_result.encompassed = encompassed
97
- last_result.unencompassed = unencompassed
98
+ last_result.joint = joint
99
+ last_result.disjoint = disjoint
98
100
  end
99
101
 
100
102
  possibly_identical, certainly_different = if identities.any?
101
- encompassed.partition do |straw|
103
+ joint.partition do |straw|
102
104
  identities.all? do |identity|
103
105
  answer = identity.identical? needle, straw
104
106
  answer.nil? or answer == true
105
107
  end
106
108
  end
107
109
  else
108
- [ encompassed.dup, [] ]
110
+ [ joint.dup, [] ]
109
111
  end
110
112
 
111
113
  if gather_last_result
@@ -117,9 +119,7 @@ class LooseTightDictionary
117
119
  return possibly_identical.map { |straw| straw.record }
118
120
  end
119
121
 
120
- similarities = possibly_identical.map do |straw|
121
- needle.similarity straw
122
- end.sort
122
+ similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
123
123
 
124
124
  best_similarity = similarities[-1]
125
125
  straw = best_similarity.wrapper2
@@ -164,13 +164,13 @@ class LooseTightDictionary
164
164
  log "-" * 150
165
165
  log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
166
166
  log
167
- log "Included"
167
+ log "Joint"
168
168
  log "-" * 150
169
- log last_result.encompassed.blank? ? '(none)' : last_result.encompassed.map { |encompassed| encompassed.to_str }.join("\n")
169
+ log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.to_str }.join("\n")
170
170
  log
171
- log "Ignored"
171
+ log "Disjoint"
172
172
  log "-" * 150
173
- log last_result.unencompassed.blank? ? '(none)' : last_result.unencompassed.map { |unencompassed| unencompassed.to_str }.join("\n")
173
+ log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.to_str }.join("\n")
174
174
  log
175
175
  log "Possibly identical"
176
176
  log "-" * 150
@@ -196,6 +196,10 @@ class LooseTightDictionary
196
196
  def must_match_blocking
197
197
  options[:must_match_blocking] || false
198
198
  end
199
+
200
+ def first_blocking_decides
201
+ options[:first_blocking_decides] || false
202
+ end
199
203
 
200
204
  def tighteners
201
205
  @tighteners ||= (options[:tighteners] || []).map do |regexp_or_str|
@@ -10,18 +10,18 @@ class LooseTightDictionary
10
10
  attr_reader :regexp
11
11
 
12
12
  def initialize(regexp_or_str)
13
- @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
13
+ @regexp = regexp_or_str.to_regexp
14
14
  end
15
15
 
16
16
  def match?(str)
17
17
  !!(regexp.match(str))
18
18
  end
19
19
 
20
- # If a blocking "encompasses" two strings, that means they both fit into it.
20
+ # If a blocking "joins" two strings, that means they both fit into it.
21
21
  #
22
22
  # Returns false if they certainly don't fit this blocking.
23
23
  # Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
24
- def encompass?(str1, str2)
24
+ def join?(str1, str2)
25
25
  if str2_match_data = regexp.match(str2)
26
26
  if str1_match_data = regexp.match(str1)
27
27
  str2_match_data.captures == str1_match_data.captures
@@ -5,7 +5,7 @@ class LooseTightDictionary
5
5
  attr_reader :regexp
6
6
 
7
7
  def initialize(regexp_or_str)
8
- @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
8
+ @regexp = regexp_or_str.to_regexp
9
9
  end
10
10
 
11
11
  # Two strings are "identical" if they both match this identity and the captures are equal.
@@ -4,8 +4,8 @@ class LooseTightDictionary
4
4
  attr_accessor :tighteners
5
5
  attr_accessor :blockings
6
6
  attr_accessor :identities
7
- attr_accessor :encompassed
8
- attr_accessor :unencompassed
7
+ attr_accessor :joint
8
+ attr_accessor :disjoint
9
9
  attr_accessor :possibly_identical
10
10
  attr_accessor :certainly_different
11
11
  attr_accessor :similarities
@@ -13,7 +13,7 @@ class LooseTightDictionary
13
13
  attr_accessor :score
14
14
 
15
15
  def haystack
16
- encompassed + unencompassed
16
+ joint + disjoint
17
17
  end
18
18
 
19
19
  def free
@@ -4,7 +4,7 @@ class LooseTightDictionary
4
4
  attr_reader :regexp
5
5
 
6
6
  def initialize(regexp_or_str)
7
- @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
7
+ @regexp = regexp_or_str.to_regexp
8
8
  end
9
9
 
10
10
  # A tightener applies when its regexp matches and captures a new (shorter) string
@@ -1,3 +1,3 @@
1
1
  class LooseTightDictionary
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.1'
3
3
  end
@@ -23,5 +23,5 @@ Gem::Specification.new do |s|
23
23
  s.add_development_dependency "remote_table"
24
24
  s.add_dependency 'activesupport', '>=2.3.4'
25
25
  s.add_dependency 'amatch'
26
- s.add_dependency 'to_regexp'
26
+ s.add_dependency 'to_regexp', '>=0.0.3'
27
27
  end
@@ -6,18 +6,18 @@ class TestBlocking < Test::Unit::TestCase
6
6
  assert_equal true, b.match?('2 apples')
7
7
  end
8
8
 
9
- def test_002_encompass_both
9
+ def test_002_join_both
10
10
  b = LooseTightDictionary::Blocking.new %r{apple}
11
- assert_equal true, b.encompass?('apple', '2 apples')
11
+ assert_equal true, b.join?('apple', '2 apples')
12
12
  end
13
13
 
14
- def test_002_doesnt_encompass_both
14
+ def test_002_doesnt_join_both
15
15
  b = LooseTightDictionary::Blocking.new %r{apple}
16
- assert_equal false, b.encompass?('orange', '2 apples')
16
+ assert_equal false, b.join?('orange', '2 apples')
17
17
  end
18
18
 
19
19
  def test_003_no_information
20
20
  b = LooseTightDictionary::Blocking.new %r{apple}
21
- assert_equal nil, b.encompass?('orange', 'orange')
21
+ assert_equal nil, b.join?('orange', 'orange')
22
22
  end
23
23
  end
@@ -78,4 +78,23 @@ class TestLooseTightDictionary < Test::Unit::TestCase
78
78
  assert_equal ['X', 'X22' ], d.find_all('X')
79
79
  assert_equal [], d.find_all('A')
80
80
  end
81
+
82
+ def test_013_first_blocking_decides
83
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
84
+ assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
85
+
86
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
87
+ assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
88
+
89
+ # first_blocking_decides refers to the needle
90
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
91
+ assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing ER6')
92
+
93
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
94
+ assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
95
+
96
+ # or equivalently with an identity
97
+ d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
98
+ assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
99
+ end
81
100
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: loose_tight_dictionary
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 21
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 0
10
- version: 0.2.0
9
+ - 1
10
+ version: 0.2.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Seamus Abshere
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-04-27 00:00:00 Z
18
+ date: 2011-04-28 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: shoulda
@@ -83,10 +83,12 @@ dependencies:
83
83
  requirements:
84
84
  - - ">="
85
85
  - !ruby/object:Gem::Version
86
- hash: 3
86
+ hash: 25
87
87
  segments:
88
88
  - 0
89
- version: "0"
89
+ - 0
90
+ - 3
91
+ version: 0.0.3
90
92
  type: :runtime
91
93
  version_requirements: *id005
92
94
  description: Create dictionaries that link rows between two tables using loose matching (string similarity) by default and tight matching (regexp) by request.