loose_tight_dictionary 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/loose_tight_dictionary.rb +21 -17
- data/lib/loose_tight_dictionary/blocking.rb +3 -3
- data/lib/loose_tight_dictionary/identity.rb +1 -1
- data/lib/loose_tight_dictionary/result.rb +3 -3
- data/lib/loose_tight_dictionary/tightener.rb +1 -1
- data/lib/loose_tight_dictionary/version.rb +1 -1
- data/loose_tight_dictionary.gemspec +1 -1
- data/test/test_blocking.rb +5 -5
- data/test/test_loose_tight_dictionary.rb +19 -0
- metadata +8 -6
|
@@ -76,10 +76,12 @@ class LooseTightDictionary
|
|
|
76
76
|
end
|
|
77
77
|
end
|
|
78
78
|
|
|
79
|
-
|
|
79
|
+
joint, disjoint = if blockings.any?
|
|
80
80
|
haystack.partition do |straw|
|
|
81
|
-
|
|
82
|
-
blocking.
|
|
81
|
+
if first_blocking_decides
|
|
82
|
+
blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
|
|
83
|
+
else
|
|
84
|
+
blockings.any? { |blocking| blocking.join? needle, straw }
|
|
83
85
|
end
|
|
84
86
|
end
|
|
85
87
|
else
|
|
@@ -87,25 +89,25 @@ class LooseTightDictionary
|
|
|
87
89
|
end
|
|
88
90
|
|
|
89
91
|
# special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
|
|
90
|
-
if
|
|
91
|
-
|
|
92
|
-
|
|
92
|
+
if joint.none?
|
|
93
|
+
joint = disjoint
|
|
94
|
+
disjoint = []
|
|
93
95
|
end
|
|
94
96
|
|
|
95
97
|
if gather_last_result
|
|
96
|
-
last_result.
|
|
97
|
-
last_result.
|
|
98
|
+
last_result.joint = joint
|
|
99
|
+
last_result.disjoint = disjoint
|
|
98
100
|
end
|
|
99
101
|
|
|
100
102
|
possibly_identical, certainly_different = if identities.any?
|
|
101
|
-
|
|
103
|
+
joint.partition do |straw|
|
|
102
104
|
identities.all? do |identity|
|
|
103
105
|
answer = identity.identical? needle, straw
|
|
104
106
|
answer.nil? or answer == true
|
|
105
107
|
end
|
|
106
108
|
end
|
|
107
109
|
else
|
|
108
|
-
[
|
|
110
|
+
[ joint.dup, [] ]
|
|
109
111
|
end
|
|
110
112
|
|
|
111
113
|
if gather_last_result
|
|
@@ -117,9 +119,7 @@ class LooseTightDictionary
|
|
|
117
119
|
return possibly_identical.map { |straw| straw.record }
|
|
118
120
|
end
|
|
119
121
|
|
|
120
|
-
similarities = possibly_identical.map
|
|
121
|
-
needle.similarity straw
|
|
122
|
-
end.sort
|
|
122
|
+
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
|
|
123
123
|
|
|
124
124
|
best_similarity = similarities[-1]
|
|
125
125
|
straw = best_similarity.wrapper2
|
|
@@ -164,13 +164,13 @@ class LooseTightDictionary
|
|
|
164
164
|
log "-" * 150
|
|
165
165
|
log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
|
|
166
166
|
log
|
|
167
|
-
log "
|
|
167
|
+
log "Joint"
|
|
168
168
|
log "-" * 150
|
|
169
|
-
log last_result.
|
|
169
|
+
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.to_str }.join("\n")
|
|
170
170
|
log
|
|
171
|
-
log "
|
|
171
|
+
log "Disjoint"
|
|
172
172
|
log "-" * 150
|
|
173
|
-
log last_result.
|
|
173
|
+
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.to_str }.join("\n")
|
|
174
174
|
log
|
|
175
175
|
log "Possibly identical"
|
|
176
176
|
log "-" * 150
|
|
@@ -196,6 +196,10 @@ class LooseTightDictionary
|
|
|
196
196
|
def must_match_blocking
|
|
197
197
|
options[:must_match_blocking] || false
|
|
198
198
|
end
|
|
199
|
+
|
|
200
|
+
def first_blocking_decides
|
|
201
|
+
options[:first_blocking_decides] || false
|
|
202
|
+
end
|
|
199
203
|
|
|
200
204
|
def tighteners
|
|
201
205
|
@tighteners ||= (options[:tighteners] || []).map do |regexp_or_str|
|
|
@@ -10,18 +10,18 @@ class LooseTightDictionary
|
|
|
10
10
|
attr_reader :regexp
|
|
11
11
|
|
|
12
12
|
def initialize(regexp_or_str)
|
|
13
|
-
@regexp = regexp_or_str.
|
|
13
|
+
@regexp = regexp_or_str.to_regexp
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
def match?(str)
|
|
17
17
|
!!(regexp.match(str))
|
|
18
18
|
end
|
|
19
19
|
|
|
20
|
-
# If a blocking "
|
|
20
|
+
# If a blocking "joins" two strings, that means they both fit into it.
|
|
21
21
|
#
|
|
22
22
|
# Returns false if they certainly don't fit this blocking.
|
|
23
23
|
# Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
|
|
24
|
-
def
|
|
24
|
+
def join?(str1, str2)
|
|
25
25
|
if str2_match_data = regexp.match(str2)
|
|
26
26
|
if str1_match_data = regexp.match(str1)
|
|
27
27
|
str2_match_data.captures == str1_match_data.captures
|
|
@@ -5,7 +5,7 @@ class LooseTightDictionary
|
|
|
5
5
|
attr_reader :regexp
|
|
6
6
|
|
|
7
7
|
def initialize(regexp_or_str)
|
|
8
|
-
@regexp = regexp_or_str.
|
|
8
|
+
@regexp = regexp_or_str.to_regexp
|
|
9
9
|
end
|
|
10
10
|
|
|
11
11
|
# Two strings are "identical" if they both match this identity and the captures are equal.
|
|
@@ -4,8 +4,8 @@ class LooseTightDictionary
|
|
|
4
4
|
attr_accessor :tighteners
|
|
5
5
|
attr_accessor :blockings
|
|
6
6
|
attr_accessor :identities
|
|
7
|
-
attr_accessor :
|
|
8
|
-
attr_accessor :
|
|
7
|
+
attr_accessor :joint
|
|
8
|
+
attr_accessor :disjoint
|
|
9
9
|
attr_accessor :possibly_identical
|
|
10
10
|
attr_accessor :certainly_different
|
|
11
11
|
attr_accessor :similarities
|
|
@@ -13,7 +13,7 @@ class LooseTightDictionary
|
|
|
13
13
|
attr_accessor :score
|
|
14
14
|
|
|
15
15
|
def haystack
|
|
16
|
-
|
|
16
|
+
joint + disjoint
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def free
|
|
@@ -4,7 +4,7 @@ class LooseTightDictionary
|
|
|
4
4
|
attr_reader :regexp
|
|
5
5
|
|
|
6
6
|
def initialize(regexp_or_str)
|
|
7
|
-
@regexp = regexp_or_str.
|
|
7
|
+
@regexp = regexp_or_str.to_regexp
|
|
8
8
|
end
|
|
9
9
|
|
|
10
10
|
# A tightener applies when its regexp matches and captures a new (shorter) string
|
data/test/test_blocking.rb
CHANGED
|
@@ -6,18 +6,18 @@ class TestBlocking < Test::Unit::TestCase
|
|
|
6
6
|
assert_equal true, b.match?('2 apples')
|
|
7
7
|
end
|
|
8
8
|
|
|
9
|
-
def
|
|
9
|
+
def test_002_join_both
|
|
10
10
|
b = LooseTightDictionary::Blocking.new %r{apple}
|
|
11
|
-
assert_equal true, b.
|
|
11
|
+
assert_equal true, b.join?('apple', '2 apples')
|
|
12
12
|
end
|
|
13
13
|
|
|
14
|
-
def
|
|
14
|
+
def test_002_doesnt_join_both
|
|
15
15
|
b = LooseTightDictionary::Blocking.new %r{apple}
|
|
16
|
-
assert_equal false, b.
|
|
16
|
+
assert_equal false, b.join?('orange', '2 apples')
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def test_003_no_information
|
|
20
20
|
b = LooseTightDictionary::Blocking.new %r{apple}
|
|
21
|
-
assert_equal nil, b.
|
|
21
|
+
assert_equal nil, b.join?('orange', 'orange')
|
|
22
22
|
end
|
|
23
23
|
end
|
|
@@ -78,4 +78,23 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
|
78
78
|
assert_equal ['X', 'X22' ], d.find_all('X')
|
|
79
79
|
assert_equal [], d.find_all('A')
|
|
80
80
|
end
|
|
81
|
+
|
|
82
|
+
def test_013_first_blocking_decides
|
|
83
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
|
|
84
|
+
assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
|
|
85
|
+
|
|
86
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
|
87
|
+
assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
|
|
88
|
+
|
|
89
|
+
# first_blocking_decides refers to the needle
|
|
90
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
|
91
|
+
assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing ER6')
|
|
92
|
+
|
|
93
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
|
94
|
+
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
|
95
|
+
|
|
96
|
+
# or equivalently with an identity
|
|
97
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
|
|
98
|
+
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
|
99
|
+
end
|
|
81
100
|
end
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: loose_tight_dictionary
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
hash:
|
|
4
|
+
hash: 21
|
|
5
5
|
prerelease:
|
|
6
6
|
segments:
|
|
7
7
|
- 0
|
|
8
8
|
- 2
|
|
9
|
-
-
|
|
10
|
-
version: 0.2.
|
|
9
|
+
- 1
|
|
10
|
+
version: 0.2.1
|
|
11
11
|
platform: ruby
|
|
12
12
|
authors:
|
|
13
13
|
- Seamus Abshere
|
|
@@ -15,7 +15,7 @@ autorequire:
|
|
|
15
15
|
bindir: bin
|
|
16
16
|
cert_chain: []
|
|
17
17
|
|
|
18
|
-
date: 2011-04-
|
|
18
|
+
date: 2011-04-28 00:00:00 Z
|
|
19
19
|
dependencies:
|
|
20
20
|
- !ruby/object:Gem::Dependency
|
|
21
21
|
name: shoulda
|
|
@@ -83,10 +83,12 @@ dependencies:
|
|
|
83
83
|
requirements:
|
|
84
84
|
- - ">="
|
|
85
85
|
- !ruby/object:Gem::Version
|
|
86
|
-
hash:
|
|
86
|
+
hash: 25
|
|
87
87
|
segments:
|
|
88
88
|
- 0
|
|
89
|
-
|
|
89
|
+
- 0
|
|
90
|
+
- 3
|
|
91
|
+
version: 0.0.3
|
|
90
92
|
type: :runtime
|
|
91
93
|
version_requirements: *id005
|
|
92
94
|
description: Create dictionaries that link rows between two tables using loose matching (string similarity) by default and tight matching (regexp) by request.
|