loose_tight_dictionary 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/loose_tight_dictionary.rb +21 -17
- data/lib/loose_tight_dictionary/blocking.rb +3 -3
- data/lib/loose_tight_dictionary/identity.rb +1 -1
- data/lib/loose_tight_dictionary/result.rb +3 -3
- data/lib/loose_tight_dictionary/tightener.rb +1 -1
- data/lib/loose_tight_dictionary/version.rb +1 -1
- data/loose_tight_dictionary.gemspec +1 -1
- data/test/test_blocking.rb +5 -5
- data/test/test_loose_tight_dictionary.rb +19 -0
- metadata +8 -6
@@ -76,10 +76,12 @@ class LooseTightDictionary
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
-
|
79
|
+
joint, disjoint = if blockings.any?
|
80
80
|
haystack.partition do |straw|
|
81
|
-
|
82
|
-
blocking.
|
81
|
+
if first_blocking_decides
|
82
|
+
blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
|
83
|
+
else
|
84
|
+
blockings.any? { |blocking| blocking.join? needle, straw }
|
83
85
|
end
|
84
86
|
end
|
85
87
|
else
|
@@ -87,25 +89,25 @@ class LooseTightDictionary
|
|
87
89
|
end
|
88
90
|
|
89
91
|
# special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
|
90
|
-
if
|
91
|
-
|
92
|
-
|
92
|
+
if joint.none?
|
93
|
+
joint = disjoint
|
94
|
+
disjoint = []
|
93
95
|
end
|
94
96
|
|
95
97
|
if gather_last_result
|
96
|
-
last_result.
|
97
|
-
last_result.
|
98
|
+
last_result.joint = joint
|
99
|
+
last_result.disjoint = disjoint
|
98
100
|
end
|
99
101
|
|
100
102
|
possibly_identical, certainly_different = if identities.any?
|
101
|
-
|
103
|
+
joint.partition do |straw|
|
102
104
|
identities.all? do |identity|
|
103
105
|
answer = identity.identical? needle, straw
|
104
106
|
answer.nil? or answer == true
|
105
107
|
end
|
106
108
|
end
|
107
109
|
else
|
108
|
-
[
|
110
|
+
[ joint.dup, [] ]
|
109
111
|
end
|
110
112
|
|
111
113
|
if gather_last_result
|
@@ -117,9 +119,7 @@ class LooseTightDictionary
|
|
117
119
|
return possibly_identical.map { |straw| straw.record }
|
118
120
|
end
|
119
121
|
|
120
|
-
similarities = possibly_identical.map
|
121
|
-
needle.similarity straw
|
122
|
-
end.sort
|
122
|
+
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
|
123
123
|
|
124
124
|
best_similarity = similarities[-1]
|
125
125
|
straw = best_similarity.wrapper2
|
@@ -164,13 +164,13 @@ class LooseTightDictionary
|
|
164
164
|
log "-" * 150
|
165
165
|
log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
|
166
166
|
log
|
167
|
-
log "
|
167
|
+
log "Joint"
|
168
168
|
log "-" * 150
|
169
|
-
log last_result.
|
169
|
+
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.to_str }.join("\n")
|
170
170
|
log
|
171
|
-
log "
|
171
|
+
log "Disjoint"
|
172
172
|
log "-" * 150
|
173
|
-
log last_result.
|
173
|
+
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.to_str }.join("\n")
|
174
174
|
log
|
175
175
|
log "Possibly identical"
|
176
176
|
log "-" * 150
|
@@ -196,6 +196,10 @@ class LooseTightDictionary
|
|
196
196
|
def must_match_blocking
|
197
197
|
options[:must_match_blocking] || false
|
198
198
|
end
|
199
|
+
|
200
|
+
def first_blocking_decides
|
201
|
+
options[:first_blocking_decides] || false
|
202
|
+
end
|
199
203
|
|
200
204
|
def tighteners
|
201
205
|
@tighteners ||= (options[:tighteners] || []).map do |regexp_or_str|
|
@@ -10,18 +10,18 @@ class LooseTightDictionary
|
|
10
10
|
attr_reader :regexp
|
11
11
|
|
12
12
|
def initialize(regexp_or_str)
|
13
|
-
@regexp = regexp_or_str.
|
13
|
+
@regexp = regexp_or_str.to_regexp
|
14
14
|
end
|
15
15
|
|
16
16
|
def match?(str)
|
17
17
|
!!(regexp.match(str))
|
18
18
|
end
|
19
19
|
|
20
|
-
# If a blocking "
|
20
|
+
# If a blocking "joins" two strings, that means they both fit into it.
|
21
21
|
#
|
22
22
|
# Returns false if they certainly don't fit this blocking.
|
23
23
|
# Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
|
24
|
-
def
|
24
|
+
def join?(str1, str2)
|
25
25
|
if str2_match_data = regexp.match(str2)
|
26
26
|
if str1_match_data = regexp.match(str1)
|
27
27
|
str2_match_data.captures == str1_match_data.captures
|
@@ -5,7 +5,7 @@ class LooseTightDictionary
|
|
5
5
|
attr_reader :regexp
|
6
6
|
|
7
7
|
def initialize(regexp_or_str)
|
8
|
-
@regexp = regexp_or_str.
|
8
|
+
@regexp = regexp_or_str.to_regexp
|
9
9
|
end
|
10
10
|
|
11
11
|
# Two strings are "identical" if they both match this identity and the captures are equal.
|
@@ -4,8 +4,8 @@ class LooseTightDictionary
|
|
4
4
|
attr_accessor :tighteners
|
5
5
|
attr_accessor :blockings
|
6
6
|
attr_accessor :identities
|
7
|
-
attr_accessor :
|
8
|
-
attr_accessor :
|
7
|
+
attr_accessor :joint
|
8
|
+
attr_accessor :disjoint
|
9
9
|
attr_accessor :possibly_identical
|
10
10
|
attr_accessor :certainly_different
|
11
11
|
attr_accessor :similarities
|
@@ -13,7 +13,7 @@ class LooseTightDictionary
|
|
13
13
|
attr_accessor :score
|
14
14
|
|
15
15
|
def haystack
|
16
|
-
|
16
|
+
joint + disjoint
|
17
17
|
end
|
18
18
|
|
19
19
|
def free
|
@@ -4,7 +4,7 @@ class LooseTightDictionary
|
|
4
4
|
attr_reader :regexp
|
5
5
|
|
6
6
|
def initialize(regexp_or_str)
|
7
|
-
@regexp = regexp_or_str.
|
7
|
+
@regexp = regexp_or_str.to_regexp
|
8
8
|
end
|
9
9
|
|
10
10
|
# A tightener applies when its regexp matches and captures a new (shorter) string
|
data/test/test_blocking.rb
CHANGED
@@ -6,18 +6,18 @@ class TestBlocking < Test::Unit::TestCase
|
|
6
6
|
assert_equal true, b.match?('2 apples')
|
7
7
|
end
|
8
8
|
|
9
|
-
def
|
9
|
+
def test_002_join_both
|
10
10
|
b = LooseTightDictionary::Blocking.new %r{apple}
|
11
|
-
assert_equal true, b.
|
11
|
+
assert_equal true, b.join?('apple', '2 apples')
|
12
12
|
end
|
13
13
|
|
14
|
-
def
|
14
|
+
def test_002_doesnt_join_both
|
15
15
|
b = LooseTightDictionary::Blocking.new %r{apple}
|
16
|
-
assert_equal false, b.
|
16
|
+
assert_equal false, b.join?('orange', '2 apples')
|
17
17
|
end
|
18
18
|
|
19
19
|
def test_003_no_information
|
20
20
|
b = LooseTightDictionary::Blocking.new %r{apple}
|
21
|
-
assert_equal nil, b.
|
21
|
+
assert_equal nil, b.join?('orange', 'orange')
|
22
22
|
end
|
23
23
|
end
|
@@ -78,4 +78,23 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
78
78
|
assert_equal ['X', 'X22' ], d.find_all('X')
|
79
79
|
assert_equal [], d.find_all('A')
|
80
80
|
end
|
81
|
+
|
82
|
+
def test_013_first_blocking_decides
|
83
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ]
|
84
|
+
assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing 747')
|
85
|
+
|
86
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
87
|
+
assert_equal [ 'Boeing 747', 'Boeing 747SR' ], d.find_all('Boeing 747')
|
88
|
+
|
89
|
+
# first_blocking_decides refers to the needle
|
90
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true
|
91
|
+
assert_equal [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], d.find_all('Boeing ER6')
|
92
|
+
|
93
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
94
|
+
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
95
|
+
|
96
|
+
# or equivalently with an identity
|
97
|
+
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
|
98
|
+
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
99
|
+
end
|
81
100
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: loose_tight_dictionary
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 1
|
10
|
+
version: 0.2.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Seamus Abshere
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-04-
|
18
|
+
date: 2011-04-28 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: shoulda
|
@@ -83,10 +83,12 @@ dependencies:
|
|
83
83
|
requirements:
|
84
84
|
- - ">="
|
85
85
|
- !ruby/object:Gem::Version
|
86
|
-
hash:
|
86
|
+
hash: 25
|
87
87
|
segments:
|
88
88
|
- 0
|
89
|
-
|
89
|
+
- 0
|
90
|
+
- 3
|
91
|
+
version: 0.0.3
|
90
92
|
type: :runtime
|
91
93
|
version_requirements: *id005
|
92
94
|
description: Create dictionaries that link rows between two tables using loose matching (string similarity) by default and tight matching (regexp) by request.
|