loose_tight_dictionary 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +24 -6
- data/examples/bts_aircraft/test_bts_aircraft.rb +0 -5
- data/examples/first_name_matching.rb +1 -1
- data/lib/loose_tight_dictionary/result.rb +1 -0
- data/lib/loose_tight_dictionary/score.rb +66 -20
- data/lib/loose_tight_dictionary/similarity.rb +7 -6
- data/lib/loose_tight_dictionary/stop_word.rb +19 -0
- data/lib/loose_tight_dictionary/version.rb +1 -1
- data/lib/loose_tight_dictionary/wrapper.rb +28 -11
- data/lib/loose_tight_dictionary.rb +48 -48
- data/test/test_loose_tight_dictionary.rb +20 -5
- metadata +21 -20
data/README.rdoc
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
= loose_tight_dictionary
|
2
2
|
|
3
|
-
|
3
|
+
Find a needle in a haystack based on string similarity (using the Pair Distance algorithm and Levenshtein distance) and regular expressions.
|
4
4
|
|
5
5
|
== Quickstart
|
6
6
|
|
@@ -11,7 +11,20 @@ Match things based on string similarity (using the Pair Distance algorithm) and
|
|
11
11
|
|
12
12
|
== String similarity matching
|
13
13
|
|
14
|
-
|
14
|
+
Uses {Dice's Coefficient}[http://en.wikipedia.org/wiki/Dice's_coefficient] algorithm (aka Pair Distance).
|
15
|
+
|
16
|
+
If that judges two strings to be be equally similar to a third string, then Levenshtein distance is used. For example, pair distance considers "RATZ" and "CATZ" to be equally similar to "RITZ" so we invoke Levenshtein.
|
17
|
+
|
18
|
+
>> require 'amatch'
|
19
|
+
=> true
|
20
|
+
>> 'RITZ'.pair_distance_similar 'RATZ'
|
21
|
+
=> 0.3333333333333333
|
22
|
+
>> 'RITZ'.pair_distance_similar 'CATZ' # <-- pair distance can't tell the difference, so we fall back to levenshtein...
|
23
|
+
=> 0.3333333333333333
|
24
|
+
>> 'RITZ'.levenshtein_similar 'RATZ'
|
25
|
+
=> 0.75
|
26
|
+
>> 'RITZ'.levenshtein_similar 'CATZ' # <-- which properly shows that RATZ should win
|
27
|
+
=> 0.5
|
15
28
|
|
16
29
|
== Production use
|
17
30
|
|
@@ -36,6 +49,7 @@ You can improve the default matchings with regular expressions.
|
|
36
49
|
* Emphasize important words using <b>blockings</b> and <b>tighteners</b>
|
37
50
|
* Filter out stop words with <b>tighteners</b>
|
38
51
|
* Prevent impossible matches with <b>blockings</b> and <b>identities</b>
|
52
|
+
* Ignore words with <b>stop words</b>
|
39
53
|
|
40
54
|
=== Blockings
|
41
55
|
|
@@ -49,19 +63,23 @@ Adding a tightener like <tt>/(boeing).*(7\d\d)/i</tt> will cause "BOEING COMPANY
|
|
49
63
|
|
50
64
|
Adding an identity like <tt>/(F)\-?(\d50)/</tt> ensures that "Ford F-150" and "Ford F-250" never match.
|
51
65
|
|
66
|
+
=== Stop words
|
67
|
+
|
68
|
+
Adding a stop word like <tt>THE</tt> ensures that it is not taken into account when comparing "THE CAT", "THE DAT", and "THE CATT"
|
69
|
+
|
52
70
|
== Case sensitivity
|
53
71
|
|
54
|
-
Scoring is case-insensitive. Everything is downcased before scoring. This is a change from previous versions.
|
72
|
+
Scoring is case-insensitive. Everything is downcased before scoring. This is a change from previous versions. Your regexps may still be case-sensitive, though.
|
55
73
|
|
56
74
|
== Examples
|
57
75
|
|
58
76
|
Check out the tests.
|
59
77
|
|
60
|
-
== Speed
|
78
|
+
== Speed (and who to thank for the algorithms)
|
61
79
|
|
62
|
-
If you add the amatch[http://flori.github.com/amatch/] gem to your Gemfile, it will use that, which is much faster (but {segfaults have been seen in the wild}[https://github.com/flori/amatch/issues/3]). Thanks Flori!
|
80
|
+
If you add the amatch[http://flori.github.com/amatch/] gem to your Gemfile, it will use that, which is much faster (but {segfaults have been seen in the wild}[https://github.com/flori/amatch/issues/3]). Thanks {Flori}[https://github.com/flori]!
|
63
81
|
|
64
|
-
Otherwise,
|
82
|
+
Otherwise, pure ruby versions of the string similarity algorithms derived from the {answer to a StackOverflow question}[http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings] and {the text gem}[https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb] are used. Thanks {marzagao}[http://stackoverflow.com/users/10997/marzagao] and {threedaymonk}[https://github.com/threedaymonk]!
|
65
83
|
|
66
84
|
== Authors
|
67
85
|
|
@@ -71,11 +71,6 @@ FINAL_OPTIONS = {
|
|
71
71
|
}
|
72
72
|
|
73
73
|
class TestBtsAircraft < Test::Unit::TestCase
|
74
|
-
should "store the records somewhere" do
|
75
|
-
d = LooseTightDictionary.new HAYSTACK
|
76
|
-
assert d.records.grep(/BOEING 707-100/)
|
77
|
-
end
|
78
|
-
|
79
74
|
should "understand records by using the haystack reader" do
|
80
75
|
d = LooseTightDictionary.new HAYSTACK, FINAL_OPTIONS
|
81
76
|
assert d.haystack.map { |record| record.to_str }.include?('boeing boeing 707-100')
|
@@ -8,7 +8,7 @@ require 'loose_tight_dictionary'
|
|
8
8
|
haystack = [ 'seamus', 'andy', 'ben' ]
|
9
9
|
needles = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT', 'Shamus Heaney' ]
|
10
10
|
|
11
|
-
d = LooseTightDictionary.new haystack
|
11
|
+
d = LooseTightDictionary.new haystack
|
12
12
|
needles.each do |needle|
|
13
13
|
d.explain needle
|
14
14
|
puts
|
@@ -9,40 +9,44 @@ class LooseTightDictionary
|
|
9
9
|
attr_reader :str1, :str2
|
10
10
|
|
11
11
|
def initialize(str1, str2)
|
12
|
-
@str1 = str1
|
13
|
-
@str2 = str2
|
14
|
-
end
|
15
|
-
|
16
|
-
def to_f
|
17
|
-
@to_f ||= dices_coefficient(str1, str2)
|
12
|
+
@str1 = str1.downcase
|
13
|
+
@str2 = str2.downcase
|
18
14
|
end
|
19
15
|
|
20
16
|
def inspect
|
21
|
-
%{#<Score:
|
17
|
+
%{#<Score: dices_coefficient=#{dices_coefficient} levenshtein=#{levenshtein}>}
|
22
18
|
end
|
23
19
|
|
24
20
|
def <=>(other)
|
25
|
-
|
21
|
+
by_dices_coefficient = (dices_coefficient <=> other.dices_coefficient)
|
22
|
+
if by_dices_coefficient == 0
|
23
|
+
levenshtein <=> other.levenshtein
|
24
|
+
else
|
25
|
+
by_dices_coefficient
|
26
|
+
end
|
26
27
|
end
|
27
|
-
|
28
|
-
def
|
29
|
-
|
28
|
+
|
29
|
+
def utf8?
|
30
|
+
return @utf8_query[0] if @utf8_query.is_a?(::Array)
|
31
|
+
@utf8_query = [ (defined?(::Encoding) ? str1.encoding.to_s : $KCODE).downcase.start_with?('u') ]
|
32
|
+
@utf8_query[0]
|
30
33
|
end
|
31
34
|
|
32
|
-
private
|
33
|
-
|
34
|
-
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
35
35
|
if defined?(::Amatch)
|
36
|
-
|
37
|
-
|
38
|
-
str2 = str2.downcase
|
36
|
+
|
37
|
+
def dices_coefficient
|
39
38
|
str1.pair_distance_similar str2
|
40
39
|
end
|
40
|
+
|
41
|
+
def levenshtein
|
42
|
+
str1.levenshtein_similar str2
|
43
|
+
end
|
44
|
+
|
41
45
|
else
|
46
|
+
|
42
47
|
SPACE = ' '
|
43
|
-
|
44
|
-
|
45
|
-
str2 = str2.downcase
|
48
|
+
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
49
|
+
def dices_coefficient
|
46
50
|
if str1 == str2
|
47
51
|
return 1.0
|
48
52
|
elsif str1.length == 1 and str2.length == 1
|
@@ -71,6 +75,48 @@ class LooseTightDictionary
|
|
71
75
|
end
|
72
76
|
(2.0 * intersection) / union
|
73
77
|
end
|
78
|
+
|
79
|
+
# extracted/adapted from the text gem version 1.0.2
|
80
|
+
# normalization added for utf-8 strings
|
81
|
+
# lib/text/levenshtein.rb
|
82
|
+
def levenshtein
|
83
|
+
if utf8?
|
84
|
+
unpack_rule = 'U*'
|
85
|
+
else
|
86
|
+
unpack_rule = 'C*'
|
87
|
+
end
|
88
|
+
s = str1.unpack(unpack_rule)
|
89
|
+
t = str2.unpack(unpack_rule)
|
90
|
+
n = s.length
|
91
|
+
m = t.length
|
92
|
+
if n == 0 or m == 0
|
93
|
+
return 0.0
|
94
|
+
end
|
95
|
+
d = (0..m).to_a
|
96
|
+
x = nil
|
97
|
+
(0...n).each do |i|
|
98
|
+
e = i+1
|
99
|
+
(0...m).each do |j|
|
100
|
+
cost = (s[i] == t[j]) ? 0 : 1
|
101
|
+
x = [
|
102
|
+
d[j+1] + 1, # insertion
|
103
|
+
e + 1, # deletion
|
104
|
+
d[j] + cost # substitution
|
105
|
+
].min
|
106
|
+
d[j] = e
|
107
|
+
e = x
|
108
|
+
end
|
109
|
+
d[m] = x
|
110
|
+
end
|
111
|
+
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
112
|
+
# if (b_len > a_len) {
|
113
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
114
|
+
# } else {
|
115
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
116
|
+
# }
|
117
|
+
1.0 - x.to_f / [n, m].max
|
118
|
+
end
|
119
|
+
|
74
120
|
end
|
75
121
|
end
|
76
122
|
end
|
@@ -9,16 +9,17 @@ class LooseTightDictionary
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def <=>(other)
|
12
|
-
|
13
|
-
|
12
|
+
by_score = best_score <=> other.best_score
|
13
|
+
if by_score == 0
|
14
|
+
original_weight <=> other.original_weight
|
14
15
|
else
|
15
|
-
|
16
|
+
by_score
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
19
20
|
# Weight things towards short original strings
|
20
|
-
def
|
21
|
-
@
|
21
|
+
def original_weight
|
22
|
+
@original_weight ||= (1.0 / (wrapper1.render.length * wrapper2.render.length))
|
22
23
|
end
|
23
24
|
|
24
25
|
def best_score
|
@@ -46,7 +47,7 @@ class LooseTightDictionary
|
|
46
47
|
end
|
47
48
|
|
48
49
|
def inspect
|
49
|
-
%{#<Similarity "#{wrapper2.
|
50
|
+
%{#<Similarity "#{wrapper2.render}"=>"#{best_wrapper2_variant}" versus "#{wrapper1.render}"=>"#{best_wrapper1_variant}" original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
|
50
51
|
end
|
51
52
|
end
|
52
53
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class LooseTightDictionary
|
2
|
+
# A stop word is ignored
|
3
|
+
class StopWord
|
4
|
+
attr_reader :regexp
|
5
|
+
|
6
|
+
def initialize(regexp_or_str)
|
7
|
+
@regexp = regexp_or_str.to_regexp
|
8
|
+
end
|
9
|
+
|
10
|
+
# Destructively remove stop words from the string
|
11
|
+
def apply!(str)
|
12
|
+
str.gsub! regexp, ''
|
13
|
+
end
|
14
|
+
|
15
|
+
def inspect
|
16
|
+
"#<StopWord regexp=#{regexp.inspect}>"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -1,22 +1,23 @@
|
|
1
1
|
class LooseTightDictionary
|
2
2
|
# Wrappers are the tokens that are passed around when doing scoring and optimizing.
|
3
3
|
class Wrapper #:nodoc: all
|
4
|
-
attr_reader :
|
4
|
+
attr_reader :loose_tight_dictionary
|
5
5
|
attr_reader :record
|
6
6
|
attr_reader :read
|
7
7
|
|
8
|
-
def initialize(
|
9
|
-
@
|
8
|
+
def initialize(loose_tight_dictionary, record, read = nil)
|
9
|
+
@loose_tight_dictionary = loose_tight_dictionary
|
10
10
|
@record = record
|
11
11
|
@read = read
|
12
12
|
end
|
13
13
|
|
14
14
|
def inspect
|
15
|
-
"#<Wrapper
|
15
|
+
"#<Wrapper render=#{render} variants=#{variants.length}>"
|
16
16
|
end
|
17
17
|
|
18
|
-
def
|
19
|
-
@
|
18
|
+
def render
|
19
|
+
return @render if rendered?
|
20
|
+
str = case read
|
20
21
|
when ::Proc
|
21
22
|
read.call record
|
22
23
|
when ::Symbol
|
@@ -29,22 +30,38 @@ class LooseTightDictionary
|
|
29
30
|
record
|
30
31
|
else
|
31
32
|
record[read]
|
32
|
-
end.to_s
|
33
|
+
end.to_s.dup
|
34
|
+
loose_tight_dictionary.stop_words.each do |stop_word|
|
35
|
+
stop_word.apply! str
|
36
|
+
end
|
37
|
+
str.strip!
|
38
|
+
@render = str.freeze
|
39
|
+
@rendered = true
|
40
|
+
@render
|
33
41
|
end
|
34
42
|
|
35
|
-
alias :
|
43
|
+
alias :to_str :render
|
44
|
+
|
45
|
+
WORD_BOUNDARY = %r{\s*\b\s*}
|
46
|
+
def words
|
47
|
+
@words ||= render.split(WORD_BOUNDARY)
|
48
|
+
end
|
36
49
|
|
37
50
|
def similarity(other)
|
38
51
|
Similarity.new self, other
|
39
52
|
end
|
40
53
|
|
41
54
|
def variants
|
42
|
-
@variants ||=
|
43
|
-
if tightener.apply?
|
44
|
-
memo.push tightener.apply(
|
55
|
+
@variants ||= loose_tight_dictionary.tighteners.inject([ render ]) do |memo, tightener|
|
56
|
+
if tightener.apply? render
|
57
|
+
memo.push tightener.apply(render)
|
45
58
|
end
|
46
59
|
memo
|
47
60
|
end.uniq
|
48
61
|
end
|
62
|
+
|
63
|
+
def rendered?
|
64
|
+
@rendered == true
|
65
|
+
end
|
49
66
|
end
|
50
67
|
end
|
@@ -8,6 +8,7 @@ require 'to_regexp'
|
|
8
8
|
# See the README for more information.
|
9
9
|
class LooseTightDictionary
|
10
10
|
autoload :Tightener, 'loose_tight_dictionary/tightener'
|
11
|
+
autoload :StopWord, 'loose_tight_dictionary/stop_word'
|
11
12
|
autoload :Blocking, 'loose_tight_dictionary/blocking'
|
12
13
|
autoload :Identity, 'loose_tight_dictionary/identity'
|
13
14
|
autoload :Result, 'loose_tight_dictionary/result'
|
@@ -16,19 +17,31 @@ class LooseTightDictionary
|
|
16
17
|
autoload :Score, 'loose_tight_dictionary/score'
|
17
18
|
autoload :CachedResult, 'loose_tight_dictionary/cached_result'
|
18
19
|
|
19
|
-
attr_reader :options
|
20
20
|
attr_reader :haystack
|
21
|
-
attr_reader :
|
21
|
+
attr_reader :blockings
|
22
|
+
attr_reader :identities
|
23
|
+
attr_reader :tighteners
|
24
|
+
attr_reader :stop_words
|
25
|
+
attr_reader :first_blocking_decides
|
26
|
+
attr_reader :must_match_blocking
|
27
|
+
attr_reader :must_match_at_least_one_word
|
22
28
|
|
23
29
|
# haystack - a bunch of records
|
24
30
|
# options
|
25
31
|
# * tighteners: regexps (see readme)
|
26
32
|
# * identities: regexps
|
27
33
|
# * blockings: regexps
|
34
|
+
# * stop_words: regexps
|
28
35
|
# * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
|
29
36
|
def initialize(records, options = {})
|
30
|
-
|
31
|
-
@
|
37
|
+
options = options.symbolize_keys
|
38
|
+
@first_blocking_decides = options.fetch :first_blocking_decides, false
|
39
|
+
@must_match_blocking = options.fetch :must_match_blocking, false
|
40
|
+
@must_match_at_least_one_word = options.fetch :must_match_at_least_one_word, false
|
41
|
+
@blockings = options.fetch(:blockings, []).map { |regexp_or_str| Blocking.new regexp_or_str }
|
42
|
+
@identities = options.fetch(:identities, []).map { |regexp_or_str| Identity.new regexp_or_str }
|
43
|
+
@tighteners = options.fetch(:tighteners, []).map { |regexp_or_str| Tightener.new regexp_or_str }
|
44
|
+
@stop_words = options.fetch(:stop_words, []).map { |regexp_or_str| StopWord.new regexp_or_str }
|
32
45
|
read = options[:read] || options[:haystack_reader]
|
33
46
|
@haystack = records.map { |record| Wrapper.new self, record, read }
|
34
47
|
end
|
@@ -37,10 +50,6 @@ class LooseTightDictionary
|
|
37
50
|
@last_result || raise(::RuntimeError, "[loose_tight_dictionary] You can't access the last result until you've run a find with :gather_last_result => true")
|
38
51
|
end
|
39
52
|
|
40
|
-
def log(str = '') #:nodoc:
|
41
|
-
(options[:log] || $stderr).puts str unless options[:log] == false
|
42
|
-
end
|
43
|
-
|
44
53
|
def find_all(needle, options = {})
|
45
54
|
options = options.symbolize_keys.merge(:find_all => true)
|
46
55
|
find needle, options
|
@@ -50,11 +59,13 @@ class LooseTightDictionary
|
|
50
59
|
raise ::RuntimeError, "[loose_tight_dictionary] Dictionary has already been freed, can't perform more finds" if freed?
|
51
60
|
|
52
61
|
options = options.symbolize_keys
|
53
|
-
|
62
|
+
gather_last_result = options.fetch(:gather_last_result, false)
|
63
|
+
is_find_all = options.fetch(:find_all, false)
|
64
|
+
|
65
|
+
if gather_last_result
|
54
66
|
free_last_result
|
55
67
|
@last_result = Result.new
|
56
68
|
end
|
57
|
-
find_all = options.fetch(:find_all, false)
|
58
69
|
|
59
70
|
if gather_last_result
|
60
71
|
last_result.tighteners = tighteners
|
@@ -69,15 +80,27 @@ class LooseTightDictionary
|
|
69
80
|
end
|
70
81
|
|
71
82
|
if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
|
72
|
-
if
|
83
|
+
if is_find_all
|
73
84
|
return []
|
74
85
|
else
|
75
86
|
return nil
|
76
87
|
end
|
77
88
|
end
|
78
89
|
|
90
|
+
candidates = if must_match_at_least_one_word
|
91
|
+
haystack.select do |straw|
|
92
|
+
needle.words.any? { |w| straw.render.include? w }
|
93
|
+
end
|
94
|
+
else
|
95
|
+
haystack
|
96
|
+
end
|
97
|
+
|
98
|
+
if gather_last_result
|
99
|
+
last_result.candidates = candidates
|
100
|
+
end
|
101
|
+
|
79
102
|
joint, disjoint = if blockings.any?
|
80
|
-
|
103
|
+
candidates.partition do |straw|
|
81
104
|
if first_blocking_decides
|
82
105
|
blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
|
83
106
|
else
|
@@ -85,7 +108,7 @@ class LooseTightDictionary
|
|
85
108
|
end
|
86
109
|
end
|
87
110
|
else
|
88
|
-
[
|
111
|
+
[ candidates.dup, [] ]
|
89
112
|
end
|
90
113
|
|
91
114
|
# special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
|
@@ -115,7 +138,7 @@ class LooseTightDictionary
|
|
115
138
|
last_result.certainly_different = certainly_different
|
116
139
|
end
|
117
140
|
|
118
|
-
if
|
141
|
+
if is_find_all
|
119
142
|
return possibly_identical.map { |straw| straw.record }
|
120
143
|
end
|
121
144
|
|
@@ -125,12 +148,11 @@ class LooseTightDictionary
|
|
125
148
|
last_result.similarities = similarities
|
126
149
|
end
|
127
150
|
|
128
|
-
|
129
|
-
if best_similarity = similarities[-1] and best_similarity.best_score.to_f > 0
|
151
|
+
if best_similarity = similarities[-1] and best_similarity.best_score.dices_coefficient > 0
|
130
152
|
record = best_similarity.wrapper2.record
|
131
153
|
if gather_last_result
|
132
154
|
last_result.record = record
|
133
|
-
last_result.score = best_similarity.best_score.
|
155
|
+
last_result.score = best_similarity.best_score.dices_coefficient
|
134
156
|
end
|
135
157
|
record
|
136
158
|
end
|
@@ -148,11 +170,11 @@ class LooseTightDictionary
|
|
148
170
|
log
|
149
171
|
log "Needle"
|
150
172
|
log "-" * 150
|
151
|
-
log last_result.needle.
|
173
|
+
log last_result.needle.render
|
152
174
|
log
|
153
175
|
log "Haystack"
|
154
176
|
log "-" * 150
|
155
|
-
log last_result.haystack.map { |record| record.
|
177
|
+
log last_result.haystack.map { |record| record.render }.join("\n")
|
156
178
|
log
|
157
179
|
log "Tighteners"
|
158
180
|
log "-" * 150
|
@@ -168,19 +190,19 @@ class LooseTightDictionary
|
|
168
190
|
log
|
169
191
|
log "Joint"
|
170
192
|
log "-" * 150
|
171
|
-
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.
|
193
|
+
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.render }.join("\n")
|
172
194
|
log
|
173
195
|
log "Disjoint"
|
174
196
|
log "-" * 150
|
175
|
-
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.
|
197
|
+
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.render }.join("\n")
|
176
198
|
log
|
177
199
|
log "Possibly identical"
|
178
200
|
log "-" * 150
|
179
|
-
log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.
|
201
|
+
log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.render }.join("\n")
|
180
202
|
log
|
181
203
|
log "Certainly different"
|
182
204
|
log "-" * 150
|
183
|
-
log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.
|
205
|
+
log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.render }.join("\n")
|
184
206
|
log
|
185
207
|
log "Similarities"
|
186
208
|
log "-" * 150
|
@@ -190,33 +212,11 @@ class LooseTightDictionary
|
|
190
212
|
log "-" * 150
|
191
213
|
log record.inspect
|
192
214
|
end
|
193
|
-
|
194
|
-
def must_match_blocking
|
195
|
-
options.fetch :must_match_blocking, false
|
196
|
-
end
|
197
215
|
|
198
|
-
def
|
199
|
-
|
200
|
-
end
|
201
|
-
|
202
|
-
def tighteners
|
203
|
-
@tighteners ||= (options[:tighteners] || []).map do |regexp_or_str|
|
204
|
-
Tightener.new regexp_or_str
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
def identities
|
209
|
-
@identities ||= (options[:identities] || []).map do |regexp_or_str|
|
210
|
-
Identity.new regexp_or_str
|
211
|
-
end
|
212
|
-
end
|
213
|
-
|
214
|
-
def blockings
|
215
|
-
@blockings ||= (options[:blockings] || []).map do |regexp_or_str|
|
216
|
-
Blocking.new regexp_or_str
|
217
|
-
end
|
216
|
+
def log(str = '') #:nodoc:
|
217
|
+
$stderr.puts str
|
218
218
|
end
|
219
|
-
|
219
|
+
|
220
220
|
def freed?
|
221
221
|
@freed == true
|
222
222
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
1
2
|
require 'helper'
|
2
3
|
|
3
4
|
class TestLooseTightDictionary < Test::Unit::TestCase
|
@@ -11,8 +12,9 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
11
12
|
# end
|
12
13
|
|
13
14
|
def test_001_find
|
14
|
-
d = LooseTightDictionary.new %w{
|
15
|
-
assert_equal '
|
15
|
+
d = LooseTightDictionary.new %w{ RATZ CATZ }
|
16
|
+
assert_equal 'RATZ', d.find('RITZ')
|
17
|
+
assert_equal 'RATZ', d.find('RíTZ')
|
16
18
|
|
17
19
|
d = LooseTightDictionary.new [ 'X' ]
|
18
20
|
assert_equal 'X', d.find('X')
|
@@ -46,7 +48,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
46
48
|
d = LooseTightDictionary.new ['BOEING 737-100/200', 'BOEING 737-900'], :tighteners => tighteners
|
47
49
|
assert_equal 'BOEING 737-100/200', d.find('BOEING 737100 number 900')
|
48
50
|
end
|
49
|
-
|
51
|
+
|
50
52
|
def test_008_false_positive_without_identity
|
51
53
|
d = LooseTightDictionary.new %w{ foo bar }
|
52
54
|
assert_equal 'bar', d.find('baz')
|
@@ -63,7 +65,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
63
65
|
assert_equal 'X', d.find('X')
|
64
66
|
assert_equal nil, d.find('A')
|
65
67
|
end
|
66
|
-
|
68
|
+
|
67
69
|
# TODO this is not very helpful
|
68
70
|
def test_0095_must_match_blocking
|
69
71
|
d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
@@ -98,7 +100,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
98
100
|
|
99
101
|
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
100
102
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
101
|
-
|
103
|
+
|
102
104
|
# or equivalently with an identity
|
103
105
|
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
|
104
106
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
@@ -153,4 +155,17 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
153
155
|
def test_018_no_result_if_best_score_is_zero
|
154
156
|
assert_equal nil, LooseTightDictionary.new(['a']).find('b')
|
155
157
|
end
|
158
|
+
|
159
|
+
def test_019_must_match_at_least_one_word
|
160
|
+
d = LooseTightDictionary.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
161
|
+
assert_equal nil, d.find('RITZ')
|
162
|
+
end
|
163
|
+
|
164
|
+
def test_020_stop_words
|
165
|
+
d = LooseTightDictionary.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
|
166
|
+
assert_equal 'B HTL', d.find('A HTL')
|
167
|
+
|
168
|
+
d = LooseTightDictionary.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
|
169
|
+
assert_equal 'A HOTEL', d.find('A HTL')
|
170
|
+
end
|
156
171
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: loose_tight_dictionary
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-12-
|
12
|
+
date: 2011-12-06 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: shoulda
|
16
|
-
requirement: &
|
16
|
+
requirement: &2185313400 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2185313400
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: remote_table
|
27
|
-
requirement: &
|
27
|
+
requirement: &2185284260 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2185284260
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activerecord
|
38
|
-
requirement: &
|
38
|
+
requirement: &2185283700 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2185283700
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: mysql
|
49
|
-
requirement: &
|
49
|
+
requirement: &2185283260 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2185283260
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: cohort_scope
|
60
|
-
requirement: &
|
60
|
+
requirement: &2185282760 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2185282760
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: weighted_average
|
71
|
-
requirement: &
|
71
|
+
requirement: &2185282340 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2185282340
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rake
|
82
|
-
requirement: &
|
82
|
+
requirement: &2185281880 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2185281880
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: activesupport
|
93
|
-
requirement: &
|
93
|
+
requirement: &2185281260 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '3'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *2185281260
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: to_regexp
|
104
|
-
requirement: &
|
104
|
+
requirement: &2185280640 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: 0.0.3
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *2185280640
|
113
113
|
description: Create dictionaries that link rows between two tables using loose matching
|
114
114
|
(string similarity) by default and tight matching (regexp) by request.
|
115
115
|
email:
|
@@ -150,6 +150,7 @@ files:
|
|
150
150
|
- lib/loose_tight_dictionary/result.rb
|
151
151
|
- lib/loose_tight_dictionary/score.rb
|
152
152
|
- lib/loose_tight_dictionary/similarity.rb
|
153
|
+
- lib/loose_tight_dictionary/stop_word.rb
|
153
154
|
- lib/loose_tight_dictionary/tightener.rb
|
154
155
|
- lib/loose_tight_dictionary/version.rb
|
155
156
|
- lib/loose_tight_dictionary/wrapper.rb
|