loose_tight_dictionary 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +24 -6
- data/examples/bts_aircraft/test_bts_aircraft.rb +0 -5
- data/examples/first_name_matching.rb +1 -1
- data/lib/loose_tight_dictionary/result.rb +1 -0
- data/lib/loose_tight_dictionary/score.rb +66 -20
- data/lib/loose_tight_dictionary/similarity.rb +7 -6
- data/lib/loose_tight_dictionary/stop_word.rb +19 -0
- data/lib/loose_tight_dictionary/version.rb +1 -1
- data/lib/loose_tight_dictionary/wrapper.rb +28 -11
- data/lib/loose_tight_dictionary.rb +48 -48
- data/test/test_loose_tight_dictionary.rb +20 -5
- metadata +21 -20
data/README.rdoc
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
= loose_tight_dictionary
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Find a needle in a haystack based on string similarity (using the Pair Distance algorithm and Levenshtein distance) and regular expressions.
|
|
4
4
|
|
|
5
5
|
== Quickstart
|
|
6
6
|
|
|
@@ -11,7 +11,20 @@ Match things based on string similarity (using the Pair Distance algorithm) and
|
|
|
11
11
|
|
|
12
12
|
== String similarity matching
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
Uses {Dice's Coefficient}[http://en.wikipedia.org/wiki/Dice's_coefficient] algorithm (aka Pair Distance).
|
|
15
|
+
|
|
16
|
+
If that judges two strings to be be equally similar to a third string, then Levenshtein distance is used. For example, pair distance considers "RATZ" and "CATZ" to be equally similar to "RITZ" so we invoke Levenshtein.
|
|
17
|
+
|
|
18
|
+
>> require 'amatch'
|
|
19
|
+
=> true
|
|
20
|
+
>> 'RITZ'.pair_distance_similar 'RATZ'
|
|
21
|
+
=> 0.3333333333333333
|
|
22
|
+
>> 'RITZ'.pair_distance_similar 'CATZ' # <-- pair distance can't tell the difference, so we fall back to levenshtein...
|
|
23
|
+
=> 0.3333333333333333
|
|
24
|
+
>> 'RITZ'.levenshtein_similar 'RATZ'
|
|
25
|
+
=> 0.75
|
|
26
|
+
>> 'RITZ'.levenshtein_similar 'CATZ' # <-- which properly shows that RATZ should win
|
|
27
|
+
=> 0.5
|
|
15
28
|
|
|
16
29
|
== Production use
|
|
17
30
|
|
|
@@ -36,6 +49,7 @@ You can improve the default matchings with regular expressions.
|
|
|
36
49
|
* Emphasize important words using <b>blockings</b> and <b>tighteners</b>
|
|
37
50
|
* Filter out stop words with <b>tighteners</b>
|
|
38
51
|
* Prevent impossible matches with <b>blockings</b> and <b>identities</b>
|
|
52
|
+
* Ignore words with <b>stop words</b>
|
|
39
53
|
|
|
40
54
|
=== Blockings
|
|
41
55
|
|
|
@@ -49,19 +63,23 @@ Adding a tightener like <tt>/(boeing).*(7\d\d)/i</tt> will cause "BOEING COMPANY
|
|
|
49
63
|
|
|
50
64
|
Adding an identity like <tt>/(F)\-?(\d50)/</tt> ensures that "Ford F-150" and "Ford F-250" never match.
|
|
51
65
|
|
|
66
|
+
=== Stop words
|
|
67
|
+
|
|
68
|
+
Adding a stop word like <tt>THE</tt> ensures that it is not taken into account when comparing "THE CAT", "THE DAT", and "THE CATT"
|
|
69
|
+
|
|
52
70
|
== Case sensitivity
|
|
53
71
|
|
|
54
|
-
Scoring is case-insensitive. Everything is downcased before scoring. This is a change from previous versions.
|
|
72
|
+
Scoring is case-insensitive. Everything is downcased before scoring. This is a change from previous versions. Your regexps may still be case-sensitive, though.
|
|
55
73
|
|
|
56
74
|
== Examples
|
|
57
75
|
|
|
58
76
|
Check out the tests.
|
|
59
77
|
|
|
60
|
-
== Speed
|
|
78
|
+
== Speed (and who to thank for the algorithms)
|
|
61
79
|
|
|
62
|
-
If you add the amatch[http://flori.github.com/amatch/] gem to your Gemfile, it will use that, which is much faster (but {segfaults have been seen in the wild}[https://github.com/flori/amatch/issues/3]). Thanks Flori!
|
|
80
|
+
If you add the amatch[http://flori.github.com/amatch/] gem to your Gemfile, it will use that, which is much faster (but {segfaults have been seen in the wild}[https://github.com/flori/amatch/issues/3]). Thanks {Flori}[https://github.com/flori]!
|
|
63
81
|
|
|
64
|
-
Otherwise,
|
|
82
|
+
Otherwise, pure ruby versions of the string similarity algorithms derived from the {answer to a StackOverflow question}[http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings] and {the text gem}[https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb] are used. Thanks {marzagao}[http://stackoverflow.com/users/10997/marzagao] and {threedaymonk}[https://github.com/threedaymonk]!
|
|
65
83
|
|
|
66
84
|
== Authors
|
|
67
85
|
|
|
@@ -71,11 +71,6 @@ FINAL_OPTIONS = {
|
|
|
71
71
|
}
|
|
72
72
|
|
|
73
73
|
class TestBtsAircraft < Test::Unit::TestCase
|
|
74
|
-
should "store the records somewhere" do
|
|
75
|
-
d = LooseTightDictionary.new HAYSTACK
|
|
76
|
-
assert d.records.grep(/BOEING 707-100/)
|
|
77
|
-
end
|
|
78
|
-
|
|
79
74
|
should "understand records by using the haystack reader" do
|
|
80
75
|
d = LooseTightDictionary.new HAYSTACK, FINAL_OPTIONS
|
|
81
76
|
assert d.haystack.map { |record| record.to_str }.include?('boeing boeing 707-100')
|
|
@@ -8,7 +8,7 @@ require 'loose_tight_dictionary'
|
|
|
8
8
|
haystack = [ 'seamus', 'andy', 'ben' ]
|
|
9
9
|
needles = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT', 'Shamus Heaney' ]
|
|
10
10
|
|
|
11
|
-
d = LooseTightDictionary.new haystack
|
|
11
|
+
d = LooseTightDictionary.new haystack
|
|
12
12
|
needles.each do |needle|
|
|
13
13
|
d.explain needle
|
|
14
14
|
puts
|
|
@@ -9,40 +9,44 @@ class LooseTightDictionary
|
|
|
9
9
|
attr_reader :str1, :str2
|
|
10
10
|
|
|
11
11
|
def initialize(str1, str2)
|
|
12
|
-
@str1 = str1
|
|
13
|
-
@str2 = str2
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
def to_f
|
|
17
|
-
@to_f ||= dices_coefficient(str1, str2)
|
|
12
|
+
@str1 = str1.downcase
|
|
13
|
+
@str2 = str2.downcase
|
|
18
14
|
end
|
|
19
15
|
|
|
20
16
|
def inspect
|
|
21
|
-
%{#<Score:
|
|
17
|
+
%{#<Score: dices_coefficient=#{dices_coefficient} levenshtein=#{levenshtein}>}
|
|
22
18
|
end
|
|
23
19
|
|
|
24
20
|
def <=>(other)
|
|
25
|
-
|
|
21
|
+
by_dices_coefficient = (dices_coefficient <=> other.dices_coefficient)
|
|
22
|
+
if by_dices_coefficient == 0
|
|
23
|
+
levenshtein <=> other.levenshtein
|
|
24
|
+
else
|
|
25
|
+
by_dices_coefficient
|
|
26
|
+
end
|
|
26
27
|
end
|
|
27
|
-
|
|
28
|
-
def
|
|
29
|
-
|
|
28
|
+
|
|
29
|
+
def utf8?
|
|
30
|
+
return @utf8_query[0] if @utf8_query.is_a?(::Array)
|
|
31
|
+
@utf8_query = [ (defined?(::Encoding) ? str1.encoding.to_s : $KCODE).downcase.start_with?('u') ]
|
|
32
|
+
@utf8_query[0]
|
|
30
33
|
end
|
|
31
34
|
|
|
32
|
-
private
|
|
33
|
-
|
|
34
|
-
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
|
35
35
|
if defined?(::Amatch)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
str2 = str2.downcase
|
|
36
|
+
|
|
37
|
+
def dices_coefficient
|
|
39
38
|
str1.pair_distance_similar str2
|
|
40
39
|
end
|
|
40
|
+
|
|
41
|
+
def levenshtein
|
|
42
|
+
str1.levenshtein_similar str2
|
|
43
|
+
end
|
|
44
|
+
|
|
41
45
|
else
|
|
46
|
+
|
|
42
47
|
SPACE = ' '
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
str2 = str2.downcase
|
|
48
|
+
# http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
|
|
49
|
+
def dices_coefficient
|
|
46
50
|
if str1 == str2
|
|
47
51
|
return 1.0
|
|
48
52
|
elsif str1.length == 1 and str2.length == 1
|
|
@@ -71,6 +75,48 @@ class LooseTightDictionary
|
|
|
71
75
|
end
|
|
72
76
|
(2.0 * intersection) / union
|
|
73
77
|
end
|
|
78
|
+
|
|
79
|
+
# extracted/adapted from the text gem version 1.0.2
|
|
80
|
+
# normalization added for utf-8 strings
|
|
81
|
+
# lib/text/levenshtein.rb
|
|
82
|
+
def levenshtein
|
|
83
|
+
if utf8?
|
|
84
|
+
unpack_rule = 'U*'
|
|
85
|
+
else
|
|
86
|
+
unpack_rule = 'C*'
|
|
87
|
+
end
|
|
88
|
+
s = str1.unpack(unpack_rule)
|
|
89
|
+
t = str2.unpack(unpack_rule)
|
|
90
|
+
n = s.length
|
|
91
|
+
m = t.length
|
|
92
|
+
if n == 0 or m == 0
|
|
93
|
+
return 0.0
|
|
94
|
+
end
|
|
95
|
+
d = (0..m).to_a
|
|
96
|
+
x = nil
|
|
97
|
+
(0...n).each do |i|
|
|
98
|
+
e = i+1
|
|
99
|
+
(0...m).each do |j|
|
|
100
|
+
cost = (s[i] == t[j]) ? 0 : 1
|
|
101
|
+
x = [
|
|
102
|
+
d[j+1] + 1, # insertion
|
|
103
|
+
e + 1, # deletion
|
|
104
|
+
d[j] + cost # substitution
|
|
105
|
+
].min
|
|
106
|
+
d[j] = e
|
|
107
|
+
e = x
|
|
108
|
+
end
|
|
109
|
+
d[m] = x
|
|
110
|
+
end
|
|
111
|
+
# normalization logic from https://github.com/flori/amatch/blob/master/ext/amatch_ext.c#L301
|
|
112
|
+
# if (b_len > a_len) {
|
|
113
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
|
114
|
+
# } else {
|
|
115
|
+
# result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
|
116
|
+
# }
|
|
117
|
+
1.0 - x.to_f / [n, m].max
|
|
118
|
+
end
|
|
119
|
+
|
|
74
120
|
end
|
|
75
121
|
end
|
|
76
122
|
end
|
|
@@ -9,16 +9,17 @@ class LooseTightDictionary
|
|
|
9
9
|
end
|
|
10
10
|
|
|
11
11
|
def <=>(other)
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
by_score = best_score <=> other.best_score
|
|
13
|
+
if by_score == 0
|
|
14
|
+
original_weight <=> other.original_weight
|
|
14
15
|
else
|
|
15
|
-
|
|
16
|
+
by_score
|
|
16
17
|
end
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
# Weight things towards short original strings
|
|
20
|
-
def
|
|
21
|
-
@
|
|
21
|
+
def original_weight
|
|
22
|
+
@original_weight ||= (1.0 / (wrapper1.render.length * wrapper2.render.length))
|
|
22
23
|
end
|
|
23
24
|
|
|
24
25
|
def best_score
|
|
@@ -46,7 +47,7 @@ class LooseTightDictionary
|
|
|
46
47
|
end
|
|
47
48
|
|
|
48
49
|
def inspect
|
|
49
|
-
%{#<Similarity "#{wrapper2.
|
|
50
|
+
%{#<Similarity "#{wrapper2.render}"=>"#{best_wrapper2_variant}" versus "#{wrapper1.render}"=>"#{best_wrapper1_variant}" original_weight=#{"%0.5f" % original_weight} best_score=#{best_score.inspect}>}
|
|
50
51
|
end
|
|
51
52
|
end
|
|
52
53
|
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
class LooseTightDictionary
|
|
2
|
+
# A stop word is ignored
|
|
3
|
+
class StopWord
|
|
4
|
+
attr_reader :regexp
|
|
5
|
+
|
|
6
|
+
def initialize(regexp_or_str)
|
|
7
|
+
@regexp = regexp_or_str.to_regexp
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# Destructively remove stop words from the string
|
|
11
|
+
def apply!(str)
|
|
12
|
+
str.gsub! regexp, ''
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def inspect
|
|
16
|
+
"#<StopWord regexp=#{regexp.inspect}>"
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -1,22 +1,23 @@
|
|
|
1
1
|
class LooseTightDictionary
|
|
2
2
|
# Wrappers are the tokens that are passed around when doing scoring and optimizing.
|
|
3
3
|
class Wrapper #:nodoc: all
|
|
4
|
-
attr_reader :
|
|
4
|
+
attr_reader :loose_tight_dictionary
|
|
5
5
|
attr_reader :record
|
|
6
6
|
attr_reader :read
|
|
7
7
|
|
|
8
|
-
def initialize(
|
|
9
|
-
@
|
|
8
|
+
def initialize(loose_tight_dictionary, record, read = nil)
|
|
9
|
+
@loose_tight_dictionary = loose_tight_dictionary
|
|
10
10
|
@record = record
|
|
11
11
|
@read = read
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
def inspect
|
|
15
|
-
"#<Wrapper
|
|
15
|
+
"#<Wrapper render=#{render} variants=#{variants.length}>"
|
|
16
16
|
end
|
|
17
17
|
|
|
18
|
-
def
|
|
19
|
-
@
|
|
18
|
+
def render
|
|
19
|
+
return @render if rendered?
|
|
20
|
+
str = case read
|
|
20
21
|
when ::Proc
|
|
21
22
|
read.call record
|
|
22
23
|
when ::Symbol
|
|
@@ -29,22 +30,38 @@ class LooseTightDictionary
|
|
|
29
30
|
record
|
|
30
31
|
else
|
|
31
32
|
record[read]
|
|
32
|
-
end.to_s
|
|
33
|
+
end.to_s.dup
|
|
34
|
+
loose_tight_dictionary.stop_words.each do |stop_word|
|
|
35
|
+
stop_word.apply! str
|
|
36
|
+
end
|
|
37
|
+
str.strip!
|
|
38
|
+
@render = str.freeze
|
|
39
|
+
@rendered = true
|
|
40
|
+
@render
|
|
33
41
|
end
|
|
34
42
|
|
|
35
|
-
alias :
|
|
43
|
+
alias :to_str :render
|
|
44
|
+
|
|
45
|
+
WORD_BOUNDARY = %r{\s*\b\s*}
|
|
46
|
+
def words
|
|
47
|
+
@words ||= render.split(WORD_BOUNDARY)
|
|
48
|
+
end
|
|
36
49
|
|
|
37
50
|
def similarity(other)
|
|
38
51
|
Similarity.new self, other
|
|
39
52
|
end
|
|
40
53
|
|
|
41
54
|
def variants
|
|
42
|
-
@variants ||=
|
|
43
|
-
if tightener.apply?
|
|
44
|
-
memo.push tightener.apply(
|
|
55
|
+
@variants ||= loose_tight_dictionary.tighteners.inject([ render ]) do |memo, tightener|
|
|
56
|
+
if tightener.apply? render
|
|
57
|
+
memo.push tightener.apply(render)
|
|
45
58
|
end
|
|
46
59
|
memo
|
|
47
60
|
end.uniq
|
|
48
61
|
end
|
|
62
|
+
|
|
63
|
+
def rendered?
|
|
64
|
+
@rendered == true
|
|
65
|
+
end
|
|
49
66
|
end
|
|
50
67
|
end
|
|
@@ -8,6 +8,7 @@ require 'to_regexp'
|
|
|
8
8
|
# See the README for more information.
|
|
9
9
|
class LooseTightDictionary
|
|
10
10
|
autoload :Tightener, 'loose_tight_dictionary/tightener'
|
|
11
|
+
autoload :StopWord, 'loose_tight_dictionary/stop_word'
|
|
11
12
|
autoload :Blocking, 'loose_tight_dictionary/blocking'
|
|
12
13
|
autoload :Identity, 'loose_tight_dictionary/identity'
|
|
13
14
|
autoload :Result, 'loose_tight_dictionary/result'
|
|
@@ -16,19 +17,31 @@ class LooseTightDictionary
|
|
|
16
17
|
autoload :Score, 'loose_tight_dictionary/score'
|
|
17
18
|
autoload :CachedResult, 'loose_tight_dictionary/cached_result'
|
|
18
19
|
|
|
19
|
-
attr_reader :options
|
|
20
20
|
attr_reader :haystack
|
|
21
|
-
attr_reader :
|
|
21
|
+
attr_reader :blockings
|
|
22
|
+
attr_reader :identities
|
|
23
|
+
attr_reader :tighteners
|
|
24
|
+
attr_reader :stop_words
|
|
25
|
+
attr_reader :first_blocking_decides
|
|
26
|
+
attr_reader :must_match_blocking
|
|
27
|
+
attr_reader :must_match_at_least_one_word
|
|
22
28
|
|
|
23
29
|
# haystack - a bunch of records
|
|
24
30
|
# options
|
|
25
31
|
# * tighteners: regexps (see readme)
|
|
26
32
|
# * identities: regexps
|
|
27
33
|
# * blockings: regexps
|
|
34
|
+
# * stop_words: regexps
|
|
28
35
|
# * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
|
|
29
36
|
def initialize(records, options = {})
|
|
30
|
-
|
|
31
|
-
@
|
|
37
|
+
options = options.symbolize_keys
|
|
38
|
+
@first_blocking_decides = options.fetch :first_blocking_decides, false
|
|
39
|
+
@must_match_blocking = options.fetch :must_match_blocking, false
|
|
40
|
+
@must_match_at_least_one_word = options.fetch :must_match_at_least_one_word, false
|
|
41
|
+
@blockings = options.fetch(:blockings, []).map { |regexp_or_str| Blocking.new regexp_or_str }
|
|
42
|
+
@identities = options.fetch(:identities, []).map { |regexp_or_str| Identity.new regexp_or_str }
|
|
43
|
+
@tighteners = options.fetch(:tighteners, []).map { |regexp_or_str| Tightener.new regexp_or_str }
|
|
44
|
+
@stop_words = options.fetch(:stop_words, []).map { |regexp_or_str| StopWord.new regexp_or_str }
|
|
32
45
|
read = options[:read] || options[:haystack_reader]
|
|
33
46
|
@haystack = records.map { |record| Wrapper.new self, record, read }
|
|
34
47
|
end
|
|
@@ -37,10 +50,6 @@ class LooseTightDictionary
|
|
|
37
50
|
@last_result || raise(::RuntimeError, "[loose_tight_dictionary] You can't access the last result until you've run a find with :gather_last_result => true")
|
|
38
51
|
end
|
|
39
52
|
|
|
40
|
-
def log(str = '') #:nodoc:
|
|
41
|
-
(options[:log] || $stderr).puts str unless options[:log] == false
|
|
42
|
-
end
|
|
43
|
-
|
|
44
53
|
def find_all(needle, options = {})
|
|
45
54
|
options = options.symbolize_keys.merge(:find_all => true)
|
|
46
55
|
find needle, options
|
|
@@ -50,11 +59,13 @@ class LooseTightDictionary
|
|
|
50
59
|
raise ::RuntimeError, "[loose_tight_dictionary] Dictionary has already been freed, can't perform more finds" if freed?
|
|
51
60
|
|
|
52
61
|
options = options.symbolize_keys
|
|
53
|
-
|
|
62
|
+
gather_last_result = options.fetch(:gather_last_result, false)
|
|
63
|
+
is_find_all = options.fetch(:find_all, false)
|
|
64
|
+
|
|
65
|
+
if gather_last_result
|
|
54
66
|
free_last_result
|
|
55
67
|
@last_result = Result.new
|
|
56
68
|
end
|
|
57
|
-
find_all = options.fetch(:find_all, false)
|
|
58
69
|
|
|
59
70
|
if gather_last_result
|
|
60
71
|
last_result.tighteners = tighteners
|
|
@@ -69,15 +80,27 @@ class LooseTightDictionary
|
|
|
69
80
|
end
|
|
70
81
|
|
|
71
82
|
if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
|
|
72
|
-
if
|
|
83
|
+
if is_find_all
|
|
73
84
|
return []
|
|
74
85
|
else
|
|
75
86
|
return nil
|
|
76
87
|
end
|
|
77
88
|
end
|
|
78
89
|
|
|
90
|
+
candidates = if must_match_at_least_one_word
|
|
91
|
+
haystack.select do |straw|
|
|
92
|
+
needle.words.any? { |w| straw.render.include? w }
|
|
93
|
+
end
|
|
94
|
+
else
|
|
95
|
+
haystack
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
if gather_last_result
|
|
99
|
+
last_result.candidates = candidates
|
|
100
|
+
end
|
|
101
|
+
|
|
79
102
|
joint, disjoint = if blockings.any?
|
|
80
|
-
|
|
103
|
+
candidates.partition do |straw|
|
|
81
104
|
if first_blocking_decides
|
|
82
105
|
blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
|
|
83
106
|
else
|
|
@@ -85,7 +108,7 @@ class LooseTightDictionary
|
|
|
85
108
|
end
|
|
86
109
|
end
|
|
87
110
|
else
|
|
88
|
-
[
|
|
111
|
+
[ candidates.dup, [] ]
|
|
89
112
|
end
|
|
90
113
|
|
|
91
114
|
# special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
|
|
@@ -115,7 +138,7 @@ class LooseTightDictionary
|
|
|
115
138
|
last_result.certainly_different = certainly_different
|
|
116
139
|
end
|
|
117
140
|
|
|
118
|
-
if
|
|
141
|
+
if is_find_all
|
|
119
142
|
return possibly_identical.map { |straw| straw.record }
|
|
120
143
|
end
|
|
121
144
|
|
|
@@ -125,12 +148,11 @@ class LooseTightDictionary
|
|
|
125
148
|
last_result.similarities = similarities
|
|
126
149
|
end
|
|
127
150
|
|
|
128
|
-
|
|
129
|
-
if best_similarity = similarities[-1] and best_similarity.best_score.to_f > 0
|
|
151
|
+
if best_similarity = similarities[-1] and best_similarity.best_score.dices_coefficient > 0
|
|
130
152
|
record = best_similarity.wrapper2.record
|
|
131
153
|
if gather_last_result
|
|
132
154
|
last_result.record = record
|
|
133
|
-
last_result.score = best_similarity.best_score.
|
|
155
|
+
last_result.score = best_similarity.best_score.dices_coefficient
|
|
134
156
|
end
|
|
135
157
|
record
|
|
136
158
|
end
|
|
@@ -148,11 +170,11 @@ class LooseTightDictionary
|
|
|
148
170
|
log
|
|
149
171
|
log "Needle"
|
|
150
172
|
log "-" * 150
|
|
151
|
-
log last_result.needle.
|
|
173
|
+
log last_result.needle.render
|
|
152
174
|
log
|
|
153
175
|
log "Haystack"
|
|
154
176
|
log "-" * 150
|
|
155
|
-
log last_result.haystack.map { |record| record.
|
|
177
|
+
log last_result.haystack.map { |record| record.render }.join("\n")
|
|
156
178
|
log
|
|
157
179
|
log "Tighteners"
|
|
158
180
|
log "-" * 150
|
|
@@ -168,19 +190,19 @@ class LooseTightDictionary
|
|
|
168
190
|
log
|
|
169
191
|
log "Joint"
|
|
170
192
|
log "-" * 150
|
|
171
|
-
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.
|
|
193
|
+
log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.render }.join("\n")
|
|
172
194
|
log
|
|
173
195
|
log "Disjoint"
|
|
174
196
|
log "-" * 150
|
|
175
|
-
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.
|
|
197
|
+
log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.render }.join("\n")
|
|
176
198
|
log
|
|
177
199
|
log "Possibly identical"
|
|
178
200
|
log "-" * 150
|
|
179
|
-
log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.
|
|
201
|
+
log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.render }.join("\n")
|
|
180
202
|
log
|
|
181
203
|
log "Certainly different"
|
|
182
204
|
log "-" * 150
|
|
183
|
-
log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.
|
|
205
|
+
log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.render }.join("\n")
|
|
184
206
|
log
|
|
185
207
|
log "Similarities"
|
|
186
208
|
log "-" * 150
|
|
@@ -190,33 +212,11 @@ class LooseTightDictionary
|
|
|
190
212
|
log "-" * 150
|
|
191
213
|
log record.inspect
|
|
192
214
|
end
|
|
193
|
-
|
|
194
|
-
def must_match_blocking
|
|
195
|
-
options.fetch :must_match_blocking, false
|
|
196
|
-
end
|
|
197
215
|
|
|
198
|
-
def
|
|
199
|
-
|
|
200
|
-
end
|
|
201
|
-
|
|
202
|
-
def tighteners
|
|
203
|
-
@tighteners ||= (options[:tighteners] || []).map do |regexp_or_str|
|
|
204
|
-
Tightener.new regexp_or_str
|
|
205
|
-
end
|
|
206
|
-
end
|
|
207
|
-
|
|
208
|
-
def identities
|
|
209
|
-
@identities ||= (options[:identities] || []).map do |regexp_or_str|
|
|
210
|
-
Identity.new regexp_or_str
|
|
211
|
-
end
|
|
212
|
-
end
|
|
213
|
-
|
|
214
|
-
def blockings
|
|
215
|
-
@blockings ||= (options[:blockings] || []).map do |regexp_or_str|
|
|
216
|
-
Blocking.new regexp_or_str
|
|
217
|
-
end
|
|
216
|
+
def log(str = '') #:nodoc:
|
|
217
|
+
$stderr.puts str
|
|
218
218
|
end
|
|
219
|
-
|
|
219
|
+
|
|
220
220
|
def freed?
|
|
221
221
|
@freed == true
|
|
222
222
|
end
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
1
2
|
require 'helper'
|
|
2
3
|
|
|
3
4
|
class TestLooseTightDictionary < Test::Unit::TestCase
|
|
@@ -11,8 +12,9 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
|
11
12
|
# end
|
|
12
13
|
|
|
13
14
|
def test_001_find
|
|
14
|
-
d = LooseTightDictionary.new %w{
|
|
15
|
-
assert_equal '
|
|
15
|
+
d = LooseTightDictionary.new %w{ RATZ CATZ }
|
|
16
|
+
assert_equal 'RATZ', d.find('RITZ')
|
|
17
|
+
assert_equal 'RATZ', d.find('RíTZ')
|
|
16
18
|
|
|
17
19
|
d = LooseTightDictionary.new [ 'X' ]
|
|
18
20
|
assert_equal 'X', d.find('X')
|
|
@@ -46,7 +48,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
|
46
48
|
d = LooseTightDictionary.new ['BOEING 737-100/200', 'BOEING 737-900'], :tighteners => tighteners
|
|
47
49
|
assert_equal 'BOEING 737-100/200', d.find('BOEING 737100 number 900')
|
|
48
50
|
end
|
|
49
|
-
|
|
51
|
+
|
|
50
52
|
def test_008_false_positive_without_identity
|
|
51
53
|
d = LooseTightDictionary.new %w{ foo bar }
|
|
52
54
|
assert_equal 'bar', d.find('baz')
|
|
@@ -63,7 +65,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
|
63
65
|
assert_equal 'X', d.find('X')
|
|
64
66
|
assert_equal nil, d.find('A')
|
|
65
67
|
end
|
|
66
|
-
|
|
68
|
+
|
|
67
69
|
# TODO this is not very helpful
|
|
68
70
|
def test_0095_must_match_blocking
|
|
69
71
|
d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
|
@@ -98,7 +100,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
|
98
100
|
|
|
99
101
|
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing (7|E)/i, /boeing/i ], :first_blocking_decides => true
|
|
100
102
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
|
101
|
-
|
|
103
|
+
|
|
102
104
|
# or equivalently with an identity
|
|
103
105
|
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
|
|
104
106
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
|
@@ -153,4 +155,17 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
|
153
155
|
def test_018_no_result_if_best_score_is_zero
|
|
154
156
|
assert_equal nil, LooseTightDictionary.new(['a']).find('b')
|
|
155
157
|
end
|
|
158
|
+
|
|
159
|
+
def test_019_must_match_at_least_one_word
|
|
160
|
+
d = LooseTightDictionary.new %w{ RATZ CATZ }, :must_match_at_least_one_word => true
|
|
161
|
+
assert_equal nil, d.find('RITZ')
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def test_020_stop_words
|
|
165
|
+
d = LooseTightDictionary.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true
|
|
166
|
+
assert_equal 'B HTL', d.find('A HTL')
|
|
167
|
+
|
|
168
|
+
d = LooseTightDictionary.new [ 'A HOTEL', 'B HTL' ], :must_match_at_least_one_word => true, :stop_words => [ %r{HO?TE?L} ]
|
|
169
|
+
assert_equal 'A HOTEL', d.find('A HTL')
|
|
170
|
+
end
|
|
156
171
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: loose_tight_dictionary
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.
|
|
4
|
+
version: 1.0.3
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,11 +9,11 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2011-12-
|
|
12
|
+
date: 2011-12-06 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: shoulda
|
|
16
|
-
requirement: &
|
|
16
|
+
requirement: &2185313400 !ruby/object:Gem::Requirement
|
|
17
17
|
none: false
|
|
18
18
|
requirements:
|
|
19
19
|
- - ! '>='
|
|
@@ -21,10 +21,10 @@ dependencies:
|
|
|
21
21
|
version: '0'
|
|
22
22
|
type: :development
|
|
23
23
|
prerelease: false
|
|
24
|
-
version_requirements: *
|
|
24
|
+
version_requirements: *2185313400
|
|
25
25
|
- !ruby/object:Gem::Dependency
|
|
26
26
|
name: remote_table
|
|
27
|
-
requirement: &
|
|
27
|
+
requirement: &2185284260 !ruby/object:Gem::Requirement
|
|
28
28
|
none: false
|
|
29
29
|
requirements:
|
|
30
30
|
- - ! '>='
|
|
@@ -32,10 +32,10 @@ dependencies:
|
|
|
32
32
|
version: '0'
|
|
33
33
|
type: :development
|
|
34
34
|
prerelease: false
|
|
35
|
-
version_requirements: *
|
|
35
|
+
version_requirements: *2185284260
|
|
36
36
|
- !ruby/object:Gem::Dependency
|
|
37
37
|
name: activerecord
|
|
38
|
-
requirement: &
|
|
38
|
+
requirement: &2185283700 !ruby/object:Gem::Requirement
|
|
39
39
|
none: false
|
|
40
40
|
requirements:
|
|
41
41
|
- - ! '>='
|
|
@@ -43,10 +43,10 @@ dependencies:
|
|
|
43
43
|
version: '3'
|
|
44
44
|
type: :development
|
|
45
45
|
prerelease: false
|
|
46
|
-
version_requirements: *
|
|
46
|
+
version_requirements: *2185283700
|
|
47
47
|
- !ruby/object:Gem::Dependency
|
|
48
48
|
name: mysql
|
|
49
|
-
requirement: &
|
|
49
|
+
requirement: &2185283260 !ruby/object:Gem::Requirement
|
|
50
50
|
none: false
|
|
51
51
|
requirements:
|
|
52
52
|
- - ! '>='
|
|
@@ -54,10 +54,10 @@ dependencies:
|
|
|
54
54
|
version: '0'
|
|
55
55
|
type: :development
|
|
56
56
|
prerelease: false
|
|
57
|
-
version_requirements: *
|
|
57
|
+
version_requirements: *2185283260
|
|
58
58
|
- !ruby/object:Gem::Dependency
|
|
59
59
|
name: cohort_scope
|
|
60
|
-
requirement: &
|
|
60
|
+
requirement: &2185282760 !ruby/object:Gem::Requirement
|
|
61
61
|
none: false
|
|
62
62
|
requirements:
|
|
63
63
|
- - ! '>='
|
|
@@ -65,10 +65,10 @@ dependencies:
|
|
|
65
65
|
version: '0'
|
|
66
66
|
type: :development
|
|
67
67
|
prerelease: false
|
|
68
|
-
version_requirements: *
|
|
68
|
+
version_requirements: *2185282760
|
|
69
69
|
- !ruby/object:Gem::Dependency
|
|
70
70
|
name: weighted_average
|
|
71
|
-
requirement: &
|
|
71
|
+
requirement: &2185282340 !ruby/object:Gem::Requirement
|
|
72
72
|
none: false
|
|
73
73
|
requirements:
|
|
74
74
|
- - ! '>='
|
|
@@ -76,10 +76,10 @@ dependencies:
|
|
|
76
76
|
version: '0'
|
|
77
77
|
type: :development
|
|
78
78
|
prerelease: false
|
|
79
|
-
version_requirements: *
|
|
79
|
+
version_requirements: *2185282340
|
|
80
80
|
- !ruby/object:Gem::Dependency
|
|
81
81
|
name: rake
|
|
82
|
-
requirement: &
|
|
82
|
+
requirement: &2185281880 !ruby/object:Gem::Requirement
|
|
83
83
|
none: false
|
|
84
84
|
requirements:
|
|
85
85
|
- - ! '>='
|
|
@@ -87,10 +87,10 @@ dependencies:
|
|
|
87
87
|
version: '0'
|
|
88
88
|
type: :development
|
|
89
89
|
prerelease: false
|
|
90
|
-
version_requirements: *
|
|
90
|
+
version_requirements: *2185281880
|
|
91
91
|
- !ruby/object:Gem::Dependency
|
|
92
92
|
name: activesupport
|
|
93
|
-
requirement: &
|
|
93
|
+
requirement: &2185281260 !ruby/object:Gem::Requirement
|
|
94
94
|
none: false
|
|
95
95
|
requirements:
|
|
96
96
|
- - ! '>='
|
|
@@ -98,10 +98,10 @@ dependencies:
|
|
|
98
98
|
version: '3'
|
|
99
99
|
type: :runtime
|
|
100
100
|
prerelease: false
|
|
101
|
-
version_requirements: *
|
|
101
|
+
version_requirements: *2185281260
|
|
102
102
|
- !ruby/object:Gem::Dependency
|
|
103
103
|
name: to_regexp
|
|
104
|
-
requirement: &
|
|
104
|
+
requirement: &2185280640 !ruby/object:Gem::Requirement
|
|
105
105
|
none: false
|
|
106
106
|
requirements:
|
|
107
107
|
- - ! '>='
|
|
@@ -109,7 +109,7 @@ dependencies:
|
|
|
109
109
|
version: 0.0.3
|
|
110
110
|
type: :runtime
|
|
111
111
|
prerelease: false
|
|
112
|
-
version_requirements: *
|
|
112
|
+
version_requirements: *2185280640
|
|
113
113
|
description: Create dictionaries that link rows between two tables using loose matching
|
|
114
114
|
(string similarity) by default and tight matching (regexp) by request.
|
|
115
115
|
email:
|
|
@@ -150,6 +150,7 @@ files:
|
|
|
150
150
|
- lib/loose_tight_dictionary/result.rb
|
|
151
151
|
- lib/loose_tight_dictionary/score.rb
|
|
152
152
|
- lib/loose_tight_dictionary/similarity.rb
|
|
153
|
+
- lib/loose_tight_dictionary/stop_word.rb
|
|
153
154
|
- lib/loose_tight_dictionary/tightener.rb
|
|
154
155
|
- lib/loose_tight_dictionary/version.rb
|
|
155
156
|
- lib/loose_tight_dictionary/wrapper.rb
|