loose_tight_dictionary 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +16 -4
- data/benchmark/memory.rb +1 -1
- data/examples/bts_aircraft/test_bts_aircraft.rb +1 -1
- data/lib/loose_tight_dictionary.rb +20 -19
- data/lib/loose_tight_dictionary/score.rb +5 -0
- data/lib/loose_tight_dictionary/version.rb +1 -1
- data/lib/loose_tight_dictionary/wrapper.rb +19 -6
- data/test/test_cache.rb +1 -1
- data/test/test_loose_tight_dictionary.rb +69 -13
- metadata +19 -19
data/README.rdoc
CHANGED
@@ -17,11 +17,17 @@ Exclusively uses {Dice's Coefficient}[http://en.wikipedia.org/wiki/Dice's_coeffi
|
|
17
17
|
|
18
18
|
Over 2 years in {Brighter Planet's environmental impact API}[http://impact.brighterplanet.com] and {reference data service}[http://data.brighterplanet.com].
|
19
19
|
|
20
|
-
==
|
20
|
+
== Haystacks and how to read them
|
21
|
+
|
22
|
+
The (admittedly imperfect) metaphor is "look for a needle in a haystack"
|
21
23
|
|
22
|
-
|
24
|
+
* needle - the search term
|
25
|
+
* haystack - the records you are searching (<b>your result will be an object from here</b>)
|
23
26
|
|
24
|
-
|
27
|
+
So, what if your needle is a string like <tt>youruguay</tt> and your haystack is full of <tt>Country</tt> objects like <tt><Country name:"Uruguay"></tt>?
|
28
|
+
|
29
|
+
>> LooseTightDictionary.new(countries, :read => :name).find('youruguay')
|
30
|
+
=> <Country name:"Uruguay">
|
25
31
|
|
26
32
|
== Regular expressions
|
27
33
|
|
@@ -49,7 +55,13 @@ Scoring is case-insensitive. Everything is downcased before scoring. This is a c
|
|
49
55
|
|
50
56
|
== Examples
|
51
57
|
|
52
|
-
|
58
|
+
Check out the tests.
|
59
|
+
|
60
|
+
== Speed
|
61
|
+
|
62
|
+
If you add the amatch[http://flori.github.com/amatch/] gem to your Gemfile, it will use that, which is much faster (but {segfaults have been seen in the wild}[https://github.com/flori/amatch/issues/3]). Thanks Flori!
|
63
|
+
|
64
|
+
Otherwise, a pure ruby version derived from the {answer to a StackOverflow question}[http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings] is used. Thanks {marzagao}[http://stackoverflow.com/users/10997/marzagao]!
|
53
65
|
|
54
66
|
== Authors
|
55
67
|
|
data/benchmark/memory.rb
CHANGED
@@ -37,7 +37,7 @@ TIGHTENERS = RemoteTable.new(:url => "file://#{File.expand_path("../../examples/
|
|
37
37
|
IDENTITIES = RemoteTable.new(:url => "file://#{File.expand_path("../../examples/bts_aircraft/identities.csv", __FILE__)}", :headers => :first_row).map { |row| row['regexp'] }
|
38
38
|
|
39
39
|
FINAL_OPTIONS = {
|
40
|
-
:
|
40
|
+
:read => HAYSTACK_READER,
|
41
41
|
:must_match_blocking => MUST_MATCH_BLOCKING,
|
42
42
|
:tighteners => TIGHTENERS,
|
43
43
|
:identities => IDENTITIES,
|
@@ -63,7 +63,7 @@ NEGATIVES = RemoteTable.new :url => "file://#{File.expand_path("../negatives.csv
|
|
63
63
|
# Section 3
|
64
64
|
|
65
65
|
FINAL_OPTIONS = {
|
66
|
-
:
|
66
|
+
:read => HAYSTACK_READER,
|
67
67
|
:must_match_blocking => MUST_MATCH_BLOCKING,
|
68
68
|
:tighteners => TIGHTENERS,
|
69
69
|
:identities => IDENTITIES,
|
@@ -6,7 +6,7 @@ require 'active_support/version'
|
|
6
6
|
active_support/core_ext/object
|
7
7
|
}.each do |active_support_3_requirement|
|
8
8
|
require active_support_3_requirement
|
9
|
-
end if ::ActiveSupport::VERSION::MAJOR
|
9
|
+
end if ::ActiveSupport::VERSION::MAJOR >= 3
|
10
10
|
require 'to_regexp'
|
11
11
|
|
12
12
|
# See the README for more information.
|
@@ -20,24 +20,25 @@ class LooseTightDictionary
|
|
20
20
|
autoload :Score, 'loose_tight_dictionary/score'
|
21
21
|
autoload :CachedResult, 'loose_tight_dictionary/cached_result'
|
22
22
|
|
23
|
-
class Freed < RuntimeError; end
|
24
|
-
|
25
23
|
attr_reader :options
|
26
24
|
attr_reader :haystack
|
27
25
|
attr_reader :records
|
28
26
|
|
29
27
|
# haystack - a bunch of records
|
30
28
|
# options
|
31
|
-
# * tighteners: regexps
|
32
|
-
# * identities: regexps
|
29
|
+
# * tighteners: regexps (see readme)
|
30
|
+
# * identities: regexps
|
31
|
+
# * blockings: regexps
|
32
|
+
# * read: how to interpret each entry in the 'haystack', either a Proc or a symbol
|
33
33
|
def initialize(records, options = {})
|
34
34
|
@options = options.symbolize_keys
|
35
35
|
@records = records
|
36
|
-
|
36
|
+
read = options[:read] || options[:haystack_reader]
|
37
|
+
@haystack = records.map { |record| Wrapper.new self, record, read }
|
37
38
|
end
|
38
39
|
|
39
40
|
def last_result
|
40
|
-
@last_result
|
41
|
+
@last_result || raise(::RuntimeError, "[loose_tight_dictionary] You can't access the last result until you've run a find with :gather_last_result => true")
|
41
42
|
end
|
42
43
|
|
43
44
|
def log(str = '') #:nodoc:
|
@@ -50,11 +51,13 @@ class LooseTightDictionary
|
|
50
51
|
end
|
51
52
|
|
52
53
|
def find(needle, options = {})
|
53
|
-
raise
|
54
|
-
free_last_result
|
54
|
+
raise ::RuntimeError, "[loose_tight_dictionary] Dictionary has already been freed, can't perform more finds" if freed?
|
55
55
|
|
56
56
|
options = options.symbolize_keys
|
57
|
-
gather_last_result = options.fetch(:gather_last_result,
|
57
|
+
if gather_last_result = options.fetch(:gather_last_result, false)
|
58
|
+
free_last_result
|
59
|
+
@last_result = Result.new
|
60
|
+
end
|
58
61
|
find_all = options.fetch(:find_all, false)
|
59
62
|
|
60
63
|
if gather_last_result
|
@@ -63,7 +66,7 @@ class LooseTightDictionary
|
|
63
66
|
last_result.blockings = blockings
|
64
67
|
end
|
65
68
|
|
66
|
-
needle = Wrapper.new
|
69
|
+
needle = Wrapper.new self, needle
|
67
70
|
|
68
71
|
if gather_last_result
|
69
72
|
last_result.needle = needle
|
@@ -126,7 +129,9 @@ class LooseTightDictionary
|
|
126
129
|
last_result.similarities = similarities
|
127
130
|
end
|
128
131
|
|
129
|
-
|
132
|
+
best_similarity = similarities[-1]
|
133
|
+
if best_similarity.best_score.to_f > 0
|
134
|
+
record = best_similarity.wrapper2.record
|
130
135
|
if gather_last_result
|
131
136
|
last_result.record = record
|
132
137
|
last_result.score = best_similarity.best_score.to_f
|
@@ -140,7 +145,7 @@ class LooseTightDictionary
|
|
140
145
|
# d = LooseTightDictionary.new ['737', '747', '757' ]
|
141
146
|
# d.explain 'boeing 737-100'
|
142
147
|
def explain(needle)
|
143
|
-
record = find needle
|
148
|
+
record = find needle, :gather_last_result => true
|
144
149
|
log "#" * 150
|
145
150
|
log "# Match #{needle.inspect} => #{record.inspect}"
|
146
151
|
log "#" * 150
|
@@ -190,16 +195,12 @@ class LooseTightDictionary
|
|
190
195
|
log record.inspect
|
191
196
|
end
|
192
197
|
|
193
|
-
def haystack_reader
|
194
|
-
options[:haystack_reader]
|
195
|
-
end
|
196
|
-
|
197
198
|
def must_match_blocking
|
198
|
-
options
|
199
|
+
options.fetch :must_match_blocking, false
|
199
200
|
end
|
200
201
|
|
201
202
|
def first_blocking_decides
|
202
|
-
options
|
203
|
+
options.fetch :first_blocking_decides, false
|
203
204
|
end
|
204
205
|
|
205
206
|
def tighteners
|
@@ -43,6 +43,11 @@ class LooseTightDictionary
|
|
43
43
|
def dices_coefficient(str1, str2)
|
44
44
|
str1 = str1.downcase
|
45
45
|
str2 = str2.downcase
|
46
|
+
if str1 == str2
|
47
|
+
return 1.0
|
48
|
+
elsif str1.length == 1 and str2.length == 1
|
49
|
+
return 0.0
|
50
|
+
end
|
46
51
|
pairs1 = (0..str1.length-2).map do |i|
|
47
52
|
str1[i,2]
|
48
53
|
end.reject do |pair|
|
@@ -3,12 +3,12 @@ class LooseTightDictionary
|
|
3
3
|
class Wrapper #:nodoc: all
|
4
4
|
attr_reader :parent
|
5
5
|
attr_reader :record
|
6
|
-
attr_reader :
|
6
|
+
attr_reader :read
|
7
7
|
|
8
|
-
def initialize(
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
def initialize(parent, record, read = nil)
|
9
|
+
@parent = parent
|
10
|
+
@record = record
|
11
|
+
@read = read
|
12
12
|
end
|
13
13
|
|
14
14
|
def inspect
|
@@ -16,7 +16,20 @@ class LooseTightDictionary
|
|
16
16
|
end
|
17
17
|
|
18
18
|
def to_str
|
19
|
-
@to_str ||=
|
19
|
+
@to_str ||= case read
|
20
|
+
when ::Proc
|
21
|
+
read.call record
|
22
|
+
when ::Symbol
|
23
|
+
if record.respond_to?(read)
|
24
|
+
record.send read
|
25
|
+
else
|
26
|
+
record[read]
|
27
|
+
end
|
28
|
+
when ::NilClass
|
29
|
+
record
|
30
|
+
else
|
31
|
+
record[read]
|
32
|
+
end.to_s
|
20
33
|
end
|
21
34
|
|
22
35
|
alias :to_s :to_str
|
data/test/test_cache.rb
CHANGED
@@ -33,7 +33,7 @@ class Aircraft < ActiveRecord::Base
|
|
33
33
|
end
|
34
34
|
|
35
35
|
def self.loose_tight_dictionary
|
36
|
-
@loose_tight_dictionary ||= LooseTightDictionary.new all, :
|
36
|
+
@loose_tight_dictionary ||= LooseTightDictionary.new all, :read => ::Proc.new { |straw| straw.aircraft_description }
|
37
37
|
end
|
38
38
|
|
39
39
|
def self.create_table
|
@@ -13,17 +13,23 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
13
13
|
def test_001_find
|
14
14
|
d = LooseTightDictionary.new %w{ NISSAN HONDA }
|
15
15
|
assert_equal 'NISSAN', d.find('MISSAM')
|
16
|
+
|
17
|
+
d = LooseTightDictionary.new [ 'X' ]
|
18
|
+
assert_equal 'X', d.find('X')
|
19
|
+
assert_equal nil, d.find('A')
|
16
20
|
end
|
17
21
|
|
18
|
-
def
|
22
|
+
def test_002_dont_gather_last_result_by_default
|
19
23
|
d = LooseTightDictionary.new %w{ NISSAN HONDA }
|
20
|
-
|
21
|
-
|
24
|
+
d.find('MISSAM')
|
25
|
+
assert_raises(::RuntimeError, /gather_last_result/) do
|
26
|
+
d.last_result
|
27
|
+
end
|
22
28
|
end
|
23
29
|
|
24
30
|
def test_003_last_result
|
25
31
|
d = LooseTightDictionary.new %w{ NISSAN HONDA }
|
26
|
-
d.find 'MISSAM'
|
32
|
+
d.find 'MISSAM', :gather_last_result => true
|
27
33
|
assert_equal 0.6, d.last_result.score
|
28
34
|
assert_equal 'NISSAN', d.last_result.record
|
29
35
|
end
|
@@ -48,18 +54,18 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
48
54
|
|
49
55
|
def test_008_identify_false_positive
|
50
56
|
d = LooseTightDictionary.new %w{ foo bar }, :identities => [ /ba(.)/ ]
|
51
|
-
assert_equal
|
57
|
+
assert_equal nil, d.find('baz')
|
52
58
|
end
|
53
59
|
|
54
|
-
|
55
|
-
|
56
|
-
assert_equal 'X', d.find('X')
|
57
|
-
assert_equal 'X', d.find('A')
|
58
|
-
|
60
|
+
# TODO this is not very helpful
|
61
|
+
def test_009_blocking
|
59
62
|
d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
60
63
|
assert_equal 'X', d.find('X')
|
61
|
-
assert_equal
|
62
|
-
|
64
|
+
assert_equal nil, d.find('A')
|
65
|
+
end
|
66
|
+
|
67
|
+
# TODO this is not very helpful
|
68
|
+
def test_0095_must_match_blocking
|
63
69
|
d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
64
70
|
assert_equal 'X', d.find('X')
|
65
71
|
assert_equal nil, d.find('A')
|
@@ -68,7 +74,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
68
74
|
def test_011_free
|
69
75
|
d = LooseTightDictionary.new %w{ NISSAN HONDA }
|
70
76
|
d.free
|
71
|
-
assert_raises(
|
77
|
+
assert_raises(::RuntimeError, /free/) do
|
72
78
|
d.find('foobar')
|
73
79
|
end
|
74
80
|
end
|
@@ -97,4 +103,54 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
97
103
|
d = LooseTightDictionary.new [ 'Boeing 747', 'Boeing 747SR', 'Boeing ER6' ], :blockings => [ /(boeing \d{3})/i, /boeing/i ], :first_blocking_decides => true, :identities => [ /boeing (7|E)/i ]
|
98
104
|
assert_equal [ 'Boeing ER6' ], d.find_all('Boeing ER6')
|
99
105
|
end
|
106
|
+
|
107
|
+
MyStruct = Struct.new(:one, :two)
|
108
|
+
def test_014_symbol_read_sends_method
|
109
|
+
ab = MyStruct.new('a', 'b')
|
110
|
+
ba = MyStruct.new('b', 'a')
|
111
|
+
haystack = [ab, ba]
|
112
|
+
by_first = LooseTightDictionary.new haystack, :read => :one
|
113
|
+
by_last = LooseTightDictionary.new haystack, :read => :two
|
114
|
+
assert_equal ab, by_first.find('a')
|
115
|
+
assert_equal ab, by_last.find('b')
|
116
|
+
assert_equal ba, by_first.find('b')
|
117
|
+
assert_equal ba, by_last.find('a')
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_015_symbol_read_reads_array
|
121
|
+
ab = ['a', 'b']
|
122
|
+
ba = ['b', 'a']
|
123
|
+
haystack = [ab, ba]
|
124
|
+
by_first = LooseTightDictionary.new haystack, :read => 0
|
125
|
+
by_last = LooseTightDictionary.new haystack, :read => 1
|
126
|
+
assert_equal ab, by_first.find('a')
|
127
|
+
assert_equal ab, by_last.find('b')
|
128
|
+
assert_equal ba, by_first.find('b')
|
129
|
+
assert_equal ba, by_last.find('a')
|
130
|
+
end
|
131
|
+
|
132
|
+
def test_016_symbol_read_reads_hash
|
133
|
+
ab = { :one => 'a', :two => 'b' }
|
134
|
+
ba = { :one => 'b', :two => 'a' }
|
135
|
+
haystack = [ab, ba]
|
136
|
+
by_first = LooseTightDictionary.new haystack, :read => :one
|
137
|
+
by_last = LooseTightDictionary.new haystack, :read => :two
|
138
|
+
assert_equal ab, by_first.find('a')
|
139
|
+
assert_equal ab, by_last.find('b')
|
140
|
+
assert_equal ba, by_first.find('b')
|
141
|
+
assert_equal ba, by_last.find('a')
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_017_understands_haystack_reader_option
|
145
|
+
ab = ['a', 'b']
|
146
|
+
ba = ['b', 'a']
|
147
|
+
haystack = [ab, ba]
|
148
|
+
by_first = LooseTightDictionary.new haystack, :haystack_reader => 0
|
149
|
+
assert_equal ab, by_first.find('a')
|
150
|
+
assert_equal ba, by_first.find('b')
|
151
|
+
end
|
152
|
+
|
153
|
+
def test_018_no_result_if_best_score_is_zero
|
154
|
+
assert_equal nil, LooseTightDictionary.new(['a']).find('b')
|
155
|
+
end
|
100
156
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: loose_tight_dictionary
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2011-12-03 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: shoulda
|
16
|
-
requirement: &
|
16
|
+
requirement: &2164915940 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2164915940
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: remote_table
|
27
|
-
requirement: &
|
27
|
+
requirement: &2164983440 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2164983440
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activerecord
|
38
|
-
requirement: &
|
38
|
+
requirement: &2165077220 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2165077220
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: mysql
|
49
|
-
requirement: &
|
49
|
+
requirement: &2165156580 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2165156580
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: cohort_scope
|
60
|
-
requirement: &
|
60
|
+
requirement: &2165225600 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2165225600
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: weighted_average
|
71
|
-
requirement: &
|
71
|
+
requirement: &2165400220 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2165400220
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rake
|
82
|
-
requirement: &
|
82
|
+
requirement: &2165637020 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2165637020
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: activesupport
|
93
|
-
requirement: &
|
93
|
+
requirement: &2165774580 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '3'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *2165774580
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: to_regexp
|
104
|
-
requirement: &
|
104
|
+
requirement: &2165808840 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: 0.0.3
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *2165808840
|
113
113
|
description: Create dictionaries that link rows between two tables using loose matching
|
114
114
|
(string similarity) by default and tight matching (regexp) by request.
|
115
115
|
email:
|