loose_tight_dictionary 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +40 -1
- data/VERSION +1 -1
- data/examples/first_name_matching.rb +23 -0
- data/lib/loose_tight_dictionary.rb +32 -9
- data/test/test_loose_tight_dictionary.rb +22 -0
- metadata +5 -3
data/README.rdoc
CHANGED
@@ -1,6 +1,45 @@
|
|
1
1
|
= loose_tight_dictionary
|
2
2
|
|
3
|
-
|
3
|
+
Match things based on string similarity (using the Pair Distance algorithm) and regular expressions.
|
4
|
+
|
5
|
+
= Quickstart
|
6
|
+
|
7
|
+
>> right_records = [ 'seamus', 'andy', 'ben' ]
|
8
|
+
=> [...]
|
9
|
+
>> left_record = 'Shamus Heaney'
|
10
|
+
=> [...]
|
11
|
+
>> d = LooseTightDictionary.new right_records
|
12
|
+
=> [...]
|
13
|
+
>> puts d.left_to_right left_record
|
14
|
+
=> 'seamus'
|
15
|
+
|
16
|
+
Try running the included example file:
|
17
|
+
|
18
|
+
$ ruby examples/first_name_matching.rb
|
19
|
+
Left side (input)
|
20
|
+
====================
|
21
|
+
Mr. Seamus
|
22
|
+
Sr. Andy
|
23
|
+
Master BenT
|
24
|
+
|
25
|
+
Right side (output)
|
26
|
+
====================
|
27
|
+
seamus
|
28
|
+
andy
|
29
|
+
ben
|
30
|
+
|
31
|
+
Results
|
32
|
+
====================
|
33
|
+
Left record (input) Right record (output) Prefix used (if any) Score
|
34
|
+
Mr. Seamus seamus NULL 0.666666666666667
|
35
|
+
Sr. Andy andy NULL 0.5
|
36
|
+
Master BenT ben NULL 0.2
|
37
|
+
|
38
|
+
= Improving dictionaries
|
39
|
+
|
40
|
+
Similarity matching will only get you so far.
|
41
|
+
|
42
|
+
TODO: regex usage
|
4
43
|
|
5
44
|
== Note on Patches/Pull Requests
|
6
45
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.6
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
# require 'loose_tight_dictionary'
|
4
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
|
5
|
+
right_side = [ 'seamus', 'andy', 'ben' ]
|
6
|
+
left_side = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT' ]
|
7
|
+
|
8
|
+
puts "Left side (input)"
|
9
|
+
puts "=" * 20
|
10
|
+
puts left_side
|
11
|
+
puts
|
12
|
+
|
13
|
+
puts "Right side (output)"
|
14
|
+
puts "=" * 20
|
15
|
+
puts right_side
|
16
|
+
puts
|
17
|
+
|
18
|
+
puts "Results"
|
19
|
+
puts "=" * 20
|
20
|
+
d = LooseTightDictionary.new right_side, :tee => STDOUT, :tee_format => :fixed_width
|
21
|
+
d.check left_side
|
22
|
+
|
23
|
+
puts d.left_to_right 'Shamus Heaney'
|
@@ -39,10 +39,11 @@ class LooseTightDictionary
|
|
39
39
|
include Amatch
|
40
40
|
|
41
41
|
attr_reader :right_records
|
42
|
-
attr_reader :logger
|
43
|
-
attr_reader :tee
|
44
42
|
attr_reader :case_sensitive
|
45
|
-
|
43
|
+
|
44
|
+
attr_accessor :logger
|
45
|
+
attr_accessor :tee
|
46
|
+
attr_accessor :tee_format
|
46
47
|
attr_accessor :positives
|
47
48
|
attr_accessor :negatives
|
48
49
|
attr_accessor :left_reader
|
@@ -59,6 +60,7 @@ class LooseTightDictionary
|
|
59
60
|
@negatives = options[:negatives]
|
60
61
|
@logger = options[:logger]
|
61
62
|
@tee = options[:tee]
|
63
|
+
@tee_format = options[:tee_format] || :fixed_width
|
62
64
|
@case_sensitive = options[:case_sensitive] || false
|
63
65
|
end
|
64
66
|
|
@@ -106,15 +108,24 @@ class LooseTightDictionary
|
|
106
108
|
end
|
107
109
|
|
108
110
|
def check(left_records)
|
109
|
-
|
110
|
-
|
111
|
-
|
111
|
+
header = [ 'Left record (input)', 'Right record (output)', 'Prefix used (if any)', 'Score' ]
|
112
|
+
case tee_format
|
113
|
+
when :csv
|
114
|
+
tee.andand.puts header.flatten.to_csv
|
115
|
+
when :fixed_width
|
116
|
+
tee.andand.puts header.map { |i| i.to_s.ljust(30) }.join
|
112
117
|
end
|
118
|
+
|
113
119
|
left_records.each do |left_record|
|
114
120
|
begin
|
115
121
|
right_record = left_to_right left_record
|
116
122
|
ensure
|
117
|
-
|
123
|
+
case tee_format
|
124
|
+
when :csv
|
125
|
+
tee.andand.puts $ltd_1.flatten.to_csv
|
126
|
+
when :fixed_width
|
127
|
+
tee.andand.puts $ltd_1.map { |i| i.to_s.ljust(30) }.join if $ltd_1
|
128
|
+
end
|
118
129
|
end
|
119
130
|
end
|
120
131
|
end
|
@@ -292,12 +303,24 @@ class LooseTightDictionary
|
|
292
303
|
|
293
304
|
def read_left(left_record)
|
294
305
|
return if left_record.nil?
|
295
|
-
|
306
|
+
if left_reader
|
307
|
+
left_reader.call(left_record)
|
308
|
+
elsif left_record.is_a?(String)
|
309
|
+
left_record
|
310
|
+
else
|
311
|
+
left_record[0]
|
312
|
+
end
|
296
313
|
end
|
297
314
|
|
298
315
|
def read_right(right_record)
|
299
316
|
return if right_record.nil?
|
300
|
-
|
317
|
+
if right_reader
|
318
|
+
right_reader.call(right_record)
|
319
|
+
elsif right_record.is_a?(String)
|
320
|
+
right_record
|
321
|
+
else
|
322
|
+
right_record[0]
|
323
|
+
end
|
301
324
|
end
|
302
325
|
|
303
326
|
# Thanks William James!
|
@@ -68,6 +68,28 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
68
68
|
end
|
69
69
|
|
70
70
|
if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
|
71
|
+
# the example from the readme, considerably uglier here
|
72
|
+
should "check a simple table" do
|
73
|
+
@right = [ 'seamus', 'andy', 'ben' ]
|
74
|
+
@positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
|
75
|
+
left = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
|
76
|
+
|
77
|
+
assert_nothing_raised do
|
78
|
+
ltd.check left
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
should "treat a String as a full record if passed through" do
|
83
|
+
dash = 'DHC8-400'
|
84
|
+
b747 = 'B747200/300'
|
85
|
+
dc9 = 'DC-9-10'
|
86
|
+
right_records = [ dash, b747, dc9 ]
|
87
|
+
simple_ltd = LooseTightDictionary.new right_records, :logger => $logger, :tee => $tee
|
88
|
+
assert_equal dash, simple_ltd.left_to_right('DeHavilland Dash-8 DHC-400')
|
89
|
+
assert_equal b747, simple_ltd.left_to_right('Boeing 747-300')
|
90
|
+
assert_equal dc9, simple_ltd.left_to_right('McDonnell Douglas MD81/DC-9')
|
91
|
+
end
|
92
|
+
|
71
93
|
should "call it a mismatch if you hit a blank positive" do
|
72
94
|
@positives.push [@a_left[0], '']
|
73
95
|
assert_raises(LooseTightDictionary::Mismatch) do
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 6
|
9
|
+
version: 0.0.6
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Seamus Abshere
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-05-
|
17
|
+
date: 2010-05-13 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -115,6 +115,7 @@ files:
|
|
115
115
|
- README.rdoc
|
116
116
|
- Rakefile
|
117
117
|
- VERSION
|
118
|
+
- examples/first_name_matching.rb
|
118
119
|
- examples/icao-bts.rb
|
119
120
|
- examples/icao-bts.xls
|
120
121
|
- lib/loose_tight_dictionary.rb
|
@@ -153,4 +154,5 @@ summary: Allows iterative development of dictionaries for big data sets.
|
|
153
154
|
test_files:
|
154
155
|
- test/helper.rb
|
155
156
|
- test/test_loose_tight_dictionary.rb
|
157
|
+
- examples/first_name_matching.rb
|
156
158
|
- examples/icao-bts.rb
|