loose_tight_dictionary 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +40 -1
- data/VERSION +1 -1
- data/examples/first_name_matching.rb +23 -0
- data/lib/loose_tight_dictionary.rb +32 -9
- data/test/test_loose_tight_dictionary.rb +22 -0
- metadata +5 -3
data/README.rdoc
CHANGED
|
@@ -1,6 +1,45 @@
|
|
|
1
1
|
= loose_tight_dictionary
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Match things based on string similarity (using the Pair Distance algorithm) and regular expressions.
|
|
4
|
+
|
|
5
|
+
= Quickstart
|
|
6
|
+
|
|
7
|
+
>> right_records = [ 'seamus', 'andy', 'ben' ]
|
|
8
|
+
=> [...]
|
|
9
|
+
>> left_record = 'Shamus Heaney'
|
|
10
|
+
=> [...]
|
|
11
|
+
>> d = LooseTightDictionary.new right_records
|
|
12
|
+
=> [...]
|
|
13
|
+
>> puts d.left_to_right left_record
|
|
14
|
+
=> 'seamus'
|
|
15
|
+
|
|
16
|
+
Try running the included example file:
|
|
17
|
+
|
|
18
|
+
$ ruby examples/first_name_matching.rb
|
|
19
|
+
Left side (input)
|
|
20
|
+
====================
|
|
21
|
+
Mr. Seamus
|
|
22
|
+
Sr. Andy
|
|
23
|
+
Master BenT
|
|
24
|
+
|
|
25
|
+
Right side (output)
|
|
26
|
+
====================
|
|
27
|
+
seamus
|
|
28
|
+
andy
|
|
29
|
+
ben
|
|
30
|
+
|
|
31
|
+
Results
|
|
32
|
+
====================
|
|
33
|
+
Left record (input) Right record (output) Prefix used (if any) Score
|
|
34
|
+
Mr. Seamus seamus NULL 0.666666666666667
|
|
35
|
+
Sr. Andy andy NULL 0.5
|
|
36
|
+
Master BenT ben NULL 0.2
|
|
37
|
+
|
|
38
|
+
= Improving dictionaries
|
|
39
|
+
|
|
40
|
+
Similarity matching will only get you so far.
|
|
41
|
+
|
|
42
|
+
TODO: regex usage
|
|
4
43
|
|
|
5
44
|
== Note on Patches/Pull Requests
|
|
6
45
|
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.0.
|
|
1
|
+
0.0.6
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
# require 'loose_tight_dictionary'
|
|
4
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
|
|
5
|
+
right_side = [ 'seamus', 'andy', 'ben' ]
|
|
6
|
+
left_side = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT' ]
|
|
7
|
+
|
|
8
|
+
puts "Left side (input)"
|
|
9
|
+
puts "=" * 20
|
|
10
|
+
puts left_side
|
|
11
|
+
puts
|
|
12
|
+
|
|
13
|
+
puts "Right side (output)"
|
|
14
|
+
puts "=" * 20
|
|
15
|
+
puts right_side
|
|
16
|
+
puts
|
|
17
|
+
|
|
18
|
+
puts "Results"
|
|
19
|
+
puts "=" * 20
|
|
20
|
+
d = LooseTightDictionary.new right_side, :tee => STDOUT, :tee_format => :fixed_width
|
|
21
|
+
d.check left_side
|
|
22
|
+
|
|
23
|
+
puts d.left_to_right 'Shamus Heaney'
|
|
@@ -39,10 +39,11 @@ class LooseTightDictionary
|
|
|
39
39
|
include Amatch
|
|
40
40
|
|
|
41
41
|
attr_reader :right_records
|
|
42
|
-
attr_reader :logger
|
|
43
|
-
attr_reader :tee
|
|
44
42
|
attr_reader :case_sensitive
|
|
45
|
-
|
|
43
|
+
|
|
44
|
+
attr_accessor :logger
|
|
45
|
+
attr_accessor :tee
|
|
46
|
+
attr_accessor :tee_format
|
|
46
47
|
attr_accessor :positives
|
|
47
48
|
attr_accessor :negatives
|
|
48
49
|
attr_accessor :left_reader
|
|
@@ -59,6 +60,7 @@ class LooseTightDictionary
|
|
|
59
60
|
@negatives = options[:negatives]
|
|
60
61
|
@logger = options[:logger]
|
|
61
62
|
@tee = options[:tee]
|
|
63
|
+
@tee_format = options[:tee_format] || :fixed_width
|
|
62
64
|
@case_sensitive = options[:case_sensitive] || false
|
|
63
65
|
end
|
|
64
66
|
|
|
@@ -106,15 +108,24 @@ class LooseTightDictionary
|
|
|
106
108
|
end
|
|
107
109
|
|
|
108
110
|
def check(left_records)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
111
|
+
header = [ 'Left record (input)', 'Right record (output)', 'Prefix used (if any)', 'Score' ]
|
|
112
|
+
case tee_format
|
|
113
|
+
when :csv
|
|
114
|
+
tee.andand.puts header.flatten.to_csv
|
|
115
|
+
when :fixed_width
|
|
116
|
+
tee.andand.puts header.map { |i| i.to_s.ljust(30) }.join
|
|
112
117
|
end
|
|
118
|
+
|
|
113
119
|
left_records.each do |left_record|
|
|
114
120
|
begin
|
|
115
121
|
right_record = left_to_right left_record
|
|
116
122
|
ensure
|
|
117
|
-
|
|
123
|
+
case tee_format
|
|
124
|
+
when :csv
|
|
125
|
+
tee.andand.puts $ltd_1.flatten.to_csv
|
|
126
|
+
when :fixed_width
|
|
127
|
+
tee.andand.puts $ltd_1.map { |i| i.to_s.ljust(30) }.join if $ltd_1
|
|
128
|
+
end
|
|
118
129
|
end
|
|
119
130
|
end
|
|
120
131
|
end
|
|
@@ -292,12 +303,24 @@ class LooseTightDictionary
|
|
|
292
303
|
|
|
293
304
|
def read_left(left_record)
|
|
294
305
|
return if left_record.nil?
|
|
295
|
-
|
|
306
|
+
if left_reader
|
|
307
|
+
left_reader.call(left_record)
|
|
308
|
+
elsif left_record.is_a?(String)
|
|
309
|
+
left_record
|
|
310
|
+
else
|
|
311
|
+
left_record[0]
|
|
312
|
+
end
|
|
296
313
|
end
|
|
297
314
|
|
|
298
315
|
def read_right(right_record)
|
|
299
316
|
return if right_record.nil?
|
|
300
|
-
|
|
317
|
+
if right_reader
|
|
318
|
+
right_reader.call(right_record)
|
|
319
|
+
elsif right_record.is_a?(String)
|
|
320
|
+
right_record
|
|
321
|
+
else
|
|
322
|
+
right_record[0]
|
|
323
|
+
end
|
|
301
324
|
end
|
|
302
325
|
|
|
303
326
|
# Thanks William James!
|
|
@@ -68,6 +68,28 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
|
68
68
|
end
|
|
69
69
|
|
|
70
70
|
if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
|
|
71
|
+
# the example from the readme, considerably uglier here
|
|
72
|
+
should "check a simple table" do
|
|
73
|
+
@right = [ 'seamus', 'andy', 'ben' ]
|
|
74
|
+
@positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
|
|
75
|
+
left = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
|
|
76
|
+
|
|
77
|
+
assert_nothing_raised do
|
|
78
|
+
ltd.check left
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
should "treat a String as a full record if passed through" do
|
|
83
|
+
dash = 'DHC8-400'
|
|
84
|
+
b747 = 'B747200/300'
|
|
85
|
+
dc9 = 'DC-9-10'
|
|
86
|
+
right_records = [ dash, b747, dc9 ]
|
|
87
|
+
simple_ltd = LooseTightDictionary.new right_records, :logger => $logger, :tee => $tee
|
|
88
|
+
assert_equal dash, simple_ltd.left_to_right('DeHavilland Dash-8 DHC-400')
|
|
89
|
+
assert_equal b747, simple_ltd.left_to_right('Boeing 747-300')
|
|
90
|
+
assert_equal dc9, simple_ltd.left_to_right('McDonnell Douglas MD81/DC-9')
|
|
91
|
+
end
|
|
92
|
+
|
|
71
93
|
should "call it a mismatch if you hit a blank positive" do
|
|
72
94
|
@positives.push [@a_left[0], '']
|
|
73
95
|
assert_raises(LooseTightDictionary::Mismatch) do
|
metadata
CHANGED
|
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
|
5
5
|
segments:
|
|
6
6
|
- 0
|
|
7
7
|
- 0
|
|
8
|
-
-
|
|
9
|
-
version: 0.0.
|
|
8
|
+
- 6
|
|
9
|
+
version: 0.0.6
|
|
10
10
|
platform: ruby
|
|
11
11
|
authors:
|
|
12
12
|
- Seamus Abshere
|
|
@@ -14,7 +14,7 @@ autorequire:
|
|
|
14
14
|
bindir: bin
|
|
15
15
|
cert_chain: []
|
|
16
16
|
|
|
17
|
-
date: 2010-05-
|
|
17
|
+
date: 2010-05-13 00:00:00 -04:00
|
|
18
18
|
default_executable:
|
|
19
19
|
dependencies:
|
|
20
20
|
- !ruby/object:Gem::Dependency
|
|
@@ -115,6 +115,7 @@ files:
|
|
|
115
115
|
- README.rdoc
|
|
116
116
|
- Rakefile
|
|
117
117
|
- VERSION
|
|
118
|
+
- examples/first_name_matching.rb
|
|
118
119
|
- examples/icao-bts.rb
|
|
119
120
|
- examples/icao-bts.xls
|
|
120
121
|
- lib/loose_tight_dictionary.rb
|
|
@@ -153,4 +154,5 @@ summary: Allows iterative development of dictionaries for big data sets.
|
|
|
153
154
|
test_files:
|
|
154
155
|
- test/helper.rb
|
|
155
156
|
- test/test_loose_tight_dictionary.rb
|
|
157
|
+
- examples/first_name_matching.rb
|
|
156
158
|
- examples/icao-bts.rb
|