loose_tight_dictionary 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -1,6 +1,45 @@
1
1
  = loose_tight_dictionary
2
2
 
3
- Description goes here.
3
+ Match things based on string similarity (using the Pair Distance algorithm) and regular expressions.
4
+
5
+ = Quickstart
6
+
7
+ >> right_records = [ 'seamus', 'andy', 'ben' ]
8
+ => [...]
9
+ >> left_record = 'Shamus Heaney'
10
+ => [...]
11
+ >> d = LooseTightDictionary.new right_records
12
+ => [...]
13
+ >> puts d.left_to_right left_record
14
+ => 'seamus'
15
+
16
+ Try running the included example file:
17
+
18
+ $ ruby examples/first_name_matching.rb
19
+ Left side (input)
20
+ ====================
21
+ Mr. Seamus
22
+ Sr. Andy
23
+ Master BenT
24
+
25
+ Right side (output)
26
+ ====================
27
+ seamus
28
+ andy
29
+ ben
30
+
31
+ Results
32
+ ====================
33
+ Left record (input) Right record (output) Prefix used (if any) Score
34
+ Mr. Seamus seamus NULL 0.666666666666667
35
+ Sr. Andy andy NULL 0.5
36
+ Master BenT ben NULL 0.2
37
+
38
+ = Improving dictionaries
39
+
40
+ Similarity matching will only get you so far.
41
+
42
+ TODO: regex usage
4
43
 
5
44
  == Note on Patches/Pull Requests
6
45
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.5
1
+ 0.0.6
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ # require 'loose_tight_dictionary'
4
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
5
+ right_side = [ 'seamus', 'andy', 'ben' ]
6
+ left_side = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT' ]
7
+
8
+ puts "Left side (input)"
9
+ puts "=" * 20
10
+ puts left_side
11
+ puts
12
+
13
+ puts "Right side (output)"
14
+ puts "=" * 20
15
+ puts right_side
16
+ puts
17
+
18
+ puts "Results"
19
+ puts "=" * 20
20
+ d = LooseTightDictionary.new right_side, :tee => STDOUT, :tee_format => :fixed_width
21
+ d.check left_side
22
+
23
+ puts d.left_to_right 'Shamus Heaney'
@@ -39,10 +39,11 @@ class LooseTightDictionary
39
39
  include Amatch
40
40
 
41
41
  attr_reader :right_records
42
- attr_reader :logger
43
- attr_reader :tee
44
42
  attr_reader :case_sensitive
45
-
43
+
44
+ attr_accessor :logger
45
+ attr_accessor :tee
46
+ attr_accessor :tee_format
46
47
  attr_accessor :positives
47
48
  attr_accessor :negatives
48
49
  attr_accessor :left_reader
@@ -59,6 +60,7 @@ class LooseTightDictionary
59
60
  @negatives = options[:negatives]
60
61
  @logger = options[:logger]
61
62
  @tee = options[:tee]
63
+ @tee_format = options[:tee_format] || :fixed_width
62
64
  @case_sensitive = options[:case_sensitive] || false
63
65
  end
64
66
 
@@ -106,15 +108,24 @@ class LooseTightDictionary
106
108
  end
107
109
 
108
110
  def check(left_records)
109
- unless positives.present? or negatives.present?
110
- logger.andand.info "You didn't define any positives or negatives, so running check doesn't do anything"
111
- return
111
+ header = [ 'Left record (input)', 'Right record (output)', 'Prefix used (if any)', 'Score' ]
112
+ case tee_format
113
+ when :csv
114
+ tee.andand.puts header.flatten.to_csv
115
+ when :fixed_width
116
+ tee.andand.puts header.map { |i| i.to_s.ljust(30) }.join
112
117
  end
118
+
113
119
  left_records.each do |left_record|
114
120
  begin
115
121
  right_record = left_to_right left_record
116
122
  ensure
117
- tee.andand.puts [ read_left(left_record), read_right($ltd_0), $ltd_1 ].flatten.to_csv
123
+ case tee_format
124
+ when :csv
125
+ tee.andand.puts $ltd_1.flatten.to_csv
126
+ when :fixed_width
127
+ tee.andand.puts $ltd_1.map { |i| i.to_s.ljust(30) }.join if $ltd_1
128
+ end
118
129
  end
119
130
  end
120
131
  end
@@ -292,12 +303,24 @@ class LooseTightDictionary
292
303
 
293
304
  def read_left(left_record)
294
305
  return if left_record.nil?
295
- left_reader ? left_reader.call(left_record) : left_record[0]
306
+ if left_reader
307
+ left_reader.call(left_record)
308
+ elsif left_record.is_a?(String)
309
+ left_record
310
+ else
311
+ left_record[0]
312
+ end
296
313
  end
297
314
 
298
315
  def read_right(right_record)
299
316
  return if right_record.nil?
300
- right_reader ? right_reader.call(right_record) : right_record[0]
317
+ if right_reader
318
+ right_reader.call(right_record)
319
+ elsif right_record.is_a?(String)
320
+ right_record
321
+ else
322
+ right_record[0]
323
+ end
301
324
  end
302
325
 
303
326
  # Thanks William James!
@@ -68,6 +68,28 @@ class TestLooseTightDictionary < Test::Unit::TestCase
68
68
  end
69
69
 
70
70
  if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
71
+ # the example from the readme, considerably uglier here
72
+ should "check a simple table" do
73
+ @right = [ 'seamus', 'andy', 'ben' ]
74
+ @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
75
+ left = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
76
+
77
+ assert_nothing_raised do
78
+ ltd.check left
79
+ end
80
+ end
81
+
82
+ should "treat a String as a full record if passed through" do
83
+ dash = 'DHC8-400'
84
+ b747 = 'B747200/300'
85
+ dc9 = 'DC-9-10'
86
+ right_records = [ dash, b747, dc9 ]
87
+ simple_ltd = LooseTightDictionary.new right_records, :logger => $logger, :tee => $tee
88
+ assert_equal dash, simple_ltd.left_to_right('DeHavilland Dash-8 DHC-400')
89
+ assert_equal b747, simple_ltd.left_to_right('Boeing 747-300')
90
+ assert_equal dc9, simple_ltd.left_to_right('McDonnell Douglas MD81/DC-9')
91
+ end
92
+
71
93
  should "call it a mismatch if you hit a blank positive" do
72
94
  @positives.push [@a_left[0], '']
73
95
  assert_raises(LooseTightDictionary::Mismatch) do
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 5
9
- version: 0.0.5
8
+ - 6
9
+ version: 0.0.6
10
10
  platform: ruby
11
11
  authors:
12
12
  - Seamus Abshere
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-05-03 00:00:00 -04:00
17
+ date: 2010-05-13 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -115,6 +115,7 @@ files:
115
115
  - README.rdoc
116
116
  - Rakefile
117
117
  - VERSION
118
+ - examples/first_name_matching.rb
118
119
  - examples/icao-bts.rb
119
120
  - examples/icao-bts.xls
120
121
  - lib/loose_tight_dictionary.rb
@@ -153,4 +154,5 @@ summary: Allows iterative development of dictionaries for big data sets.
153
154
  test_files:
154
155
  - test/helper.rb
155
156
  - test/test_loose_tight_dictionary.rb
157
+ - examples/first_name_matching.rb
156
158
  - examples/icao-bts.rb