loose_tight_dictionary 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -1,6 +1,45 @@
1
1
  = loose_tight_dictionary
2
2
 
3
- Description goes here.
3
+ Match things based on string similarity (using the Pair Distance algorithm) and regular expressions.
4
+
5
+ = Quickstart
6
+
7
+ >> right_records = [ 'seamus', 'andy', 'ben' ]
8
+ => [...]
9
+ >> left_record = 'Shamus Heaney'
10
+ => [...]
11
+ >> d = LooseTightDictionary.new right_records
12
+ => [...]
13
+ >> puts d.left_to_right left_record
14
+ => 'seamus'
15
+
16
+ Try running the included example file:
17
+
18
+ $ ruby examples/first_name_matching.rb
19
+ Left side (input)
20
+ ====================
21
+ Mr. Seamus
22
+ Sr. Andy
23
+ Master BenT
24
+
25
+ Right side (output)
26
+ ====================
27
+ seamus
28
+ andy
29
+ ben
30
+
31
+ Results
32
+ ====================
33
+ Left record (input) Right record (output) Prefix used (if any) Score
34
+ Mr. Seamus seamus NULL 0.666666666666667
35
+ Sr. Andy andy NULL 0.5
36
+ Master BenT ben NULL 0.2
37
+
38
+ = Improving dictionaries
39
+
40
+ Similarity matching will only get you so far.
41
+
42
+ TODO: regex usage
4
43
 
5
44
  == Note on Patches/Pull Requests
6
45
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.5
1
+ 0.0.6
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ # require 'loose_tight_dictionary'
4
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'loose_tight_dictionary.rb'))
5
+ right_side = [ 'seamus', 'andy', 'ben' ]
6
+ left_side = [ 'Mr. Seamus', 'Sr. Andy', 'Master BenT' ]
7
+
8
+ puts "Left side (input)"
9
+ puts "=" * 20
10
+ puts left_side
11
+ puts
12
+
13
+ puts "Right side (output)"
14
+ puts "=" * 20
15
+ puts right_side
16
+ puts
17
+
18
+ puts "Results"
19
+ puts "=" * 20
20
+ d = LooseTightDictionary.new right_side, :tee => STDOUT, :tee_format => :fixed_width
21
+ d.check left_side
22
+
23
+ puts d.left_to_right 'Shamus Heaney'
@@ -39,10 +39,11 @@ class LooseTightDictionary
39
39
  include Amatch
40
40
 
41
41
  attr_reader :right_records
42
- attr_reader :logger
43
- attr_reader :tee
44
42
  attr_reader :case_sensitive
45
-
43
+
44
+ attr_accessor :logger
45
+ attr_accessor :tee
46
+ attr_accessor :tee_format
46
47
  attr_accessor :positives
47
48
  attr_accessor :negatives
48
49
  attr_accessor :left_reader
@@ -59,6 +60,7 @@ class LooseTightDictionary
59
60
  @negatives = options[:negatives]
60
61
  @logger = options[:logger]
61
62
  @tee = options[:tee]
63
+ @tee_format = options[:tee_format] || :fixed_width
62
64
  @case_sensitive = options[:case_sensitive] || false
63
65
  end
64
66
 
@@ -106,15 +108,24 @@ class LooseTightDictionary
106
108
  end
107
109
 
108
110
  def check(left_records)
109
- unless positives.present? or negatives.present?
110
- logger.andand.info "You didn't define any positives or negatives, so running check doesn't do anything"
111
- return
111
+ header = [ 'Left record (input)', 'Right record (output)', 'Prefix used (if any)', 'Score' ]
112
+ case tee_format
113
+ when :csv
114
+ tee.andand.puts header.flatten.to_csv
115
+ when :fixed_width
116
+ tee.andand.puts header.map { |i| i.to_s.ljust(30) }.join
112
117
  end
118
+
113
119
  left_records.each do |left_record|
114
120
  begin
115
121
  right_record = left_to_right left_record
116
122
  ensure
117
- tee.andand.puts [ read_left(left_record), read_right($ltd_0), $ltd_1 ].flatten.to_csv
123
+ case tee_format
124
+ when :csv
125
+ tee.andand.puts $ltd_1.flatten.to_csv
126
+ when :fixed_width
127
+ tee.andand.puts $ltd_1.map { |i| i.to_s.ljust(30) }.join if $ltd_1
128
+ end
118
129
  end
119
130
  end
120
131
  end
@@ -292,12 +303,24 @@ class LooseTightDictionary
292
303
 
293
304
  def read_left(left_record)
294
305
  return if left_record.nil?
295
- left_reader ? left_reader.call(left_record) : left_record[0]
306
+ if left_reader
307
+ left_reader.call(left_record)
308
+ elsif left_record.is_a?(String)
309
+ left_record
310
+ else
311
+ left_record[0]
312
+ end
296
313
  end
297
314
 
298
315
  def read_right(right_record)
299
316
  return if right_record.nil?
300
- right_reader ? right_reader.call(right_record) : right_record[0]
317
+ if right_reader
318
+ right_reader.call(right_record)
319
+ elsif right_record.is_a?(String)
320
+ right_record
321
+ else
322
+ right_record[0]
323
+ end
301
324
  end
302
325
 
303
326
  # Thanks William James!
@@ -68,6 +68,28 @@ class TestLooseTightDictionary < Test::Unit::TestCase
68
68
  end
69
69
 
70
70
  if ENV['OLD'] == 'true' or ENV['ALL'] == 'true'
71
+ # the example from the readme, considerably uglier here
72
+ should "check a simple table" do
73
+ @right = [ 'seamus', 'andy', 'ben' ]
74
+ @positives = [ [ 'seamus', 'Mr. Seamus Abshere' ] ]
75
+ left = [ 'Mr. Seamus Abshere', 'Sr. Andy Rossmeissl', 'Master BenT' ]
76
+
77
+ assert_nothing_raised do
78
+ ltd.check left
79
+ end
80
+ end
81
+
82
+ should "treat a String as a full record if passed through" do
83
+ dash = 'DHC8-400'
84
+ b747 = 'B747200/300'
85
+ dc9 = 'DC-9-10'
86
+ right_records = [ dash, b747, dc9 ]
87
+ simple_ltd = LooseTightDictionary.new right_records, :logger => $logger, :tee => $tee
88
+ assert_equal dash, simple_ltd.left_to_right('DeHavilland Dash-8 DHC-400')
89
+ assert_equal b747, simple_ltd.left_to_right('Boeing 747-300')
90
+ assert_equal dc9, simple_ltd.left_to_right('McDonnell Douglas MD81/DC-9')
91
+ end
92
+
71
93
  should "call it a mismatch if you hit a blank positive" do
72
94
  @positives.push [@a_left[0], '']
73
95
  assert_raises(LooseTightDictionary::Mismatch) do
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 5
9
- version: 0.0.5
8
+ - 6
9
+ version: 0.0.6
10
10
  platform: ruby
11
11
  authors:
12
12
  - Seamus Abshere
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-05-03 00:00:00 -04:00
17
+ date: 2010-05-13 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -115,6 +115,7 @@ files:
115
115
  - README.rdoc
116
116
  - Rakefile
117
117
  - VERSION
118
+ - examples/first_name_matching.rb
118
119
  - examples/icao-bts.rb
119
120
  - examples/icao-bts.xls
120
121
  - lib/loose_tight_dictionary.rb
@@ -153,4 +154,5 @@ summary: Allows iterative development of dictionaries for big data sets.
153
154
  test_files:
154
155
  - test/helper.rb
155
156
  - test/test_loose_tight_dictionary.rb
157
+ - examples/first_name_matching.rb
156
158
  - examples/icao-bts.rb