mericson-people 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Matthew Ericson
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,7 @@
1
+ = people
2
+
3
+ Description goes here.
4
+
5
+ == Copyright
6
+
7
+ Copyright (c) 2009 Matthew Ericson. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "people"
8
+ gem.summary = %Q{Matts Name Parser}
9
+ gem.email = "mericson@ericson.net"
10
+ gem.homepage = "http://github.com/mericson/people"
11
+ gem.authors = ["Matthew Ericson"]
12
+
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
17
+ end
18
+
19
+ require 'rake/testtask'
20
+ Rake::TestTask.new(:test) do |test|
21
+ test.libs << 'lib' << 'test'
22
+ test.pattern = 'test/**/*_test.rb'
23
+ test.verbose = true
24
+ end
25
+
26
+ begin
27
+ require 'rcov/rcovtask'
28
+ Rcov::RcovTask.new do |test|
29
+ test.libs << 'test'
30
+ test.pattern = 'test/**/*_test.rb'
31
+ test.verbose = true
32
+ end
33
+ rescue LoadError
34
+ task :rcov do
35
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
36
+ end
37
+ end
38
+
39
+
40
+ task :default => :test
41
+
42
+ require 'rake/rdoctask'
43
+ Rake::RDocTask.new do |rdoc|
44
+ if File.exist?('VERSION.yml')
45
+ config = YAML.load(File.read('VERSION.yml'))
46
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
47
+ else
48
+ version = ""
49
+ end
50
+
51
+ rdoc.rdoc_dir = 'rdoc'
52
+ rdoc.title = "people #{version}"
53
+ rdoc.rdoc_files.include('README*')
54
+ rdoc.rdoc_files.include('lib/**/*.rb')
55
+ end
56
+
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :minor: 1
3
+ :patch: 0
4
+ :major: 0
data/lib/people.rb ADDED
@@ -0,0 +1,492 @@
1
+ module People
2
+
3
+ class NameParser
4
+
5
+ def initialize( opts={} )
6
+
7
+ @name_chars = "A-Za-z0-9\\-\\'"
8
+ @nc = @name_chars
9
+
10
+ @opts = {
11
+ :strip_mr => true,
12
+ :strip_mrs => false,
13
+ :case_mode => 'proper',
14
+ :couples => false
15
+ }.merge! opts
16
+
17
+ ## constants
18
+
19
+ @titles = [ 'Mr\.? and Mrs\.? ',
20
+ 'Mrs\.? ',
21
+ 'M/s\.? ',
22
+ 'Ms\.? ',
23
+ 'Miss\.? ',
24
+ 'Mme\.? ',
25
+ 'Mr\.? ',
26
+ 'Messrs ',
27
+ 'Mister ',
28
+ 'Mast(\.|er)? ',
29
+ 'Ms?gr\.? ',
30
+ 'Sir ',
31
+ 'Lord ',
32
+ 'Lady ',
33
+ 'Madam(e)? ',
34
+ 'Dame ',
35
+
36
+ # Medical
37
+ 'Dr\.? ',
38
+ 'Doctor ',
39
+ 'Sister ',
40
+ 'Matron ',
41
+
42
+ # Legal
43
+ 'Judge ',
44
+ 'Justice ',
45
+
46
+ # Police
47
+ 'Det\.? ',
48
+ 'Insp\.? ',
49
+
50
+ # Military
51
+ 'Brig(adier)? ',
52
+ 'Capt(\.|ain)? ',
53
+ 'Commander ',
54
+ 'Commodore ',
55
+ 'Cdr\.? ',
56
+ 'Colonel ',
57
+ 'Gen(\.|eral)? ',
58
+ 'Field Marshall ',
59
+ 'Fl\.? Off\.? ',
60
+ 'Flight Officer ',
61
+ 'Flt Lt ',
62
+ 'Flight Lieutenant ',
63
+ 'Pte\. ',
64
+ 'Private ',
65
+ 'Sgt\.? ',
66
+ 'Sargent ',
67
+ 'Air Commander ',
68
+ 'Air Commodore ',
69
+ 'Air Marshall ',
70
+ 'Lieutenant Colonel ',
71
+ 'Lt\.? Col\.? ',
72
+ 'Lt\.? Gen\.? ',
73
+ 'Lt\.? Cdr\.? ',
74
+ 'Lieutenant ',
75
+ '(Lt|Leut|Lieut)\.? ',
76
+ 'Major General ',
77
+ 'Maj\.? Gen\.?',
78
+ 'Major ',
79
+ 'Maj\.? ',
80
+
81
+ # Religious
82
+ 'Rabbi ',
83
+ 'Brother ',
84
+ 'Father ',
85
+ 'Chaplain ',
86
+ 'Pastor ',
87
+ 'Bishop ',
88
+ 'Mother Superior ',
89
+ 'Mother ',
90
+ 'Most Rever[e|a]nd ',
91
+ 'Very Rever[e|a]nd ',
92
+ 'Mt\.? Revd\.? ',
93
+ 'V\.? Revd?\.? ',
94
+ 'Rever[e|a]nd ',
95
+ 'Revd?\.? ',
96
+
97
+ # Other
98
+ 'Prof(\.|essor)? ',
99
+ 'Ald(\.|erman)? '
100
+ ];
101
+
102
+
103
+ @suffixes = [
104
+ 'Jn?r\.?,? Esq\.?',
105
+ 'Sn?r\.?,? Esq\.?',
106
+ 'I{1,3},? Esq\.?',
107
+
108
+ 'Jn?r\.?,? M\.?D\.?',
109
+ 'Sn?r\.?,? M\.?D\.?',
110
+ 'I{1,3},? M\.?D\.?',
111
+
112
+ 'Sn?r\.?', # Senior
113
+ 'Jn?r\.?', # Junior
114
+
115
+ 'Esq(\.|uire)?',
116
+ 'Esquire.',
117
+ 'Attorney at Law.',
118
+ 'Attorney-at-Law.',
119
+
120
+ 'Ph\.?d\.?',
121
+ 'C\.?P\.?A\.?',
122
+
123
+ 'XI{1,3}', # 11th, 12th, 13th
124
+ 'X', # 10th
125
+ 'IV', # 4th
126
+ 'VI{1,3}', # 6th, 7th, 8th
127
+ 'V', # 5th
128
+ 'IX', # 9th
129
+ 'I{1,3}\.?', # 1st, 2nd, 3rd
130
+ 'M\.?D\.?', # M.D.
131
+ 'D.?M\.?D\.?' # M.D.
132
+ ];
133
+
134
+ @last_name_p = "((;.+)|(((Mc|Mac|Des|Dell[ae]|Del|De La|De Los|Da|Di|Du|La|Le|Lo|St\.|Den|Von|Van|Von Der|Van De[nr]) )?([#{@nc}]+)))";
135
+ @mult_name_p = "((;.+)|(((Mc|Mac|Des|Dell[ae]|Del|De La|De Los|Da|Di|Du|La|Le|Lo|St\.|Den|Von|Van|Von Der|Van De[nr]) )?([#{@nc} ]+)))";
136
+
137
+ @seen = 0
138
+ @parsed = 0;
139
+
140
+ end
141
+
142
+ def parse( name )
143
+
144
+ @seen += 1
145
+
146
+ clean = ''
147
+ out = Hash.new( "" )
148
+
149
+ out[:orig] = name.dup
150
+
151
+ name = clean( name )
152
+
153
+ # strip trailing suffices
154
+ @suffixes.each do |sfx|
155
+ sfx_p = Regexp.new( "(.+), (#{sfx})$", true )
156
+ ##puts sfx_p
157
+ name.gsub!( sfx_p, "\\1 \\2" )
158
+ end
159
+
160
+ name.gsub!( /Mr\.? \& Mrs\.?/i, "Mr. and Mrs." )
161
+
162
+ # Flip last and first if contain comma
163
+ name.gsub!( /;/, "" )
164
+ name.gsub!( /(.+),(.+)/, "\\2 ;\\1" )
165
+
166
+
167
+ name.gsub!( /,/, "" )
168
+ name.strip!
169
+
170
+ if @opts[:couples]
171
+ name.gsub!( / +and +/i, " \& " )
172
+ end
173
+
174
+
175
+
176
+ if @opts[:couples] && name.match( /\&/ )
177
+
178
+ names = name.split( / *& */ )
179
+ a = names[0]
180
+ b = names[1]
181
+
182
+ out[:title2] = get_title( b );
183
+ out[:suffix2] = get_suffix( b );
184
+
185
+ b.strip!
186
+
187
+ parts = get_name_parts( b )
188
+
189
+ out[:parsed2] = parts[0]
190
+ out[:parse_type2] = parts[1]
191
+ out[:first2] = parts[2]
192
+ out[:middle2] = parts[3]
193
+ out[:last] = parts[4]
194
+
195
+ out[:title] = get_title( a );
196
+ out[:suffix] = get_suffix( a );
197
+
198
+ a.strip!
199
+ a += " "
200
+
201
+ parts = get_name_parts( a, true )
202
+
203
+ out[:parsed] = parts[0]
204
+ out[:parse_type] = parts[1]
205
+ out[:first] = parts[2]
206
+ out[:middle] = parts[3]
207
+
208
+ if out[:parsed] && out[:parsed2]
209
+ out[:multiple] = true
210
+ else
211
+ out = Hash.new( "" )
212
+ end
213
+
214
+
215
+ else
216
+
217
+ out[:title] = get_title( name );
218
+ out[:suffix] = get_suffix( name );
219
+
220
+ parts = get_name_parts( name )
221
+
222
+ out[:parsed] = parts[0]
223
+ out[:parse_type] = parts[1]
224
+ out[:first] = parts[2]
225
+ out[:middle] = parts[3]
226
+ out[:last] = parts[4]
227
+
228
+ end
229
+
230
+
231
+ if @opts[:case_mode] == 'proper'
232
+ [ :title, :first, :middle, :last, :suffix, :clean, :first2, :middle2, :title2, :suffix2 ].each do |part|
233
+ out[part] = proper( out[part] )
234
+ end
235
+
236
+ elsif @opts[:case_mode] == 'upper'
237
+ [ :title, :first, :middle, :last, :suffix, :clean, :first2, :middle2, :title2, :suffix2 ].each do |part|
238
+ out[part].upcase!
239
+ end
240
+
241
+ else
242
+
243
+ end
244
+
245
+ if out[:parsed]
246
+ @parsed += 1
247
+ end
248
+
249
+ out[:clean] = name
250
+
251
+
252
+
253
+
254
+
255
+ return {
256
+ :title => "",
257
+ :first => "",
258
+ :middle => "",
259
+ :last => "",
260
+ :suffix => "",
261
+
262
+ :title2 => "",
263
+ :first2 => "",
264
+ :middle2 => "",
265
+ :suffix2 => "",
266
+
267
+ :clean => "",
268
+
269
+ :parsed => false,
270
+ :parse_type => "",
271
+
272
+ :parsed => false,
273
+ :parse_type => "",
274
+
275
+ :parsed2 => false,
276
+ :parse_type2 => "",
277
+
278
+ :multiple => false
279
+ }.merge( out )
280
+
281
+ end
282
+
283
+
284
+ def clean( s )
285
+
286
+ # remove illegal characters
287
+ s.gsub!( /[^A-Za-z0-9\-\'\.&\/ \,]/, "" )
288
+ # remove repeating spaces
289
+ s.gsub!( / +/, " " )
290
+ s.gsub!( /\s+/, " " )
291
+ s.strip!
292
+ s
293
+
294
+ end
295
+
296
+ def get_title( name )
297
+
298
+ @titles.each do |title|
299
+ title_p = Regexp.new( "^(#{title})(.+)", true )
300
+ if m = name.match( title_p )
301
+
302
+ title = m[1]
303
+ name.replace( m[-1].strip )
304
+ return title
305
+ end
306
+
307
+ end
308
+
309
+ return ""
310
+ end
311
+
312
+ def get_suffix( name )
313
+
314
+ @suffixes.each do |sfx|
315
+ sfx_p = Regexp.new( "(.+) (#{sfx})$", true )
316
+ if name.match( sfx_p )
317
+ name.replace $1.strip
318
+ suffix = $2
319
+ return $2
320
+ end
321
+
322
+ end
323
+
324
+ return ""
325
+ end
326
+
327
+ def get_name_parts( name, no_last_name = false )
328
+
329
+ first = ""
330
+ middle = ""
331
+ last = ""
332
+
333
+ if no_last_name
334
+ last_name_p = ''
335
+ mult_name_p = ''
336
+ else
337
+ last_name_p = @last_name_p
338
+ mult_name_p = @mult_name_p
339
+ end
340
+
341
+ parsed = false
342
+
343
+ # M ERICSON
344
+ if name.match( /^([A-Za-z])\.? (#{last_name_p})$/i )
345
+ first = $1;
346
+ middle = '';
347
+ last = $2;
348
+ parsed = true
349
+ parse_type = 1;
350
+
351
+ # M E ERICSON
352
+ elsif name.match( /^([A-Za-z])\.? ([A-Za-z])\.? (#{last_name_p})$/i )
353
+ first = $1;
354
+ middle = $2;
355
+ last = $3;
356
+ parsed = true
357
+ parse_type = 2;
358
+
359
+ # M.E. ERICSON
360
+ elsif name.match( /^([A-Za-z])\.([A-Za-z])\. (#{last_name_p})$/i )
361
+ first = $1;
362
+ middle = $2;
363
+ last = $3;
364
+ parsed = true
365
+ parse_type = 3;
366
+
367
+ # M E E ERICSON
368
+ elsif name.match( /^([A-Za-z])\.? ([A-Za-z])\.? ([A-Za-z])\.? (#{last_name_p})$/i )
369
+ first = $1;
370
+ middle = $2 + ' ' + $3;
371
+ last = $4;
372
+ parsed = true
373
+ parse_type = 4;
374
+
375
+ # M EDWARD ERICSON
376
+ elsif name.match( /^([A-Za-z])\.? ([#{@nc}]+) (#{last_name_p})$/i )
377
+ first = $1;
378
+ middle = $2;
379
+ last = $3;
380
+ parsed = true
381
+ parse_type = 5;
382
+
383
+ # MATTHEW E ERICSON
384
+ elsif name.match( /^([#{@nc}]+) ([A-Za-z])\.? (#{last_name_p})$/i )
385
+ first = $1;
386
+ middle = $2;
387
+ last = $3;
388
+ parsed = true
389
+ parse_type = 6;
390
+
391
+ # MATTHEW E E ERICSON
392
+ elsif name.match( /^([#{@nc}]+) ([A-Za-z])\.? ([A-Za-z])\.? (#{last_name_p})$/i )
393
+ first = $1;
394
+ middle = $2 + ' ' + $3;
395
+ last = $4;
396
+ parsed = true
397
+ parse_type = 7;
398
+
399
+ # MATTHEW E.E. ERICSON
400
+ elsif name.match( /^([#{@nc}]+) ([A-Za-z]\.[A-Za-z]\.) (#{last_name_p})$/i )
401
+ first = $1;
402
+ middle = $2;
403
+ last = $3;
404
+ parsed = true
405
+ parse_type = 8;
406
+
407
+ # MATTHEW ERICSON
408
+ elsif name.match( /^([#{@nc}]+) (#{last_name_p})$/i )
409
+ first = $1;
410
+ middle = '';
411
+ last = $2;
412
+ parsed = true
413
+ parse_type = 9;
414
+
415
+ # MATTHEW EDWARD ERICSON
416
+ elsif name.match( /^([#{@nc}]+) ([#{@nc}]+) (#{last_name_p})$/i )
417
+ first = $1;
418
+ middle = $2;
419
+ last = $3;
420
+ parsed = true
421
+ parse_type = 10;
422
+
423
+ # MATTHEW E. SHEIE ERICSON
424
+ elsif name.match( /^([#{@nc}]+) ([A-Za-z])\.? ($multNamePat)$/i )
425
+ first = $1;
426
+ middle = $2;
427
+ last = $3;
428
+ parsed = true
429
+ parse_type = 11;
430
+ end
431
+
432
+ last.gsub!( /;/, "" )
433
+
434
+ return [ parsed, parse_type, first, middle, last ];
435
+
436
+ end
437
+
438
+
439
+
440
+ def proper ( name )
441
+
442
+ fixed = name.downcase
443
+
444
+ # Now uppercase first letter of every word. By checking on word boundaries,
445
+ # we will account for apostrophes (D'Angelo) and hyphenated names
446
+ fixed.gsub!( /\b(\w+)/ ) { |m| m.match( /^[ixv]$+/i ) ? m.upcase : m.capitalize }
447
+
448
+ # Name case Macs and Mcs
449
+ # Exclude names with 1-2 letters after prefix like Mack, Macky, Mace
450
+ # Exclude names ending in a,c,i,o,z or j, typically Polish or Italian
451
+
452
+ if fixed.match( /\bMac[a-z]{2,}[^a|c|i|o|z|j]\b/i )
453
+
454
+ fixed.gsub!( /\b(Mac)([a-z]+)/i ) do |m|
455
+ $1 + $2.capitalize
456
+ end
457
+
458
+ # Now correct for "Mac" exceptions
459
+ fixed.gsub!( /MacHin/i, 'Machin' )
460
+ fixed.gsub!( /MacHlin/i, 'Machlin' )
461
+ fixed.gsub!( /MacHar/i, 'Machar' )
462
+ fixed.gsub!( /MacKle/i, 'Mackle' )
463
+ fixed.gsub!( /MacKlin/i, 'Macklin' )
464
+ fixed.gsub!( /MacKie/i, 'Mackie' )
465
+
466
+ # Portuguese
467
+ fixed.gsub!( /MacHado/i, 'Machado' );
468
+
469
+ # Lithuanian
470
+ fixed.gsub!( /MacEvicius/i, 'Macevicius' )
471
+ fixed.gsub!( /MacIulis/i, 'Maciulis' )
472
+ fixed.gsub!( /MacIas/i, 'Macias' )
473
+
474
+ elsif fixed.match( /\bMc/i )
475
+ fixed.gsub!( /\b(Mc)([a-z]+)/i ) do |m|
476
+ $1 + $2.capitalize
477
+ end
478
+
479
+ end
480
+
481
+ # Exceptions (only 'Mac' name ending in 'o' ?)
482
+ fixed.gsub!( /Macmurdo/i, 'MacMurdo' )
483
+
484
+ return fixed
485
+
486
+ end
487
+
488
+ end
489
+
490
+
491
+ end
492
+
@@ -0,0 +1,7 @@
1
+ require 'test_helper'
2
+
3
+ class PeopleTest < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'people'
8
+
9
+ class Test::Unit::TestCase
10
+ end
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mericson-people
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matthew Ericson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-14 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: mericson@ericson.net
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README.rdoc
25
+ files:
26
+ - LICENSE
27
+ - README.rdoc
28
+ - Rakefile
29
+ - VERSION.yml
30
+ - lib/people.rb
31
+ - test/people_test.rb
32
+ - test/test_helper.rb
33
+ has_rdoc: true
34
+ homepage: http://github.com/mericson/people
35
+ post_install_message:
36
+ rdoc_options:
37
+ - --charset=UTF-8
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: "0"
51
+ version:
52
+ requirements: []
53
+
54
+ rubyforge_project:
55
+ rubygems_version: 1.2.0
56
+ signing_key:
57
+ specification_version: 2
58
+ summary: Matts Name Parser
59
+ test_files:
60
+ - test/people_test.rb
61
+ - test/test_helper.rb