dwc_agent 1.4.8 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a14318e8eacabb639a9df951afd01ed96f97ea3a1f9a47149633d7ba111ee16a
4
- data.tar.gz: c2f6d20c4fa91489ce76fcf2569d228eb8912f31b904008b5b90388e14857aaa
3
+ metadata.gz: 87cfd6b6ab9ee83156f9503e48669691d486f143f64a623f93a40f064d2dc7f7
4
+ data.tar.gz: 7f40287aa6bf7e7b90408d72594fd9ab5498401c5ee402da6f2a78163f011a9a
5
5
  SHA512:
6
- metadata.gz: 6def3a412971675055d57b094a00c15c649f6f108ec69c22463dd537d7f90f710148a5eeb8dca57d7036b8598892fdae12eb07cb5107770bbbeb1f23a8f84e52
7
- data.tar.gz: d5daf2a9bbcd1a52d4a9a16ccfe8ad962fd77906189355651d7130f1c1830d4b65d616e41a29f13e45c5bd7db8d7917a1ae485b5ec3b3f3f004896f321779fba
6
+ metadata.gz: 4cb91ffc7530bd1a93fbbc70c716ad96d49cab139bf95507e3a688566ce997cb71510b13013bb07cb52dffe2ce7419d294392028eedd28f1caba8b032eb98c9a
7
+ data.tar.gz: ae226c63106a2b41899b6d4c85c92aa372d8d1cbce8a90023cf9b8d1a4c153a28bd33a9eb8064c492fa79d5d4229ec8979d5ca4f751d97fc1ec09132b0e46ca3
@@ -16,7 +16,7 @@ module DwcAgent
16
16
  # @param parsed_namae [Object] the namae object
17
17
  # @return [Hash] the given, family hash
18
18
  def clean(parsed_namae)
19
- blank_name = { given: nil, family: nil, particle: nil }
19
+ blank_name = { title: nil, appellation: nil, given: nil, particle: nil, family: nil, suffix: nil }
20
20
 
21
21
  if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
22
22
  return blank_name
@@ -84,6 +84,9 @@ module DwcAgent
84
84
  family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
85
85
  given = parsed_namae.given.strip rescue nil
86
86
  particle = parsed_namae.particle.strip rescue nil
87
+ appellation = parsed_namae.appellation.strip rescue nil
88
+ suffix = parsed_names.suffix.strip rescue nil
89
+ title = parsed_names.title.strip rescue nil
87
90
 
88
91
  if !given.nil? && given.match(/[A-Z]\.[A-Za-z]{2,}/)
89
92
  given = given.gsub(".", ". ").strip
@@ -123,7 +126,7 @@ module DwcAgent
123
126
  return blank_name
124
127
  end
125
128
 
126
- { given: given, family: family, particle: particle }
129
+ { title: nil, appellation: nil, given: given, particle: particle, family: family, suffix: nil }
127
130
  end
128
131
 
129
132
  end
@@ -14,7 +14,7 @@ module DwcAgent
14
14
  \b[,;]?\s*(?i:unkn?own)\b|
15
15
  \b[,;]?\s*(?i:n/a)\b|
16
16
  \b[,;]?\s*(?i:ann?onymous)\b|
17
- \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
17
+ \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed?)\)?\b|
18
18
  \b[,;]?\s*(?i:importer|gift)\:?\b|
19
19
  \b[,;]?\s*(?i:string)\b|
20
20
  \b[,;]?\s*(?i:person\s*string)\b|
@@ -49,11 +49,13 @@ module DwcAgent
49
49
  \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
50
50
  \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
51
51
  \b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
52
+ \b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|China|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Denmark|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|France|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|Georgia|Germany|Ghana|Gibraltar|Greece|Greenland|Grenada|Guadeloupe|Guam|Guatemala|Guernsey|Guinea|Guinea-Bissau|Guyana|Haiti|Heard Island and McDonald Islands|Holy See|Honduras|Hong Kong|Hungary|Iceland|India|Indonesia|Iran \(Islamic Republic of\)|Iraq|Ireland|Isle of Man|Israel|Italy|Jamaica|Japan|Jersey|Jordan|Kazakhstan|Kenya|Kiribati|Korea \(Democratic People\'s Republic of\)|Korea \(Republic of\)|Kuwait|Kyrgyzstan|Lao People\'s Democratic Republic|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Macao|Macedonia (the former Yugoslav Republic of)|Madagascar|Malawi|Malaysia|Maldives|Mali|Malta|Marshall Islands|Martinique|Mauritania|Mauritius|Mayotte|Mexico|Micronesia \(Federated States of\)|Moldova \(Republic of\)|Monaco|Mongolia|Montenegro|Montserrat|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Caledonia|New Zealand|Nicaragua|Niger|Nigeria|Niue|Norfolk Island|Northern Mariana Islands|Norway|Oman|Pakistan|Palau|Palestine, State of|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Pitcairn|Poland|Portugal|Puerto Rico|Qatar|Réunion|Romania|Russian Federation|Rwanda|Saint Barthélemy|Saint Helena, Ascension and Tristan da Cunha|Saint Kitts and Nevis|Saint Lucia|Saint Martin \(French part\)|Saint Pierre and Miquelon|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Sint Maarten \(Dutch part\)|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Georgia and the South Sandwich Islands|South Sudan|Spain|Sri Lanka|Sudan|Suriname|Svalbard and Jan Mayen|Swaziland|Sweden|Switzerland|Syrian Arab Republic|Taiwan|Tajikistan|Tanzania, United Republic of|Thailand|Timor-Leste|Togo|Tokelau|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Turks and Caicos Islands|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom of Great Britain and Northern Ireland|United States of America|United States Minor Outlying Islands|Uruguay|Uzbekistan|Vanuatu|Venezuela \(Bolivarian Republic of\)|Viet Nam|Virgin Islands \(British\)|Virgin Islands \(U\.S\.\)|Wallis and Futuna|Western Sahara|Yemen|Zambia|Zimbabwe)|
52
53
  (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
53
54
  \b\s*(?i:maybe)\s*\b|
54
55
  \b\s*(?i:prob)\.\s*\b|
55
56
  \(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
56
57
  \b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
58
+ (?i:Engº|Agrº|Fcº|Drº|Mº|Profº|Dº|Fº)|
57
59
  (?i:fide)\:?\s*\b|
58
60
  (?i:game\s+dept)\.?\s*\b|
59
61
  (?i:see\s+notes?\s*(inside)?)|
@@ -137,7 +139,6 @@ module DwcAgent
137
139
  '|' => ' | ',
138
140
  'ǀ' => ' | ',
139
141
  '∣' => ' | ',
140
- '|' => ' | ',
141
142
  '│' => ' | ',
142
143
  '(' => ' ',
143
144
  ')' => ' ',
@@ -158,12 +159,12 @@ module DwcAgent
158
159
  }
159
160
 
160
161
  PHRASE_SUBS = {
161
- 'dr\.' => 'Dr. ',
162
- 'mr\.' => 'Mr. ',
163
- 'mrs\.' => 'Mrs. ',
164
162
  'prof\.' => 'Prof. ',
165
163
  '\, ph\.d\.' => ' Ph.D.',
166
- '\, bro\.' => ' Bro.'
164
+ '\, bro\.' => ' Bro.',
165
+ ' jr\.\,' => ' Jr.;',
166
+ ' jr\,' => ' Jr.;',
167
+ '\-jr' => ' Jr.'
167
168
  }
168
169
 
169
170
  COMPLEX_SEPARATORS = %r{
@@ -202,7 +203,7 @@ module DwcAgent
202
203
  (?i:geographic)|
203
204
  (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
204
205
  (?i:univ\.)|
205
- (?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
206
+ (?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker|gamekeeper)|
206
207
  (?i:non\s+pr(é|e)cis(é|e))|
207
208
  (?i:no\s+consta)|
208
209
  (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
@@ -267,6 +268,10 @@ module DwcAgent
267
268
  "has not"
268
269
  ]
269
270
 
270
- TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
271
+ TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
272
+
273
+ APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
274
+
275
+ SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/
271
276
 
272
277
  end
@@ -11,14 +11,16 @@ module DwcAgent
11
11
  options = {
12
12
  prefer_comma_as_separator: true,
13
13
  separator: SPLIT_BY,
14
- title: TITLE
14
+ title: TITLE,
15
+ appellation: APPELLATION,
16
+ suffix: SUFFIX
15
17
  }
16
18
  @namae = Namae::Parser.new(options)
17
19
 
18
20
  @strip_out_regex = Regexp.new STRIP_OUT.to_s
19
21
  @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
20
22
  @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
21
- @phrase_subs_regex = Regexp.new((PHRASE_SUBS.keys.join('|')).to_s, Regexp::IGNORECASE)
23
+ @phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.join('|').to_s, Regexp::IGNORECASE
22
24
  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
23
25
  @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
24
26
  end
@@ -2,8 +2,8 @@ module DwcAgent
2
2
  class Version
3
3
 
4
4
  MAJOR = 1
5
- MINOR = 4
6
- PATCH = 8
5
+ MINOR = 5
6
+ PATCH = 0
7
7
  BUILD = nil
8
8
 
9
9
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.8
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-05-19 00:00:00.000000000 Z
11
+ date: 2020-08-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae
@@ -98,7 +98,7 @@ files:
98
98
  - lib/dwc_agent/similarity.rb
99
99
  - lib/dwc_agent/utility.rb
100
100
  - lib/dwc_agent/version.rb
101
- homepage: https://github.com/dshorthouse/dwc_agent
101
+ homepage: https://github.com/bionomia/dwc_agent
102
102
  licenses:
103
103
  - MIT
104
104
  metadata: {}