dwc_agent 1.5.0.2 → 1.5.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1da23de86c03d0926470ba28781b793d5dc4af411d0e99fda7388168d3ee33af
4
- data.tar.gz: 5a5feafd10724e9d9745da67edd9a177c62a4bc942e18aeec75c2357b559718b
3
+ metadata.gz: a63a84b3c095994d4b5053ddb389cfc1e7e0375f51b6cbb2668742bc4381a0da
4
+ data.tar.gz: 4f4cc7668f64196458bc126e8dc81753afd92ee77ce729f241f96fa5df39315c
5
5
  SHA512:
6
- metadata.gz: bf2f99ecf700de1c93db82f210083ddff4fc4b58c6185b387f6197444e3a7a84e088ae6b2dd5cb696a3f534b32bbec1ac11c73a2a15205655edaf6fe5ef2db44
7
- data.tar.gz: 65ece7814dd700b63a59762967487919a7f570f27a2aca717f3cabe736200b90a6c1bc060f285f642826643d467ef337e18ad99fb4b3f3f3a5ee7fb4ce232ef8
6
+ metadata.gz: da8f649cbc9d4ddf82b66e044e0f9e6a6354d5f613ecc2bf1e5cf900a37868178e3056f9c84311050738ec3b3dd7b7466df2ea25b5cebd3595821c35f634eb29
7
+ data.tar.gz: e52cec918b39c1aceb010e55ba0d5feaa7891234368fbdb56abd48c36c0945a5de6db5e89ee1371ee8240c3030caf4bae268257b0a780036df54f9521c3a6f1e
@@ -18,10 +18,6 @@ module DwcAgent
18
18
  def clean(parsed_namae)
19
19
  blank_name = { title: nil, appellation: nil, given: nil, particle: nil, family: nil, suffix: nil }
20
20
 
21
- if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
22
- return blank_name
23
- end
24
-
25
21
  if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
26
22
  return blank_name
27
23
  end
@@ -30,7 +26,7 @@ module DwcAgent
30
26
  return blank_name
31
27
  end
32
28
 
33
- if parsed_namae.given && parsed_namae.given.length > 25
29
+ if parsed_namae.given && parsed_namae.given.length > 35
34
30
  return blank_name
35
31
  end
36
32
 
@@ -79,6 +75,10 @@ module DwcAgent
79
75
  parsed_namae.given = NameCase(parsed_namae.given)
80
76
  end
81
77
 
78
+ if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
79
+ return blank_name
80
+ end
81
+
82
82
  parsed_namae.normalize_initials
83
83
 
84
84
  family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
@@ -3,18 +3,20 @@ module DwcAgent
3
3
  ^[\[{(]|
4
4
  [\]})]\??$|
5
5
  (?i:acc\s?\#)|
6
+ [,;]?\s*(?i:1st|2nd|3rd|[4-9]th)|
6
7
  \s*?\d+\.\d+|
7
8
  \b\d+\(?(?i:[[:alpha:]])\)?\b|
8
9
  \b[,;]?\s*(?i:et\.?\s+al|&\s+al)\.?|
10
+ \b[,;]?\s*(?i:etal)\.?|
9
11
  \b\s+(bis|ter)(\b|\z)|
10
12
  \bu\.\s*a\.|
11
- \b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
13
+ \b[,;]?\s*(?i:and|&)?\s*(?i:others|party)\s*\b|
12
14
  \b[,;]?\s*(?i:etc)\.?|
13
15
  \b[,;]?\s*(?i:on)\b|
14
16
  \b[,;]?\s*(?i:unkn?own)\b|
15
17
  \b[,;]?\s*(?i:n/a)\b|
16
18
  \b[,;]?\s*(?i:ann?onymous)\b|
17
- \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed?)\)?\b|
19
+ \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed?|presumably)\)?\b|
18
20
  \b[,;]?\s*(?i:importer|gift)\:?\b|
19
21
  \b[,;]?\s*(?i:string)\b|
20
22
  \b[,;]?\s*(?i:person\s*string)\b|
@@ -53,29 +55,32 @@ module DwcAgent
53
55
  (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
54
56
  \b\s*(?i:maybe)\s*\b|
55
57
  \b\s*(?i:prob)\.\s*\b|
56
- \(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
58
+ \(?[,]?\s*?(?i:(local)?\s?collectors?|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
57
59
  \b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
58
60
  (?i:Engº|Agrº|Fcº|Drº|Mº|Profº|Dº|Fº)|
59
61
  (?i:fide)\:?\s*\b|
62
+ (?i:first\s+name\s+unknown)|
60
63
  (?i:game\s+dept)\.?\s*\b|
61
64
  (?i:see\s+notes?\s*(inside)?)|
62
65
  (?i:see\s+letter\s+enclosed)|
63
66
  (?i:(by)?\s+correspondance)|
64
- (?i:pers\.?\s+comm\.?)|
67
+ (?i:pers\.?\s*comm\.?)|
65
68
  (?i:crossed\s+out)|
66
69
  \(?(?i:source)\(?|
67
70
  (?i:according\s+to)|
68
71
  (?i:lanuv)\d+|
72
+ \b\s*name\b|
73
+ \b\s*lost\b|
69
74
  (?i:nswobs)|
70
75
  ORCID|
71
76
  MRI(\s|-)PAS|
72
77
  urn\:qm\.qld\.gov\.au\:collector|
73
78
  (?i:University\s+of\s+(Southern\s+)?California(,\s+Berkeley)?)|
74
- (?i:Field\s+Museum\s+of\s+Natural\s+History)|
75
- (?i:American\s+Museum\s+of\s+Natural\s+History)|
79
+ (?i:field\s+museum\s+of\s+natural\s+history)|
80
+ (?i:american\s+museum\s+of\s+natural\s+history)|
76
81
  (?i:The\s+Paleontological\s+Research\s+Institution)|
77
82
  (?i:museums?\s+victoria)|
78
- \b\s*(?i:United\s+States|Russia)\s*\b|
83
+ \b\s*(?i:united\s+states|russia)\s*\b|
79
84
  (?i:revised|photograph|fruits\s+only)|
80
85
  -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
81
86
  -?\s*(?i:synonym(y|ie))|
@@ -83,11 +88,14 @@ module DwcAgent
83
88
  \b(?i:to\s+(sub)?spp?)\.?|
84
89
  (?i:nom\.?\s+rev\.?)|
85
90
  FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|CSIRO|ACAD|USGS|NAWQA|
91
+ \b,?\s*(?i:para|topo|syn)?(?i:type)|
86
92
  AFSC\/POLISH\s+SORTING\s+CTR\.?|
87
93
  (?i:university|museum|exhibits?)|
88
94
  (?i:uqam)|
89
95
  (?i:sem\s+(colec?tor|data))|
90
96
  (?i:no\s+coll\.?(ector)?)|
97
+ (?i:not?)\s+(?i:name|date|details?|specific)?\s*?(?i:given|name|date|noted)|
98
+ (?i:non?)\s+(?i:specificato)|
91
99
  \b[,;]\s+\d+\z|
92
100
  ["!@?]|
93
101
  [,]?\d+|
@@ -113,19 +121,22 @@ module DwcAgent
113
121
  [–|ǀ∣|│&+\/;:]|
114
122
  \s+-\s+|
115
123
  \s+a\.\s+|
116
- \b(e|y|i|en|et|or|per|for)\s*\b|
124
+ \b(con|e|y|i|en|et|or|per|for)\s*\b|
117
125
  \b(?i:and|with)\s*\b|
118
126
  \b(?i:annotated(\s+by)?)\s*\b|
119
127
  \b(?i:coll\.)\s*\b|
120
128
  \b(?i:communicate?d(\s+to)?)\s*\b|
121
129
  \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
130
+ \b(?i:confirmada)(\s+por)?\s*\b|
122
131
  \b(?i:checked?(\s+by)?)\s*\b|
123
132
  \b(?i:det\.?(\s+by)?)\s*\b|
124
133
  \b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
125
134
  \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
126
135
  \b(?i:in?dentified(\s+by)?)\s*\b|
127
136
  \b(?i:in\s+part(\s+by)?)\s*\b|
137
+ \b(?i:och)\s*\b|
128
138
  \b(?i:prep\.?\s+(?i:by)?)\s*\b|
139
+ \b(?i:purchased?)(\s+by)?\s*\b|
129
140
  \b(?i:redet\.?(\s+by?)?)\s*\b|
130
141
  \b(?i:reidentified(\s+by)?)\s*\b|
131
142
  \b(?i:stet)\s*\b|
@@ -155,16 +166,24 @@ module DwcAgent
155
166
  '}' => '',
156
167
  '@' => '',
157
168
  '%' => '',
158
- '\\' => ''
169
+ '\\' => '',
170
+ '´' => '\'',
171
+ '+' => ' | '
159
172
  }
160
173
 
161
174
  PHRASE_SUBS = {
162
- 'prof\.' => 'Prof. ',
163
- '\, ph\.d\.' => ' Ph.D.',
164
- '\, bro\.' => ' Bro.',
165
- ' jr\.\,' => ' Jr.;',
166
- ' jr\,' => ' Jr.;',
167
- '\-jr' => ' Jr.'
175
+ ', ph.d.' => ' Ph.D.',
176
+ ', Ph.D.' => ' Ph.D.',
177
+ ', bro.' => ' Bro.',
178
+ ', Jr.,' => ' Jr.;',
179
+ ', Jr.' => ' Jr.',
180
+ ',Jr.' => ' Jr.',
181
+ ', Sr.' => ' Sr.',
182
+ ',Sr.' => ' Sr.',
183
+ ' jr.,' => ' Jr.;',
184
+ ' jr,' => ' Jr.;',
185
+ '-jr' => ' Jr.',
186
+ '-Jr' => ' Jr.'
168
187
  }
169
188
 
170
189
  COMPLEX_SEPARATORS = %r{
@@ -229,7 +248,7 @@ module DwcAgent
229
248
  (?i:though)|
230
249
  (?i:texas\s+instruments?)\s*?(for)?|
231
250
  (?:tropical)|
232
- (?i:toward|seen at)|
251
+ (?i:toward|seen\s+at)|
233
252
  (?i:unidentified|unspecified|unk?nown?|unnamed|unread|unmistak|no agent)|
234
253
  (?i:urn\:)|
235
254
  (?i:usda|ucla)|
@@ -238,29 +257,47 @@ module DwcAgent
238
257
  }x
239
258
 
240
259
  FAMILY_BLACKLIST = [
260
+ "ap",
241
261
  "da",
262
+ "de",
242
263
  "de'",
243
264
  "del",
244
265
  "der",
266
+ "di",
267
+ "do",
268
+ "dos",
245
269
  "du",
246
270
  "el",
271
+ "le",
272
+ "la",
247
273
  "van",
248
274
  "von",
249
275
  "the",
250
276
  "of",
251
277
  "adjustment",
278
+ "annotator",
252
279
  "available",
253
280
  "arachnology",
254
281
  "catalogue",
255
282
  "curators",
256
283
  "data",
284
+ "details",
285
+ "determiner",
257
286
  "determination",
258
287
  "dissection",
259
288
  "entered",
289
+ "erased",
290
+ "indecipherable",
260
291
  "nomenclatural",
261
292
  "orig",
262
293
  "registration",
263
- "science"
294
+ "science",
295
+ "wg",
296
+ "wm",
297
+ "wn",
298
+ "zw",
299
+ "zz",
300
+ "z-"
264
301
  ]
265
302
 
266
303
  GIVEN_BLACKLIST = [
@@ -268,7 +305,7 @@ module DwcAgent
268
305
  "has not"
269
306
  ]
270
307
 
271
- TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
308
+ TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|major|capt|cmdr|lt|sgt|cpl|pvt|proff?|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
272
309
 
273
310
  APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
274
311
 
@@ -20,7 +20,7 @@ module DwcAgent
20
20
  @strip_out_regex = Regexp.new STRIP_OUT.to_s
21
21
  @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
22
22
  @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
23
- @phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.join('|').to_s, Regexp::IGNORECASE
23
+ @phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
24
24
  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
25
25
  @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
26
26
  end
@@ -4,7 +4,7 @@ module DwcAgent
4
4
  MAJOR = 1
5
5
  MINOR = 5
6
6
  PATCH = 0
7
- BUILD = 2
7
+ BUILD = 7
8
8
 
9
9
  def self.version
10
10
  [MAJOR, MINOR, PATCH, BUILD].compact.join('.').freeze
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.0.2
4
+ version: 1.5.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-09 00:00:00.000000000 Z
11
+ date: 2020-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae
@@ -102,7 +102,7 @@ homepage: https://github.com/bionomia/dwc_agent
102
102
  licenses:
103
103
  - MIT
104
104
  metadata: {}
105
- post_install_message:
105
+ post_install_message:
106
106
  rdoc_options:
107
107
  - "--encoding"
108
108
  - UTF-8
@@ -119,8 +119,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
119
  - !ruby/object:Gem::Version
120
120
  version: '0'
121
121
  requirements: []
122
- rubygems_version: 3.0.6
123
- signing_key:
122
+ rubygems_version: 3.1.2
123
+ signing_key:
124
124
  specification_version: 4
125
125
  summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
126
126
  test_files: []