dwc_agent 1.4.2 → 1.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 75ac9f77b1b4d5761881cafca45e66c2745eb47bb501ad9fab84cad6324b41ee
4
- data.tar.gz: 16057726f330cb033fe7e3af0af58dd4fbb2f6f9817a56c3e86c88da1fbccf61
3
+ metadata.gz: 2975bee8cb8675fa7b6e50bc45f90d4d21855a2a22fe72999220b80af4e9d54d
4
+ data.tar.gz: 06b764667fe3235983492f2182eb2dc90d4ad768382e1f1f5eb7384144180a86
5
5
  SHA512:
6
- metadata.gz: f628d3bfec046b54338eb08ec18cc5352dff7560e5e87e2c949378e66bcb44df84af637ea7c8b4d559504945b5682b6f019f2bde56d31552a1c4537902b2a71e
7
- data.tar.gz: 444b15c0d391e12d8ccd6823f8ae97688d9b20ed075323795e22acd7834add4e5068fcc2d4fc3729e36aa3bdc11f29f4a6f8af8c7fc1cfc34d1f294159212328
6
+ metadata.gz: 7d026fd7ffc15101bd5f994263c1950115fee1d5ee9a5af89f46ae1bfd5aa7e761216b77dea5a4b225503ed11ae889bc8853aafab98d700532b60723cfeefec1
7
+ data.tar.gz: fd2d7986eb1ea1456800f03c6bf78b31ef33e50e743e1c316b873334fa16767e23afa2c463bf3445e23573620b86746ea61e67c890b16d6b8b2673e01f8a6a84
@@ -1,10 +1,11 @@
1
1
  module DwcAgent
2
2
  STRIP_OUT = %r{
3
3
  ^[\[{(]|
4
- [\]})]$|
4
+ [\]})]\??$|
5
+ (?i:acc\s?\#)|
5
6
  \s*?\d+\.\d+|
6
7
  \b\d+\(?(?i:[[:alpha:]])\)?\b|
7
- \b[,;]?\s*(?i:et\.?\s+al)\.?|
8
+ \b[,;]?\s*(?i:et\.?\s+al|&\s+al)\.?|
8
9
  \b\s+(bis|ter)(\b|\z)|
9
10
  \bu\.\s*a\.|
10
11
  \b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
@@ -13,9 +14,8 @@ module DwcAgent
13
14
  \b[,;]?\s*(?i:unkn?own)\b|
14
15
  \b[,;]?\s*(?i:n/a)\b|
15
16
  \b[,;]?\s*(?i:ann?onymous)\b|
16
- \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|illegible|scripsit)\)?\b|
17
+ \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
17
18
  \b[,;]?\s*(?i:importer|gift)\:?\b|
18
- \b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
19
19
  \b[,;]?\s*(?i:string)\b|
20
20
  \b[,;]?\s*(?i:person\s*string)\b|
21
21
  \b[,;]?\s*(?i:colls)\.(\b|\z)|
@@ -48,10 +48,12 @@ module DwcAgent
48
48
  \d+\s+(?i:Oct|Octob(er|re))\.?\b|
49
49
  \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
50
50
  \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
51
+ \b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
51
52
  (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
52
53
  \b\s*(?i:maybe)\s*\b|
53
54
  \b\s*(?i:prob)\.\s*\b|
54
55
  \(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
56
+ \b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
55
57
  (?i:fide)\:?\s*\b|
56
58
  (?i:game\s+dept)\.?\s*\b|
57
59
  (?i:see\s+notes?\s*(inside)?)|
@@ -66,22 +68,24 @@ module DwcAgent
66
68
  ORCID|
67
69
  MRI(\s|-)PAS|
68
70
  urn\:qm\.qld\.gov\.au\:collector|
69
- (?i:University\s+of\s+California)\,?\s+?(?i:Berkeley)?|
71
+ (?i:University\s+of\s+(Southern\s+)?California(,\s+Berkeley)?)|
70
72
  (?i:Field\s+Museum\s+of\s+Natural\s+History)|
71
73
  (?i:American\s+Museum\s+of\s+Natural\s+History)|
72
74
  (?i:The\s+Paleontological\s+Research\s+Institution)|
73
- (?i:museum\s+victoria)|
75
+ (?i:museums?\s+victoria)|
76
+ \b\s*(?i:United\s+States|Russia)\s*\b|
74
77
  (?i:revised|photograph|fruits\s+only)|
75
78
  -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
76
79
  -?\s*(?i:synonym(y|ie))|
77
80
  \b\s*\(?(?i:(fe)?male)\)?\s*\b|
78
81
  \b(?i:to\s+(sub)?spp?)\.?|
79
82
  (?i:nom\.?\s+rev\.?)|
80
- FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
83
+ FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|CSIRO|ACAD|
81
84
  AFSC\/POLISH\s+SORTING\s+CTR\.?|
82
85
  (?i:university|museum|exhibits?)|
83
86
  (?i:uqam)|
84
87
  (?i:sem\s+(colec?tor|data))|
88
+ (?i:no\s+coll\.?(ector)?)|
85
89
  \b[,;]\s+\d+\z|
86
90
  ["!@?]|
87
91
  [,]?\d+|
@@ -133,8 +137,6 @@ module DwcAgent
133
137
  '|' => ' | ',
134
138
  '(' => ' ',
135
139
  ')' => ' ',
136
- '[' => ' ',
137
- ']' => ' ',
138
140
  '?' => '',
139
141
  '!' => '',
140
142
  '=' => '',
@@ -152,10 +154,12 @@ module DwcAgent
152
154
  }
153
155
 
154
156
  PHRASE_SUBS = {
155
- 'Dr\.' => 'Dr. ',
156
- 'Mr\.' => 'Mr. ',
157
- 'Mrs\.' => 'Mrs. ',
158
- 'Prof\.' => 'Prof. '
157
+ 'dr\.' => 'Dr. ',
158
+ 'mr\.' => 'Mr. ',
159
+ 'mrs\.' => 'Mrs. ',
160
+ 'prof\.' => 'Prof. ',
161
+ '\, ph\.d\.' => ' Ph.D.',
162
+ '\, bro\.' => ' Bro.'
159
163
  }
160
164
 
161
165
  COMPLEX_SEPARATORS = %r{
@@ -171,6 +175,7 @@ module DwcAgent
171
175
  (?i:average)|
172
176
  (?i:believe|unclear|ill?egible|none|suggested|(dis)?agrees?)|approach|
173
177
  (?i:barcod)|
178
+ (?i:BgWd)|
174
179
  (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
175
180
  (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
176
181
  (?i:carex|salix)|
@@ -193,13 +198,15 @@ module DwcAgent
193
198
  (?i:geographic)|
194
199
  (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
195
200
  (?i:univ\.)|
196
- (?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
201
+ (?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
197
202
  (?i:non\s+pr(é|e)cis(é|e))|
203
+ (?i:no\s+consta)|
198
204
  (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
199
205
  (?i:not?\s+(entered|stated))|
200
206
  (?i:nomenclatur(e|al)\s+adjustment)|
201
207
  (?i:not\s+available)|
202
208
  (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
209
+ (?i:popa\s+observers?)|
203
210
  (?i:recreation|culture)|
204
211
  (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
205
212
  (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
@@ -207,7 +214,7 @@ module DwcAgent
207
214
  (?i:sequence\s+data)|
208
215
  (?i:size|large|colou?r)\s+|
209
216
  (?i:skeleton)|
210
- (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|exchange|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
217
+ (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
211
218
  (?i:submersible)|
212
219
  (?i:synonymy?)|(topo|syn|holo)type|
213
220
  (?i:systematic|perspective)|
@@ -226,20 +233,29 @@ module DwcAgent
226
233
  }x
227
234
 
228
235
  FAMILY_BLACKLIST = [
236
+ "da",
237
+ "de'",
238
+ "del",
229
239
  "der",
240
+ "du",
241
+ "el",
230
242
  "van",
231
243
  "von",
232
244
  "the",
233
245
  "of",
234
- "curators",
235
- "nomenclatural",
236
246
  "adjustment",
237
247
  "available",
248
+ "arachnology",
249
+ "catalogue",
250
+ "curators",
238
251
  "data",
252
+ "determination",
253
+ "dissection",
254
+ "entered",
255
+ "nomenclatural",
239
256
  "orig",
240
- "science",
241
- "catalogue",
242
- "entered"
257
+ "registration",
258
+ "science"
243
259
  ]
244
260
 
245
261
  GIVEN_BLACKLIST = [
@@ -247,6 +263,6 @@ module DwcAgent
247
263
  "has not"
248
264
  ]
249
265
 
250
- TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
266
+ TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
251
267
 
252
268
  end
@@ -8,7 +8,7 @@ module DwcAgent
8
8
  end
9
9
 
10
10
  def initialize
11
- options = {
11
+ options = {
12
12
  prefer_comma_as_separator: true,
13
13
  separator: SPLIT_BY,
14
14
  title: TITLE
@@ -17,12 +17,12 @@ module DwcAgent
17
17
 
18
18
  @strip_out_regex = Regexp.new STRIP_OUT.to_s
19
19
  @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
20
- @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join('\\')].to_s
21
- @phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
20
+ @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
21
+ @phrase_subs_regex = Regexp.new((PHRASE_SUBS.keys.join('|')).to_s, Regexp::IGNORECASE)
22
22
  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
23
23
  @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
24
24
  end
25
-
25
+
26
26
  # Parses the passed-in string and returns a list of names.
27
27
  #
28
28
  # @param names [String] the name or names to be parsed
@@ -30,6 +30,7 @@ module DwcAgent
30
30
  def parse(name)
31
31
  return [] if name.nil? || name == ""
32
32
  name.gsub!(@strip_out_regex, ' ')
33
+ name.gsub!(/\[|\]/, '')
33
34
  name.gsub!(@char_subs_regex, CHAR_SUBS)
34
35
  name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
35
36
  name.gsub!(@add_separators_regex, '\1 \2')
@@ -41,4 +42,4 @@ module DwcAgent
41
42
  end
42
43
 
43
44
  end
44
- end
45
+ end
@@ -3,7 +3,7 @@ module DwcAgent
3
3
 
4
4
  MAJOR = 1
5
5
  MINOR = 4
6
- PATCH = 2
6
+ PATCH = 7
7
7
  BUILD = nil
8
8
 
9
9
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.2
4
+ version: 1.4.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-12-02 00:00:00.000000000 Z
11
+ date: 2020-03-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae