dwc_agent 1.4.1 → 1.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e2819869504dd7262508c63978dae9b03cd6380838bf20f1e5ffecb8464b610a
4
- data.tar.gz: 3d1afff77e4bea9995cec5a3f72687646cdb6c9f7637bef8444f7a45df4ab556
3
+ metadata.gz: 1cf058ee8ca5956a4d05ca0122d5ee9a7497b304576bd4af92fd0f0597846a08
4
+ data.tar.gz: 3ef7bafb6a3bd9b9af826f161448b259739ec791b4158e5704f6d4da98af48fe
5
5
  SHA512:
6
- metadata.gz: 223cdf7a80a54e102da1f35d0b98d420d5a3b4ca1a11494ac3feca73a80d86f83a092b3c8b5dac1e5079a56cb5c69c3771876a091770cb4c9e45de0fc905d177
7
- data.tar.gz: 042d702364e285c9e629180802ecd2f1898af1c50ec6fbdcacefa8f117d734db7843fd17e0f816771fcef47c7839ccfa8014d963f6798eaec25e7a3854b88289
6
+ metadata.gz: 0be4b1c78b8fbf61c1a5363ce32efa780372b394bf88e9fd5a5fc42be77eea564d865637feaca08b63712d8780c515db2408e01967519fc014a3b6f5468793c9
7
+ data.tar.gz: 37095ef2920f06e6152aa58aab26034ce5b00d15778a8ae3270c7672c7e53ba2853223f7f67bfbfc5953d06050b8f2a8c8a2a1f0c3e41ba002aa18411a57092c
@@ -22,6 +22,10 @@ module DwcAgent
22
22
  return blank_name
23
23
  end
24
24
 
25
+ if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
26
+ return blank_name
27
+ end
28
+
25
29
  if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
26
30
  return blank_name
27
31
  end
@@ -38,9 +42,9 @@ module DwcAgent
38
42
  return blank_name
39
43
  end
40
44
 
41
- if parsed_namae.given &&
42
- parsed_namae.family &&
43
- parsed_namae.family.count(".") > 0 &&
45
+ if parsed_namae.given &&
46
+ parsed_namae.family &&
47
+ parsed_namae.family.count(".") > 0 &&
44
48
  parsed_namae.family.length - parsed_namae.family.count(".") <= 3
45
49
  given = parsed_namae.given
46
50
  family = parsed_namae.family
@@ -48,9 +52,9 @@ module DwcAgent
48
52
  parsed_namae.given = family
49
53
  end
50
54
 
51
- if parsed_namae.given &&
52
- parsed_namae.family &&
53
- parsed_namae.family.length <=3 &&
55
+ if parsed_namae.given &&
56
+ parsed_namae.family &&
57
+ parsed_namae.family.length <=3 &&
54
58
  parsed_namae.family == parsed_namae.family.upcase &&
55
59
  parsed_namae.given[-1] != "."
56
60
  given = parsed_namae.given
@@ -59,9 +63,9 @@ module DwcAgent
59
63
  parsed_namae.given = family
60
64
  end
61
65
 
62
- if parsed_namae.given &&
63
- (parsed_namae.given == parsed_namae.given.upcase ||
64
- parsed_namae.given == parsed_namae.given.downcase) &&
66
+ if parsed_namae.given &&
67
+ (parsed_namae.given == parsed_namae.given.upcase ||
68
+ parsed_namae.given == parsed_namae.given.downcase) &&
65
69
  !parsed_namae.given.include?(".") &&
66
70
  parsed_namae.given.tr(".","").length >= 4
67
71
  parsed_namae.given = NameCase(parsed_namae.given)
@@ -115,8 +119,12 @@ module DwcAgent
115
119
  return blank_name
116
120
  end
117
121
 
122
+ if !given.nil? && GIVEN_BLACKLIST.any?{ |s| s.casecmp(given) == 0 }
123
+ return blank_name
124
+ end
125
+
118
126
  { given: given, family: family, particle: particle }
119
127
  end
120
128
 
121
129
  end
122
- end
130
+ end
@@ -1,7 +1,7 @@
1
1
  module DwcAgent
2
2
  STRIP_OUT = %r{
3
3
  ^[\[{(]|
4
- [\]})]$|
4
+ [\]})]\??$|
5
5
  \s*?\d+\.\d+|
6
6
  \b\d+\(?(?i:[[:alpha:]])\)?\b|
7
7
  \b[,;]?\s*(?i:et\.?\s+al)\.?|
@@ -13,9 +13,8 @@ module DwcAgent
13
13
  \b[,;]?\s*(?i:unkn?own)\b|
14
14
  \b[,;]?\s*(?i:n/a)\b|
15
15
  \b[,;]?\s*(?i:ann?onymous)\b|
16
- \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|illegible|scripsit)\)?\b|
16
+ \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
17
17
  \b[,;]?\s*(?i:importer|gift)\:?\b|
18
- \b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
19
18
  \b[,;]?\s*(?i:string)\b|
20
19
  \b[,;]?\s*(?i:person\s*string)\b|
21
20
  \b[,;]?\s*(?i:colls)\.(\b|\z)|
@@ -48,10 +47,12 @@ module DwcAgent
48
47
  \d+\s+(?i:Oct|Octob(er|re))\.?\b|
49
48
  \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
50
49
  \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
50
+ \b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
51
51
  (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
52
52
  \b\s*(?i:maybe)\s*\b|
53
53
  \b\s*(?i:prob)\.\s*\b|
54
54
  \(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
55
+ \b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
55
56
  (?i:fide)\:?\s*\b|
56
57
  (?i:game\s+dept)\.?\s*\b|
57
58
  (?i:see\s+notes?\s*(inside)?)|
@@ -66,22 +67,24 @@ module DwcAgent
66
67
  ORCID|
67
68
  MRI(\s|-)PAS|
68
69
  urn\:qm\.qld\.gov\.au\:collector|
69
- (?i:University\s+of\s+California)\,?\s+?(?i:Berkeley)?|
70
+ (?i:University\s+of\s+(Southern\s+)?California(,\s+Berkeley)?)|
70
71
  (?i:Field\s+Museum\s+of\s+Natural\s+History)|
71
72
  (?i:American\s+Museum\s+of\s+Natural\s+History)|
72
73
  (?i:The\s+Paleontological\s+Research\s+Institution)|
73
74
  (?i:museum\s+victoria)|
75
+ \b\s*(?i:United\s+States|Russia)\s*\b|
74
76
  (?i:revised|photograph|fruits\s+only)|
75
77
  -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
76
78
  -?\s*(?i:synonym(y|ie))|
77
79
  \b\s*\(?(?i:(fe)?male)\)?\s*\b|
78
80
  \b(?i:to\s+(sub)?spp?)\.?|
79
81
  (?i:nom\.?\s+rev\.?)|
80
- FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
82
+ FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|CSIRO|
81
83
  AFSC\/POLISH\s+SORTING\s+CTR\.?|
82
84
  (?i:university|museum|exhibits?)|
83
85
  (?i:uqam)|
84
86
  (?i:sem\s+(colec?tor|data))|
87
+ (?i:no\s+coll\.?(ector)?)|
85
88
  \b[,;]\s+\d+\z|
86
89
  ["!@?]|
87
90
  [,]?\d+|
@@ -133,8 +136,6 @@ module DwcAgent
133
136
  '|' => ' | ',
134
137
  '(' => ' ',
135
138
  ')' => ' ',
136
- '[' => ' ',
137
- ']' => ' ',
138
139
  '?' => '',
139
140
  '!' => '',
140
141
  '=' => '',
@@ -152,10 +153,12 @@ module DwcAgent
152
153
  }
153
154
 
154
155
  PHRASE_SUBS = {
155
- 'Dr\.' => 'Dr. ',
156
- 'Mr\.' => 'Mr. ',
157
- 'Mrs\.' => 'Mrs. ',
158
- 'Prof\.' => 'Prof. '
156
+ 'dr\.' => 'Dr. ',
157
+ 'mr\.' => 'Mr. ',
158
+ 'mrs\.' => 'Mrs. ',
159
+ 'prof\.' => 'Prof. ',
160
+ '\, ph\.d\.' => ' Ph.D.',
161
+ '\, bro\.' => ' Bro.'
159
162
  }
160
163
 
161
164
  COMPLEX_SEPARATORS = %r{
@@ -193,13 +196,15 @@ module DwcAgent
193
196
  (?i:geographic)|
194
197
  (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
195
198
  (?i:univ\.)|
196
- (?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
199
+ (?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
197
200
  (?i:non\s+pr(é|e)cis(é|e))|
201
+ (?i:no\s+consta)|
198
202
  (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
199
203
  (?i:not?\s+(entered|stated))|
200
204
  (?i:nomenclatur(e|al)\s+adjustment)|
201
205
  (?i:not\s+available)|
202
206
  (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
207
+ (?i:popa\s+observers?)|
203
208
  (?i:recreation|culture)|
204
209
  (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
205
210
  (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
@@ -207,7 +212,7 @@ module DwcAgent
207
212
  (?i:sequence\s+data)|
208
213
  (?i:size|large|colou?r)\s+|
209
214
  (?i:skeleton)|
210
- (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|exchange|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
215
+ (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
211
216
  (?i:submersible)|
212
217
  (?i:synonymy?)|(topo|syn|holo)type|
213
218
  (?i:systematic|perspective)|
@@ -226,21 +231,36 @@ module DwcAgent
226
231
  }x
227
232
 
228
233
  FAMILY_BLACKLIST = [
234
+ "da",
235
+ "de'",
236
+ "del",
229
237
  "der",
238
+ "du",
239
+ "el",
230
240
  "van",
231
241
  "von",
232
242
  "the",
233
243
  "of",
234
- "curators",
235
- "nomenclatural",
236
244
  "adjustment",
237
245
  "available",
246
+ "arachnology",
247
+ "catalogue",
248
+ "curators",
238
249
  "data",
250
+ "determination",
251
+ "dissection",
252
+ "entered",
253
+ "nomenclatural",
239
254
  "orig",
240
- "science",
241
- "catalogue"
255
+ "registration",
256
+ "science"
257
+ ]
258
+
259
+ GIVEN_BLACKLIST = [
260
+ "not any",
261
+ "has not"
242
262
  ]
243
263
 
244
- TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
264
+ TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
245
265
 
246
266
  end
@@ -8,7 +8,7 @@ module DwcAgent
8
8
  end
9
9
 
10
10
  def initialize
11
- options = {
11
+ options = {
12
12
  prefer_comma_as_separator: true,
13
13
  separator: SPLIT_BY,
14
14
  title: TITLE
@@ -17,12 +17,12 @@ module DwcAgent
17
17
 
18
18
  @strip_out_regex = Regexp.new STRIP_OUT.to_s
19
19
  @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
20
- @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join('\\')].to_s
21
- @phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
20
+ @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
21
+ @phrase_subs_regex = Regexp.new((PHRASE_SUBS.keys.join('|')).to_s, Regexp::IGNORECASE)
22
22
  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
23
23
  @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
24
24
  end
25
-
25
+
26
26
  # Parses the passed-in string and returns a list of names.
27
27
  #
28
28
  # @param names [String] the name or names to be parsed
@@ -30,6 +30,7 @@ module DwcAgent
30
30
  def parse(name)
31
31
  return [] if name.nil? || name == ""
32
32
  name.gsub!(@strip_out_regex, ' ')
33
+ name.gsub!(/\[|\]/, '')
33
34
  name.gsub!(@char_subs_regex, CHAR_SUBS)
34
35
  name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
35
36
  name.gsub!(@add_separators_regex, '\1 \2')
@@ -41,4 +42,4 @@ module DwcAgent
41
42
  end
42
43
 
43
44
  end
44
- end
45
+ end
@@ -3,7 +3,7 @@ module DwcAgent
3
3
 
4
4
  MAJOR = 1
5
5
  MINOR = 4
6
- PATCH = 1
6
+ PATCH = 6
7
7
  BUILD = nil
8
8
 
9
9
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.1
4
+ version: 1.4.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-25 00:00:00.000000000 Z
11
+ date: 2020-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae
@@ -119,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
119
  - !ruby/object:Gem::Version
120
120
  version: '0'
121
121
  requirements: []
122
- rubygems_version: 3.0.3
122
+ rubygems_version: 3.0.6
123
123
  signing_key:
124
124
  specification_version: 4
125
125
  summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy