dwc_agent 0.4.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 988e79daa81edb97377cdaf65ad2e6aa79b6e524cd3c9ab940d7b4cc1c1c9885
4
- data.tar.gz: 7b6969b8da5e772858e68313e24954bd352c78f62e552d0b5b16a328b3d20e0c
3
+ metadata.gz: d56e8cb49d4857e324dbdda7d6cfa31e9a7a3079c40a12659ff6077d13308613
4
+ data.tar.gz: 7652bc849261aa4179a99e64a725a0f82d0b65b4d2e2885042ad35521887e705
5
5
  SHA512:
6
- metadata.gz: 5794e0e8b1d779a72e6d8ba599e85b196f870f16494e663830f3c265c3ee39101bf729e6670f0f8f7d5762dadd438b3843aab5d7bcd450bd1212d4c95f0c6183
7
- data.tar.gz: 91fca40528d0f20a698ea66602de134ba6c442e6ab7e51118003ba0afdf49eca1247caa33fcf025f9cced537dc6f2ecaad8a440f07d7d34232d1326bb2eaa213
6
+ metadata.gz: 99d384a56c180c8c5db64c5cc8cf072da272b427830875783a22d67279cd070af3bd0bd2b881e4a20018bdeba774d27e8adb8ae048668c28452171017b72c70b
7
+ data.tar.gz: edc59989136745e0b603039d08420b368ef92e95a0ab17b48bcabf439472c1252ec763f4f9c7a398b735995b340f897dc0f3d85a45c2ea1c9ccec643f050e904
@@ -16,9 +16,9 @@ module DwcAgent
16
16
  # @param parsed_namae [Object] the namae object
17
17
  # @return [Hash] the given, family hash
18
18
  def clean(parsed_namae)
19
- blank_name = { given: nil, family: nil }
19
+ blank_name = { given: nil, family: nil, particle: nil }
20
20
 
21
- if parsed_namae.family && FAMILY_BLACKLIST.include?(parsed_namae.family)
21
+ if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
22
22
  return blank_name
23
23
  end
24
24
 
@@ -99,6 +99,10 @@ module DwcAgent
99
99
  particle = nil
100
100
  end
101
101
 
102
+ if !particle.nil? && particle.include?(".")
103
+ particle = nil
104
+ end
105
+
102
106
  if !family.nil? && (family == family.upcase || family == family.downcase)
103
107
  family = NameCase(family)
104
108
  end
@@ -111,7 +115,11 @@ module DwcAgent
111
115
  return blank_name
112
116
  end
113
117
 
114
- { given: given, family: family }
118
+ if !family.nil? && FAMILY_BLACKLIST.any?{ |s| s.casecmp(family) == 0 }
119
+ return blank_name
120
+ end
121
+
122
+ { given: given, family: family, particle: particle }
115
123
  end
116
124
 
117
125
  end
@@ -20,7 +20,6 @@ module DwcAgent
20
20
  \b[,;]?\s*(?i:person\s*string)\b|
21
21
  \b[,;]?\s*(?i:colls)\.(\b|\z)|
22
22
  \b[,;]?\s*(?i:colln?)[:.]?(\b|\z)|
23
- (?i:no\s+(data|disponible))|
24
23
  \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
25
24
  [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
26
25
  May|Jun|Jul|Aug|Sept?|
@@ -78,10 +77,11 @@ module DwcAgent
78
77
  \b\s*\(?(?i:(fe)?male)\)?\s*\b|
79
78
  \b(?i:to\s+(sub)?spp?)\.?|
80
79
  (?i:nom\.?\s+rev\.?)|
81
- FNA|DAO|HUH|FDNMB|
80
+ FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
82
81
  AFSC\/POLISH\s+SORTING\s+CTR\.?|
83
82
  (?i:university|museum|exhibits?)|
84
83
  (?i:uqam)|
84
+ (?i:sem\s+(colec?tor|data))|
85
85
  \b[,;]\s+\d+\z|
86
86
  ["!@?]|
87
87
  [,]?\d+|
@@ -147,7 +147,8 @@ module DwcAgent
147
147
  '{' => '',
148
148
  '}' => '',
149
149
  '@' => '',
150
- '%' => ''
150
+ '%' => '',
151
+ '\\' => ''
151
152
  }
152
153
 
153
154
  PHRASE_SUBS = {
@@ -158,12 +159,17 @@ module DwcAgent
158
159
  }
159
160
 
160
161
  COMPLEX_SEPARATORS = %r{
161
- ^([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})\s+([A-Za-z]{4,},\s+(?:[A-Z]\.\s*){1,})$
162
+ ^(\S{4,},\s+(?:\S\.\s*){1,})\s+(\S{4,},\s+(?:\S\.\s*){1,})$
163
+ }x
164
+
165
+ CONTRACTED_LIST = %r{
166
+ ^(\S{1,}\.?)+\s+(?i:and|&)\s+(\S{1,}\.?)+\s*(.*)$
162
167
  }x
163
168
 
164
169
  BLACKLIST = %r{
165
170
  (?i:abundant)|
166
171
  (?i:adult|juvenile)|
172
+ (?i:administra(d|t)or)|
167
173
  (?i:anon)|
168
174
  (?i:australian?)|
169
175
  (?i:average)|
@@ -172,9 +178,10 @@ module DwcAgent
172
178
  (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
173
179
  (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
174
180
  (?i:carex|salix)|
175
- (?:catalog)|
181
+ (?i:catalog(ue)?)|
176
182
  (?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
177
183
  \b\s*(?i:help)\s*\b|
184
+ (?i:data\s+not\s+captured)|
178
185
  (?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
179
186
  (?i:desconocido)|
180
187
  (?i:exc?s?icc?at(a|i))|
@@ -192,6 +199,9 @@ module DwcAgent
192
199
  (?i:univ\.)|
193
200
  (?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
194
201
  (?i:non\s+pr(é|e)cis(é|e))|
202
+ (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
203
+ (?i:not?\s+(entered|stated))|
204
+ (?i:nomenclatur(e|al)\s+adjustment)|
195
205
  (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
196
206
  (?i:recreation|culture)|
197
207
  (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
@@ -219,7 +229,12 @@ module DwcAgent
219
229
  FAMILY_BLACKLIST = [
220
230
  "der",
221
231
  "van",
222
- "von"
232
+ "von",
233
+ "the",
234
+ "of",
235
+ "curators",
236
+ "nomenclatural",
237
+ "adjustment"
223
238
  ]
224
239
 
225
240
  TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
@@ -20,7 +20,8 @@ module DwcAgent
20
20
  @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join('\\')].to_s
21
21
  @phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
22
22
  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
23
- @add_separators_regex = Regexp.new %r{([A-Z]{1}\.)([[:alpha:]]{2,})}.to_s
23
+ @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
24
+ @contracted_list_regex = Regexp.new CONTRACTED_LIST.to_s
24
25
  end
25
26
 
26
27
  # Parses the passed-in string and returns a list of names.
@@ -34,6 +35,7 @@ module DwcAgent
34
35
  name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
35
36
  name.gsub!(@add_separators_regex, '\1 \2')
36
37
  name.gsub!(@complex_separators_regex, '\1 | \2')
38
+ name.gsub!(@contracted_list_regex, '\1 \3 | \2 \3')
37
39
  name.gsub!(@residual_terminators_regex, '')
38
40
  name.squeeze!(' ')
39
41
  name.strip!
@@ -1,8 +1,8 @@
1
1
  module DwcAgent
2
2
  class Version
3
3
 
4
- MAJOR = 0
5
- MINOR = 4
4
+ MAJOR = 1
5
+ MINOR = 1
6
6
  PATCH = 0
7
7
  BUILD = nil
8
8
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-07 00:00:00.000000000 Z
11
+ date: 2019-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae