dwc_agent 1.3.0 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c3bcc45d8ecae4211cd0f678727d1002ed54ca998934d0f72f458762b21152f1
4
- data.tar.gz: 6857f25e128b7a5a62e95bc36367679356ef9321b65e783b3ea04a7e0501afa5
3
+ metadata.gz: fa4fb87ea91fd1f0e67278590192a55bfc7f1e8d6f4b8dc92c1f9f5eb508e44c
4
+ data.tar.gz: a89b51ea705885713ef8615c67e1ea10798abfe593b5646b4de9fb8e1b478762
5
5
  SHA512:
6
- metadata.gz: b5489f5f4d6473de3054377c7884b37cbb1ce8982f68a4e52ea39cf68ec59945a8708a90400b0aa8c509505bd5c48cef2ab4a99e19d1661af21fd6cab349a0b2
7
- data.tar.gz: a27efb2e8aa6c9183d473217e6d50ac309163cebf12c7d191f878f28838c006cd6132c5ccd2ec71b76e108be100492332edd93917adb2ee32417d5e9f31a87ea
6
+ metadata.gz: d676b64441d0097bd6272e2cd694c5754c4bdaed8fd0f523ecbe28748c8ccedffd9dd1c0430f5ad25cf48c02705b8131a9ce1021c07965da0791ae5f62e36c8a
7
+ data.tar.gz: 00effae1b438e6d97ef8da8383aa407985876d148b5b30a51ca98d2befa0dc8ac4a8c69bb389f75dd08c147853522490a406470dd8c4aa20d976238cf2cb4d82
data/bin/dwcagent CHANGED
@@ -5,7 +5,7 @@ require 'dwc_agent'
5
5
  require 'json'
6
6
 
7
7
  names = []
8
- DwcAgent.parse(ARGV[0]).each do |r|
8
+ DwcAgent.parse(ARGV[0].dup).each do |r|
9
9
  name = DwcAgent.clean(r)
10
10
  if !name[:family].nil? && name[:family].length >= 2
11
11
  names << name
@@ -3,4 +3,4 @@
3
3
 
4
4
  require 'dwc_agent'
5
5
 
6
- puts DwcAgent.similarity_score(ARGV[0],ARGV[1])
6
+ puts DwcAgent.similarity_score(ARGV[0].dup,ARGV[1].dup)
@@ -22,6 +22,10 @@ module DwcAgent
22
22
  return blank_name
23
23
  end
24
24
 
25
+ if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
26
+ return blank_name
27
+ end
28
+
25
29
  if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
26
30
  return blank_name
27
31
  end
@@ -38,9 +42,9 @@ module DwcAgent
38
42
  return blank_name
39
43
  end
40
44
 
41
- if parsed_namae.given &&
42
- parsed_namae.family &&
43
- parsed_namae.family.count(".") > 0 &&
45
+ if parsed_namae.given &&
46
+ parsed_namae.family &&
47
+ parsed_namae.family.count(".") > 0 &&
44
48
  parsed_namae.family.length - parsed_namae.family.count(".") <= 3
45
49
  given = parsed_namae.given
46
50
  family = parsed_namae.family
@@ -48,9 +52,9 @@ module DwcAgent
48
52
  parsed_namae.given = family
49
53
  end
50
54
 
51
- if parsed_namae.given &&
52
- parsed_namae.family &&
53
- parsed_namae.family.length <=3 &&
55
+ if parsed_namae.given &&
56
+ parsed_namae.family &&
57
+ parsed_namae.family.length <=3 &&
54
58
  parsed_namae.family == parsed_namae.family.upcase &&
55
59
  parsed_namae.given[-1] != "."
56
60
  given = parsed_namae.given
@@ -59,9 +63,9 @@ module DwcAgent
59
63
  parsed_namae.given = family
60
64
  end
61
65
 
62
- if parsed_namae.given &&
63
- (parsed_namae.given == parsed_namae.given.upcase ||
64
- parsed_namae.given == parsed_namae.given.downcase) &&
66
+ if parsed_namae.given &&
67
+ (parsed_namae.given == parsed_namae.given.upcase ||
68
+ parsed_namae.given == parsed_namae.given.downcase) &&
65
69
  !parsed_namae.given.include?(".") &&
66
70
  parsed_namae.given.tr(".","").length >= 4
67
71
  parsed_namae.given = NameCase(parsed_namae.given)
@@ -115,8 +119,12 @@ module DwcAgent
115
119
  return blank_name
116
120
  end
117
121
 
122
+ if !given.nil? && GIVEN_BLACKLIST.any?{ |s| s.casecmp(given) == 0 }
123
+ return blank_name
124
+ end
125
+
118
126
  { given: given, family: family, particle: particle }
119
127
  end
120
128
 
121
129
  end
122
- end
130
+ end
@@ -1,7 +1,7 @@
1
1
  module DwcAgent
2
2
  STRIP_OUT = %r{
3
3
  ^[\[{(]|
4
- [\]})]$|
4
+ [\]})]\??$|
5
5
  \s*?\d+\.\d+|
6
6
  \b\d+\(?(?i:[[:alpha:]])\)?\b|
7
7
  \b[,;]?\s*(?i:et\.?\s+al)\.?|
@@ -13,7 +13,7 @@ module DwcAgent
13
13
  \b[,;]?\s*(?i:unkn?own)\b|
14
14
  \b[,;]?\s*(?i:n/a)\b|
15
15
  \b[,;]?\s*(?i:ann?onymous)\b|
16
- \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|illegible|scripsit)\)?\b|
16
+ \b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
17
17
  \b[,;]?\s*(?i:importer|gift)\:?\b|
18
18
  \b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
19
19
  \b[,;]?\s*(?i:string)\b|
@@ -48,10 +48,12 @@ module DwcAgent
48
48
  \d+\s+(?i:Oct|Octob(er|re))\.?\b|
49
49
  \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
50
50
  \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
51
+ \b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
51
52
  (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
52
53
  \b\s*(?i:maybe)\s*\b|
53
54
  \b\s*(?i:prob)\.\s*\b|
54
55
  \(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
56
+ \b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
55
57
  (?i:fide)\:?\s*\b|
56
58
  (?i:game\s+dept)\.?\s*\b|
57
59
  (?i:see\s+notes?\s*(inside)?)|
@@ -71,17 +73,19 @@ module DwcAgent
71
73
  (?i:American\s+Museum\s+of\s+Natural\s+History)|
72
74
  (?i:The\s+Paleontological\s+Research\s+Institution)|
73
75
  (?i:museum\s+victoria)|
76
+ \b\s*(?i:United\s+States|Russia)\s*\b|
74
77
  (?i:revised|photograph|fruits\s+only)|
75
78
  -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
76
79
  -?\s*(?i:synonym(y|ie))|
77
80
  \b\s*\(?(?i:(fe)?male)\)?\s*\b|
78
81
  \b(?i:to\s+(sub)?spp?)\.?|
79
82
  (?i:nom\.?\s+rev\.?)|
80
- FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
83
+ FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|
81
84
  AFSC\/POLISH\s+SORTING\s+CTR\.?|
82
85
  (?i:university|museum|exhibits?)|
83
86
  (?i:uqam)|
84
87
  (?i:sem\s+(colec?tor|data))|
88
+ (?i:no\s+coll\.?(ector)?)|
85
89
  \b[,;]\s+\d+\z|
86
90
  ["!@?]|
87
91
  [,]?\d+|
@@ -133,8 +137,6 @@ module DwcAgent
133
137
  '|' => ' | ',
134
138
  '(' => ' ',
135
139
  ')' => ' ',
136
- '[' => ' ',
137
- ']' => ' ',
138
140
  '?' => '',
139
141
  '!' => '',
140
142
  '=' => '',
@@ -193,19 +195,22 @@ module DwcAgent
193
195
  (?i:geographic)|
194
196
  (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
195
197
  (?i:univ\.)|
196
- (?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
198
+ (?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
197
199
  (?i:non\s+pr(é|e)cis(é|e))|
198
200
  (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
199
201
  (?i:not?\s+(entered|stated))|
200
202
  (?i:nomenclatur(e|al)\s+adjustment)|
203
+ (?i:not\s+available)|
201
204
  (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
205
+ (?i:popa\s+observers?)|
202
206
  (?i:recreation|culture)|
203
207
  (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
204
208
  (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
205
209
  (?i:commercial|company|control|product)|
210
+ (?i:sequence\s+data)|
206
211
  (?i:size|large|colou?r)\s+|
207
212
  (?i:skeleton)|
208
- (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|exchange|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
213
+ (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
209
214
  (?i:submersible)|
210
215
  (?i:synonymy?)|(topo|syn|holo)type|
211
216
  (?i:systematic|perspective)|
@@ -213,6 +218,7 @@ module DwcAgent
213
218
  \s*(?i:too)\s+|\s*(?i:the)\s+|
214
219
  (?i:taxiderm(ies|y))|
215
220
  (?i:though)|
221
+ (?i:texas\s+instruments?)\s*?(for)?|
216
222
  (?:tropical)|
217
223
  (?i:toward|seen at)|
218
224
  (?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
@@ -223,14 +229,33 @@ module DwcAgent
223
229
  }x
224
230
 
225
231
  FAMILY_BLACKLIST = [
232
+ "da",
233
+ "de'",
234
+ "del",
226
235
  "der",
236
+ "du",
237
+ "el",
227
238
  "van",
228
239
  "von",
229
240
  "the",
230
241
  "of",
242
+ "adjustment",
243
+ "available",
244
+ "catalogue",
231
245
  "curators",
246
+ "data",
247
+ "determination",
248
+ "dissection",
249
+ "entered",
232
250
  "nomenclatural",
233
- "adjustment"
251
+ "orig",
252
+ "registration",
253
+ "science"
254
+ ]
255
+
256
+ GIVEN_BLACKLIST = [
257
+ "not any",
258
+ "has not"
234
259
  ]
235
260
 
236
261
  TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
@@ -17,7 +17,7 @@ module DwcAgent
17
17
 
18
18
  @strip_out_regex = Regexp.new STRIP_OUT.to_s
19
19
  @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
20
- @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join('\\')].to_s
20
+ @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
21
21
  @phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
22
22
  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
23
23
  @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
@@ -30,6 +30,7 @@ module DwcAgent
30
30
  def parse(name)
31
31
  return [] if name.nil? || name == ""
32
32
  name.gsub!(@strip_out_regex, ' ')
33
+ name.gsub!(/\[|\]/, '')
33
34
  name.gsub!(@char_subs_regex, CHAR_SUBS)
34
35
  name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
35
36
  name.gsub!(@add_separators_regex, '\1 \2')
@@ -2,8 +2,8 @@ module DwcAgent
2
2
  class Version
3
3
 
4
4
  MAJOR = 1
5
- MINOR = 3
6
- PATCH = 0
5
+ MINOR = 4
6
+ PATCH = 4
7
7
  BUILD = nil
8
8
 
9
9
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-27 00:00:00.000000000 Z
11
+ date: 2020-01-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae