dwc_agent 1.2.0 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dcdfb4922038d07715bd064b74a516203bcce44ef824683ac9766586764bb98f
4
- data.tar.gz: d9113f199abc420d7c608bfbead6e70f1f6192437a82e43541f77eb187752c2a
3
+ metadata.gz: ee3ede60926c849a6db7094f63a9d03d07f3e9d13fd74428841812f8970aa681
4
+ data.tar.gz: 7075c82cd35834f8dad00d57922e46f7ab0eb4f086eb4c0ad40b02c5726ca001
5
5
  SHA512:
6
- metadata.gz: 403049856dbfbc83c984b5175615718c738b46312ecced4d3b5853fb4584a080d1e868492ee476187c8cf7238481845c0f77bd181e6ceaeebc22bae8b7aef127
7
- data.tar.gz: 01e8bc4beb84140a80763c6c0dc8ff6ee6f967e8382cbacf04a230aa3405f2394ec1a90570471e113d90264c1e202c5452428858eb5aff6bc499f4e795dc3158
6
+ metadata.gz: 9e7b655e50ec2d744d74ad44a30b35d75e76b7e0160a35ee7e6b295a03dbc343a012a702e3614c10516bf1282617ec00a0f472e2cc1aefc9d3decc9b4494a946
7
+ data.tar.gz: a28bd967f7df5afaf5e71a01f3902c0bab33dc735bc0861ceb137bad2d81981ec9ae6768497afff267b986bf653f9ba1fb8091221720964866b3352f8d8e2aae
data/bin/dwcagent CHANGED
@@ -5,7 +5,7 @@ require 'dwc_agent'
5
5
  require 'json'
6
6
 
7
7
  names = []
8
- DwcAgent.parse(ARGV[0]).each do |r|
8
+ DwcAgent.parse(ARGV[0].dup).each do |r|
9
9
  name = DwcAgent.clean(r)
10
10
  if !name[:family].nil? && name[:family].length >= 2
11
11
  names << name
@@ -3,4 +3,4 @@
3
3
 
4
4
  require 'dwc_agent'
5
5
 
6
- puts DwcAgent.similarity_score(ARGV[0],ARGV[1])
6
+ puts DwcAgent.similarity_score(ARGV[0].dup,ARGV[1].dup)
@@ -22,6 +22,10 @@ module DwcAgent
22
22
  return blank_name
23
23
  end
24
24
 
25
+ if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
26
+ return blank_name
27
+ end
28
+
25
29
  if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
26
30
  return blank_name
27
31
  end
@@ -38,9 +42,9 @@ module DwcAgent
38
42
  return blank_name
39
43
  end
40
44
 
41
- if parsed_namae.given &&
42
- parsed_namae.family &&
43
- parsed_namae.family.count(".") > 0 &&
45
+ if parsed_namae.given &&
46
+ parsed_namae.family &&
47
+ parsed_namae.family.count(".") > 0 &&
44
48
  parsed_namae.family.length - parsed_namae.family.count(".") <= 3
45
49
  given = parsed_namae.given
46
50
  family = parsed_namae.family
@@ -48,9 +52,9 @@ module DwcAgent
48
52
  parsed_namae.given = family
49
53
  end
50
54
 
51
- if parsed_namae.given &&
52
- parsed_namae.family &&
53
- parsed_namae.family.length <=3 &&
55
+ if parsed_namae.given &&
56
+ parsed_namae.family &&
57
+ parsed_namae.family.length <=3 &&
54
58
  parsed_namae.family == parsed_namae.family.upcase &&
55
59
  parsed_namae.given[-1] != "."
56
60
  given = parsed_namae.given
@@ -59,9 +63,9 @@ module DwcAgent
59
63
  parsed_namae.given = family
60
64
  end
61
65
 
62
- if parsed_namae.given &&
63
- (parsed_namae.given == parsed_namae.given.upcase ||
64
- parsed_namae.given == parsed_namae.given.downcase) &&
66
+ if parsed_namae.given &&
67
+ (parsed_namae.given == parsed_namae.given.upcase ||
68
+ parsed_namae.given == parsed_namae.given.downcase) &&
65
69
  !parsed_namae.given.include?(".") &&
66
70
  parsed_namae.given.tr(".","").length >= 4
67
71
  parsed_namae.given = NameCase(parsed_namae.given)
@@ -115,8 +119,12 @@ module DwcAgent
115
119
  return blank_name
116
120
  end
117
121
 
122
+ if !given.nil? && GIVEN_BLACKLIST.any?{ |s| s.casecmp(given) == 0 }
123
+ return blank_name
124
+ end
125
+
118
126
  { given: given, family: family, particle: particle }
119
127
  end
120
128
 
121
129
  end
122
- end
130
+ end
@@ -48,10 +48,12 @@ module DwcAgent
48
48
  \d+\s+(?i:Oct|Octob(er|re))\.?\b|
49
49
  \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
50
50
  \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
51
+ \b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
51
52
  (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
52
53
  \b\s*(?i:maybe)\s*\b|
53
54
  \b\s*(?i:prob)\.\s*\b|
54
55
  \(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
56
+ \b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
55
57
  (?i:fide)\:?\s*\b|
56
58
  (?i:game\s+dept)\.?\s*\b|
57
59
  (?i:see\s+notes?\s*(inside)?)|
@@ -71,6 +73,7 @@ module DwcAgent
71
73
  (?i:American\s+Museum\s+of\s+Natural\s+History)|
72
74
  (?i:The\s+Paleontological\s+Research\s+Institution)|
73
75
  (?i:museum\s+victoria)|
76
+ \b\s*(?i:United\s+States|Russia)\s*\b|
74
77
  (?i:revised|photograph|fruits\s+only)|
75
78
  -?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
76
79
  -?\s*(?i:synonym(y|ie))|
@@ -162,16 +165,6 @@ module DwcAgent
162
165
  ^(\S{4,},\s+(?:\S\.\s*){1,})\s+(\S{4,},\s+(?:\S\.\s*){1,})$
163
166
  }x
164
167
 
165
- # Was used in 1.1.0 but it sunk performance so threw it back to a WIP
166
- #
167
- # @contracted_list_regex = Regexp.new CONTRACTED_LIST.to_s
168
- #
169
- # name.gsub!(@contracted_list_regex, '\1 \3 | \2 \3')
170
- #
171
- # CONTRACTED_LIST = %r{
172
- # ^(\S{1,}\.?)+\s+(?i:and|&)\s+(\S{1,}\.?)+\s*(.*)$
173
- # }x
174
-
175
168
  BLACKLIST = %r{
176
169
  (?i:abundant)|
177
170
  (?i:adult|juvenile)|
@@ -208,11 +201,14 @@ module DwcAgent
208
201
  (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
209
202
  (?i:not?\s+(entered|stated))|
210
203
  (?i:nomenclatur(e|al)\s+adjustment)|
204
+ (?i:not\s+available)|
211
205
  (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
206
+ (?i:popa\s+observers?)|
212
207
  (?i:recreation|culture)|
213
208
  (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
214
209
  (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
215
210
  (?i:commercial|company|control|product)|
211
+ (?i:sequence\s+data)|
216
212
  (?i:size|large|colou?r)\s+|
217
213
  (?i:skeleton)|
218
214
  (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|exchange|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
@@ -223,6 +219,7 @@ module DwcAgent
223
219
  \s*(?i:too)\s+|\s*(?i:the)\s+|
224
220
  (?i:taxiderm(ies|y))|
225
221
  (?i:though)|
222
+ (?i:texas\s+instruments?)\s*?(for)?|
226
223
  (?:tropical)|
227
224
  (?i:toward|seen at)|
228
225
  (?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
@@ -240,7 +237,19 @@ module DwcAgent
240
237
  "of",
241
238
  "curators",
242
239
  "nomenclatural",
243
- "adjustment"
240
+ "adjustment",
241
+ "available",
242
+ "data",
243
+ "orig",
244
+ "science",
245
+ "catalogue",
246
+ "entered",
247
+ "registration"
248
+ ]
249
+
250
+ GIVEN_BLACKLIST = [
251
+ "not any",
252
+ "has not"
244
253
  ]
245
254
 
246
255
  TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
@@ -2,8 +2,8 @@ module DwcAgent
2
2
  class Version
3
3
 
4
4
  MAJOR = 1
5
- MINOR = 2
6
- PATCH = 0
5
+ MINOR = 4
6
+ PATCH = 3
7
7
  BUILD = nil
8
8
 
9
9
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-11 00:00:00.000000000 Z
11
+ date: 2020-01-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae