dwc_agent 1.1.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d56e8cb49d4857e324dbdda7d6cfa31e9a7a3079c40a12659ff6077d13308613
4
- data.tar.gz: 7652bc849261aa4179a99e64a725a0f82d0b65b4d2e2885042ad35521887e705
3
+ metadata.gz: 75ac9f77b1b4d5761881cafca45e66c2745eb47bb501ad9fab84cad6324b41ee
4
+ data.tar.gz: 16057726f330cb033fe7e3af0af58dd4fbb2f6f9817a56c3e86c88da1fbccf61
5
5
  SHA512:
6
- metadata.gz: 99d384a56c180c8c5db64c5cc8cf072da272b427830875783a22d67279cd070af3bd0bd2b881e4a20018bdeba774d27e8adb8ae048668c28452171017b72c70b
7
- data.tar.gz: edc59989136745e0b603039d08420b368ef92e95a0ab17b48bcabf439472c1252ec763f4f9c7a398b735995b340f897dc0f3d85a45c2ea1c9ccec643f050e904
6
+ metadata.gz: f628d3bfec046b54338eb08ec18cc5352dff7560e5e87e2c949378e66bcb44df84af637ea7c8b4d559504945b5682b6f019f2bde56d31552a1c4537902b2a71e
7
+ data.tar.gz: 444b15c0d391e12d8ccd6823f8ae97688d9b20ed075323795e22acd7834add4e5068fcc2d4fc3729e36aa3bdc11f29f4a6f8af8c7fc1cfc34d1f294159212328
data/bin/dwcagent CHANGED
@@ -5,7 +5,7 @@ require 'dwc_agent'
5
5
  require 'json'
6
6
 
7
7
  names = []
8
- DwcAgent.parse(ARGV[0]).each do |r|
8
+ DwcAgent.parse(ARGV[0].dup).each do |r|
9
9
  name = DwcAgent.clean(r)
10
10
  if !name[:family].nil? && name[:family].length >= 2
11
11
  names << name
@@ -3,4 +3,4 @@
3
3
 
4
4
  require 'dwc_agent'
5
5
 
6
- puts DwcAgent.similarity_score(ARGV[0],ARGV[1])
6
+ puts DwcAgent.similarity_score(ARGV[0].dup,ARGV[1].dup)
@@ -22,7 +22,7 @@ module DwcAgent
22
22
  return blank_name
23
23
  end
24
24
 
25
- if parsed_namae.family && parsed_namae.family.length < 2 && parsed_namae.family.count('.') == 0
25
+ if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
26
26
  return blank_name
27
27
  end
28
28
 
@@ -42,9 +42,9 @@ module DwcAgent
42
42
  return blank_name
43
43
  end
44
44
 
45
- if parsed_namae.given &&
46
- parsed_namae.family &&
47
- parsed_namae.family.count(".") > 0 &&
45
+ if parsed_namae.given &&
46
+ parsed_namae.family &&
47
+ parsed_namae.family.count(".") > 0 &&
48
48
  parsed_namae.family.length - parsed_namae.family.count(".") <= 3
49
49
  given = parsed_namae.given
50
50
  family = parsed_namae.family
@@ -52,9 +52,9 @@ module DwcAgent
52
52
  parsed_namae.given = family
53
53
  end
54
54
 
55
- if parsed_namae.given &&
56
- parsed_namae.family &&
57
- parsed_namae.family.length <=3 &&
55
+ if parsed_namae.given &&
56
+ parsed_namae.family &&
57
+ parsed_namae.family.length <=3 &&
58
58
  parsed_namae.family == parsed_namae.family.upcase &&
59
59
  parsed_namae.given[-1] != "."
60
60
  given = parsed_namae.given
@@ -63,9 +63,9 @@ module DwcAgent
63
63
  parsed_namae.given = family
64
64
  end
65
65
 
66
- if parsed_namae.given &&
67
- (parsed_namae.given == parsed_namae.given.upcase ||
68
- parsed_namae.given == parsed_namae.given.downcase) &&
66
+ if parsed_namae.given &&
67
+ (parsed_namae.given == parsed_namae.given.upcase ||
68
+ parsed_namae.given == parsed_namae.given.downcase) &&
69
69
  !parsed_namae.given.include?(".") &&
70
70
  parsed_namae.given.tr(".","").length >= 4
71
71
  parsed_namae.given = NameCase(parsed_namae.given)
@@ -119,8 +119,12 @@ module DwcAgent
119
119
  return blank_name
120
120
  end
121
121
 
122
+ if !given.nil? && GIVEN_BLACKLIST.any?{ |s| s.casecmp(given) == 0 }
123
+ return blank_name
124
+ end
125
+
122
126
  { given: given, family: family, particle: particle }
123
127
  end
124
128
 
125
129
  end
126
- end
130
+ end
@@ -162,10 +162,6 @@ module DwcAgent
162
162
  ^(\S{4,},\s+(?:\S\.\s*){1,})\s+(\S{4,},\s+(?:\S\.\s*){1,})$
163
163
  }x
164
164
 
165
- CONTRACTED_LIST = %r{
166
- ^(\S{1,}\.?)+\s+(?i:and|&)\s+(\S{1,}\.?)+\s*(.*)$
167
- }x
168
-
169
165
  BLACKLIST = %r{
170
166
  (?i:abundant)|
171
167
  (?i:adult|juvenile)|
@@ -202,11 +198,13 @@ module DwcAgent
202
198
  (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
203
199
  (?i:not?\s+(entered|stated))|
204
200
  (?i:nomenclatur(e|al)\s+adjustment)|
201
+ (?i:not\s+available)|
205
202
  (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
206
203
  (?i:recreation|culture)|
207
204
  (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
208
205
  (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
209
206
  (?i:commercial|company|control|product)|
207
+ (?i:sequence\s+data)|
210
208
  (?i:size|large|colou?r)\s+|
211
209
  (?i:skeleton)|
212
210
  (?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|exchange|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
@@ -217,6 +215,7 @@ module DwcAgent
217
215
  \s*(?i:too)\s+|\s*(?i:the)\s+|
218
216
  (?i:taxiderm(ies|y))|
219
217
  (?i:though)|
218
+ (?i:texas\s+instruments?)\s*?(for)?|
220
219
  (?:tropical)|
221
220
  (?i:toward|seen at)|
222
221
  (?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
@@ -234,7 +233,18 @@ module DwcAgent
234
233
  "of",
235
234
  "curators",
236
235
  "nomenclatural",
237
- "adjustment"
236
+ "adjustment",
237
+ "available",
238
+ "data",
239
+ "orig",
240
+ "science",
241
+ "catalogue",
242
+ "entered"
243
+ ]
244
+
245
+ GIVEN_BLACKLIST = [
246
+ "not any",
247
+ "has not"
238
248
  ]
239
249
 
240
250
  TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
@@ -21,7 +21,6 @@ module DwcAgent
21
21
  @phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
22
22
  @complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
23
23
  @add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
24
- @contracted_list_regex = Regexp.new CONTRACTED_LIST.to_s
25
24
  end
26
25
 
27
26
  # Parses the passed-in string and returns a list of names.
@@ -35,7 +34,6 @@ module DwcAgent
35
34
  name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
36
35
  name.gsub!(@add_separators_regex, '\1 \2')
37
36
  name.gsub!(@complex_separators_regex, '\1 | \2')
38
- name.gsub!(@contracted_list_regex, '\1 \3 | \2 \3')
39
37
  name.gsub!(@residual_terminators_regex, '')
40
38
  name.squeeze!(' ')
41
39
  name.strip!
@@ -2,8 +2,8 @@ module DwcAgent
2
2
  class Version
3
3
 
4
4
  MAJOR = 1
5
- MINOR = 1
6
- PATCH = 0
5
+ MINOR = 4
6
+ PATCH = 2
7
7
  BUILD = nil
8
8
 
9
9
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-11 00:00:00.000000000 Z
11
+ date: 2019-12-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae
@@ -119,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
119
  - !ruby/object:Gem::Version
120
120
  version: '0'
121
121
  requirements: []
122
- rubygems_version: 3.0.3
122
+ rubygems_version: 3.0.6
123
123
  signing_key:
124
124
  specification_version: 4
125
125
  summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy