dwca_hunter 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 60327e105c53c226f322e3a7272bdc5747d73fac0124887b024f99e3c39c985b
4
- data.tar.gz: '09660f8b5feccfaf4caeaec277db4dc4729a973196a77bb02860947ef55bd272'
3
+ metadata.gz: 26106d03a805f473871c3092a84b8d0498f991ff809a7e3f259f1e20d29bca80
4
+ data.tar.gz: 28ac068cf264870f67ffdc6697801c683ffec0e101c582ddbd50aa77b7831b8f
5
5
  SHA512:
6
- metadata.gz: 9b0a621f85535f421eef5a8550ce653c4f3483f563c7b19934a76e8f30b0cdf17e7a8c59945ea31455c57a350a14d345993f5fe6b91d656f5eb40317da6b1af9
7
- data.tar.gz: 00a54b23a8588e6d304d35bb8756f633fb409777f07908525d257f4b1a23c5956a9683863b491f50e8d772c8b41343f909e6cb252a326d4f2e662f96f37826ed
6
+ metadata.gz: 3a1e49e6db3aa0bb79616a512feb8c05438f3534d6c7bdd43a0ae32981e32f83ccd74bdcd682130bdf044dd893e0017eeddf533b170109338bb8d95f0379e2b7
7
+ data.tar.gz: 35bb769a822639d26380ad70db0b7aeed22483ce7184e62b7fad38c8d5c3b7ab5ea3b82e3dce00bb012bcc18e719077c2c3bb4a2b53830f7babd693c26aa2caf
data/.rubocop.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  AllCops:
2
2
  NewCops: disable
3
- TargetRubyVersion: 2.6.6
3
+ TargetRubyVersion: 3.0.0
4
4
  Exclude:
5
5
  - bin/**/*
6
6
  - db/**/*
@@ -28,6 +28,14 @@ Metrics/ClassLength:
28
28
  Metrics/MethodLength:
29
29
  Enabled: false
30
30
 
31
+ Metrics/AbcSize:
32
+ Enabled: false
33
+
34
+ Metrics/PerceivedComplexity:
35
+ Enabled: false
36
+
37
+
38
+
31
39
  Naming/FileName:
32
40
  Exclude:
33
41
  - Gemfile
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.6
1
+ 3.0.0
data/Gemfile.lock CHANGED
@@ -1,23 +1,26 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- dwca_hunter (0.7.0)
5
- biodiversity (~> 4)
6
- dwc-archive (~> 1.1.1)
4
+ dwca_hunter (0.7.2)
5
+ biodiversity (~> 5.1.2)
6
+ dwc-archive (~> 1.1.3)
7
7
  gn_uuid (~> 0.5)
8
8
  htmlentities (~> 4.3)
9
9
  nokogiri (~> 1.11)
10
- rest-client (~> 2.0)
10
+ rest-client (~> 2.1)
11
11
  ruby-xz (~> 1.0)
12
- thor (~> 0.19)
12
+ rubyzip (~> 2.3)
13
+ thor (~> 1.1)
13
14
 
14
15
  GEM
15
16
  remote: http://rubygems.org/
16
17
  specs:
17
- ast (2.4.1)
18
- biodiversity (4.1.0)
19
- ffi (~> 1.11)
20
- byebug (10.0.2)
18
+ ast (2.4.2)
19
+ backport (1.1.2)
20
+ benchmark (0.1.1)
21
+ biodiversity (5.1.2)
22
+ ffi (~> 1.14)
23
+ byebug (11.1.3)
21
24
  coveralls (0.8.23)
22
25
  json (>= 1.8, < 3)
23
26
  simplecov (~> 0.16.1)
@@ -25,19 +28,25 @@ GEM
25
28
  thor (>= 0.19.4, < 2.0)
26
29
  tins (~> 1.6)
27
30
  diff-lcs (1.4.4)
28
- docile (1.3.4)
31
+ docile (1.3.5)
29
32
  domain_name (0.5.20190701)
30
33
  unf (>= 0.0.5, < 1.0.0)
31
- dwc-archive (1.1.1)
32
- biodiversity (~> 4)
33
- nokogiri (~> 1.10)
34
+ dwc-archive (1.1.3)
35
+ biodiversity (~> 5.1.2)
36
+ nokogiri (~> 1.11)
37
+ e2mmap (0.1.0)
34
38
  ffi (1.14.2)
35
39
  gn_uuid (0.5.1)
36
40
  htmlentities (4.3.4)
37
41
  http-accept (1.7.0)
38
42
  http-cookie (1.0.3)
39
43
  domain_name (~> 0.5)
44
+ jaro_winkler (1.5.4)
40
45
  json (2.5.1)
46
+ kramdown (2.3.0)
47
+ rexml
48
+ kramdown-parser-gfm (1.1.0)
49
+ kramdown (~> 2.0)
41
50
  mime-types (3.3.1)
42
51
  mime-types-data (~> 3.2015)
43
52
  mime-types-data (3.2020.1104)
@@ -58,6 +67,8 @@ GEM
58
67
  http-cookie (>= 1.0.2, < 2.0)
59
68
  mime-types (>= 1.16, < 4.0)
60
69
  netrc (~> 0.8)
70
+ reverse_markdown (2.0.0)
71
+ nokogiri
61
72
  rexml (3.2.4)
62
73
  rspec (3.10.0)
63
74
  rspec-core (~> 3.10.0)
@@ -72,46 +83,64 @@ GEM
72
83
  diff-lcs (>= 1.2.0, < 2.0)
73
84
  rspec-support (~> 3.10.0)
74
85
  rspec-support (3.10.1)
75
- rubocop (0.93.1)
86
+ rubocop (1.9.0)
76
87
  parallel (~> 1.10)
77
- parser (>= 2.7.1.5)
88
+ parser (>= 3.0.0.0)
78
89
  rainbow (>= 2.2.2, < 4.0)
79
- regexp_parser (>= 1.8)
90
+ regexp_parser (>= 1.8, < 3.0)
80
91
  rexml
81
- rubocop-ast (>= 0.6.0)
92
+ rubocop-ast (>= 1.2.0, < 2.0)
82
93
  ruby-progressbar (~> 1.7)
83
- unicode-display_width (>= 1.4.0, < 2.0)
84
- rubocop-ast (1.4.0)
94
+ unicode-display_width (>= 1.4.0, < 3.0)
95
+ rubocop-ast (1.4.1)
85
96
  parser (>= 2.7.1.5)
86
97
  ruby-progressbar (1.11.0)
87
98
  ruby-xz (1.0.0)
99
+ rubyzip (2.3.0)
88
100
  simplecov (0.16.1)
89
101
  docile (~> 1.1)
90
102
  json (>= 1.8, < 3)
91
103
  simplecov-html (~> 0.10.0)
92
104
  simplecov-html (0.10.2)
105
+ solargraph (0.40.2)
106
+ backport (~> 1.1)
107
+ benchmark
108
+ bundler (>= 1.17.2)
109
+ e2mmap
110
+ jaro_winkler (~> 1.5)
111
+ kramdown (~> 2.3)
112
+ kramdown-parser-gfm (~> 1.1)
113
+ parser (~> 3.0)
114
+ reverse_markdown (>= 1.0.5, < 3)
115
+ rubocop (>= 0.52)
116
+ thor (~> 1.0)
117
+ tilt (~> 2.0)
118
+ yard (~> 0.9, >= 0.9.24)
93
119
  sync (0.5.0)
94
120
  term-ansicolor (1.7.1)
95
121
  tins (~> 1.0)
96
- thor (0.20.3)
97
- tins (1.26.0)
122
+ thor (1.1.0)
123
+ tilt (2.0.10)
124
+ tins (1.28.0)
98
125
  sync
99
126
  unf (0.1.4)
100
127
  unf_ext
101
128
  unf_ext (0.0.7.7)
102
- unicode-display_width (1.7.0)
129
+ unicode-display_width (2.0.0)
130
+ yard (0.9.26)
103
131
 
104
132
  PLATFORMS
105
133
  ruby
106
134
 
107
135
  DEPENDENCIES
108
- bundler (~> 2.0)
109
- byebug (~> 10.0)
136
+ bundler (~> 2.2)
137
+ byebug (~> 11.1)
110
138
  coveralls (~> 0.8)
111
139
  dwca_hunter!
112
140
  rake (~> 13.0)
113
- rspec (~> 3.9)
114
- rubocop (~> 0.84)
141
+ rspec (~> 3.10)
142
+ rubocop (~> 1.9)
143
+ solargraph (~> 0.40)
115
144
 
116
145
  BUNDLED WITH
117
- 2.1.4
146
+ 2.2.7
data/dwca_hunter.gemspec CHANGED
@@ -6,7 +6,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
6
6
  require "dwca_hunter/version"
7
7
 
8
8
  Gem::Specification.new do |gem|
9
- gem.required_ruby_version = ">= 2.6.6"
9
+ gem.required_ruby_version = ">= 3.0.0"
10
10
  gem.name = "dwca_hunter"
11
11
  gem.version = DwcaHunter.version
12
12
  gem.license = "MIT"
@@ -26,19 +26,21 @@ Gem::Specification.new do |gem|
26
26
  gem.executables = gem.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
27
  gem.require_paths = ["lib"]
28
28
 
29
- gem.add_dependency "biodiversity", "~> 4"
30
- gem.add_dependency "dwc-archive", "~> 1.1.1"
29
+ gem.add_dependency "biodiversity", "~> 5.1.2"
30
+ gem.add_dependency "dwc-archive", "~> 1.1.3"
31
31
  gem.add_dependency "gn_uuid", "~> 0.5"
32
32
  gem.add_dependency "htmlentities", "~> 4.3"
33
33
  gem.add_dependency "nokogiri", "~> 1.11"
34
- gem.add_dependency "rest-client", "~> 2.0"
34
+ gem.add_dependency "rest-client", "~> 2.1"
35
35
  gem.add_dependency "ruby-xz", "~> 1.0"
36
- gem.add_dependency "thor", "~> 0.19"
36
+ gem.add_dependency "rubyzip", "~> 2.3"
37
+ gem.add_dependency "thor", "~> 1.1"
37
38
 
38
- gem.add_development_dependency "bundler", "~> 2.0"
39
- gem.add_development_dependency "byebug", "~> 10.0"
39
+ gem.add_development_dependency "bundler", "~> 2.2"
40
+ gem.add_development_dependency "byebug", "~> 11.1"
40
41
  gem.add_development_dependency "coveralls", "~> 0.8"
41
42
  gem.add_development_dependency "rake", "~> 13.0"
42
- gem.add_development_dependency "rspec", "~> 3.9"
43
- gem.add_development_dependency "rubocop", "~> 0.84"
43
+ gem.add_development_dependency "rspec", "~> 3.10"
44
+ gem.add_development_dependency "rubocop", "~> 1.9"
45
+ gem.add_development_dependency "solargraph", "~> 0.40"
44
46
  end
data/exe/dwcahunter CHANGED
@@ -51,7 +51,6 @@ class DwcaHunterCLI < Thor
51
51
  resource.abbr =~ /#{search}/i)
52
52
  end
53
53
 
54
- # rubocop:disable Metrics/AbcSize
55
54
  def print_table(data)
56
55
  table = [data.first.keys.map { |k| k.to_s.capitalize }] + data.map(&:values)
57
56
  widths = table_widths(table)
@@ -59,7 +58,6 @@ class DwcaHunterCLI < Thor
59
58
  format = widths.collect { |n| "%-#{n}s" }.join(" ")
60
59
  table.each { |line| printf " #{format} \n", *line }
61
60
  end
62
- # rubocop:enable Metrics/AbcSize
63
61
 
64
62
  def table_widths(table)
65
63
  table.each_with_object([]) do |line, widths|
data/lib/dwca_hunter.rb CHANGED
@@ -7,11 +7,13 @@ require "dwca_hunter/resource"
7
7
  require "fileutils"
8
8
  require "htmlentities"
9
9
  require "json"
10
+ require "zip"
10
11
  require "logger"
11
12
  require "net/http"
12
13
  require "rest_client"
13
14
  require "tmpdir"
14
15
  require "uri"
16
+ require "cgi"
15
17
 
16
18
  Dir[File.join(__dir__, "dwca_hunter", "*.rb")].
17
19
  each { |f| require f }
@@ -53,13 +55,14 @@ module DwcaHunter
53
55
  end
54
56
 
55
57
  def normalize_authors(auth)
56
- reg = Regexp.new(/^([\(]?)(.*?)(([\s,\)][^[:upper:]]*)?$)/)
57
- auth = auth.gsub(/duPont/, 'du Pont')
58
+ reg = Regexp.new(/^(\(?)(.*?)(([\s,)][^[:upper:]]*)?$)/)
59
+ auth = auth.gsub(/duPont/, "du Pont")
58
60
  match = reg.match(auth)
59
61
  return auth if match.nil?
62
+
60
63
  a1, a2, a3 = match[1..3]
61
- a2mod = a2.gsub('&', ',')
62
- ary2 = a2mod.split(',').map(&:strip)
64
+ a2mod = a2.gsub("&", ",")
65
+ ary2 = a2mod.split(",").map(&:strip)
63
66
  a2 = move_initials(ary2) if ary2.size > 1
64
67
  "#{a1}#{a2}#{a3}"
65
68
  end
@@ -73,8 +76,8 @@ module DwcaHunter
73
76
  end
74
77
  match = /^([[:upper:]]{1,4})(\sJr)?$/.match(a)
75
78
  if !match.nil?
76
- initialls = match[1].split('').join('. ')
77
- res[-1] = "#{initialls}. #{res[-1]}#{match[2].to_s}"
79
+ initialls = match[1].split("").join(". ")
80
+ res[-1] = "#{initialls}. #{res[-1]}#{match[2]}"
78
81
  else
79
82
  res << a
80
83
  end
@@ -83,4 +86,3 @@ module DwcaHunter
83
86
  end
84
87
  end
85
88
  end
86
-
@@ -4,7 +4,12 @@ module DwcaHunter
4
4
 
5
5
  def self.unzip(file, dir = nil)
6
6
  Dir.chdir(dir) if dir
7
- `unzip -qq -u #{file} > /dev/null 2>&1`
7
+ Zip::File.open(file) do |zip_file|
8
+ zip_file.each do |entry|
9
+ puts "Extracting #{entry.name}"
10
+ entry.extract
11
+ end
12
+ end
8
13
  end
9
14
 
10
15
  def self.gunzip(file, dir = nil)
@@ -13,8 +18,8 @@ module DwcaHunter
13
18
  end
14
19
 
15
20
  def initialize(opts)
16
- @needs_download = !(opts[:download] == false)
17
- @needs_unpack = !(opts[:unpack] == false)
21
+ @needs_download = (opts[:download] != false)
22
+ @needs_unpack = (opts[:unpack] != false)
18
23
  @download_dir, @download_file = File.split(@download_path)
19
24
  prepare_path if needs_download?
20
25
  end
@@ -5,12 +5,12 @@ module DwcaHunter
5
5
  def initialize(opts = {})
6
6
  @command = "arctos"
7
7
  @title = "Arctos"
8
- @url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
8
+ @url = "http://arctos.database.museum/cache/gn_merge.tgz"
9
9
  @UUID = "eea8315d-a244-4625-859a-226675622312"
10
10
  @download_path = File.join(Dir.tmpdir,
11
11
  "dwca_hunter",
12
12
  "arctos",
13
- "data.zip")
13
+ "data.tar.gz")
14
14
  @synonyms = []
15
15
  @names = []
16
16
  @vernaculars = []
@@ -21,8 +21,8 @@ module DwcaHunter
21
21
  end
22
22
 
23
23
  def download
24
- puts "Downloading cached verion of the file. Ask Arctos to generate new."
25
- `curl -s -L #{@url} -o #{@download_path}`
24
+ puts "Downloading Arctos file."
25
+ `curl -s #{@url} -o #{@download_path}`
26
26
  end
27
27
 
28
28
  def unpack
@@ -45,11 +45,11 @@ module DwcaHunter
45
45
  end
46
46
 
47
47
  def collect_vernaculars
48
- file = CSV.open(File.join(@download_dir, "common_name.csv"),
48
+ file = CSV.open(File.join(@download_dir, "globalnames_commonname.csv"),
49
49
  headers: true)
50
50
  file.each_with_index do |row, i|
51
- canonical = row["SCIENTIFIC_NAME"]
52
- vernacular_name_string = row["COMMON_NAME"]
51
+ canonical = row["scientific_name"]
52
+ vernacular_name_string = row["common_name"]
53
53
 
54
54
  if @vernaculars_hash.key?(canonical)
55
55
  @vernaculars_hash[canonical] << vernacular_name_string
@@ -57,66 +57,63 @@ module DwcaHunter
57
57
  @vernaculars_hash[canonical] = [vernacular_name_string]
58
58
  end
59
59
 
60
- puts "Processed %s vernaculars" % i if i % 10_000 == 0
60
+ puts "Processed #{i} vernaculars"if (i % 100_000).zero?
61
61
  end
62
62
  end
63
63
 
64
64
  def collect_synonyms
65
- file = CSV.open(File.join(@download_dir, "relationships.csv"),
65
+ file = CSV.open(File.join(@download_dir, "globalnames_relationships.csv"),
66
66
  headers: true)
67
67
  file.each_with_index do |row, i|
68
68
  canonical = row["scientific_name"]
69
69
  if @synonyms_hash.key?(canonical)
70
70
  @synonyms_hash[canonical] <<
71
- { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
71
+ { name_string: row["related_name"], status: row["taxon_relationship"] }
72
72
  else
73
73
  @synonyms_hash[canonical] = [
74
- { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
74
+ { name_string: row["related_name"], status: row["taxon_relationship"] }
75
75
  ]
76
76
  end
77
- puts "Processed %s synonyms" % i if i % 10_000 == 0
77
+ puts "Processed #{i} synonyms" if (i % 100_000).zero?
78
78
  end
79
79
  end
80
80
 
81
81
  def collect_names
82
82
  @names_index = {}
83
- file = CSV.open(File.join(@download_dir, "classification.csv"),
83
+ file = CSV.open(File.join(@download_dir, "globalnames_classification.csv"),
84
84
  headers: true)
85
- file.each_with_index do |row, i|
86
- next unless row["display_name"]
87
-
88
- name_string = row["display_name"].gsub(%r{</?i>}, "")
89
- canonical = row["scientific_name"]
90
- kingdom = row["kingdom"]
91
- phylum = row["phylum"]
92
- klass = row["phylclass"]
93
- subclass = row["subclass"]
94
- order = row["phylorder"]
95
- suborder = row["suborder"]
96
- superfamily = row["superfamily"]
97
- family = row["family"]
98
- subfamily = row["subfamily"]
99
- tribe = row["tribe"]
100
- genus = row["genus"]
101
- subgenus = row["subgenus"]
102
- species = row["species"]
103
- subspecies = row["subspecies"]
104
- code = row["nomenclatural_code"]
105
-
106
- taxon_id = "ARCT_#{i + 1}"
107
- @names << { taxon_id: taxon_id,
108
- name_string: name_string,
109
- kingdom: kingdom,
110
- phylum: phylum,
111
- klass: klass,
112
- order: order,
113
- family: family,
114
- genus: genus,
115
- code: code }
116
85
 
86
+ names = {}
87
+ file.each_with_index do |row, i|
88
+ next if row["term_type"].nil?
89
+ name = row["scientific_name"]
90
+ if names.key?(name)
91
+ names[name] = names[name].
92
+ merge({row["term_type"].to_sym => row["term"]})
93
+ else
94
+ names[name] = {row["term_type"].to_sym => row["term"]}
95
+ end
96
+ puts "Preprocessed #{i} rows" if (i % 100_000).zero?
97
+ end
98
+ names.each_with_index do |m, i|
99
+ canonical = m[0]
100
+ v = m[1]
101
+ taxon_id = "gn_#{i + 1}"
102
+ res ={ taxon_id: taxon_id,
103
+ name_string: canonical,
104
+ kingdom: v[:kingdom],
105
+ phylum: v[:phylum],
106
+ klass: v[:class],
107
+ order: v[:order],
108
+ family: v[:family],
109
+ genus: v[:genus],
110
+ species: v[:species],
111
+ authors: v[:author_text],
112
+ code: v[:nomenclatural_code] }
113
+ @names << res
117
114
  update_vernacular(taxon_id, canonical)
118
115
  update_synonym(taxon_id, canonical)
119
- puts "Processed %s names" % i if i % 10_000 == 0
116
+ puts "Processed #{i} names" if (i % 100_000).zero?
120
117
  end
121
118
  end
122
119