dwca_hunter 0.7.1 → 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 60327e105c53c226f322e3a7272bdc5747d73fac0124887b024f99e3c39c985b
4
- data.tar.gz: '09660f8b5feccfaf4caeaec277db4dc4729a973196a77bb02860947ef55bd272'
3
+ metadata.gz: 26106d03a805f473871c3092a84b8d0498f991ff809a7e3f259f1e20d29bca80
4
+ data.tar.gz: 28ac068cf264870f67ffdc6697801c683ffec0e101c582ddbd50aa77b7831b8f
5
5
  SHA512:
6
- metadata.gz: 9b0a621f85535f421eef5a8550ce653c4f3483f563c7b19934a76e8f30b0cdf17e7a8c59945ea31455c57a350a14d345993f5fe6b91d656f5eb40317da6b1af9
7
- data.tar.gz: 00a54b23a8588e6d304d35bb8756f633fb409777f07908525d257f4b1a23c5956a9683863b491f50e8d772c8b41343f909e6cb252a326d4f2e662f96f37826ed
6
+ metadata.gz: 3a1e49e6db3aa0bb79616a512feb8c05438f3534d6c7bdd43a0ae32981e32f83ccd74bdcd682130bdf044dd893e0017eeddf533b170109338bb8d95f0379e2b7
7
+ data.tar.gz: 35bb769a822639d26380ad70db0b7aeed22483ce7184e62b7fad38c8d5c3b7ab5ea3b82e3dce00bb012bcc18e719077c2c3bb4a2b53830f7babd693c26aa2caf
data/.rubocop.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  AllCops:
2
2
  NewCops: disable
3
- TargetRubyVersion: 2.6.6
3
+ TargetRubyVersion: 3.0.0
4
4
  Exclude:
5
5
  - bin/**/*
6
6
  - db/**/*
@@ -28,6 +28,14 @@ Metrics/ClassLength:
28
28
  Metrics/MethodLength:
29
29
  Enabled: false
30
30
 
31
+ Metrics/AbcSize:
32
+ Enabled: false
33
+
34
+ Metrics/PerceivedComplexity:
35
+ Enabled: false
36
+
37
+
38
+
31
39
  Naming/FileName:
32
40
  Exclude:
33
41
  - Gemfile
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.6
1
+ 3.0.0
data/Gemfile.lock CHANGED
@@ -1,23 +1,26 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- dwca_hunter (0.7.0)
5
- biodiversity (~> 4)
6
- dwc-archive (~> 1.1.1)
4
+ dwca_hunter (0.7.2)
5
+ biodiversity (~> 5.1.2)
6
+ dwc-archive (~> 1.1.3)
7
7
  gn_uuid (~> 0.5)
8
8
  htmlentities (~> 4.3)
9
9
  nokogiri (~> 1.11)
10
- rest-client (~> 2.0)
10
+ rest-client (~> 2.1)
11
11
  ruby-xz (~> 1.0)
12
- thor (~> 0.19)
12
+ rubyzip (~> 2.3)
13
+ thor (~> 1.1)
13
14
 
14
15
  GEM
15
16
  remote: http://rubygems.org/
16
17
  specs:
17
- ast (2.4.1)
18
- biodiversity (4.1.0)
19
- ffi (~> 1.11)
20
- byebug (10.0.2)
18
+ ast (2.4.2)
19
+ backport (1.1.2)
20
+ benchmark (0.1.1)
21
+ biodiversity (5.1.2)
22
+ ffi (~> 1.14)
23
+ byebug (11.1.3)
21
24
  coveralls (0.8.23)
22
25
  json (>= 1.8, < 3)
23
26
  simplecov (~> 0.16.1)
@@ -25,19 +28,25 @@ GEM
25
28
  thor (>= 0.19.4, < 2.0)
26
29
  tins (~> 1.6)
27
30
  diff-lcs (1.4.4)
28
- docile (1.3.4)
31
+ docile (1.3.5)
29
32
  domain_name (0.5.20190701)
30
33
  unf (>= 0.0.5, < 1.0.0)
31
- dwc-archive (1.1.1)
32
- biodiversity (~> 4)
33
- nokogiri (~> 1.10)
34
+ dwc-archive (1.1.3)
35
+ biodiversity (~> 5.1.2)
36
+ nokogiri (~> 1.11)
37
+ e2mmap (0.1.0)
34
38
  ffi (1.14.2)
35
39
  gn_uuid (0.5.1)
36
40
  htmlentities (4.3.4)
37
41
  http-accept (1.7.0)
38
42
  http-cookie (1.0.3)
39
43
  domain_name (~> 0.5)
44
+ jaro_winkler (1.5.4)
40
45
  json (2.5.1)
46
+ kramdown (2.3.0)
47
+ rexml
48
+ kramdown-parser-gfm (1.1.0)
49
+ kramdown (~> 2.0)
41
50
  mime-types (3.3.1)
42
51
  mime-types-data (~> 3.2015)
43
52
  mime-types-data (3.2020.1104)
@@ -58,6 +67,8 @@ GEM
58
67
  http-cookie (>= 1.0.2, < 2.0)
59
68
  mime-types (>= 1.16, < 4.0)
60
69
  netrc (~> 0.8)
70
+ reverse_markdown (2.0.0)
71
+ nokogiri
61
72
  rexml (3.2.4)
62
73
  rspec (3.10.0)
63
74
  rspec-core (~> 3.10.0)
@@ -72,46 +83,64 @@ GEM
72
83
  diff-lcs (>= 1.2.0, < 2.0)
73
84
  rspec-support (~> 3.10.0)
74
85
  rspec-support (3.10.1)
75
- rubocop (0.93.1)
86
+ rubocop (1.9.0)
76
87
  parallel (~> 1.10)
77
- parser (>= 2.7.1.5)
88
+ parser (>= 3.0.0.0)
78
89
  rainbow (>= 2.2.2, < 4.0)
79
- regexp_parser (>= 1.8)
90
+ regexp_parser (>= 1.8, < 3.0)
80
91
  rexml
81
- rubocop-ast (>= 0.6.0)
92
+ rubocop-ast (>= 1.2.0, < 2.0)
82
93
  ruby-progressbar (~> 1.7)
83
- unicode-display_width (>= 1.4.0, < 2.0)
84
- rubocop-ast (1.4.0)
94
+ unicode-display_width (>= 1.4.0, < 3.0)
95
+ rubocop-ast (1.4.1)
85
96
  parser (>= 2.7.1.5)
86
97
  ruby-progressbar (1.11.0)
87
98
  ruby-xz (1.0.0)
99
+ rubyzip (2.3.0)
88
100
  simplecov (0.16.1)
89
101
  docile (~> 1.1)
90
102
  json (>= 1.8, < 3)
91
103
  simplecov-html (~> 0.10.0)
92
104
  simplecov-html (0.10.2)
105
+ solargraph (0.40.2)
106
+ backport (~> 1.1)
107
+ benchmark
108
+ bundler (>= 1.17.2)
109
+ e2mmap
110
+ jaro_winkler (~> 1.5)
111
+ kramdown (~> 2.3)
112
+ kramdown-parser-gfm (~> 1.1)
113
+ parser (~> 3.0)
114
+ reverse_markdown (>= 1.0.5, < 3)
115
+ rubocop (>= 0.52)
116
+ thor (~> 1.0)
117
+ tilt (~> 2.0)
118
+ yard (~> 0.9, >= 0.9.24)
93
119
  sync (0.5.0)
94
120
  term-ansicolor (1.7.1)
95
121
  tins (~> 1.0)
96
- thor (0.20.3)
97
- tins (1.26.0)
122
+ thor (1.1.0)
123
+ tilt (2.0.10)
124
+ tins (1.28.0)
98
125
  sync
99
126
  unf (0.1.4)
100
127
  unf_ext
101
128
  unf_ext (0.0.7.7)
102
- unicode-display_width (1.7.0)
129
+ unicode-display_width (2.0.0)
130
+ yard (0.9.26)
103
131
 
104
132
  PLATFORMS
105
133
  ruby
106
134
 
107
135
  DEPENDENCIES
108
- bundler (~> 2.0)
109
- byebug (~> 10.0)
136
+ bundler (~> 2.2)
137
+ byebug (~> 11.1)
110
138
  coveralls (~> 0.8)
111
139
  dwca_hunter!
112
140
  rake (~> 13.0)
113
- rspec (~> 3.9)
114
- rubocop (~> 0.84)
141
+ rspec (~> 3.10)
142
+ rubocop (~> 1.9)
143
+ solargraph (~> 0.40)
115
144
 
116
145
  BUNDLED WITH
117
- 2.1.4
146
+ 2.2.7
data/dwca_hunter.gemspec CHANGED
@@ -6,7 +6,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
6
6
  require "dwca_hunter/version"
7
7
 
8
8
  Gem::Specification.new do |gem|
9
- gem.required_ruby_version = ">= 2.6.6"
9
+ gem.required_ruby_version = ">= 3.0.0"
10
10
  gem.name = "dwca_hunter"
11
11
  gem.version = DwcaHunter.version
12
12
  gem.license = "MIT"
@@ -26,19 +26,21 @@ Gem::Specification.new do |gem|
26
26
  gem.executables = gem.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
27
  gem.require_paths = ["lib"]
28
28
 
29
- gem.add_dependency "biodiversity", "~> 4"
30
- gem.add_dependency "dwc-archive", "~> 1.1.1"
29
+ gem.add_dependency "biodiversity", "~> 5.1.2"
30
+ gem.add_dependency "dwc-archive", "~> 1.1.3"
31
31
  gem.add_dependency "gn_uuid", "~> 0.5"
32
32
  gem.add_dependency "htmlentities", "~> 4.3"
33
33
  gem.add_dependency "nokogiri", "~> 1.11"
34
- gem.add_dependency "rest-client", "~> 2.0"
34
+ gem.add_dependency "rest-client", "~> 2.1"
35
35
  gem.add_dependency "ruby-xz", "~> 1.0"
36
- gem.add_dependency "thor", "~> 0.19"
36
+ gem.add_dependency "rubyzip", "~> 2.3"
37
+ gem.add_dependency "thor", "~> 1.1"
37
38
 
38
- gem.add_development_dependency "bundler", "~> 2.0"
39
- gem.add_development_dependency "byebug", "~> 10.0"
39
+ gem.add_development_dependency "bundler", "~> 2.2"
40
+ gem.add_development_dependency "byebug", "~> 11.1"
40
41
  gem.add_development_dependency "coveralls", "~> 0.8"
41
42
  gem.add_development_dependency "rake", "~> 13.0"
42
- gem.add_development_dependency "rspec", "~> 3.9"
43
- gem.add_development_dependency "rubocop", "~> 0.84"
43
+ gem.add_development_dependency "rspec", "~> 3.10"
44
+ gem.add_development_dependency "rubocop", "~> 1.9"
45
+ gem.add_development_dependency "solargraph", "~> 0.40"
44
46
  end
data/exe/dwcahunter CHANGED
@@ -51,7 +51,6 @@ class DwcaHunterCLI < Thor
51
51
  resource.abbr =~ /#{search}/i)
52
52
  end
53
53
 
54
- # rubocop:disable Metrics/AbcSize
55
54
  def print_table(data)
56
55
  table = [data.first.keys.map { |k| k.to_s.capitalize }] + data.map(&:values)
57
56
  widths = table_widths(table)
@@ -59,7 +58,6 @@ class DwcaHunterCLI < Thor
59
58
  format = widths.collect { |n| "%-#{n}s" }.join(" ")
60
59
  table.each { |line| printf " #{format} \n", *line }
61
60
  end
62
- # rubocop:enable Metrics/AbcSize
63
61
 
64
62
  def table_widths(table)
65
63
  table.each_with_object([]) do |line, widths|
data/lib/dwca_hunter.rb CHANGED
@@ -7,11 +7,13 @@ require "dwca_hunter/resource"
7
7
  require "fileutils"
8
8
  require "htmlentities"
9
9
  require "json"
10
+ require "zip"
10
11
  require "logger"
11
12
  require "net/http"
12
13
  require "rest_client"
13
14
  require "tmpdir"
14
15
  require "uri"
16
+ require "cgi"
15
17
 
16
18
  Dir[File.join(__dir__, "dwca_hunter", "*.rb")].
17
19
  each { |f| require f }
@@ -53,13 +55,14 @@ module DwcaHunter
53
55
  end
54
56
 
55
57
  def normalize_authors(auth)
56
- reg = Regexp.new(/^([\(]?)(.*?)(([\s,\)][^[:upper:]]*)?$)/)
57
- auth = auth.gsub(/duPont/, 'du Pont')
58
+ reg = Regexp.new(/^(\(?)(.*?)(([\s,)][^[:upper:]]*)?$)/)
59
+ auth = auth.gsub(/duPont/, "du Pont")
58
60
  match = reg.match(auth)
59
61
  return auth if match.nil?
62
+
60
63
  a1, a2, a3 = match[1..3]
61
- a2mod = a2.gsub('&', ',')
62
- ary2 = a2mod.split(',').map(&:strip)
64
+ a2mod = a2.gsub("&", ",")
65
+ ary2 = a2mod.split(",").map(&:strip)
63
66
  a2 = move_initials(ary2) if ary2.size > 1
64
67
  "#{a1}#{a2}#{a3}"
65
68
  end
@@ -73,8 +76,8 @@ module DwcaHunter
73
76
  end
74
77
  match = /^([[:upper:]]{1,4})(\sJr)?$/.match(a)
75
78
  if !match.nil?
76
- initialls = match[1].split('').join('. ')
77
- res[-1] = "#{initialls}. #{res[-1]}#{match[2].to_s}"
79
+ initialls = match[1].split("").join(". ")
80
+ res[-1] = "#{initialls}. #{res[-1]}#{match[2]}"
78
81
  else
79
82
  res << a
80
83
  end
@@ -83,4 +86,3 @@ module DwcaHunter
83
86
  end
84
87
  end
85
88
  end
86
-
@@ -4,7 +4,12 @@ module DwcaHunter
4
4
 
5
5
  def self.unzip(file, dir = nil)
6
6
  Dir.chdir(dir) if dir
7
- `unzip -qq -u #{file} > /dev/null 2>&1`
7
+ Zip::File.open(file) do |zip_file|
8
+ zip_file.each do |entry|
9
+ puts "Extracting #{entry.name}"
10
+ entry.extract
11
+ end
12
+ end
8
13
  end
9
14
 
10
15
  def self.gunzip(file, dir = nil)
@@ -13,8 +18,8 @@ module DwcaHunter
13
18
  end
14
19
 
15
20
  def initialize(opts)
16
- @needs_download = !(opts[:download] == false)
17
- @needs_unpack = !(opts[:unpack] == false)
21
+ @needs_download = (opts[:download] != false)
22
+ @needs_unpack = (opts[:unpack] != false)
18
23
  @download_dir, @download_file = File.split(@download_path)
19
24
  prepare_path if needs_download?
20
25
  end
@@ -5,12 +5,12 @@ module DwcaHunter
5
5
  def initialize(opts = {})
6
6
  @command = "arctos"
7
7
  @title = "Arctos"
8
- @url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
8
+ @url = "http://arctos.database.museum/cache/gn_merge.tgz"
9
9
  @UUID = "eea8315d-a244-4625-859a-226675622312"
10
10
  @download_path = File.join(Dir.tmpdir,
11
11
  "dwca_hunter",
12
12
  "arctos",
13
- "data.zip")
13
+ "data.tar.gz")
14
14
  @synonyms = []
15
15
  @names = []
16
16
  @vernaculars = []
@@ -21,8 +21,8 @@ module DwcaHunter
21
21
  end
22
22
 
23
23
  def download
24
- puts "Downloading cached verion of the file. Ask Arctos to generate new."
25
- `curl -s -L #{@url} -o #{@download_path}`
24
+ puts "Downloading Arctos file."
25
+ `curl -s #{@url} -o #{@download_path}`
26
26
  end
27
27
 
28
28
  def unpack
@@ -45,11 +45,11 @@ module DwcaHunter
45
45
  end
46
46
 
47
47
  def collect_vernaculars
48
- file = CSV.open(File.join(@download_dir, "common_name.csv"),
48
+ file = CSV.open(File.join(@download_dir, "globalnames_commonname.csv"),
49
49
  headers: true)
50
50
  file.each_with_index do |row, i|
51
- canonical = row["SCIENTIFIC_NAME"]
52
- vernacular_name_string = row["COMMON_NAME"]
51
+ canonical = row["scientific_name"]
52
+ vernacular_name_string = row["common_name"]
53
53
 
54
54
  if @vernaculars_hash.key?(canonical)
55
55
  @vernaculars_hash[canonical] << vernacular_name_string
@@ -57,66 +57,63 @@ module DwcaHunter
57
57
  @vernaculars_hash[canonical] = [vernacular_name_string]
58
58
  end
59
59
 
60
- puts "Processed %s vernaculars" % i if i % 10_000 == 0
60
+ puts "Processed #{i} vernaculars"if (i % 100_000).zero?
61
61
  end
62
62
  end
63
63
 
64
64
  def collect_synonyms
65
- file = CSV.open(File.join(@download_dir, "relationships.csv"),
65
+ file = CSV.open(File.join(@download_dir, "globalnames_relationships.csv"),
66
66
  headers: true)
67
67
  file.each_with_index do |row, i|
68
68
  canonical = row["scientific_name"]
69
69
  if @synonyms_hash.key?(canonical)
70
70
  @synonyms_hash[canonical] <<
71
- { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
71
+ { name_string: row["related_name"], status: row["taxon_relationship"] }
72
72
  else
73
73
  @synonyms_hash[canonical] = [
74
- { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
74
+ { name_string: row["related_name"], status: row["taxon_relationship"] }
75
75
  ]
76
76
  end
77
- puts "Processed %s synonyms" % i if i % 10_000 == 0
77
+ puts "Processed #{i} synonyms" if (i % 100_000).zero?
78
78
  end
79
79
  end
80
80
 
81
81
  def collect_names
82
82
  @names_index = {}
83
- file = CSV.open(File.join(@download_dir, "classification.csv"),
83
+ file = CSV.open(File.join(@download_dir, "globalnames_classification.csv"),
84
84
  headers: true)
85
- file.each_with_index do |row, i|
86
- next unless row["display_name"]
87
-
88
- name_string = row["display_name"].gsub(%r{</?i>}, "")
89
- canonical = row["scientific_name"]
90
- kingdom = row["kingdom"]
91
- phylum = row["phylum"]
92
- klass = row["phylclass"]
93
- subclass = row["subclass"]
94
- order = row["phylorder"]
95
- suborder = row["suborder"]
96
- superfamily = row["superfamily"]
97
- family = row["family"]
98
- subfamily = row["subfamily"]
99
- tribe = row["tribe"]
100
- genus = row["genus"]
101
- subgenus = row["subgenus"]
102
- species = row["species"]
103
- subspecies = row["subspecies"]
104
- code = row["nomenclatural_code"]
105
-
106
- taxon_id = "ARCT_#{i + 1}"
107
- @names << { taxon_id: taxon_id,
108
- name_string: name_string,
109
- kingdom: kingdom,
110
- phylum: phylum,
111
- klass: klass,
112
- order: order,
113
- family: family,
114
- genus: genus,
115
- code: code }
116
85
 
86
+ names = {}
87
+ file.each_with_index do |row, i|
88
+ next if row["term_type"].nil?
89
+ name = row["scientific_name"]
90
+ if names.key?(name)
91
+ names[name] = names[name].
92
+ merge({row["term_type"].to_sym => row["term"]})
93
+ else
94
+ names[name] = {row["term_type"].to_sym => row["term"]}
95
+ end
96
+ puts "Preprocessed #{i} rows" if (i % 100_000).zero?
97
+ end
98
+ names.each_with_index do |m, i|
99
+ canonical = m[0]
100
+ v = m[1]
101
+ taxon_id = "gn_#{i + 1}"
102
+ res ={ taxon_id: taxon_id,
103
+ name_string: canonical,
104
+ kingdom: v[:kingdom],
105
+ phylum: v[:phylum],
106
+ klass: v[:class],
107
+ order: v[:order],
108
+ family: v[:family],
109
+ genus: v[:genus],
110
+ species: v[:species],
111
+ authors: v[:author_text],
112
+ code: v[:nomenclatural_code] }
113
+ @names << res
117
114
  update_vernacular(taxon_id, canonical)
118
115
  update_synonym(taxon_id, canonical)
119
- puts "Processed %s names" % i if i % 10_000 == 0
116
+ puts "Processed #{i} names" if (i % 100_000).zero?
120
117
  end
121
118
  end
122
119