gn_crossmap 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +3 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +9 -0
- data/CHANGELOG.md +13 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.md +137 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/exe/crossmap +23 -0
- data/gn_crossmap.gemspec +39 -0
- data/lib/gn_crossmap.rb +34 -0
- data/lib/gn_crossmap/collector.rb +44 -0
- data/lib/gn_crossmap/column_collector.rb +74 -0
- data/lib/gn_crossmap/reader.rb +30 -0
- data/lib/gn_crossmap/resolver.rb +60 -0
- data/lib/gn_crossmap/result_processor.rb +58 -0
- data/lib/gn_crossmap/sci_name_collector.rb +61 -0
- data/lib/gn_crossmap/version.rb +8 -0
- data/lib/gn_crossmap/writer.rb +22 -0
- metadata +182 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ac2bef76184049dac604ad4be78998d50fc8cc54
|
4
|
+
data.tar.gz: 4ebf72afdfe65e37148d19dd2234d771cb3db951
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 06b2f2a33ad5d73344e73afa6b76790ab195b5529b98984949a40a3d1e64eb79f5449d1045a0a0c1d60af6b65a5076f63deb3339d1e1b6edbf734790fc5b4cff
|
7
|
+
data.tar.gz: f26df79b31645a2228be814aa47e9efd4f4976d9753df3ca4a0ebeafc1b5b7f1d91cd55d8e61555a655f1c39d0051d36e34554a983f07fec507aa4e4aaf5ed75
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
gn_crossmap CHANGELOG
|
2
|
+
=====================
|
3
|
+
|
4
|
+
0.1.1
|
5
|
+
-----
|
6
|
+
- [Dmitry Mozzherin][dimus] - first official release -- works for full names
|
7
|
+
and names entered in rank fields
|
8
|
+
|
9
|
+
0.1.0
|
10
|
+
-----
|
11
|
+
- [Dmitry Mozzherin][dimus] - initial version
|
12
|
+
|
13
|
+
[dimus]: https://github.com/dimus
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2015 Marine Biological Laboratory
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
# GnCrossmap
|
2
|
+
[![Gem Version][gem_badge]][gem_link]
|
3
|
+
[![Continuous Integration Status][ci_badge]][ci_link]
|
4
|
+
[![Coverage Status][cov_badge]][cov_link]
|
5
|
+
[![CodeClimate][code_badge]][code_link]
|
6
|
+
[![Dependency Status][dep_badge]][dep_link]
|
7
|
+
|
8
|
+
This gem crossmaps a checklist of scientific names to names from a data source
|
9
|
+
in [GN Resolver][resolver].
|
10
|
+
|
11
|
+
Checklist has to be in a CSV format.
|
12
|
+
|
13
|
+
Compatibility
|
14
|
+
-------------
|
15
|
+
|
16
|
+
This gem is compatible with Ruby versions higher or equal to 2.1.0
|
17
|
+
|
18
|
+
Installation
|
19
|
+
------------
|
20
|
+
|
21
|
+
Add this line to your application's Gemfile:
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
gem 'gn_crossmap'
|
25
|
+
```
|
26
|
+
|
27
|
+
And then execute:
|
28
|
+
|
29
|
+
$ bundle
|
30
|
+
|
31
|
+
Or install it yourself as:
|
32
|
+
|
33
|
+
$ gem install gn_crossmap
|
34
|
+
|
35
|
+
Usage
|
36
|
+
-----
|
37
|
+
|
38
|
+
### Input file format
|
39
|
+
|
40
|
+
- Comma Separated File with names of fields in first row.
|
41
|
+
- Columns can be separated by tab, comma or semicolon
|
42
|
+
- At least some columns should have recognizable fields
|
43
|
+
|
44
|
+
taxonID kingdom phylum class order family genus species
|
45
|
+
subspecies variety form scientificNameAuthorship scientificName
|
46
|
+
taxonRank
|
47
|
+
|
48
|
+
#### Simple Example
|
49
|
+
|
50
|
+
taxonID;scientificName
|
51
|
+
1;Macrobiotus echinogenitus subsp. areolatus Murray, 1907
|
52
|
+
...
|
53
|
+
|
54
|
+
#### Rank Example
|
55
|
+
|
56
|
+
taxonID;scientificName;taxonRank
|
57
|
+
1;Macrobiotus echinogenitus f. areolatus Murray, 1907;form
|
58
|
+
...
|
59
|
+
|
60
|
+
#### Family and Authorship Example
|
61
|
+
|
62
|
+
taxonID;family;scientificName;scientificNameAuthorship
|
63
|
+
1;Macrobiotidae;Macrobiotus echinogenitus subsp. areolatus;Murray, 1907
|
64
|
+
...
|
65
|
+
|
66
|
+
#### Fine-grained Example
|
67
|
+
|
68
|
+
TaxonId;kingdom;subkingdom;phylum;subphylum;superclass;class;subclass;cohort;superorder;order;suborder;infraorder;superfamily;family;subfamily;tribe;subtribe;genus;subgenus;section;species;subspecies;variety;form;ScientificNameAuthorship
|
69
|
+
1;Animalia;;Tardigrada;;;Eutardigrada;;;;Parachela;;;Macrobiotoidea;Macrobiotidae;;;;Macrobiotus;;;harmsworthi;obscurus;;;Dastych, 1985
|
70
|
+
|
71
|
+
### Usage from command line
|
72
|
+
|
73
|
+
# to see help
|
74
|
+
$ crossmap --help
|
75
|
+
|
76
|
+
# to compare with default source (Catalogue of Life)
|
77
|
+
$ crossmap -i my_list.csv -o my_list_col.csv
|
78
|
+
|
79
|
+
# to compare with other source (Index Fungorum in this example)
|
80
|
+
$ crossmap -i my_list.csv -o my_list_if.csv -d 5
|
81
|
+
|
82
|
+
### Usage as Ruby Library
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
require "gn_crossmap"
|
86
|
+
|
87
|
+
# If you want to change logger -- default Logging is to standard output
|
88
|
+
GnCrossmap.logger = MyCustomLogger.new
|
89
|
+
|
90
|
+
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5)
|
91
|
+
```
|
92
|
+
|
93
|
+
Development
|
94
|
+
-----------
|
95
|
+
|
96
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
97
|
+
`bin/console` for an interactive prompt that will allow you to experiment.
|
98
|
+
|
99
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To
|
100
|
+
release a new version, update the version number in `version.rb`, and then run
|
101
|
+
`bundle exec rake release` to create a git tag for the version, push git
|
102
|
+
commits and tags, and push the `.gem` file to
|
103
|
+
[rubygems.org][rubygems]
|
104
|
+
|
105
|
+
Contributing
|
106
|
+
------------
|
107
|
+
|
108
|
+
1. Fork it ( https://github.com/[my-github-username]/gn_crossmap/fork )
|
109
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
110
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
111
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
112
|
+
5. Create a new Pull Request
|
113
|
+
|
114
|
+
Copyright
|
115
|
+
---------
|
116
|
+
|
117
|
+
Author -- [Dmitry Mozzherin][dimus]
|
118
|
+
|
119
|
+
Copyright (c) 2015 [Marine Biological Laboratory][mbl].
|
120
|
+
See [LICENSE][license] for details.
|
121
|
+
|
122
|
+
[gem_badge]: https://badge.fury.io/rb/gn_crossmap.png
|
123
|
+
[gem_link]: http://badge.fury.io/rb/gn_crossmap
|
124
|
+
[ci_badge]: https://secure.travis-ci.org/GlobalNamesArchitecture/gn_crossmap.png
|
125
|
+
[ci_link]: http://travis-ci.org/GlobalNamesArchitecture/gn_crossmap
|
126
|
+
[cov_badge]: https://coveralls.io/repos/GlobalNamesArchitecture/gn_crossmap/badge.png?branch=master
|
127
|
+
[cov_link]: https://coveralls.io/r/GlobalNamesArchitecture/gn_crossmap?branch=master
|
128
|
+
[code_badge]: https://codeclimate.com/github/GlobalNamesArchitecture/gn_crossmap.png
|
129
|
+
[code_link]: https://codeclimate.com/github/GlobalNamesArchitecture/gn_crossmap
|
130
|
+
[dep_badge]: https://gemnasium.com/GlobalNamesArchitecture/gn_crossmap.png
|
131
|
+
[dep_link]: https://gemnasium.com/GlobalNamesArchitecture/gn_crossmap
|
132
|
+
[resolver]: http://resolver.globalnames.org
|
133
|
+
[rubygems]: https://rubygems.org
|
134
|
+
[dimus]: https://github.com/dimus
|
135
|
+
[mbl]: http://mbl.edu
|
136
|
+
[license]: https://github.com/GlobalNamesArchitecture/gn_crossmap/blob/master/LICENSE
|
137
|
+
[terms]: http://rs.tdwg.org/dwc/terms
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "gn_crossmap"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/exe/crossmap
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "trollop"
|
3
|
+
require "gn_crossmap"
|
4
|
+
|
5
|
+
puts "This program requires Ruby >= v. 2.1.0" if RUBY_VERSION < "2.1.0"
|
6
|
+
|
7
|
+
CATALOGUE_OF_LIFE = 1
|
8
|
+
OUTPUT = "output.csv"
|
9
|
+
opts = Trollop.options do
|
10
|
+
banner "Compares a list of scientific names to scientific names from a " \
|
11
|
+
"data source from Global Names Resolver\n\n " \
|
12
|
+
"Usage:\n crossmap [options]\n\noptions:"
|
13
|
+
|
14
|
+
opt(:input, "Path to intput file", type: :string)
|
15
|
+
opt(:output, "Path to output file", default: OUTPUT)
|
16
|
+
opt(:data_source_id, "Data source id from GN Resolver",
|
17
|
+
default: CATALOGUE_OF_LIFE)
|
18
|
+
end
|
19
|
+
|
20
|
+
Trollop.die :input, "must be set" if opts[:input].nil?
|
21
|
+
Trollop.die :input, "file must exist" unless File.exist?(opts[:input])
|
22
|
+
|
23
|
+
GnCrossmap.run(opts[:input], opts[:output], opts[:data_source_id])
|
data/gn_crossmap.gemspec
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "gn_crossmap/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "gn_crossmap"
|
8
|
+
gem.version = GnCrossmap::VERSION
|
9
|
+
gem.authors = ["Dmitry Mozzherin"]
|
10
|
+
gem.email = ["dmozzherin@gmail.com"]
|
11
|
+
|
12
|
+
gem.summary = "Crossmaps a list of scientific names to names from " \
|
13
|
+
"a data source in GN Index"
|
14
|
+
gem.description = "User supplies a comma-separated file which breaks " \
|
15
|
+
"contains in one row a hierarchy path of known ranks, " \
|
16
|
+
"scientific name which can be split into its semantic " \
|
17
|
+
"elements and include authorship and taxon concept " \
|
18
|
+
"reference. User also supplies an id of a data source "\
|
19
|
+
"from global names resolver/index. User gets back a " \
|
20
|
+
"new comma-separated file where scientific names from " \
|
21
|
+
"her list match data from the given data source."
|
22
|
+
gem.homepage = "https://github.com/GlobalNamesArchitecture/gn_crossmap"
|
23
|
+
|
24
|
+
gem.files = `git ls-files -z`.split("\x0").
|
25
|
+
reject { |f| f.match(%r{^(test|spec|features)/}) }
|
26
|
+
gem.bindir = "exe"
|
27
|
+
gem.executables = gem.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
|
+
gem.require_paths = ["lib"]
|
29
|
+
|
30
|
+
gem.add_dependency "trollop", "~> 2.1"
|
31
|
+
gem.add_dependency "biodiversity", "~> 3.1"
|
32
|
+
|
33
|
+
gem.add_development_dependency "bundler", "~> 1.7"
|
34
|
+
gem.add_development_dependency "rake", "~> 10.0"
|
35
|
+
gem.add_development_dependency "rspec", "~> 3.2"
|
36
|
+
gem.add_development_dependency "rubocop", "~> 0.31"
|
37
|
+
gem.add_development_dependency "coveralls", "~> 0.8"
|
38
|
+
gem.add_development_dependency "byebug"
|
39
|
+
end
|
data/lib/gn_crossmap.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "rest_client"
|
3
|
+
require "logger"
|
4
|
+
require "biodiversity"
|
5
|
+
require "gn_crossmap/version"
|
6
|
+
require "gn_crossmap/reader"
|
7
|
+
require "gn_crossmap/writer"
|
8
|
+
require "gn_crossmap/collector"
|
9
|
+
require "gn_crossmap/column_collector"
|
10
|
+
require "gn_crossmap/sci_name_collector"
|
11
|
+
require "gn_crossmap/resolver"
|
12
|
+
require "gn_crossmap/result_processor"
|
13
|
+
|
14
|
+
# Namespace module for crossmapping checklists wth GN sources
|
15
|
+
module GnCrossmap
|
16
|
+
class << self
|
17
|
+
attr_writer :logger
|
18
|
+
|
19
|
+
def run(input, output, data_source_id)
|
20
|
+
data = Reader.new(input).read
|
21
|
+
writer = Writer.new(output)
|
22
|
+
Resolver.new(writer, data_source_id).resolve(data)
|
23
|
+
output
|
24
|
+
end
|
25
|
+
|
26
|
+
def logger
|
27
|
+
@logger ||= Logger.new($stdout)
|
28
|
+
end
|
29
|
+
|
30
|
+
def log(message)
|
31
|
+
logger.info(message)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Assemble data from CSV reader by checking column fields
|
3
|
+
class Collector
|
4
|
+
RANKS = %i(kingdom subkingdom phylum subphylum superclass class
|
5
|
+
subclass cohort superorder order suborder infraorder superfamily
|
6
|
+
family subfamily tribe subtribe genus subgenus section species
|
7
|
+
subspecies variety form)
|
8
|
+
SPECIES_RANKS = %i(genus species subspecies variety form)
|
9
|
+
|
10
|
+
attr_reader :data
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@data = []
|
14
|
+
@fields = nil
|
15
|
+
@collector = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
def process_row(row)
|
19
|
+
@row = row
|
20
|
+
@fields ? collect_data : init_fields_collector
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def init_fields_collector
|
26
|
+
@fields = @row.map { |f| f.downcase.to_sym }
|
27
|
+
@collector = collector_factory
|
28
|
+
end
|
29
|
+
|
30
|
+
def collect_data
|
31
|
+
@row = @fields.zip(@row).to_h
|
32
|
+
data = @collector.id_name_rank(@row)
|
33
|
+
@data << data if data
|
34
|
+
end
|
35
|
+
|
36
|
+
def collector_factory
|
37
|
+
if @fields.include?(:scientificname)
|
38
|
+
SciNameCollector.new(@fields)
|
39
|
+
else
|
40
|
+
ColumnCollector.new(@fields)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Assemble data from CSV reader by checking column fields
|
3
|
+
class ColumnCollector
|
4
|
+
RANKS = %i(kingdom subkingdom phylum subphylum superclass class
|
5
|
+
subclass cohort superorder order suborder infraorder superfamily
|
6
|
+
family subfamily tribe subtribe genus subgenus section species
|
7
|
+
subspecies variety form)
|
8
|
+
SPECIES_RANKS = %i(genus species subspecies variety form)
|
9
|
+
|
10
|
+
attr_reader :data
|
11
|
+
|
12
|
+
def initialize(fields)
|
13
|
+
@fields = fields
|
14
|
+
end
|
15
|
+
|
16
|
+
def id_name_rank(row)
|
17
|
+
@row = row
|
18
|
+
id = @row[:taxonid]
|
19
|
+
return nil if id.to_s.strip == ""
|
20
|
+
rank = find_rank
|
21
|
+
return nil unless rank
|
22
|
+
name = assemble_name(rank)
|
23
|
+
return nil unless name
|
24
|
+
{ id: id, name: name, rank: rank.to_s }
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def find_rank
|
30
|
+
name_rank = nil
|
31
|
+
RANKS.reverse_each do |rank|
|
32
|
+
next if @row[rank].to_s.strip == ""
|
33
|
+
name_rank = rank
|
34
|
+
break
|
35
|
+
end
|
36
|
+
name_rank
|
37
|
+
end
|
38
|
+
|
39
|
+
def assemble_name(name_rank)
|
40
|
+
name = @row[name_rank]
|
41
|
+
if SPECIES_RANKS[1..-1].include?(name_rank)
|
42
|
+
name = assemble_species_name(name, name_rank)
|
43
|
+
end
|
44
|
+
name
|
45
|
+
end
|
46
|
+
|
47
|
+
def assemble_species_name(name, name_rank)
|
48
|
+
ending = [add_infrarank(name, name_rank), @row[:scientificnameauthorship]]
|
49
|
+
ranks = SPECIES_RANKS[0...SPECIES_RANKS.index(name_rank)]
|
50
|
+
starting = name_start(ranks)
|
51
|
+
(starting + ending).flatten.join(" ").strip.gsub(/\s+/, " ")
|
52
|
+
end
|
53
|
+
|
54
|
+
def name_start(ranks)
|
55
|
+
ranks.each_with_object([]) do |rank, ary|
|
56
|
+
next unless @row[rank]
|
57
|
+
ary << add_infrarank(@row[rank], rank)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_infrarank(name, rank)
|
62
|
+
case rank
|
63
|
+
when :subspecies
|
64
|
+
"subsp. #{name}"
|
65
|
+
when :variety
|
66
|
+
"var. #{name}"
|
67
|
+
when :form
|
68
|
+
"f. #{name}"
|
69
|
+
else
|
70
|
+
name
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Reads supplied csv file and creates ruby structure to compare
|
3
|
+
# with a Global Names Resolver source
|
4
|
+
class Reader
|
5
|
+
def initialize(csv_path)
|
6
|
+
@csv_file = csv_path
|
7
|
+
@col_sep = col_sep
|
8
|
+
end
|
9
|
+
|
10
|
+
def read
|
11
|
+
GnCrossmap.log("Read input file '#{File.basename(@csv_file)}'")
|
12
|
+
parse_input
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def col_sep
|
18
|
+
line = open(@csv_file, &:readline)
|
19
|
+
[";", ",", "\t"].map { |s| [line.count(s), s] }.sort.last.last
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse_input
|
23
|
+
dc = Collector.new
|
24
|
+
CSV.open(@csv_file, col_sep: @col_sep).each do |row|
|
25
|
+
dc.process_row(row)
|
26
|
+
end
|
27
|
+
dc.data
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Sends data to GN Resolver and collects results
|
3
|
+
class Resolver
|
4
|
+
URL = "http://resolver.globalnames.org/name_resolvers.json"
|
5
|
+
|
6
|
+
def initialize(writer, data_source_id)
|
7
|
+
@processor = GnCrossmap::ResultProcessor.new(writer)
|
8
|
+
@ds_id = data_source_id
|
9
|
+
@count = 0
|
10
|
+
@batch = 200
|
11
|
+
end
|
12
|
+
|
13
|
+
def resolve(data)
|
14
|
+
data_size = data.size
|
15
|
+
data.each_slice(@batch) do |slice|
|
16
|
+
with_log(data_size) do
|
17
|
+
names = collect_names(slice)
|
18
|
+
remote_resolve(names)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
@processor.writer.close
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def with_log(size)
|
27
|
+
s = @count + 1
|
28
|
+
@count += @batch
|
29
|
+
e = [@count, size].min
|
30
|
+
GnCrossmap.log("Resolve #{s}-#{e} out of #{size} records")
|
31
|
+
yield
|
32
|
+
end
|
33
|
+
|
34
|
+
def collect_names(slice)
|
35
|
+
slice.each_with_object("") do |row, str|
|
36
|
+
@processor.input[row[:id]] = { rank: row[:rank] }
|
37
|
+
str << "#{row[:id]}|#{row[:name]}\n"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def remote_resolve(names)
|
42
|
+
res = RestClient.post(URL, data: names, data_source_ids: @ds_id)
|
43
|
+
@processor.process(res)
|
44
|
+
rescue RestClient::Exception
|
45
|
+
single_remote_resolve(names)
|
46
|
+
end
|
47
|
+
|
48
|
+
def single_remote_resolve(names)
|
49
|
+
names.split("\n").each do |name|
|
50
|
+
begin
|
51
|
+
res = RestClient.post(URL, data: name, data_source_ids: @ds_id)
|
52
|
+
@processor.process(res)
|
53
|
+
rescue RestClient::Exception => e
|
54
|
+
GnCrossmap.log("Resolver broke on '#{name}': #{e}")
|
55
|
+
next
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Processes data received from the GN Resolver
|
3
|
+
class ResultProcessor
|
4
|
+
MATCH_TYPES = {
|
5
|
+
1 => "Exact match",
|
6
|
+
2 => "Canonical form exact match",
|
7
|
+
3 => "Canonical form fuzzy match",
|
8
|
+
4 => "Partial canonical form match",
|
9
|
+
5 => "Partial canonical form fuzzy match",
|
10
|
+
6 => "Genus part match"
|
11
|
+
}
|
12
|
+
|
13
|
+
attr_reader :input, :writer
|
14
|
+
|
15
|
+
def initialize(writer)
|
16
|
+
@writer = writer
|
17
|
+
@input = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
def process(result)
|
21
|
+
res = rubyfy(result)
|
22
|
+
res[:data].each do |d|
|
23
|
+
d[:results].nil? ? write_empty_result(d) : write_result(d)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def rubyfy(result)
|
30
|
+
JSON.parse(result, symbolize_names: true)
|
31
|
+
end
|
32
|
+
|
33
|
+
def write_empty_result(datum)
|
34
|
+
res = [datum[:supplied_id], datum[:supplied_name_string], nil, nil,
|
35
|
+
@input[datum[:supplied_id]][:rank], nil, nil, nil, nil]
|
36
|
+
@writer.write(res)
|
37
|
+
end
|
38
|
+
|
39
|
+
def write_result(datum)
|
40
|
+
datum[:results].each do |r|
|
41
|
+
res = [datum[:supplied_id], datum[:supplied_name_string],
|
42
|
+
r[:name_string], r[:canonical_form],
|
43
|
+
@input[datum[:supplied_id]][:rank],
|
44
|
+
matched_rank(r), matched_type(r),
|
45
|
+
r[:edit_distance], r[:score]]
|
46
|
+
@writer.write(res)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def matched_rank(record)
|
51
|
+
record[:classification_path_ranks].split("|").last
|
52
|
+
end
|
53
|
+
|
54
|
+
def matched_type(record)
|
55
|
+
MATCH_TYPES[record[:match_type]]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Assemble data from CSV reader by parsing scientificName field
|
3
|
+
class SciNameCollector
|
4
|
+
def initialize(fields)
|
5
|
+
@fields = fields
|
6
|
+
@parser = ScientificNameParser.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def id_name_rank(row)
|
10
|
+
@row = row
|
11
|
+
id = @row[:taxonid]
|
12
|
+
name = find_name
|
13
|
+
rank = @row[:taxonRank]
|
14
|
+
rank = parse_rank if rank.nil?
|
15
|
+
(id && name) ? { id: id, name: name, rank: rank } : nil
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def find_name
|
21
|
+
name = @row[:scientificname].strip
|
22
|
+
authorship = @row[:scientificnameauthorship].to_s.strip
|
23
|
+
name = "#{name} #{authorship}" if authorship != ""
|
24
|
+
name
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_rank
|
28
|
+
@parsed_name = @parser.parse(@row[:scientificname])[:scientificName]
|
29
|
+
return nil if !@parsed_name[:canonical] || @parsed_name[:hybrid]
|
30
|
+
words_num = @parsed_name[:canonical].split(" ").size
|
31
|
+
infer_rank(words_num)
|
32
|
+
rescue RuntimeError
|
33
|
+
@parser = ScientificNameParser.new
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def infer_rank(words_in_canonical_form)
|
38
|
+
case words_in_canonical_form
|
39
|
+
when 1
|
40
|
+
nil
|
41
|
+
when 2
|
42
|
+
"species"
|
43
|
+
else
|
44
|
+
normalize_rank(@parsed_name[:details][0][:infraspecies][-1][:rank])
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def normalize_rank(rank)
|
49
|
+
case rank
|
50
|
+
when /^f/
|
51
|
+
"form"
|
52
|
+
when /^var/
|
53
|
+
"variety"
|
54
|
+
when /^sub/
|
55
|
+
"subspicies"
|
56
|
+
else
|
57
|
+
rank
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Saves output from GN Resolver to disk
|
3
|
+
class Writer
|
4
|
+
def initialize(output_path)
|
5
|
+
@path = output_path
|
6
|
+
@output = CSV.open(@path, "w:utf-8")
|
7
|
+
@output << [:taxonID, :scientificName, :matchedScientificName,
|
8
|
+
:matchedCanonicalForm, :rank, :matchedRank, :matchType,
|
9
|
+
:editDistance, :score]
|
10
|
+
GnCrossmap.log("Open output file '#{@path}'")
|
11
|
+
end
|
12
|
+
|
13
|
+
def write(record)
|
14
|
+
@output << record
|
15
|
+
end
|
16
|
+
|
17
|
+
def close
|
18
|
+
GnCrossmap.log("Close output file '#{@path}'")
|
19
|
+
@output.close
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gn_crossmap
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dmitry Mozzherin
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-05-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: trollop
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: biodiversity
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.1'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.1'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.7'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.7'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.2'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.2'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.31'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.31'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: coveralls
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.8'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.8'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: byebug
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description: User supplies a comma-separated file which breaks contains in one row
|
126
|
+
a hierarchy path of known ranks, scientific name which can be split into its semantic
|
127
|
+
elements and include authorship and taxon concept reference. User also supplies
|
128
|
+
an id of a data source from global names resolver/index. User gets back a new comma-separated
|
129
|
+
file where scientific names from her list match data from the given data source.
|
130
|
+
email:
|
131
|
+
- dmozzherin@gmail.com
|
132
|
+
executables:
|
133
|
+
- crossmap
|
134
|
+
extensions: []
|
135
|
+
extra_rdoc_files: []
|
136
|
+
files:
|
137
|
+
- ".gitignore"
|
138
|
+
- ".rspec"
|
139
|
+
- ".rubocop.yml"
|
140
|
+
- ".travis.yml"
|
141
|
+
- CHANGELOG.md
|
142
|
+
- Gemfile
|
143
|
+
- LICENSE
|
144
|
+
- README.md
|
145
|
+
- Rakefile
|
146
|
+
- bin/console
|
147
|
+
- bin/setup
|
148
|
+
- exe/crossmap
|
149
|
+
- gn_crossmap.gemspec
|
150
|
+
- lib/gn_crossmap.rb
|
151
|
+
- lib/gn_crossmap/collector.rb
|
152
|
+
- lib/gn_crossmap/column_collector.rb
|
153
|
+
- lib/gn_crossmap/reader.rb
|
154
|
+
- lib/gn_crossmap/resolver.rb
|
155
|
+
- lib/gn_crossmap/result_processor.rb
|
156
|
+
- lib/gn_crossmap/sci_name_collector.rb
|
157
|
+
- lib/gn_crossmap/version.rb
|
158
|
+
- lib/gn_crossmap/writer.rb
|
159
|
+
homepage: https://github.com/GlobalNamesArchitecture/gn_crossmap
|
160
|
+
licenses: []
|
161
|
+
metadata: {}
|
162
|
+
post_install_message:
|
163
|
+
rdoc_options: []
|
164
|
+
require_paths:
|
165
|
+
- lib
|
166
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - ">="
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: '0'
|
171
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
172
|
+
requirements:
|
173
|
+
- - ">="
|
174
|
+
- !ruby/object:Gem::Version
|
175
|
+
version: '0'
|
176
|
+
requirements: []
|
177
|
+
rubyforge_project:
|
178
|
+
rubygems_version: 2.2.3
|
179
|
+
signing_key:
|
180
|
+
specification_version: 4
|
181
|
+
summary: Crossmaps a list of scientific names to names from a data source in GN Index
|
182
|
+
test_files: []
|