gn_crossmap 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +3 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +9 -0
- data/CHANGELOG.md +13 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.md +137 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/exe/crossmap +23 -0
- data/gn_crossmap.gemspec +39 -0
- data/lib/gn_crossmap.rb +34 -0
- data/lib/gn_crossmap/collector.rb +44 -0
- data/lib/gn_crossmap/column_collector.rb +74 -0
- data/lib/gn_crossmap/reader.rb +30 -0
- data/lib/gn_crossmap/resolver.rb +60 -0
- data/lib/gn_crossmap/result_processor.rb +58 -0
- data/lib/gn_crossmap/sci_name_collector.rb +61 -0
- data/lib/gn_crossmap/version.rb +8 -0
- data/lib/gn_crossmap/writer.rb +22 -0
- metadata +182 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ac2bef76184049dac604ad4be78998d50fc8cc54
|
4
|
+
data.tar.gz: 4ebf72afdfe65e37148d19dd2234d771cb3db951
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 06b2f2a33ad5d73344e73afa6b76790ab195b5529b98984949a40a3d1e64eb79f5449d1045a0a0c1d60af6b65a5076f63deb3339d1e1b6edbf734790fc5b4cff
|
7
|
+
data.tar.gz: f26df79b31645a2228be814aa47e9efd4f4976d9753df3ca4a0ebeafc1b5b7f1d91cd55d8e61555a655f1c39d0051d36e34554a983f07fec507aa4e4aaf5ed75
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
gn_crossmap CHANGELOG
|
2
|
+
=====================
|
3
|
+
|
4
|
+
0.1.1
|
5
|
+
-----
|
6
|
+
- [Dmitry Mozzherin][dimus] - first official release -- works for full names
|
7
|
+
and names entered in rank fields
|
8
|
+
|
9
|
+
0.1.0
|
10
|
+
-----
|
11
|
+
- [Dmitry Mozzherin][dimus] - initial version
|
12
|
+
|
13
|
+
[dimus]: https://github.com/dimus
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2015 Marine Biological Laboratory
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
# GnCrossmap
|
2
|
+
[![Gem Version][gem_badge]][gem_link]
|
3
|
+
[![Continuous Integration Status][ci_badge]][ci_link]
|
4
|
+
[![Coverage Status][cov_badge]][cov_link]
|
5
|
+
[![CodeClimate][code_badge]][code_link]
|
6
|
+
[![Dependency Status][dep_badge]][dep_link]
|
7
|
+
|
8
|
+
This gem crossmaps a checklist of scientific names to names from a data source
|
9
|
+
in [GN Resolver][resolver].
|
10
|
+
|
11
|
+
Checklist has to be in a CSV format.
|
12
|
+
|
13
|
+
Compatibility
|
14
|
+
-------------
|
15
|
+
|
16
|
+
This gem is compatible with Ruby versions higher or equal to 2.1.0
|
17
|
+
|
18
|
+
Installation
|
19
|
+
------------
|
20
|
+
|
21
|
+
Add this line to your application's Gemfile:
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
gem 'gn_crossmap'
|
25
|
+
```
|
26
|
+
|
27
|
+
And then execute:
|
28
|
+
|
29
|
+
$ bundle
|
30
|
+
|
31
|
+
Or install it yourself as:
|
32
|
+
|
33
|
+
$ gem install gn_crossmap
|
34
|
+
|
35
|
+
Usage
|
36
|
+
-----
|
37
|
+
|
38
|
+
### Input file format
|
39
|
+
|
40
|
+
- Comma Separated File with names of fields in first row.
|
41
|
+
- Columns can be separated by tab, comma or semicolon
|
42
|
+
- At least some columns should have recognizable fields
|
43
|
+
|
44
|
+
taxonID kingdom phylum class order family genus species
|
45
|
+
subspecies variety form scientificNameAuthorship scientificName
|
46
|
+
taxonRank
|
47
|
+
|
48
|
+
#### Simple Example
|
49
|
+
|
50
|
+
taxonID;scientificName
|
51
|
+
1;Macrobiotus echinogenitus subsp. areolatus Murray, 1907
|
52
|
+
...
|
53
|
+
|
54
|
+
#### Rank Example
|
55
|
+
|
56
|
+
taxonID;scientificName;taxonRank
|
57
|
+
1;Macrobiotus echinogenitus f. areolatus Murray, 1907;form
|
58
|
+
...
|
59
|
+
|
60
|
+
#### Family and Authorship Example
|
61
|
+
|
62
|
+
taxonID;family;scientificName;scientificNameAuthorship
|
63
|
+
1;Macrobiotidae;Macrobiotus echinogenitus subsp. areolatus;Murray, 1907
|
64
|
+
...
|
65
|
+
|
66
|
+
#### Fine-grained Example
|
67
|
+
|
68
|
+
TaxonId;kingdom;subkingdom;phylum;subphylum;superclass;class;subclass;cohort;superorder;order;suborder;infraorder;superfamily;family;subfamily;tribe;subtribe;genus;subgenus;section;species;subspecies;variety;form;ScientificNameAuthorship
|
69
|
+
1;Animalia;;Tardigrada;;;Eutardigrada;;;;Parachela;;;Macrobiotoidea;Macrobiotidae;;;;Macrobiotus;;;harmsworthi;obscurus;;;Dastych, 1985
|
70
|
+
|
71
|
+
### Usage from command line
|
72
|
+
|
73
|
+
# to see help
|
74
|
+
$ crossmap --help
|
75
|
+
|
76
|
+
# to compare with default source (Catalogue of Life)
|
77
|
+
$ crossmap -i my_list.csv -o my_list_col.csv
|
78
|
+
|
79
|
+
# to compare with other source (Index Fungorum in this example)
|
80
|
+
$ crossmap -i my_list.csv -o my_list_if.csv -d 5
|
81
|
+
|
82
|
+
### Usage as Ruby Library
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
require "gn_crossmap"
|
86
|
+
|
87
|
+
# If you want to change logger -- default Logging is to standard output
|
88
|
+
GnCrossmap.logger = MyCustomLogger.new
|
89
|
+
|
90
|
+
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5)
|
91
|
+
```
|
92
|
+
|
93
|
+
Development
|
94
|
+
-----------
|
95
|
+
|
96
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
97
|
+
`bin/console` for an interactive prompt that will allow you to experiment.
|
98
|
+
|
99
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To
|
100
|
+
release a new version, update the version number in `version.rb`, and then run
|
101
|
+
`bundle exec rake release` to create a git tag for the version, push git
|
102
|
+
commits and tags, and push the `.gem` file to
|
103
|
+
[rubygems.org][rubygems]
|
104
|
+
|
105
|
+
Contributing
|
106
|
+
------------
|
107
|
+
|
108
|
+
1. Fork it ( https://github.com/[my-github-username]/gn_crossmap/fork )
|
109
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
110
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
111
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
112
|
+
5. Create a new Pull Request
|
113
|
+
|
114
|
+
Copyright
|
115
|
+
---------
|
116
|
+
|
117
|
+
Author -- [Dmitry Mozzherin][dimus]
|
118
|
+
|
119
|
+
Copyright (c) 2015 [Marine Biological Laboratory][mbl].
|
120
|
+
See [LICENSE][license] for details.
|
121
|
+
|
122
|
+
[gem_badge]: https://badge.fury.io/rb/gn_crossmap.png
|
123
|
+
[gem_link]: http://badge.fury.io/rb/gn_crossmap
|
124
|
+
[ci_badge]: https://secure.travis-ci.org/GlobalNamesArchitecture/gn_crossmap.png
|
125
|
+
[ci_link]: http://travis-ci.org/GlobalNamesArchitecture/gn_crossmap
|
126
|
+
[cov_badge]: https://coveralls.io/repos/GlobalNamesArchitecture/gn_crossmap/badge.png?branch=master
|
127
|
+
[cov_link]: https://coveralls.io/r/GlobalNamesArchitecture/gn_crossmap?branch=master
|
128
|
+
[code_badge]: https://codeclimate.com/github/GlobalNamesArchitecture/gn_crossmap.png
|
129
|
+
[code_link]: https://codeclimate.com/github/GlobalNamesArchitecture/gn_crossmap
|
130
|
+
[dep_badge]: https://gemnasium.com/GlobalNamesArchitecture/gn_crossmap.png
|
131
|
+
[dep_link]: https://gemnasium.com/GlobalNamesArchitecture/gn_crossmap
|
132
|
+
[resolver]: http://resolver.globalnames.org
|
133
|
+
[rubygems]: https://rubygems.org
|
134
|
+
[dimus]: https://github.com/dimus
|
135
|
+
[mbl]: http://mbl.edu
|
136
|
+
[license]: https://github.com/GlobalNamesArchitecture/gn_crossmap/blob/master/LICENSE
|
137
|
+
[terms]: http://rs.tdwg.org/dwc/terms
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "gn_crossmap"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/exe/crossmap
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "trollop"
|
3
|
+
require "gn_crossmap"
|
4
|
+
|
5
|
+
puts "This program requires Ruby >= v. 2.1.0" if RUBY_VERSION < "2.1.0"
|
6
|
+
|
7
|
+
CATALOGUE_OF_LIFE = 1
|
8
|
+
OUTPUT = "output.csv"
|
9
|
+
opts = Trollop.options do
|
10
|
+
banner "Compares a list of scientific names to scientific names from a " \
|
11
|
+
"data source from Global Names Resolver\n\n " \
|
12
|
+
"Usage:\n crossmap [options]\n\noptions:"
|
13
|
+
|
14
|
+
opt(:input, "Path to intput file", type: :string)
|
15
|
+
opt(:output, "Path to output file", default: OUTPUT)
|
16
|
+
opt(:data_source_id, "Data source id from GN Resolver",
|
17
|
+
default: CATALOGUE_OF_LIFE)
|
18
|
+
end
|
19
|
+
|
20
|
+
Trollop.die :input, "must be set" if opts[:input].nil?
|
21
|
+
Trollop.die :input, "file must exist" unless File.exist?(opts[:input])
|
22
|
+
|
23
|
+
GnCrossmap.run(opts[:input], opts[:output], opts[:data_source_id])
|
data/gn_crossmap.gemspec
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "gn_crossmap/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "gn_crossmap"
|
8
|
+
gem.version = GnCrossmap::VERSION
|
9
|
+
gem.authors = ["Dmitry Mozzherin"]
|
10
|
+
gem.email = ["dmozzherin@gmail.com"]
|
11
|
+
|
12
|
+
gem.summary = "Crossmaps a list of scientific names to names from " \
|
13
|
+
"a data source in GN Index"
|
14
|
+
gem.description = "User supplies a comma-separated file which breaks " \
|
15
|
+
"contains in one row a hierarchy path of known ranks, " \
|
16
|
+
"scientific name which can be split into its semantic " \
|
17
|
+
"elements and include authorship and taxon concept " \
|
18
|
+
"reference. User also supplies an id of a data source "\
|
19
|
+
"from global names resolver/index. User gets back a " \
|
20
|
+
"new comma-separated file where scientific names from " \
|
21
|
+
"her list match data from the given data source."
|
22
|
+
gem.homepage = "https://github.com/GlobalNamesArchitecture/gn_crossmap"
|
23
|
+
|
24
|
+
gem.files = `git ls-files -z`.split("\x0").
|
25
|
+
reject { |f| f.match(%r{^(test|spec|features)/}) }
|
26
|
+
gem.bindir = "exe"
|
27
|
+
gem.executables = gem.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
|
+
gem.require_paths = ["lib"]
|
29
|
+
|
30
|
+
gem.add_dependency "trollop", "~> 2.1"
|
31
|
+
gem.add_dependency "biodiversity", "~> 3.1"
|
32
|
+
|
33
|
+
gem.add_development_dependency "bundler", "~> 1.7"
|
34
|
+
gem.add_development_dependency "rake", "~> 10.0"
|
35
|
+
gem.add_development_dependency "rspec", "~> 3.2"
|
36
|
+
gem.add_development_dependency "rubocop", "~> 0.31"
|
37
|
+
gem.add_development_dependency "coveralls", "~> 0.8"
|
38
|
+
gem.add_development_dependency "byebug"
|
39
|
+
end
|
data/lib/gn_crossmap.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "rest_client"
|
3
|
+
require "logger"
|
4
|
+
require "biodiversity"
|
5
|
+
require "gn_crossmap/version"
|
6
|
+
require "gn_crossmap/reader"
|
7
|
+
require "gn_crossmap/writer"
|
8
|
+
require "gn_crossmap/collector"
|
9
|
+
require "gn_crossmap/column_collector"
|
10
|
+
require "gn_crossmap/sci_name_collector"
|
11
|
+
require "gn_crossmap/resolver"
|
12
|
+
require "gn_crossmap/result_processor"
|
13
|
+
|
14
|
+
# Namespace module for crossmapping checklists wth GN sources
|
15
|
+
module GnCrossmap
|
16
|
+
class << self
|
17
|
+
attr_writer :logger
|
18
|
+
|
19
|
+
def run(input, output, data_source_id)
|
20
|
+
data = Reader.new(input).read
|
21
|
+
writer = Writer.new(output)
|
22
|
+
Resolver.new(writer, data_source_id).resolve(data)
|
23
|
+
output
|
24
|
+
end
|
25
|
+
|
26
|
+
def logger
|
27
|
+
@logger ||= Logger.new($stdout)
|
28
|
+
end
|
29
|
+
|
30
|
+
def log(message)
|
31
|
+
logger.info(message)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Assemble data from CSV reader by checking column fields
|
3
|
+
class Collector
|
4
|
+
RANKS = %i(kingdom subkingdom phylum subphylum superclass class
|
5
|
+
subclass cohort superorder order suborder infraorder superfamily
|
6
|
+
family subfamily tribe subtribe genus subgenus section species
|
7
|
+
subspecies variety form)
|
8
|
+
SPECIES_RANKS = %i(genus species subspecies variety form)
|
9
|
+
|
10
|
+
attr_reader :data
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@data = []
|
14
|
+
@fields = nil
|
15
|
+
@collector = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
def process_row(row)
|
19
|
+
@row = row
|
20
|
+
@fields ? collect_data : init_fields_collector
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def init_fields_collector
|
26
|
+
@fields = @row.map { |f| f.downcase.to_sym }
|
27
|
+
@collector = collector_factory
|
28
|
+
end
|
29
|
+
|
30
|
+
def collect_data
|
31
|
+
@row = @fields.zip(@row).to_h
|
32
|
+
data = @collector.id_name_rank(@row)
|
33
|
+
@data << data if data
|
34
|
+
end
|
35
|
+
|
36
|
+
def collector_factory
|
37
|
+
if @fields.include?(:scientificname)
|
38
|
+
SciNameCollector.new(@fields)
|
39
|
+
else
|
40
|
+
ColumnCollector.new(@fields)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Assemble data from CSV reader by checking column fields
|
3
|
+
class ColumnCollector
|
4
|
+
RANKS = %i(kingdom subkingdom phylum subphylum superclass class
|
5
|
+
subclass cohort superorder order suborder infraorder superfamily
|
6
|
+
family subfamily tribe subtribe genus subgenus section species
|
7
|
+
subspecies variety form)
|
8
|
+
SPECIES_RANKS = %i(genus species subspecies variety form)
|
9
|
+
|
10
|
+
attr_reader :data
|
11
|
+
|
12
|
+
def initialize(fields)
|
13
|
+
@fields = fields
|
14
|
+
end
|
15
|
+
|
16
|
+
def id_name_rank(row)
|
17
|
+
@row = row
|
18
|
+
id = @row[:taxonid]
|
19
|
+
return nil if id.to_s.strip == ""
|
20
|
+
rank = find_rank
|
21
|
+
return nil unless rank
|
22
|
+
name = assemble_name(rank)
|
23
|
+
return nil unless name
|
24
|
+
{ id: id, name: name, rank: rank.to_s }
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def find_rank
|
30
|
+
name_rank = nil
|
31
|
+
RANKS.reverse_each do |rank|
|
32
|
+
next if @row[rank].to_s.strip == ""
|
33
|
+
name_rank = rank
|
34
|
+
break
|
35
|
+
end
|
36
|
+
name_rank
|
37
|
+
end
|
38
|
+
|
39
|
+
def assemble_name(name_rank)
|
40
|
+
name = @row[name_rank]
|
41
|
+
if SPECIES_RANKS[1..-1].include?(name_rank)
|
42
|
+
name = assemble_species_name(name, name_rank)
|
43
|
+
end
|
44
|
+
name
|
45
|
+
end
|
46
|
+
|
47
|
+
def assemble_species_name(name, name_rank)
|
48
|
+
ending = [add_infrarank(name, name_rank), @row[:scientificnameauthorship]]
|
49
|
+
ranks = SPECIES_RANKS[0...SPECIES_RANKS.index(name_rank)]
|
50
|
+
starting = name_start(ranks)
|
51
|
+
(starting + ending).flatten.join(" ").strip.gsub(/\s+/, " ")
|
52
|
+
end
|
53
|
+
|
54
|
+
def name_start(ranks)
|
55
|
+
ranks.each_with_object([]) do |rank, ary|
|
56
|
+
next unless @row[rank]
|
57
|
+
ary << add_infrarank(@row[rank], rank)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_infrarank(name, rank)
|
62
|
+
case rank
|
63
|
+
when :subspecies
|
64
|
+
"subsp. #{name}"
|
65
|
+
when :variety
|
66
|
+
"var. #{name}"
|
67
|
+
when :form
|
68
|
+
"f. #{name}"
|
69
|
+
else
|
70
|
+
name
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Reads supplied csv file and creates ruby structure to compare
|
3
|
+
# with a Global Names Resolver source
|
4
|
+
class Reader
|
5
|
+
def initialize(csv_path)
|
6
|
+
@csv_file = csv_path
|
7
|
+
@col_sep = col_sep
|
8
|
+
end
|
9
|
+
|
10
|
+
def read
|
11
|
+
GnCrossmap.log("Read input file '#{File.basename(@csv_file)}'")
|
12
|
+
parse_input
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def col_sep
|
18
|
+
line = open(@csv_file, &:readline)
|
19
|
+
[";", ",", "\t"].map { |s| [line.count(s), s] }.sort.last.last
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse_input
|
23
|
+
dc = Collector.new
|
24
|
+
CSV.open(@csv_file, col_sep: @col_sep).each do |row|
|
25
|
+
dc.process_row(row)
|
26
|
+
end
|
27
|
+
dc.data
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Sends data to GN Resolver and collects results
|
3
|
+
class Resolver
|
4
|
+
URL = "http://resolver.globalnames.org/name_resolvers.json"
|
5
|
+
|
6
|
+
def initialize(writer, data_source_id)
|
7
|
+
@processor = GnCrossmap::ResultProcessor.new(writer)
|
8
|
+
@ds_id = data_source_id
|
9
|
+
@count = 0
|
10
|
+
@batch = 200
|
11
|
+
end
|
12
|
+
|
13
|
+
def resolve(data)
|
14
|
+
data_size = data.size
|
15
|
+
data.each_slice(@batch) do |slice|
|
16
|
+
with_log(data_size) do
|
17
|
+
names = collect_names(slice)
|
18
|
+
remote_resolve(names)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
@processor.writer.close
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def with_log(size)
|
27
|
+
s = @count + 1
|
28
|
+
@count += @batch
|
29
|
+
e = [@count, size].min
|
30
|
+
GnCrossmap.log("Resolve #{s}-#{e} out of #{size} records")
|
31
|
+
yield
|
32
|
+
end
|
33
|
+
|
34
|
+
def collect_names(slice)
|
35
|
+
slice.each_with_object("") do |row, str|
|
36
|
+
@processor.input[row[:id]] = { rank: row[:rank] }
|
37
|
+
str << "#{row[:id]}|#{row[:name]}\n"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def remote_resolve(names)
|
42
|
+
res = RestClient.post(URL, data: names, data_source_ids: @ds_id)
|
43
|
+
@processor.process(res)
|
44
|
+
rescue RestClient::Exception
|
45
|
+
single_remote_resolve(names)
|
46
|
+
end
|
47
|
+
|
48
|
+
def single_remote_resolve(names)
|
49
|
+
names.split("\n").each do |name|
|
50
|
+
begin
|
51
|
+
res = RestClient.post(URL, data: name, data_source_ids: @ds_id)
|
52
|
+
@processor.process(res)
|
53
|
+
rescue RestClient::Exception => e
|
54
|
+
GnCrossmap.log("Resolver broke on '#{name}': #{e}")
|
55
|
+
next
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Processes data received from the GN Resolver
|
3
|
+
class ResultProcessor
|
4
|
+
MATCH_TYPES = {
|
5
|
+
1 => "Exact match",
|
6
|
+
2 => "Canonical form exact match",
|
7
|
+
3 => "Canonical form fuzzy match",
|
8
|
+
4 => "Partial canonical form match",
|
9
|
+
5 => "Partial canonical form fuzzy match",
|
10
|
+
6 => "Genus part match"
|
11
|
+
}
|
12
|
+
|
13
|
+
attr_reader :input, :writer
|
14
|
+
|
15
|
+
def initialize(writer)
|
16
|
+
@writer = writer
|
17
|
+
@input = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
def process(result)
|
21
|
+
res = rubyfy(result)
|
22
|
+
res[:data].each do |d|
|
23
|
+
d[:results].nil? ? write_empty_result(d) : write_result(d)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def rubyfy(result)
|
30
|
+
JSON.parse(result, symbolize_names: true)
|
31
|
+
end
|
32
|
+
|
33
|
+
def write_empty_result(datum)
|
34
|
+
res = [datum[:supplied_id], datum[:supplied_name_string], nil, nil,
|
35
|
+
@input[datum[:supplied_id]][:rank], nil, nil, nil, nil]
|
36
|
+
@writer.write(res)
|
37
|
+
end
|
38
|
+
|
39
|
+
def write_result(datum)
|
40
|
+
datum[:results].each do |r|
|
41
|
+
res = [datum[:supplied_id], datum[:supplied_name_string],
|
42
|
+
r[:name_string], r[:canonical_form],
|
43
|
+
@input[datum[:supplied_id]][:rank],
|
44
|
+
matched_rank(r), matched_type(r),
|
45
|
+
r[:edit_distance], r[:score]]
|
46
|
+
@writer.write(res)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def matched_rank(record)
|
51
|
+
record[:classification_path_ranks].split("|").last
|
52
|
+
end
|
53
|
+
|
54
|
+
def matched_type(record)
|
55
|
+
MATCH_TYPES[record[:match_type]]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Assemble data from CSV reader by parsing scientificName field
|
3
|
+
class SciNameCollector
|
4
|
+
def initialize(fields)
|
5
|
+
@fields = fields
|
6
|
+
@parser = ScientificNameParser.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def id_name_rank(row)
|
10
|
+
@row = row
|
11
|
+
id = @row[:taxonid]
|
12
|
+
name = find_name
|
13
|
+
rank = @row[:taxonRank]
|
14
|
+
rank = parse_rank if rank.nil?
|
15
|
+
(id && name) ? { id: id, name: name, rank: rank } : nil
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def find_name
|
21
|
+
name = @row[:scientificname].strip
|
22
|
+
authorship = @row[:scientificnameauthorship].to_s.strip
|
23
|
+
name = "#{name} #{authorship}" if authorship != ""
|
24
|
+
name
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_rank
|
28
|
+
@parsed_name = @parser.parse(@row[:scientificname])[:scientificName]
|
29
|
+
return nil if !@parsed_name[:canonical] || @parsed_name[:hybrid]
|
30
|
+
words_num = @parsed_name[:canonical].split(" ").size
|
31
|
+
infer_rank(words_num)
|
32
|
+
rescue RuntimeError
|
33
|
+
@parser = ScientificNameParser.new
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def infer_rank(words_in_canonical_form)
|
38
|
+
case words_in_canonical_form
|
39
|
+
when 1
|
40
|
+
nil
|
41
|
+
when 2
|
42
|
+
"species"
|
43
|
+
else
|
44
|
+
normalize_rank(@parsed_name[:details][0][:infraspecies][-1][:rank])
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def normalize_rank(rank)
|
49
|
+
case rank
|
50
|
+
when /^f/
|
51
|
+
"form"
|
52
|
+
when /^var/
|
53
|
+
"variety"
|
54
|
+
when /^sub/
|
55
|
+
"subspicies"
|
56
|
+
else
|
57
|
+
rank
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module GnCrossmap
|
2
|
+
# Saves output from GN Resolver to disk
|
3
|
+
class Writer
|
4
|
+
def initialize(output_path)
|
5
|
+
@path = output_path
|
6
|
+
@output = CSV.open(@path, "w:utf-8")
|
7
|
+
@output << [:taxonID, :scientificName, :matchedScientificName,
|
8
|
+
:matchedCanonicalForm, :rank, :matchedRank, :matchType,
|
9
|
+
:editDistance, :score]
|
10
|
+
GnCrossmap.log("Open output file '#{@path}'")
|
11
|
+
end
|
12
|
+
|
13
|
+
def write(record)
|
14
|
+
@output << record
|
15
|
+
end
|
16
|
+
|
17
|
+
def close
|
18
|
+
GnCrossmap.log("Close output file '#{@path}'")
|
19
|
+
@output.close
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gn_crossmap
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dmitry Mozzherin
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-05-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: trollop
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: biodiversity
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.1'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.1'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.7'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.7'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.2'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.2'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.31'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.31'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: coveralls
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.8'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.8'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: byebug
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description: User supplies a comma-separated file which breaks contains in one row
|
126
|
+
a hierarchy path of known ranks, scientific name which can be split into its semantic
|
127
|
+
elements and include authorship and taxon concept reference. User also supplies
|
128
|
+
an id of a data source from global names resolver/index. User gets back a new comma-separated
|
129
|
+
file where scientific names from her list match data from the given data source.
|
130
|
+
email:
|
131
|
+
- dmozzherin@gmail.com
|
132
|
+
executables:
|
133
|
+
- crossmap
|
134
|
+
extensions: []
|
135
|
+
extra_rdoc_files: []
|
136
|
+
files:
|
137
|
+
- ".gitignore"
|
138
|
+
- ".rspec"
|
139
|
+
- ".rubocop.yml"
|
140
|
+
- ".travis.yml"
|
141
|
+
- CHANGELOG.md
|
142
|
+
- Gemfile
|
143
|
+
- LICENSE
|
144
|
+
- README.md
|
145
|
+
- Rakefile
|
146
|
+
- bin/console
|
147
|
+
- bin/setup
|
148
|
+
- exe/crossmap
|
149
|
+
- gn_crossmap.gemspec
|
150
|
+
- lib/gn_crossmap.rb
|
151
|
+
- lib/gn_crossmap/collector.rb
|
152
|
+
- lib/gn_crossmap/column_collector.rb
|
153
|
+
- lib/gn_crossmap/reader.rb
|
154
|
+
- lib/gn_crossmap/resolver.rb
|
155
|
+
- lib/gn_crossmap/result_processor.rb
|
156
|
+
- lib/gn_crossmap/sci_name_collector.rb
|
157
|
+
- lib/gn_crossmap/version.rb
|
158
|
+
- lib/gn_crossmap/writer.rb
|
159
|
+
homepage: https://github.com/GlobalNamesArchitecture/gn_crossmap
|
160
|
+
licenses: []
|
161
|
+
metadata: {}
|
162
|
+
post_install_message:
|
163
|
+
rdoc_options: []
|
164
|
+
require_paths:
|
165
|
+
- lib
|
166
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - ">="
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: '0'
|
171
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
172
|
+
requirements:
|
173
|
+
- - ">="
|
174
|
+
- !ruby/object:Gem::Version
|
175
|
+
version: '0'
|
176
|
+
requirements: []
|
177
|
+
rubyforge_project:
|
178
|
+
rubygems_version: 2.2.3
|
179
|
+
signing_key:
|
180
|
+
specification_version: 4
|
181
|
+
summary: Crossmaps a list of scientific names to names from a data source in GN Index
|
182
|
+
test_files: []
|