gn_crossmap 0.2.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +8 -0
- data/README.md +53 -15
- data/exe/crossmap +4 -1
- data/lib/gn_crossmap.rb +2 -2
- data/lib/gn_crossmap/collector.rb +13 -3
- data/lib/gn_crossmap/reader.rb +16 -3
- data/lib/gn_crossmap/result_processor.rb +2 -2
- data/lib/gn_crossmap/version.rb +1 -1
- data/lib/gn_crossmap/writer.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 779a3a9193e896242a7e8717fd98bc779b417937
|
4
|
+
data.tar.gz: d8e9c4c7d72447a62d35f57fe26e33e9b5a69a29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 83f6f2f6be28c5891d93e5f6e5e09ac54481f4334640ef3b500822301bb5d1ea2cf0d5ca94f4ba0a2eb7b7e7afcd67fed8b37ad76c61ac184c2d9932d080ccf3
|
7
|
+
data.tar.gz: ae609d842d36b96de15ffe3013fe49e762da963191609486757d7ba9bbc5e85cdf674a19e24916fdcac7214d37adb1a9c089c95cd89ba9c4c7360256858cca11
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
# ``gn_crossmap`` CHANGELOG
|
2
2
|
|
3
|
+
## 1.0.0
|
4
|
+
|
5
|
+
* @dimus - #18 output file optionally removes original fields except `taxonID`
|
6
|
+
|
7
|
+
* @dimus - #19 `acceptedName` field if filled for all matched names
|
8
|
+
|
9
|
+
* @dimus - #22 output is now tab-separated instead of comma-separated
|
10
|
+
|
3
11
|
## 0.2.2
|
4
12
|
|
5
13
|
* @dimus - gem update
|
data/README.md
CHANGED
@@ -27,37 +27,75 @@ gem 'gn_crossmap'
|
|
27
27
|
|
28
28
|
And then execute:
|
29
29
|
|
30
|
-
|
30
|
+
```bash
|
31
|
+
bundle
|
32
|
+
```
|
31
33
|
|
32
34
|
Or install it yourself as:
|
33
35
|
|
34
|
-
|
36
|
+
```bash
|
37
|
+
gem install gn_crossmap
|
38
|
+
```
|
35
39
|
|
36
40
|
## Usage
|
37
41
|
|
38
42
|
### Usage from command line
|
39
43
|
|
40
|
-
|
41
|
-
|
44
|
+
```bash
|
45
|
+
# to see help
|
46
|
+
crossmap --help
|
47
|
+
|
48
|
+
# to compare with default source (Catalogue of Life)
|
49
|
+
crossmap -i my_list.csv -o my_list_col.csv
|
50
|
+
|
51
|
+
# to compare with other source (Index Fungorum in this example)
|
52
|
+
crossmap -i my_list.csv -o my_list_if.csv -d 5
|
53
|
+
|
54
|
+
# to use standard intput and/or output
|
55
|
+
cat my_list.csv | crossmap -i - -o - > output
|
56
|
+
|
57
|
+
# to keep only taxonID from original input
|
58
|
+
cat my_list.csv | crossmap -i my_list.csv -s
|
59
|
+
```
|
60
|
+
|
61
|
+
### Usage as Ruby Library (API description)
|
62
|
+
|
63
|
+
#### `GnCrossmap.run`
|
64
|
+
|
65
|
+
Compares an input list to a data source from [GN Resolver][resolver] and
|
66
|
+
writes result into an output file.
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
GnCrossmap.run(input, output, data_source_id, skip_original)
|
70
|
+
```
|
71
|
+
|
72
|
+
``input``
|
73
|
+
: (string) Either a path to a csv file with list of names, or "-" which
|
74
|
+
designates `STDIN`
|
75
|
+
|
76
|
+
``output``
|
77
|
+
: (string) Either a path to the output file, or "-" which designates `STDOUT`
|
78
|
+
|
79
|
+
``data_source_id``
|
80
|
+
: (integer) id of a data source from [GN resolver][resolver]
|
42
81
|
|
43
|
-
|
44
|
-
|
82
|
+
``skip_original``
|
83
|
+
: (boolean) if true only `taxonID` is preserved from original data. Otherwise
|
84
|
+
all original data is preserved
|
45
85
|
|
46
|
-
|
47
|
-
crossmap -i my_list.csv -o my_list_if.csv -d 5
|
86
|
+
#### `GnCrossmap.logger=`
|
48
87
|
|
49
|
-
|
50
|
-
cat my_list.csv | crossmap -i - -o - > output
|
88
|
+
Allows to set logger to a custom logger (default is `STDERR`)
|
51
89
|
|
52
|
-
|
90
|
+
#### Usage Example
|
53
91
|
|
54
92
|
```ruby
|
55
93
|
require "gn_crossmap"
|
56
94
|
|
57
|
-
# If you want to change logger -- default Logging is to standard
|
95
|
+
# If you want to change logger -- default Logging is to standard error
|
58
96
|
GnCrossmap.logger = MyCustomLogger.new
|
59
97
|
|
60
|
-
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5)
|
98
|
+
GnCrossmap.run("path/to/input.csv", "path/to/output.csv", 5, true)
|
61
99
|
```
|
62
100
|
|
63
101
|
### Input file format
|
@@ -136,7 +174,7 @@ score | heuristic score from 0 to 1 where 1 is a good match, 0.5
|
|
136
174
|
|
137
175
|
The output fomat returns 7 possible types of matches:
|
138
176
|
|
139
|
-
1. **Exact match** - The exact name was matched (but ignoring non-ascii characters)
|
177
|
+
1. **Exact string match** - The exact name was matched (but ignoring non-ascii characters)
|
140
178
|
2. **Exact match by canonical form of a name** - The canonical form of the name (a version of a scientific name that contains complete versions of the latin words, but lacks insertions of subtaxa, annotations, or authority information) was matched
|
141
179
|
3. **Fuzzy match by canonical form** - The canonical form gave a fuzzy (detecting lexical or spelling variations of a name using Tony Rees' algorithm TAXAMATCH) match
|
142
180
|
4. **Partial exact match by species part of canonical form** - The canonical form returned a partial but exact match
|
@@ -178,7 +216,7 @@ See [LICENSE][license] for details.
|
|
178
216
|
[cov-link]: https://coveralls.io/r/GlobalNamesArchitecture/gn_crossmap?branch=master
|
179
217
|
[code-badge]: https://codeclimate.com/github/GlobalNamesArchitecture/gn_crossmap/badges/gpa.svg
|
180
218
|
[code-link]: https://codeclimate.com/github/GlobalNamesArchitecture/gn_crossmap
|
181
|
-
[dep-badge]: https://gemnasium.com/GlobalNamesArchitecture/gn_crossmap.
|
219
|
+
[dep-badge]: https://gemnasium.com/GlobalNamesArchitecture/gn_crossmap.svg
|
182
220
|
[dep-link]: https://gemnasium.com/GlobalNamesArchitecture/gn_crossmap
|
183
221
|
[resolver]: http://resolver.globalnames.org/data_sources
|
184
222
|
[rubygems]: https://rubygems.org
|
data/exe/crossmap
CHANGED
@@ -16,6 +16,8 @@ opts = Trollop.options do
|
|
16
16
|
opt(:output, "Path to output file", default: OUTPUT)
|
17
17
|
opt(:data_source_id, "Data source id from GN Resolver",
|
18
18
|
default: CATALOGUE_OF_LIFE)
|
19
|
+
opt(:skip_original, "If given, only 'taxonID' is shown " \
|
20
|
+
"from the original input", type: :boolean)
|
19
21
|
end
|
20
22
|
|
21
23
|
Trollop.die :input, "must be set" if opts[:input].nil?
|
@@ -24,7 +26,8 @@ unless File.exist?(opts[:input]) || opts[:input] == "-"
|
|
24
26
|
end
|
25
27
|
|
26
28
|
begin
|
27
|
-
GnCrossmap.run(opts[:input], opts[:output], opts[:data_source_id]
|
29
|
+
GnCrossmap.run(opts[:input], opts[:output], opts[:data_source_id],
|
30
|
+
opts[:skip_original])
|
28
31
|
rescue GnCrossmapError => e
|
29
32
|
GnCrossmap.logger.error(e.message)
|
30
33
|
end
|
data/lib/gn_crossmap.rb
CHANGED
@@ -22,9 +22,9 @@ module GnCrossmap
|
|
22
22
|
class << self
|
23
23
|
attr_writer :logger
|
24
24
|
|
25
|
-
def run(input, output, data_source_id)
|
25
|
+
def run(input, output, data_source_id, skip_original)
|
26
26
|
input_io, output_io = io(input, output)
|
27
|
-
reader = Reader.new(input_io, input_name(input))
|
27
|
+
reader = Reader.new(input_io, input_name(input), skip_original)
|
28
28
|
data = reader.read
|
29
29
|
writer = Writer.new(output_io, reader.original_fields,
|
30
30
|
output_name(output))
|
@@ -3,10 +3,11 @@ module GnCrossmap
|
|
3
3
|
class Collector
|
4
4
|
attr_reader :data
|
5
5
|
|
6
|
-
def initialize
|
6
|
+
def initialize(skip_original)
|
7
7
|
@data = []
|
8
8
|
@fields = nil
|
9
9
|
@collector = nil
|
10
|
+
@skip_original = skip_original
|
10
11
|
end
|
11
12
|
|
12
13
|
def process_row(row)
|
@@ -20,7 +21,12 @@ module GnCrossmap
|
|
20
21
|
@fields = @row.map { |f| prepare_field(f) }
|
21
22
|
@collector = collector_factory
|
22
23
|
err = "taxonID must be present in the csv header"
|
23
|
-
raise GnCrossmapError, err unless
|
24
|
+
raise GnCrossmapError, err unless taxon_id?
|
25
|
+
end
|
26
|
+
|
27
|
+
def taxon_id?
|
28
|
+
@taxon_id_index = @fields.index(:taxonid)
|
29
|
+
!@taxon_id_index.nil?
|
24
30
|
end
|
25
31
|
|
26
32
|
def prepare_field(field)
|
@@ -32,10 +38,14 @@ module GnCrossmap
|
|
32
38
|
@row = @fields.zip(@row).to_h
|
33
39
|
data = @collector.id_name_rank(@row)
|
34
40
|
return unless data
|
35
|
-
data[:original] =
|
41
|
+
data[:original] = prepare_original
|
36
42
|
@data << data
|
37
43
|
end
|
38
44
|
|
45
|
+
def prepare_original
|
46
|
+
@skip_original ? [@row[:taxonid]] : @row.values
|
47
|
+
end
|
48
|
+
|
39
49
|
def collector_factory
|
40
50
|
if @fields.include?(:scientificname)
|
41
51
|
SciNameCollector.new(@fields)
|
data/lib/gn_crossmap/reader.rb
CHANGED
@@ -4,11 +4,12 @@ module GnCrossmap
|
|
4
4
|
class Reader
|
5
5
|
attr_reader :original_fields
|
6
6
|
|
7
|
-
def initialize(csv_io, input_name)
|
7
|
+
def initialize(csv_io, input_name, skip_original)
|
8
8
|
@csv_io = csv_io
|
9
9
|
@col_sep = col_sep
|
10
10
|
@original_fields = nil
|
11
11
|
@input_name = input_name
|
12
|
+
@skip_original = skip_original
|
12
13
|
end
|
13
14
|
|
14
15
|
def read
|
@@ -25,15 +26,27 @@ module GnCrossmap
|
|
25
26
|
end
|
26
27
|
|
27
28
|
def parse_input
|
28
|
-
dc = Collector.new
|
29
|
+
dc = Collector.new(@skip_original)
|
29
30
|
csv = CSV.new(@csv_io, col_sep: col_sep)
|
30
31
|
csv.each_with_index do |row, i|
|
31
|
-
@original_fields = row
|
32
|
+
@original_fields = headers(row) if @original_fields.nil?
|
32
33
|
i += 1
|
33
34
|
GnCrossmap.log("Ingesting #{i}th csv row") if (i % 10_000).zero?
|
34
35
|
dc.process_row(row)
|
35
36
|
end && @csv_io.close
|
36
37
|
dc.data
|
37
38
|
end
|
39
|
+
|
40
|
+
def headers(row)
|
41
|
+
hdrs = row.dup
|
42
|
+
@skip_original ? taxon_id_header(hdrs) : hdrs
|
43
|
+
end
|
44
|
+
|
45
|
+
def taxon_id_header(hdrs)
|
46
|
+
hdrs.each do |h|
|
47
|
+
return [h] if h =~ /taxonid\s*$/i
|
48
|
+
end
|
49
|
+
[]
|
50
|
+
end
|
38
51
|
end
|
39
52
|
end
|
@@ -3,7 +3,7 @@ module GnCrossmap
|
|
3
3
|
class ResultProcessor
|
4
4
|
MATCH_TYPES = {
|
5
5
|
0 => "No match",
|
6
|
-
1 => "Exact match",
|
6
|
+
1 => "Exact string match",
|
7
7
|
2 => "Canonical form exact match",
|
8
8
|
3 => "Canonical form fuzzy match",
|
9
9
|
4 => "Partial canonical form match",
|
@@ -55,7 +55,7 @@ module GnCrossmap
|
|
55
55
|
[matched_type(result), datum[:supplied_name_string],
|
56
56
|
result[:name_string], result[:canonical_form],
|
57
57
|
@input[datum[:supplied_id]][:rank], matched_rank(result),
|
58
|
-
synonym, result[:current_name_string],
|
58
|
+
synonym, result[:current_name_string] || result[:name_string],
|
59
59
|
result[:edit_distance], result[:score], result[:taxon_id]]
|
60
60
|
end
|
61
61
|
|
data/lib/gn_crossmap/version.rb
CHANGED
data/lib/gn_crossmap/writer.rb
CHANGED
@@ -4,7 +4,7 @@ module GnCrossmap
|
|
4
4
|
def initialize(output_io, original_fields, output_name)
|
5
5
|
@output_io = output_io
|
6
6
|
@output_fields = output_fields(original_fields)
|
7
|
-
@output = CSV.new(@output_io)
|
7
|
+
@output = CSV.new(@output_io, col_sep: "\t")
|
8
8
|
@output << @output_fields
|
9
9
|
@output_name = output_name
|
10
10
|
GnCrossmap.log("Open output to #{@output_name}")
|
@@ -25,7 +25,7 @@ module GnCrossmap
|
|
25
25
|
original_fields + [:matchedType, :inputName, :matchedName,
|
26
26
|
:matchedCanonicalForm, :inputRank, :matchedRank,
|
27
27
|
:synonymStatus, :acceptedName, :matchedEditDistance,
|
28
|
-
:
|
28
|
+
:matchedScore, :matchTaxonID]
|
29
29
|
end
|
30
30
|
end
|
31
31
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gn_crossmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-11-
|
11
|
+
date: 2016-11-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: trollop
|