dwc-archive 0.9.10 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rspec +2 -1
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -1
- data/.travis.yml +4 -7
- data/CHANGELOG +14 -8
- data/Gemfile +3 -1
- data/LICENSE +1 -1
- data/README.md +119 -107
- data/Rakefile +13 -36
- data/dwc-archive.gemspec +23 -19
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +124 -0
- data/lib/dwc_archive/archive.rb +60 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
- data/lib/dwc_archive/expander.rb +88 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +91 -0
- data/lib/dwc_archive/generator_eml_xml.rb +116 -0
- data/lib/dwc_archive/generator_meta_xml.rb +72 -0
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +57 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +90 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +47 -0
- data/spec/files/generator_meta.xml +19 -0
- data/spec/lib/classification_normalizer_spec.rb +96 -105
- data/spec/lib/core_spec.rb +43 -41
- data/spec/lib/darwin_core_spec.rb +108 -138
- data/spec/lib/generator_eml_xml_spec.rb +12 -11
- data/spec/lib/generator_meta_xml_spec.rb +12 -11
- data/spec/lib/generator_spec.rb +77 -69
- data/spec/lib/gnub_taxon_spec.rb +15 -17
- data/spec/lib/metadata_spec.rb +50 -41
- data/spec/lib/taxon_normalized_spec.rb +62 -65
- data/spec/lib/xml_reader_spec.rb +9 -12
- data/spec/spec_helper.rb +54 -51
- metadata +105 -88
- data/.rvmrc +0 -1
- data/] +0 -40
- data/lib/dwc-archive.rb +0 -107
- data/lib/dwc-archive/archive.rb +0 -40
- data/lib/dwc-archive/classification_normalizer.rb +0 -428
- data/lib/dwc-archive/core.rb +0 -17
- data/lib/dwc-archive/expander.rb +0 -84
- data/lib/dwc-archive/generator.rb +0 -85
- data/lib/dwc-archive/generator_eml_xml.rb +0 -86
- data/lib/dwc-archive/generator_meta_xml.rb +0 -58
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -48
- data/lib/dwc-archive/version.rb +0 -3
- data/lib/dwc-archive/xml_reader.rb +0 -64
@@ -0,0 +1,106 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class DarwinCore
|
3
|
+
# This module abstracts information for reading csv file to be used
|
4
|
+
# in several classes which need such functionality
|
5
|
+
module Ingester
|
6
|
+
attr_reader :data, :properties, :encoding, :fields_separator, :size
|
7
|
+
attr_reader :file_path, :fields, :line_separator, :quote_character,
|
8
|
+
:ignore_headers
|
9
|
+
|
10
|
+
def size
|
11
|
+
@size ||= init_size
|
12
|
+
end
|
13
|
+
|
14
|
+
def read(batch_size = 10_000)
|
15
|
+
DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data")
|
16
|
+
res = []
|
17
|
+
errors = []
|
18
|
+
args = define_csv_args
|
19
|
+
min_size = @fields.map { |f| f[:index].to_i || 0 }.sort[-1] + 1
|
20
|
+
csv = CSV.new(open(@file_path), args)
|
21
|
+
csv.each_with_index do |r, i|
|
22
|
+
next if @ignore_headers && i == 0
|
23
|
+
min_size > r.size ? errors << r : process_csv_row(res, errors, r)
|
24
|
+
next if i == 0 || i % batch_size != 0
|
25
|
+
DarwinCore.logger_write(@dwc.object_id,
|
26
|
+
format("Ingested %s records from %s",
|
27
|
+
i, name))
|
28
|
+
next unless block_given?
|
29
|
+
yield [res, errors]
|
30
|
+
res = []
|
31
|
+
errors = []
|
32
|
+
end
|
33
|
+
yield [res, errors] if block_given?
|
34
|
+
[res, errors]
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def define_csv_args
|
40
|
+
args = { col_sep: @field_separator }
|
41
|
+
@quote_character = "\b" if @quote_character.empty?
|
42
|
+
args.merge(quote_char: @quote_character)
|
43
|
+
end
|
44
|
+
|
45
|
+
def name
|
46
|
+
self.class.to_s.split("::")[-1].downcase
|
47
|
+
end
|
48
|
+
|
49
|
+
def process_csv_row(result, errors, row)
|
50
|
+
str = row.join("")
|
51
|
+
str = str.force_encoding("utf-8")
|
52
|
+
if str.encoding.name == "UTF-8" && str.valid_encoding?
|
53
|
+
result << row.map { |f| f.nil? ? nil : f.force_encoding("utf-8") }
|
54
|
+
else
|
55
|
+
errors << row
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def init_attributes
|
60
|
+
@properties = @data[:attributes]
|
61
|
+
init_encoding
|
62
|
+
@field_separator = init_field_separator
|
63
|
+
@quote_character = @properties[:fieldsEnclosedBy] || ""
|
64
|
+
@line_separator = @properties[:linesTerminatedBy] || "\n"
|
65
|
+
@ignore_headers = @properties[:ignoreHeaderLines] &&
|
66
|
+
[1, true].include?(@properties[:ignoreHeaderLines])
|
67
|
+
init_file_path
|
68
|
+
init_fields
|
69
|
+
end
|
70
|
+
|
71
|
+
def init_encoding
|
72
|
+
@encoding = @properties[:encoding] || "UTF-8"
|
73
|
+
accepted_encoding = ["utf-8", "utf8", "utf-16", "utf16"].
|
74
|
+
include?(@encoding.downcase)
|
75
|
+
fail(
|
76
|
+
DarwinCore::EncodingError,
|
77
|
+
"No support for encodings other than utf-8 or utf-16 at the moment"
|
78
|
+
) unless accepted_encoding
|
79
|
+
end
|
80
|
+
|
81
|
+
def init_file_path
|
82
|
+
file = @data[:location] ||
|
83
|
+
@data[:attributes][:location] ||
|
84
|
+
@data[:files][:location]
|
85
|
+
@file_path = File.join(@path, file)
|
86
|
+
fail DarwinCore::FileNotFoundError, "No file data" unless @file_path
|
87
|
+
end
|
88
|
+
|
89
|
+
def init_fields
|
90
|
+
@data[:field] = [data[:field]] if data[:field].class != Array
|
91
|
+
@fields = @data[:field].map { |f| f[:attributes] }
|
92
|
+
fail DarwinCore::InvalidArchiveError,
|
93
|
+
"No data fields are found" if @fields.empty?
|
94
|
+
end
|
95
|
+
|
96
|
+
def init_field_separator
|
97
|
+
res = @properties[:fieldsTerminatedBy] || ","
|
98
|
+
res = "\t" if res == "\\t"
|
99
|
+
res
|
100
|
+
end
|
101
|
+
|
102
|
+
def init_size
|
103
|
+
`wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Represents data from EML (Ecological Metadata Language) file
|
5
|
+
class Metadata
|
6
|
+
attr_reader :metadata
|
7
|
+
alias data metadata
|
8
|
+
|
9
|
+
def initialize(archive = nil)
|
10
|
+
@archive = archive
|
11
|
+
@metadata = @archive.eml
|
12
|
+
end
|
13
|
+
|
14
|
+
def id
|
15
|
+
fix_nil { @metadata[:eml][:dataset][:attributes][:id] }
|
16
|
+
end
|
17
|
+
|
18
|
+
def package_id
|
19
|
+
fix_nil { @metadata.data[:eml][:attributes][:packageId] }
|
20
|
+
end
|
21
|
+
|
22
|
+
def title
|
23
|
+
fix_nil { @metadata[:eml][:dataset][:title] }
|
24
|
+
end
|
25
|
+
|
26
|
+
def authors
|
27
|
+
return nil unless defined?(@metadata[:eml][:dataset][:creator])
|
28
|
+
|
29
|
+
authors = [@metadata[:eml][:dataset][:creator]].flatten
|
30
|
+
authors.map do |au|
|
31
|
+
{ first_name: au[:individualName][:givenName],
|
32
|
+
last_name: au[:individualName][:surName],
|
33
|
+
email: au[:electronicMailAddress] }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def abstract
|
38
|
+
fix_nil { @metadata[:eml][:dataset][:abstract] }
|
39
|
+
end
|
40
|
+
|
41
|
+
def citation
|
42
|
+
fix_nil { @metadata[:eml][:additionalMetadata][:metadata][:citation] }
|
43
|
+
end
|
44
|
+
|
45
|
+
def url
|
46
|
+
fix_nil { @metadata[:eml][:dataset][:distribution][:online][:url] }
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def fix_nil
|
52
|
+
yield
|
53
|
+
rescue NoMethodError
|
54
|
+
nil
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Describes normalized taxon
|
5
|
+
class TaxonNormalized
|
6
|
+
attr_accessor :id, :local_id, :global_id, :source, :parent_id,
|
7
|
+
:classification_path_id, :classification_path,
|
8
|
+
:linnean_classification_path, :current_name,
|
9
|
+
:current_name_canonical, :synonyms, :vernacular_names,
|
10
|
+
:rank, :status
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@id = @parent_id = @rank = @status = nil
|
14
|
+
@current_name = @current_name_canonical = @source = @local_id = ""
|
15
|
+
@global_id = ""
|
16
|
+
@classification_path = []
|
17
|
+
@classification_path_id = []
|
18
|
+
@synonyms = []
|
19
|
+
@vernacular_names = []
|
20
|
+
@linnean_classification_path = []
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# USAGE: Hash.from_xml:(YOUR_XML_STRING)
|
5
|
+
# modified from
|
6
|
+
# http://stackoverflow.com/questions/1230741/
|
7
|
+
# convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
|
8
|
+
module XmlReader
|
9
|
+
def self.from_xml(xml_io)
|
10
|
+
result = Nokogiri::XML(xml_io)
|
11
|
+
{ result.root.name.to_sym => self::Node.new(result.root).value }
|
12
|
+
end
|
13
|
+
|
14
|
+
# Node is a helper class to parse xml into hash
|
15
|
+
class Node
|
16
|
+
def initialize(node)
|
17
|
+
@node = node
|
18
|
+
@val = {}
|
19
|
+
end
|
20
|
+
|
21
|
+
def value
|
22
|
+
if @node.element?
|
23
|
+
prepare_node_element
|
24
|
+
else
|
25
|
+
prepare(@node.content.to_s)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def prepare_node_element
|
32
|
+
add_attributes
|
33
|
+
add_children if @node.children.size.positive?
|
34
|
+
@val
|
35
|
+
end
|
36
|
+
|
37
|
+
def prepare(data)
|
38
|
+
data.instance_of?(String) && data.to_i.to_s == data ? data.to_i : data
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_attributes
|
42
|
+
return if @node.attributes.empty?
|
43
|
+
|
44
|
+
@val[:attributes] = {}
|
45
|
+
@node.attributes.each_key do |key|
|
46
|
+
add_attribute(@val[:attributes], @node.attributes[key])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def add_attribute(attributes, attribute)
|
51
|
+
attributes[attribute.name.to_sym] = prepare(attribute.value)
|
52
|
+
end
|
53
|
+
|
54
|
+
def add_children
|
55
|
+
@node.children.each do |child|
|
56
|
+
process_child(child)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def process_child(child)
|
61
|
+
value = DarwinCore::XmlReader::Node.new(child).value
|
62
|
+
if child.name == "text"
|
63
|
+
handle_text(child, value)
|
64
|
+
else
|
65
|
+
add_child_to_value(child, value)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def add_child_to_value(child, value)
|
70
|
+
if @val[child.name.to_sym]
|
71
|
+
handle_child_node(child.name.to_sym, value)
|
72
|
+
else
|
73
|
+
@val[child.name.to_sym] = prepare(value)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def handle_child_node(child, val)
|
78
|
+
if @val[child].is_a?(Object::Array)
|
79
|
+
@val[child] << prepare(val)
|
80
|
+
else
|
81
|
+
@val[child] = [@val[child], prepare(val)]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def handle_text(child, val)
|
86
|
+
@val = prepare(val) unless child.next_sibling || child.previous_sibling
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz
RENAMED
File without changes
|
@@ -0,0 +1,47 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:md="eml://ecoinformatics.org/methods-2.1.1" xmlns:proj="eml://ecoinformatics.org/project-2.1.1" xmlns:d="eml://ecoinformatics.org/dataset-2.1.1" xmlns:res="eml://ecoinformatics.org/resource-2.1.1" xmlns:dc="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" packageId="1234/2013-12-30::19:45:33" system="http://globalnames.org" xml:lang="en" xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd">
|
3
|
+
<dataset id="1234">
|
4
|
+
<title>Test Classification</title>
|
5
|
+
<license>http://creativecommons.org/licenses/by-sa/3.0/</license>
|
6
|
+
<creator id="1" scope="document">
|
7
|
+
<individualName>
|
8
|
+
<givenName>John</givenName>
|
9
|
+
<surName>Doe</surName>
|
10
|
+
</individualName>
|
11
|
+
<organizationName>Example</organizationName>
|
12
|
+
<positionName>Assistant Professor</positionName>
|
13
|
+
<onlineUrl>http://example.org</onlineUrl>
|
14
|
+
<electronicMailAddress>jdoe@example.com</electronicMailAddress>
|
15
|
+
</creator>
|
16
|
+
<creator id="2" scope="document">
|
17
|
+
<individualName>
|
18
|
+
<givenName>Jane</givenName>
|
19
|
+
<surName>Doe</surName>
|
20
|
+
</individualName>
|
21
|
+
<electronicMailAddress>jane@example.com</electronicMailAddress>
|
22
|
+
</creator>
|
23
|
+
<metadataProvider>
|
24
|
+
<individualName>
|
25
|
+
<givenName>Jim</givenName>
|
26
|
+
<surName>Doe</surName>
|
27
|
+
</individualName>
|
28
|
+
<onlineUrl>http://aggregator.example.org</onlineUrl>
|
29
|
+
<electronicMailAddress>jimdoe@example.com</electronicMailAddress>
|
30
|
+
</metadataProvider>
|
31
|
+
<pubDate>2013-12-30 14:45:33 -0500</pubDate>
|
32
|
+
<abstract>
|
33
|
+
<para>test classification</para>
|
34
|
+
</abstract>
|
35
|
+
<contact>
|
36
|
+
<references>1</references>
|
37
|
+
</contact>
|
38
|
+
<contact>
|
39
|
+
<references>2</references>
|
40
|
+
</contact>
|
41
|
+
</dataset>
|
42
|
+
<additionalMetadata>
|
43
|
+
<metadata>
|
44
|
+
<citation>Test classification: Doe John, Doe Jane, Taxnonmy, 10, 1, 2010</citation>
|
45
|
+
</metadata>
|
46
|
+
</additionalMetadata>
|
47
|
+
</eml:eml>
|
@@ -0,0 +1,19 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<archive xmlns="http://rs.tdwg.org/dwc/text/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd">
|
3
|
+
<core encoding="UTF-8" fieldsTerminatedBy="," fieldsEnclosedBy=""" linesTerminatedBy=" " rowType="http://rs.tdwg.org/dwc/terms/Taxon" ignoreHeaderLines="1">
|
4
|
+
<files>
|
5
|
+
<location>core.csv</location>
|
6
|
+
</files>
|
7
|
+
<id index="0"/>
|
8
|
+
<field term="http://rs.tdwg.org/dwc/terms/parentNameUsageID" index="1"/>
|
9
|
+
<field term="http://rs.tdwg.org/dwc/terms/scientificName" index="2"/>
|
10
|
+
<field term="http://rs.tdwg.org/dwc/terms/taxonRank" index="3"/>
|
11
|
+
</core>
|
12
|
+
<extension encoding="UTF-8" fieldsTerminatedBy="," fieldsEnclosedBy=""" linesTerminatedBy=" " rowType="http://rs.gbif.org/terms/1.0/VernacularName" ignoreHeaderLines="1">
|
13
|
+
<files>
|
14
|
+
<location>vern.csv</location>
|
15
|
+
</files>
|
16
|
+
<coreid index="0"/>
|
17
|
+
<field term="http://rs.tdwg.org/dwc/terms/vernacularName" index="1"/>
|
18
|
+
</extension>
|
19
|
+
</archive>
|
@@ -1,223 +1,214 @@
|
|
1
|
-
|
2
|
-
# encoding: utf-8
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
3
|
describe DarwinCore::ClassificationNormalizer do
|
5
|
-
|
6
4
|
subject(:dwca) { DarwinCore.new(file_path) }
|
7
5
|
subject(:normalizer) { DarwinCore::ClassificationNormalizer.new(dwca) }
|
8
|
-
|
9
|
-
let(:file_dir) { File.expand_path(
|
6
|
+
|
7
|
+
let(:file_dir) { File.expand_path("../files", __dir__) }
|
10
8
|
let(:file_path) { File.join(file_dir, file_name) }
|
11
9
|
|
12
|
-
describe
|
13
|
-
let(:file_path) { File.join(file_dir,
|
14
|
-
it
|
15
|
-
to
|
16
|
-
|
10
|
+
describe ".new" do
|
11
|
+
let(:file_path) { File.join(file_dir, "data.tar.gz") }
|
12
|
+
it do
|
13
|
+
expect(normalizer.is_a?(DarwinCore::ClassificationNormalizer)).to be true
|
14
|
+
end
|
15
|
+
end
|
17
16
|
|
18
|
-
describe
|
19
|
-
let(:file_name) {
|
17
|
+
describe "#normalize" do
|
18
|
+
let(:file_name) { "data.tar.gz" }
|
20
19
|
|
21
|
-
it
|
20
|
+
it "returns normalized data" do
|
22
21
|
res = normalizer.normalize
|
23
22
|
expect(res).to be normalizer.normalized_data
|
24
23
|
end
|
25
24
|
|
25
|
+
context "flat list" do
|
26
|
+
let(:file_path) { File.join(file_dir, "flat_list.tar.gz") }
|
26
27
|
|
27
|
-
|
28
|
-
let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
|
29
|
-
|
30
|
-
it 'returns flat list' do
|
28
|
+
it "returns flat list" do
|
31
29
|
normalizer.normalize
|
32
30
|
expect(normalizer.normalized_data).to be_kind_of Hash
|
33
31
|
expect(normalizer.normalized_data.size).to be > 0
|
34
32
|
end
|
35
33
|
end
|
36
34
|
|
37
|
-
context
|
38
|
-
let(:file_name) {
|
35
|
+
context "synonyms from core" do
|
36
|
+
let(:file_name) { "synonyms_in_core_accepted_name_field.tar.gz" }
|
39
37
|
|
40
|
-
it
|
38
|
+
it "ingests synonyms using accepted_name field" do
|
41
39
|
res = normalizer.normalize
|
42
|
-
syn = res.
|
43
|
-
map { |k,v| v }
|
40
|
+
syn = res.reject { |_, v| v.synonyms.empty? }.values
|
44
41
|
expect(syn.size).to be > 0
|
45
42
|
expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
|
46
43
|
end
|
47
44
|
end
|
48
45
|
|
49
|
-
context
|
50
|
-
let(:file_name) {
|
51
|
-
it
|
46
|
+
context "synonyms from extension" do
|
47
|
+
let(:file_name) { "synonyms_in_extension.tar.gz" }
|
48
|
+
it "ingests synonyms from extension" do
|
52
49
|
res = normalizer.normalize
|
53
|
-
syn = res.
|
54
|
-
map { |k,v| v }
|
50
|
+
syn = res.reject { |_, v| v.synonyms.empty? }.values
|
55
51
|
expect(syn.size).to be > 0
|
56
52
|
expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
|
57
53
|
end
|
58
54
|
end
|
59
55
|
|
60
|
-
context
|
61
|
-
let(:file_name) {
|
56
|
+
context "synonyms are not extensions" do
|
57
|
+
let(:file_name) { "not_synonym_in_extension.tar.gz" }
|
62
58
|
|
63
|
-
it
|
59
|
+
it "does not ingest synonyms" do
|
64
60
|
res = normalizer.normalize
|
65
|
-
syn = res.
|
66
|
-
map { |k,v| v }
|
61
|
+
syn = res.reject { |_, v| v.synonyms.empty? }.values
|
67
62
|
expect(syn).to be_empty
|
68
63
|
end
|
69
64
|
end
|
70
65
|
|
71
|
-
context
|
72
|
-
let(:file_name) {
|
73
|
-
it
|
66
|
+
context "with_extensions flag set on false" do
|
67
|
+
let(:file_name) { "synonyms_in_extension.tar.gz" }
|
68
|
+
it "should not harvest extensions" do
|
74
69
|
res = normalizer.normalize(with_extensions: false)
|
75
|
-
syn = res.
|
76
|
-
map { |k,v| v }
|
70
|
+
syn = res.reject { |_, v| v.synonyms.empty? }.values
|
77
71
|
expect(syn).to be_empty
|
78
72
|
end
|
79
73
|
end
|
80
74
|
|
81
|
-
context
|
82
|
-
let(:file_name) {
|
75
|
+
context "linnean classification in file (class, order etc fields)" do
|
76
|
+
let(:file_name) { "linnean.tar.gz" }
|
83
77
|
|
84
|
-
it
|
78
|
+
it "assembles classification" do
|
85
79
|
res = normalizer.normalize
|
86
80
|
expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
|
87
81
|
expect(res.first[1].linnean_classification_path).
|
88
|
-
to eq [["Animalia", :kingdom],
|
89
|
-
["Arthropoda", :phylum],
|
90
|
-
["Insecta", :class],
|
91
|
-
["Diptera", :order],
|
92
|
-
["Cecidomyiidae", :family],
|
82
|
+
to eq [["Animalia", :kingdom],
|
83
|
+
["Arthropoda", :phylum],
|
84
|
+
["Insecta", :class],
|
85
|
+
["Diptera", :order],
|
86
|
+
["Cecidomyiidae", :family],
|
93
87
|
["Resseliella", :genus]]
|
94
|
-
|
95
88
|
end
|
96
89
|
end
|
97
90
|
|
98
|
-
context
|
99
|
-
it
|
91
|
+
context "no linnean fields are given" do
|
92
|
+
it "returns empty linnean classification" do
|
100
93
|
res = normalizer.normalize
|
101
94
|
expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
|
102
95
|
expect(res.first[1].linnean_classification_path).to be_empty
|
103
96
|
end
|
104
97
|
end
|
105
98
|
|
106
|
-
context
|
107
|
-
let(:file_name) {
|
108
|
-
it
|
99
|
+
context "in the presence of scientificNameAuthorship field" do
|
100
|
+
let(:file_name) { "sci_name_authorship.tar.gz" }
|
101
|
+
it "returns normalized data" do
|
109
102
|
normalizer.normalize
|
110
103
|
expect(normalizer.darwin_core.file_name).
|
111
|
-
to eq
|
104
|
+
to eq "sci_name_authorship.tar.gz"
|
112
105
|
expect(normalizer.normalized_data).to be_kind_of Hash
|
113
106
|
expect(normalizer.normalized_data.size).to be > 0
|
114
|
-
tn = normalizer.normalized_data[
|
115
|
-
expect(tn.current_name).to eq
|
116
|
-
expect(tn.current_name_canonical).to eq
|
107
|
+
tn = normalizer.normalized_data["leptogastrinae:tid:2688"]
|
108
|
+
expect(tn.current_name).to eq "Leptogaster fornicata Martin, 1957"
|
109
|
+
expect(tn.current_name_canonical).to eq "Leptogaster fornicata"
|
117
110
|
end
|
118
111
|
end
|
119
112
|
|
120
|
-
context
|
121
|
-
let(:file_name) {
|
122
|
-
it
|
113
|
+
context "when scientificNameAuthorship duplicates author info" do
|
114
|
+
let(:file_name) { "sci_name_authorship_dup.tar.gz" }
|
115
|
+
it "returns normalized data" do
|
123
116
|
normalizer.normalize
|
124
117
|
expect(normalizer.darwin_core.file_name).
|
125
|
-
to eq
|
118
|
+
to eq "sci_name_authorship_dup.tar.gz"
|
126
119
|
expect(normalizer.normalized_data).to be_kind_of Hash
|
127
120
|
expect(normalizer.normalized_data.size).to be > 0
|
128
|
-
tn = normalizer.normalized_data[
|
129
|
-
expect(tn.current_name).to eq
|
130
|
-
expect(tn.current_name_canonical).to eq
|
121
|
+
tn = normalizer.normalized_data["leptogastrinae:tid:2688"]
|
122
|
+
expect(tn.current_name).to eq "Leptogaster fornicata Martin, 1957"
|
123
|
+
expect(tn.current_name_canonical).to eq "Leptogaster fornicata"
|
131
124
|
end
|
132
125
|
end
|
133
126
|
|
134
|
-
context
|
135
|
-
let(:file_name) {
|
136
|
-
it
|
127
|
+
context "coreid is empty" do
|
128
|
+
let(:file_name) { "empty_coreid.tar.gz" }
|
129
|
+
it "should ingest information" do
|
137
130
|
res = normalizer.normalize
|
138
131
|
expect(normalizer.darwin_core.file_name).
|
139
|
-
to eq
|
140
|
-
tn = res[
|
141
|
-
expect(tn.current_name).to eq
|
132
|
+
to eq "empty_coreid.tar.gz"
|
133
|
+
tn = res["Taxon9"]
|
134
|
+
expect(tn.current_name).to eq "Amanita phalloides"
|
142
135
|
end
|
143
136
|
end
|
144
137
|
|
145
|
-
context
|
146
|
-
let(:file_name) {
|
147
|
-
it
|
138
|
+
context "vernacular locality info" do
|
139
|
+
let(:file_name) { "language_locality.tar.gz" }
|
140
|
+
it "should ingest locality and language" do
|
148
141
|
res = normalizer.normalize
|
149
|
-
tn = res[
|
142
|
+
tn = res["leptogastrinae:tid:42"]
|
150
143
|
vn = tn.vernacular_names[0]
|
151
|
-
expect(vn.language).to eq
|
152
|
-
expect(vn.locality).to eq
|
144
|
+
expect(vn.language).to eq "en"
|
145
|
+
expect(vn.locality).to eq "New England"
|
153
146
|
end
|
154
147
|
end
|
155
148
|
end
|
156
149
|
|
157
|
-
describe
|
158
|
-
let(:file_path) { File.join(file_dir,
|
150
|
+
describe "#name_strings" do
|
151
|
+
let(:file_path) { File.join(file_dir, "flat_list.tar.gz") }
|
159
152
|
|
160
|
-
context
|
161
|
-
it
|
153
|
+
context "before running #normalize" do
|
154
|
+
it "is empty" do
|
162
155
|
expect(normalizer.name_strings).to be_empty
|
163
156
|
end
|
164
157
|
end
|
165
|
-
|
166
|
-
context 'after running #normalize' do
|
167
|
-
let(:normalized) { normalizer.tap { |n| n.normalize } }
|
168
158
|
|
169
|
-
|
170
|
-
|
159
|
+
context "after running #normalize" do
|
160
|
+
let(:normalized) { normalizer.tap(&:normalize) }
|
161
|
+
|
162
|
+
context "default attibutes" do
|
163
|
+
it "returns array" do
|
171
164
|
expect(normalized.name_strings).to be_kind_of Array
|
172
165
|
expect(normalized.name_strings.size).to be > 1
|
173
166
|
end
|
174
167
|
end
|
175
168
|
|
176
|
-
context
|
177
|
-
it
|
178
|
-
strings = normalized.name_strings(with_hash:true)
|
169
|
+
context "with_hash attribute" do
|
170
|
+
it "returns hash" do
|
171
|
+
strings = normalized.name_strings(with_hash: true)
|
179
172
|
expect(strings).to be_kind_of Hash
|
180
173
|
expect(strings.size).to be > 1
|
181
174
|
expect(strings.values.uniq).to eq [1]
|
182
175
|
end
|
183
176
|
end
|
184
177
|
end
|
185
|
-
|
186
178
|
end
|
187
179
|
|
188
|
-
describe
|
189
|
-
let(:file_path) { File.join(file_dir,
|
180
|
+
describe "#vernacular_name_strings" do
|
181
|
+
let(:file_path) { File.join(file_dir, "flat_list.tar.gz") }
|
190
182
|
|
191
|
-
context
|
183
|
+
context "before running #normalize" do
|
192
184
|
subject(:vern) { normalizer.vernacular_name_strings }
|
193
|
-
|
185
|
+
|
186
|
+
it "is empty" do
|
194
187
|
expect(vern).to be_empty
|
195
188
|
end
|
196
189
|
end
|
197
|
-
|
198
|
-
context
|
199
|
-
let(:normalized) { normalizer.tap
|
190
|
+
|
191
|
+
context "after running #normalize" do
|
192
|
+
let(:normalized) { normalizer.tap(&:normalize) }
|
200
193
|
subject(:vern) { normalized.vernacular_name_strings }
|
201
|
-
subject(:vern_w_hash)
|
202
|
-
vernacular_name_strings(with_hash: true)
|
194
|
+
subject(:vern_w_hash) do
|
195
|
+
normalized.vernacular_name_strings(with_hash: true)
|
196
|
+
end
|
203
197
|
|
204
|
-
context
|
205
|
-
it
|
198
|
+
context "default attibutes" do
|
199
|
+
it "returns array" do
|
206
200
|
expect(vern).to be_kind_of Array
|
207
|
-
expect(vern.size).to be > 0
|
201
|
+
expect(vern.size).to be > 0
|
208
202
|
end
|
209
203
|
end
|
210
204
|
|
211
|
-
context
|
212
|
-
it
|
205
|
+
context "with_hash attribute" do
|
206
|
+
it "returns hash" do
|
213
207
|
expect(vern_w_hash).to be_kind_of Hash
|
214
|
-
expect(vern_w_hash.size).to be > 0
|
208
|
+
expect(vern_w_hash.size).to be > 0
|
215
209
|
expect(vern_w_hash.values.uniq).to eq [1]
|
216
210
|
end
|
217
211
|
end
|
218
|
-
|
219
212
|
end
|
220
|
-
|
221
213
|
end
|
222
|
-
|
223
214
|
end
|