dwc-archive 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/VERSION +1 -1
- data/features/dwca-creator.feature +40 -0
- data/features/{dwc-archive.feature → dwca-reader.feature} +1 -1
- data/features/step_definitions/dwc-creator_steps.rb +105 -0
- data/features/step_definitions/{dwc-archive_steps.rb → dwc-reader_steps.rb} +0 -0
- data/lib/dwc-archive.rb +9 -3
- data/lib/dwc-archive/errors.rb +1 -0
- data/lib/dwc-archive/generator.rb +73 -0
- data/lib/dwc-archive/generator_eml_xml.rb +50 -0
- data/lib/dwc-archive/generator_meta_xml.rb +51 -0
- data/lib/dwc-archive/ingester.rb +4 -4
- data/lib/dwc-archive/metadata.rb +1 -1
- data/lib/ruby_extensions.rb +3 -3
- metadata +11 -6
data/.gitignore
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
@@ -0,0 +1,40 @@
|
|
1
|
+
Feature: Creating and writing a Darwin Core Archive
|
2
|
+
In order to communicate with DwCA compatible programs
|
3
|
+
A User should be able to
|
4
|
+
Save data from ruby objects into Darwin Core Archive file
|
5
|
+
|
6
|
+
Scenario: Creating Core File
|
7
|
+
Given an array of urls for Darwin Core or other terms
|
8
|
+
And arrays of data in the order correpsonding to order of terms
|
9
|
+
When User creates generator
|
10
|
+
And User sends this data to core generator
|
11
|
+
Then these data should be saved as "darwin_core.txt" file
|
12
|
+
|
13
|
+
Scenario: Creating Extensions
|
14
|
+
Given 2 sets of data with terms as urls in the header
|
15
|
+
When User creates generator
|
16
|
+
And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
|
17
|
+
Then data are saved as "vernacular.txt" and "synonyms.txt"
|
18
|
+
|
19
|
+
Scenario: Creating metadata.xml and eml.xml
|
20
|
+
Given an array of urls for Darwin Core or other terms
|
21
|
+
And arrays of data in the order correpsonding to order of terms
|
22
|
+
And 2 sets of data with terms as urls in the header
|
23
|
+
When User creates generator
|
24
|
+
And User sends this data to core generator
|
25
|
+
And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
|
26
|
+
And User generates meta.xml and eml.xml
|
27
|
+
Then there should be "meta.xml" file with core and extensions informations
|
28
|
+
And there should be "eml.xml" file with authoriship information
|
29
|
+
|
30
|
+
Scenario: Making DarwinCore Archive file
|
31
|
+
Given an array of urls for Darwin Core or other terms
|
32
|
+
And arrays of data in the order correpsonding to order of terms
|
33
|
+
And 2 sets of data with terms as urls in the header
|
34
|
+
When User creates generator
|
35
|
+
And User sends this data to core generator
|
36
|
+
And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
|
37
|
+
And User generates meta.xml and eml.xml
|
38
|
+
And generates archive
|
39
|
+
Then there should be a valid new archive file
|
40
|
+
|
@@ -0,0 +1,105 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require 'ruby-debug'
|
3
|
+
|
4
|
+
|
5
|
+
Given /^an array of urls for Darwin Core or other terms$/ do
|
6
|
+
@rows = ["http://rs.tdwg.org/dwc/terms/taxonID", "http://rs.tdwg.org/dwc/terms/parentNameUsageID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/taxonRank"]
|
7
|
+
end
|
8
|
+
|
9
|
+
Given /^arrays of data in the order correpsonding to order of terms$/ do
|
10
|
+
@data = [
|
11
|
+
[1, 0, "Plantae", "kingdom"],
|
12
|
+
[2, 1, "Betula", "genus"],
|
13
|
+
[3, 2, "Betula verucosa", "species"]
|
14
|
+
]
|
15
|
+
end
|
16
|
+
|
17
|
+
When /^User sends this data to core generator$/ do
|
18
|
+
@data = @data.unshift @rows
|
19
|
+
@gen.add_core(@data, 'darwin_core.txt')
|
20
|
+
end
|
21
|
+
|
22
|
+
Then /^these data should be saved as "([^\"]*)" file$/ do |file_name|
|
23
|
+
file = File.join(@gen.path, file_name)
|
24
|
+
@gen.files.include?(file_name).should be_true
|
25
|
+
csv = CSV.open(file).count.should == 4
|
26
|
+
end
|
27
|
+
|
28
|
+
Given /^2 sets of data with terms as urls in the header$/ do
|
29
|
+
@vernaculars = [
|
30
|
+
["http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/vernacularName"],
|
31
|
+
[1, "Plants"],
|
32
|
+
[1, "Растения"],
|
33
|
+
[2, "Birch"],
|
34
|
+
[2, "Береза"],
|
35
|
+
[3, "Wheeping Birch"],
|
36
|
+
[3, "Береза плакучая"]
|
37
|
+
]
|
38
|
+
@synonyms = [
|
39
|
+
["http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/taxonomicStatus"],
|
40
|
+
[1, "Betila Linnaeus, 1753", 'misspelling']
|
41
|
+
]
|
42
|
+
end
|
43
|
+
|
44
|
+
When /^User creates generator$/ do
|
45
|
+
@gen = DarwinCore::Generator.new('/tmp/dwc.tar.gz')
|
46
|
+
end
|
47
|
+
|
48
|
+
When /^User adds extensions with file names "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
|
49
|
+
@gen.add_extension(@vernaculars, file_name_1)
|
50
|
+
@gen.add_extension(@synonyms, file_name_2)
|
51
|
+
end
|
52
|
+
|
53
|
+
Then /^data are saved as "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
|
54
|
+
[file_name_1, file_name_2].each do |file_name|
|
55
|
+
file = File.join(@gen.path, file_name)
|
56
|
+
@gen.files.include?(file_name).should be_true
|
57
|
+
csv = CSV.open(file).count.should > 1
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
When /^User generates meta\.xml and eml.xml$/ do
|
62
|
+
@gen.add_meta_xml
|
63
|
+
@gen.add_eml_xml({
|
64
|
+
:id => '1234',
|
65
|
+
:title => 'Test Classification',
|
66
|
+
:authors => [
|
67
|
+
{ :first_name => 'John',
|
68
|
+
:last_name => 'Doe',
|
69
|
+
:email => 'jdoe@example.com' },
|
70
|
+
{ :first_name => 'Jane',
|
71
|
+
:last_name => 'Doe',
|
72
|
+
:email => 'jane@example.com' }
|
73
|
+
],
|
74
|
+
:abstract => 'test classification',
|
75
|
+
:citation => 'Test classification: Doe John, Doe Jane, Taxnonmy, 10, 1, 2010',
|
76
|
+
:url => 'http://example.com'
|
77
|
+
})
|
78
|
+
end
|
79
|
+
|
80
|
+
Then /^there should be "([^\"]*)" file with core and extensions informations$/ do |file_name|
|
81
|
+
meta = File.join(@gen.path, file_name)
|
82
|
+
@gen.files.include?(file_name).should be_true
|
83
|
+
dom = Nokogiri::XML(open(File.join(@gen.path, file_name)))
|
84
|
+
dom.xpath('//xmlns:core//xmlns:location').text.should == 'darwin_core.txt'
|
85
|
+
dom.xpath('//xmlns:extension[1]//xmlns:location').text.should == 'vernacular.txt'
|
86
|
+
end
|
87
|
+
|
88
|
+
Then /^there should be "([^\"]*)" file with authoriship information$/ do |file_name|
|
89
|
+
eml = File.join(@gen.path, file_name)
|
90
|
+
@gen.files.include?(file_name).should be_true
|
91
|
+
end
|
92
|
+
|
93
|
+
Given /^a path to a new file \- "([^\"]*)"$/ do |file_name|
|
94
|
+
@dwca_file = file_name
|
95
|
+
end
|
96
|
+
|
97
|
+
When /^generates archive$/ do
|
98
|
+
@gen.pack
|
99
|
+
end
|
100
|
+
|
101
|
+
Then /^there should be a valid new archive file$/ do
|
102
|
+
dwc = DarwinCore.new('/tmp/dwc.tar.gz')
|
103
|
+
dwc.archive.valid?.should be_true
|
104
|
+
end
|
105
|
+
|
File without changes
|
data/lib/dwc-archive.rb
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
$:.unshift(File.dirname(__FILE__)) unless
|
3
3
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
4
|
+
R19 = RUBY_VERSION.split('.')[0..1].join('').to_i > 18
|
4
5
|
require 'ruby_extensions'
|
5
6
|
require 'fileutils'
|
6
|
-
|
7
|
+
require 'ostruct'
|
8
|
+
|
9
|
+
if R19
|
10
|
+
require 'csv'
|
11
|
+
else
|
7
12
|
require 'fastercsv'
|
8
13
|
CSV = FasterCSV
|
9
|
-
rescue LoadError
|
10
|
-
require 'csv'
|
11
14
|
end
|
12
15
|
require 'dwc-archive/ingester'
|
13
16
|
require 'dwc-archive/errors'
|
@@ -16,6 +19,9 @@ require 'dwc-archive/archive'
|
|
16
19
|
require 'dwc-archive/core'
|
17
20
|
require 'dwc-archive/extension'
|
18
21
|
require 'dwc-archive/metadata'
|
22
|
+
require 'dwc-archive/generator'
|
23
|
+
require 'dwc-archive/generator_meta_xml'
|
24
|
+
require 'dwc-archive/generator_eml_xml'
|
19
25
|
|
20
26
|
class DarwinCore
|
21
27
|
attr_reader :archive, :core, :metadata, :extensions
|
data/lib/dwc-archive/errors.rb
CHANGED
@@ -0,0 +1,73 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
attr_reader :eml_xml_data
|
4
|
+
|
5
|
+
#TODO refactor -- for now copying expander methods
|
6
|
+
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
7
|
+
@dwc_path = dwc_path
|
8
|
+
@path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
|
9
|
+
FileUtils.mkdir(@path)
|
10
|
+
@meta_xml_data = {:extensions => []}
|
11
|
+
@eml_xml_data = {:id => nil, :title => nil, :authors => [], :abstract => nil, :citation => nil, :url => nil}
|
12
|
+
@write = R19 ? 'w:utf-8' : 'w'
|
13
|
+
end
|
14
|
+
|
15
|
+
#TODO refactor!
|
16
|
+
def clean
|
17
|
+
FileUtils.rm_rf(@path) if FileTest.exists?(@path)
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_core(data, file_name, keep_headers = true)
|
21
|
+
c = CSV.open(File.join(@path,file_name), @write)
|
22
|
+
header = data.shift
|
23
|
+
fields = header.map do |f|
|
24
|
+
f.strip!
|
25
|
+
raise GeneratorError("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
|
26
|
+
f.split("/")[-1]
|
27
|
+
end
|
28
|
+
data.unshift(fields) if keep_headers
|
29
|
+
@meta_xml_data[:core] = {:fields => header, :ignoreHeaderLines => keep_headers, :location => file_name}
|
30
|
+
data.each {|d| c << d}
|
31
|
+
c.close
|
32
|
+
end
|
33
|
+
|
34
|
+
def add_extension(data, file_name, keep_headers = true)
|
35
|
+
c = CSV.open(File.join(@path,file_name), @write)
|
36
|
+
header = data.shift
|
37
|
+
fields = header.map do |f|
|
38
|
+
f.strip!
|
39
|
+
raise GeneratorError("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
|
40
|
+
f.split("/")[-1]
|
41
|
+
end
|
42
|
+
data.unshift(fields) if keep_headers
|
43
|
+
@meta_xml_data[:extensions] << { :fields => header, :ignoreHeaderLines => keep_headers, :location => file_name }
|
44
|
+
data.each { |d| c << d }
|
45
|
+
c.close
|
46
|
+
end
|
47
|
+
|
48
|
+
def add_meta_xml
|
49
|
+
meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
|
50
|
+
meta.create
|
51
|
+
end
|
52
|
+
|
53
|
+
def add_eml_xml(data)
|
54
|
+
@eml_xml_data = data
|
55
|
+
eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
|
56
|
+
eml.create
|
57
|
+
end
|
58
|
+
|
59
|
+
def path
|
60
|
+
@path
|
61
|
+
end
|
62
|
+
|
63
|
+
def files
|
64
|
+
return nil unless @path && FileTest.exists?(@path)
|
65
|
+
Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
|
66
|
+
end
|
67
|
+
|
68
|
+
def pack
|
69
|
+
a = "cd #{@path}; tar -zcf #{@dwc_path} *"
|
70
|
+
system(a)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
class EmlXml
|
4
|
+
def initialize(data, path)
|
5
|
+
@data = data
|
6
|
+
@path = path
|
7
|
+
@write = R19 ? 'w:utf-8' : 'w'
|
8
|
+
end
|
9
|
+
def create
|
10
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
11
|
+
xml.eml( :packageId => "eml.1.1",
|
12
|
+
:system => "knb",
|
13
|
+
'xmlns:eml' => "eml://ecoinformatics.org/eml-2.1.0",
|
14
|
+
'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
|
15
|
+
'xsi:schemaLocation' => "eml://ecoinformatics.org/eml-2.1.0 eml.xsd" ) do
|
16
|
+
xml.dataset(:id => @data[:id]) do
|
17
|
+
xml.title(@data[:title])
|
18
|
+
contacts = []
|
19
|
+
@data[:authors].each_with_index do |a, i|
|
20
|
+
creator_id = i + 1
|
21
|
+
contacts << creator_id
|
22
|
+
xml.creator(:id => creator_id, :scope => 'document') do
|
23
|
+
xml.individualName do
|
24
|
+
xml.givenName(a[:first_name])
|
25
|
+
xml.surName(a[:last_name])
|
26
|
+
end
|
27
|
+
xml.electronicMailAddress(a[:email])
|
28
|
+
end
|
29
|
+
end
|
30
|
+
xml.abstract(@data[:abstract])
|
31
|
+
contacts.each do |contact|
|
32
|
+
xml.contact { xml.references(contact) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
xml.additionalMetadata do
|
36
|
+
xml.metadata do
|
37
|
+
xml.citation(@data[:citation])
|
38
|
+
end
|
39
|
+
end
|
40
|
+
xml.parent.namespace = xml.parent.namespace_definitions.first
|
41
|
+
end
|
42
|
+
end
|
43
|
+
data = builder.to_xml
|
44
|
+
f = open(File.join(@path, 'eml.xml'), @write)
|
45
|
+
f.write(data)
|
46
|
+
f.close
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
class MetaXml
|
4
|
+
def initialize(data, path)
|
5
|
+
@data = data
|
6
|
+
@path = path
|
7
|
+
@write = R19 ? 'w:utf-8' : 'w'
|
8
|
+
end
|
9
|
+
|
10
|
+
def create
|
11
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
12
|
+
opts = { :encoding => "UTF-8", :fieldsTerminatedBy => ",", :fieldsEnclosedBy => '"', :linesTerminatedBy => "\n", :rowType => "http://rs.tdwg.org/dwc/tems/Taxon" }
|
13
|
+
xml.starArchive(:xmlns => "http://rs.tdwg.org/dwc/terms/xsd/archive/",
|
14
|
+
"xmlns:xsi" =>"http://www.w3.org/2001/XMLSchema-instance",
|
15
|
+
"xsi:schemaLocation" => "http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd",
|
16
|
+
:fileRoot => ".") do
|
17
|
+
xml.core(opts.merge(:ignoreHeaderLines => @data[:core][:ignoreHeaderLines])) do
|
18
|
+
xml.files { xml.location(@data[:core][:location]) }
|
19
|
+
taxon_id, fields = find_taxon_id(@data[:core][:fields])
|
20
|
+
xml.id_(:term => taxon_id[0], :index => taxon_id[1])
|
21
|
+
fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
|
22
|
+
end
|
23
|
+
@data[:extensions].each do |e|
|
24
|
+
xml.extension(opts.merge(:ignoreHeaderLines => e[:ignoreHeaderLines])) do
|
25
|
+
xml.files { xml.location(e[:location]) }
|
26
|
+
taxon_id, fields = find_taxon_id(e[:fields])
|
27
|
+
xml.coreid(:term => taxon_id[0], :index => taxon_id[1])
|
28
|
+
fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
meta_xml_data = builder.to_xml
|
34
|
+
meta_file = open(File.join(@path, 'meta.xml'), @write)
|
35
|
+
meta_file.write(meta_xml_data)
|
36
|
+
meta_file.close
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def find_taxon_id(data)
|
41
|
+
fields = []
|
42
|
+
data.each_with_index { |f, i| fields << [f.strip, i] }
|
43
|
+
taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
|
44
|
+
raise GeneratorError if taxon_id.size != 1
|
45
|
+
[taxon_id[0], fields]
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
data/lib/dwc-archive/ingester.rb
CHANGED
@@ -24,12 +24,12 @@ class DarwinCore
|
|
24
24
|
private
|
25
25
|
def process_csv_row(result, errors, row)
|
26
26
|
str = row.join('')
|
27
|
-
if
|
28
|
-
require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
|
29
|
-
UTF8RGX === str ? result << row : errors << row
|
30
|
-
else
|
27
|
+
if R19
|
31
28
|
str = str.force_encoding('utf-8')
|
32
29
|
str.encoding.name == "UTF-8" && str.valid_encoding? ? result << row : errors << row
|
30
|
+
else
|
31
|
+
require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
|
32
|
+
UTF8RGX === str ? result << row : errors << row
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
data/lib/dwc-archive/metadata.rb
CHANGED
data/lib/ruby_extensions.rb
CHANGED
@@ -9,7 +9,7 @@ class Hash
|
|
9
9
|
result = Nokogiri::XML(xml_io)
|
10
10
|
return { result.root.name.to_sym => xml_node_to_hash(result.root)}
|
11
11
|
rescue Exception => e
|
12
|
-
|
12
|
+
raise e
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
@@ -53,8 +53,8 @@ class Hash
|
|
53
53
|
|
54
54
|
def prepare(data)
|
55
55
|
return data if data.class != String
|
56
|
-
|
57
|
-
|
56
|
+
return true if data.strip == "true"
|
57
|
+
return false if data.strip == "false"
|
58
58
|
data.to_i.to_s == data ? data.to_i : data
|
59
59
|
end
|
60
60
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
- 2
|
9
8
|
- 3
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Dmitry Mozzherin
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-
|
18
|
+
date: 2010-07-12 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -64,8 +64,10 @@ files:
|
|
64
64
|
- README.rdoc
|
65
65
|
- Rakefile
|
66
66
|
- VERSION
|
67
|
-
- features/
|
68
|
-
- features/
|
67
|
+
- features/dwca-creator.feature
|
68
|
+
- features/dwca-reader.feature
|
69
|
+
- features/step_definitions/dwc-creator_steps.rb
|
70
|
+
- features/step_definitions/dwc-reader_steps.rb
|
69
71
|
- features/support/env.rb
|
70
72
|
- lib/dwc-archive.rb
|
71
73
|
- lib/dwc-archive/.expander.rb.swo
|
@@ -74,6 +76,9 @@ files:
|
|
74
76
|
- lib/dwc-archive/errors.rb
|
75
77
|
- lib/dwc-archive/expander.rb
|
76
78
|
- lib/dwc-archive/extension.rb
|
79
|
+
- lib/dwc-archive/generator.rb
|
80
|
+
- lib/dwc-archive/generator_eml_xml.rb
|
81
|
+
- lib/dwc-archive/generator_meta_xml.rb
|
77
82
|
- lib/dwc-archive/ingester.rb
|
78
83
|
- lib/dwc-archive/metadata.rb
|
79
84
|
- lib/dwc-archive/utf_regex_ruby18.rb
|