dwc-archive 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/VERSION +1 -1
- data/features/dwca-creator.feature +40 -0
- data/features/{dwc-archive.feature → dwca-reader.feature} +1 -1
- data/features/step_definitions/dwc-creator_steps.rb +105 -0
- data/features/step_definitions/{dwc-archive_steps.rb → dwc-reader_steps.rb} +0 -0
- data/lib/dwc-archive.rb +9 -3
- data/lib/dwc-archive/errors.rb +1 -0
- data/lib/dwc-archive/generator.rb +73 -0
- data/lib/dwc-archive/generator_eml_xml.rb +50 -0
- data/lib/dwc-archive/generator_meta_xml.rb +51 -0
- data/lib/dwc-archive/ingester.rb +4 -4
- data/lib/dwc-archive/metadata.rb +1 -1
- data/lib/ruby_extensions.rb +3 -3
- metadata +11 -6
data/.gitignore
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
@@ -0,0 +1,40 @@
|
|
1
|
+
Feature: Creating and writing a Darwin Core Archive
|
2
|
+
In order to communicate with DwCA compatible programs
|
3
|
+
A User should be able to
|
4
|
+
Save data from ruby objects into Darwin Core Archive file
|
5
|
+
|
6
|
+
Scenario: Creating Core File
|
7
|
+
Given an array of urls for Darwin Core or other terms
|
8
|
+
And arrays of data in the order correpsonding to order of terms
|
9
|
+
When User creates generator
|
10
|
+
And User sends this data to core generator
|
11
|
+
Then these data should be saved as "darwin_core.txt" file
|
12
|
+
|
13
|
+
Scenario: Creating Extensions
|
14
|
+
Given 2 sets of data with terms as urls in the header
|
15
|
+
When User creates generator
|
16
|
+
And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
|
17
|
+
Then data are saved as "vernacular.txt" and "synonyms.txt"
|
18
|
+
|
19
|
+
Scenario: Creating metadata.xml and eml.xml
|
20
|
+
Given an array of urls for Darwin Core or other terms
|
21
|
+
And arrays of data in the order correpsonding to order of terms
|
22
|
+
And 2 sets of data with terms as urls in the header
|
23
|
+
When User creates generator
|
24
|
+
And User sends this data to core generator
|
25
|
+
And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
|
26
|
+
And User generates meta.xml and eml.xml
|
27
|
+
Then there should be "meta.xml" file with core and extensions informations
|
28
|
+
And there should be "eml.xml" file with authoriship information
|
29
|
+
|
30
|
+
Scenario: Making DarwinCore Archive file
|
31
|
+
Given an array of urls for Darwin Core or other terms
|
32
|
+
And arrays of data in the order correpsonding to order of terms
|
33
|
+
And 2 sets of data with terms as urls in the header
|
34
|
+
When User creates generator
|
35
|
+
And User sends this data to core generator
|
36
|
+
And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
|
37
|
+
And User generates meta.xml and eml.xml
|
38
|
+
And generates archive
|
39
|
+
Then there should be a valid new archive file
|
40
|
+
|
@@ -0,0 +1,105 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require 'ruby-debug'
|
3
|
+
|
4
|
+
|
5
|
+
Given /^an array of urls for Darwin Core or other terms$/ do
|
6
|
+
@rows = ["http://rs.tdwg.org/dwc/terms/taxonID", "http://rs.tdwg.org/dwc/terms/parentNameUsageID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/taxonRank"]
|
7
|
+
end
|
8
|
+
|
9
|
+
Given /^arrays of data in the order correpsonding to order of terms$/ do
|
10
|
+
@data = [
|
11
|
+
[1, 0, "Plantae", "kingdom"],
|
12
|
+
[2, 1, "Betula", "genus"],
|
13
|
+
[3, 2, "Betula verucosa", "species"]
|
14
|
+
]
|
15
|
+
end
|
16
|
+
|
17
|
+
When /^User sends this data to core generator$/ do
|
18
|
+
@data = @data.unshift @rows
|
19
|
+
@gen.add_core(@data, 'darwin_core.txt')
|
20
|
+
end
|
21
|
+
|
22
|
+
Then /^these data should be saved as "([^\"]*)" file$/ do |file_name|
|
23
|
+
file = File.join(@gen.path, file_name)
|
24
|
+
@gen.files.include?(file_name).should be_true
|
25
|
+
csv = CSV.open(file).count.should == 4
|
26
|
+
end
|
27
|
+
|
28
|
+
Given /^2 sets of data with terms as urls in the header$/ do
|
29
|
+
@vernaculars = [
|
30
|
+
["http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/vernacularName"],
|
31
|
+
[1, "Plants"],
|
32
|
+
[1, "Растения"],
|
33
|
+
[2, "Birch"],
|
34
|
+
[2, "Береза"],
|
35
|
+
[3, "Wheeping Birch"],
|
36
|
+
[3, "Береза плакучая"]
|
37
|
+
]
|
38
|
+
@synonyms = [
|
39
|
+
["http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/taxonomicStatus"],
|
40
|
+
[1, "Betila Linnaeus, 1753", 'misspelling']
|
41
|
+
]
|
42
|
+
end
|
43
|
+
|
44
|
+
When /^User creates generator$/ do
|
45
|
+
@gen = DarwinCore::Generator.new('/tmp/dwc.tar.gz')
|
46
|
+
end
|
47
|
+
|
48
|
+
When /^User adds extensions with file names "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
|
49
|
+
@gen.add_extension(@vernaculars, file_name_1)
|
50
|
+
@gen.add_extension(@synonyms, file_name_2)
|
51
|
+
end
|
52
|
+
|
53
|
+
Then /^data are saved as "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
|
54
|
+
[file_name_1, file_name_2].each do |file_name|
|
55
|
+
file = File.join(@gen.path, file_name)
|
56
|
+
@gen.files.include?(file_name).should be_true
|
57
|
+
csv = CSV.open(file).count.should > 1
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
When /^User generates meta\.xml and eml.xml$/ do
|
62
|
+
@gen.add_meta_xml
|
63
|
+
@gen.add_eml_xml({
|
64
|
+
:id => '1234',
|
65
|
+
:title => 'Test Classification',
|
66
|
+
:authors => [
|
67
|
+
{ :first_name => 'John',
|
68
|
+
:last_name => 'Doe',
|
69
|
+
:email => 'jdoe@example.com' },
|
70
|
+
{ :first_name => 'Jane',
|
71
|
+
:last_name => 'Doe',
|
72
|
+
:email => 'jane@example.com' }
|
73
|
+
],
|
74
|
+
:abstract => 'test classification',
|
75
|
+
:citation => 'Test classification: Doe John, Doe Jane, Taxnonmy, 10, 1, 2010',
|
76
|
+
:url => 'http://example.com'
|
77
|
+
})
|
78
|
+
end
|
79
|
+
|
80
|
+
Then /^there should be "([^\"]*)" file with core and extensions informations$/ do |file_name|
|
81
|
+
meta = File.join(@gen.path, file_name)
|
82
|
+
@gen.files.include?(file_name).should be_true
|
83
|
+
dom = Nokogiri::XML(open(File.join(@gen.path, file_name)))
|
84
|
+
dom.xpath('//xmlns:core//xmlns:location').text.should == 'darwin_core.txt'
|
85
|
+
dom.xpath('//xmlns:extension[1]//xmlns:location').text.should == 'vernacular.txt'
|
86
|
+
end
|
87
|
+
|
88
|
+
Then /^there should be "([^\"]*)" file with authoriship information$/ do |file_name|
|
89
|
+
eml = File.join(@gen.path, file_name)
|
90
|
+
@gen.files.include?(file_name).should be_true
|
91
|
+
end
|
92
|
+
|
93
|
+
Given /^a path to a new file \- "([^\"]*)"$/ do |file_name|
|
94
|
+
@dwca_file = file_name
|
95
|
+
end
|
96
|
+
|
97
|
+
When /^generates archive$/ do
|
98
|
+
@gen.pack
|
99
|
+
end
|
100
|
+
|
101
|
+
Then /^there should be a valid new archive file$/ do
|
102
|
+
dwc = DarwinCore.new('/tmp/dwc.tar.gz')
|
103
|
+
dwc.archive.valid?.should be_true
|
104
|
+
end
|
105
|
+
|
File without changes
|
data/lib/dwc-archive.rb
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
$:.unshift(File.dirname(__FILE__)) unless
|
3
3
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
4
|
+
R19 = RUBY_VERSION.split('.')[0..1].join('').to_i > 18
|
4
5
|
require 'ruby_extensions'
|
5
6
|
require 'fileutils'
|
6
|
-
|
7
|
+
require 'ostruct'
|
8
|
+
|
9
|
+
if R19
|
10
|
+
require 'csv'
|
11
|
+
else
|
7
12
|
require 'fastercsv'
|
8
13
|
CSV = FasterCSV
|
9
|
-
rescue LoadError
|
10
|
-
require 'csv'
|
11
14
|
end
|
12
15
|
require 'dwc-archive/ingester'
|
13
16
|
require 'dwc-archive/errors'
|
@@ -16,6 +19,9 @@ require 'dwc-archive/archive'
|
|
16
19
|
require 'dwc-archive/core'
|
17
20
|
require 'dwc-archive/extension'
|
18
21
|
require 'dwc-archive/metadata'
|
22
|
+
require 'dwc-archive/generator'
|
23
|
+
require 'dwc-archive/generator_meta_xml'
|
24
|
+
require 'dwc-archive/generator_eml_xml'
|
19
25
|
|
20
26
|
class DarwinCore
|
21
27
|
attr_reader :archive, :core, :metadata, :extensions
|
data/lib/dwc-archive/errors.rb
CHANGED
@@ -0,0 +1,73 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
attr_reader :eml_xml_data
|
4
|
+
|
5
|
+
#TODO refactor -- for now copying expander methods
|
6
|
+
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
7
|
+
@dwc_path = dwc_path
|
8
|
+
@path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
|
9
|
+
FileUtils.mkdir(@path)
|
10
|
+
@meta_xml_data = {:extensions => []}
|
11
|
+
@eml_xml_data = {:id => nil, :title => nil, :authors => [], :abstract => nil, :citation => nil, :url => nil}
|
12
|
+
@write = R19 ? 'w:utf-8' : 'w'
|
13
|
+
end
|
14
|
+
|
15
|
+
#TODO refactor!
|
16
|
+
def clean
|
17
|
+
FileUtils.rm_rf(@path) if FileTest.exists?(@path)
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_core(data, file_name, keep_headers = true)
|
21
|
+
c = CSV.open(File.join(@path,file_name), @write)
|
22
|
+
header = data.shift
|
23
|
+
fields = header.map do |f|
|
24
|
+
f.strip!
|
25
|
+
raise GeneratorError("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
|
26
|
+
f.split("/")[-1]
|
27
|
+
end
|
28
|
+
data.unshift(fields) if keep_headers
|
29
|
+
@meta_xml_data[:core] = {:fields => header, :ignoreHeaderLines => keep_headers, :location => file_name}
|
30
|
+
data.each {|d| c << d}
|
31
|
+
c.close
|
32
|
+
end
|
33
|
+
|
34
|
+
def add_extension(data, file_name, keep_headers = true)
|
35
|
+
c = CSV.open(File.join(@path,file_name), @write)
|
36
|
+
header = data.shift
|
37
|
+
fields = header.map do |f|
|
38
|
+
f.strip!
|
39
|
+
raise GeneratorError("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
|
40
|
+
f.split("/")[-1]
|
41
|
+
end
|
42
|
+
data.unshift(fields) if keep_headers
|
43
|
+
@meta_xml_data[:extensions] << { :fields => header, :ignoreHeaderLines => keep_headers, :location => file_name }
|
44
|
+
data.each { |d| c << d }
|
45
|
+
c.close
|
46
|
+
end
|
47
|
+
|
48
|
+
def add_meta_xml
|
49
|
+
meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
|
50
|
+
meta.create
|
51
|
+
end
|
52
|
+
|
53
|
+
def add_eml_xml(data)
|
54
|
+
@eml_xml_data = data
|
55
|
+
eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
|
56
|
+
eml.create
|
57
|
+
end
|
58
|
+
|
59
|
+
def path
|
60
|
+
@path
|
61
|
+
end
|
62
|
+
|
63
|
+
def files
|
64
|
+
return nil unless @path && FileTest.exists?(@path)
|
65
|
+
Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
|
66
|
+
end
|
67
|
+
|
68
|
+
def pack
|
69
|
+
a = "cd #{@path}; tar -zcf #{@dwc_path} *"
|
70
|
+
system(a)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
class EmlXml
|
4
|
+
def initialize(data, path)
|
5
|
+
@data = data
|
6
|
+
@path = path
|
7
|
+
@write = R19 ? 'w:utf-8' : 'w'
|
8
|
+
end
|
9
|
+
def create
|
10
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
11
|
+
xml.eml( :packageId => "eml.1.1",
|
12
|
+
:system => "knb",
|
13
|
+
'xmlns:eml' => "eml://ecoinformatics.org/eml-2.1.0",
|
14
|
+
'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
|
15
|
+
'xsi:schemaLocation' => "eml://ecoinformatics.org/eml-2.1.0 eml.xsd" ) do
|
16
|
+
xml.dataset(:id => @data[:id]) do
|
17
|
+
xml.title(@data[:title])
|
18
|
+
contacts = []
|
19
|
+
@data[:authors].each_with_index do |a, i|
|
20
|
+
creator_id = i + 1
|
21
|
+
contacts << creator_id
|
22
|
+
xml.creator(:id => creator_id, :scope => 'document') do
|
23
|
+
xml.individualName do
|
24
|
+
xml.givenName(a[:first_name])
|
25
|
+
xml.surName(a[:last_name])
|
26
|
+
end
|
27
|
+
xml.electronicMailAddress(a[:email])
|
28
|
+
end
|
29
|
+
end
|
30
|
+
xml.abstract(@data[:abstract])
|
31
|
+
contacts.each do |contact|
|
32
|
+
xml.contact { xml.references(contact) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
xml.additionalMetadata do
|
36
|
+
xml.metadata do
|
37
|
+
xml.citation(@data[:citation])
|
38
|
+
end
|
39
|
+
end
|
40
|
+
xml.parent.namespace = xml.parent.namespace_definitions.first
|
41
|
+
end
|
42
|
+
end
|
43
|
+
data = builder.to_xml
|
44
|
+
f = open(File.join(@path, 'eml.xml'), @write)
|
45
|
+
f.write(data)
|
46
|
+
f.close
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
class MetaXml
|
4
|
+
def initialize(data, path)
|
5
|
+
@data = data
|
6
|
+
@path = path
|
7
|
+
@write = R19 ? 'w:utf-8' : 'w'
|
8
|
+
end
|
9
|
+
|
10
|
+
def create
|
11
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
12
|
+
opts = { :encoding => "UTF-8", :fieldsTerminatedBy => ",", :fieldsEnclosedBy => '"', :linesTerminatedBy => "\n", :rowType => "http://rs.tdwg.org/dwc/tems/Taxon" }
|
13
|
+
xml.starArchive(:xmlns => "http://rs.tdwg.org/dwc/terms/xsd/archive/",
|
14
|
+
"xmlns:xsi" =>"http://www.w3.org/2001/XMLSchema-instance",
|
15
|
+
"xsi:schemaLocation" => "http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd",
|
16
|
+
:fileRoot => ".") do
|
17
|
+
xml.core(opts.merge(:ignoreHeaderLines => @data[:core][:ignoreHeaderLines])) do
|
18
|
+
xml.files { xml.location(@data[:core][:location]) }
|
19
|
+
taxon_id, fields = find_taxon_id(@data[:core][:fields])
|
20
|
+
xml.id_(:term => taxon_id[0], :index => taxon_id[1])
|
21
|
+
fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
|
22
|
+
end
|
23
|
+
@data[:extensions].each do |e|
|
24
|
+
xml.extension(opts.merge(:ignoreHeaderLines => e[:ignoreHeaderLines])) do
|
25
|
+
xml.files { xml.location(e[:location]) }
|
26
|
+
taxon_id, fields = find_taxon_id(e[:fields])
|
27
|
+
xml.coreid(:term => taxon_id[0], :index => taxon_id[1])
|
28
|
+
fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
meta_xml_data = builder.to_xml
|
34
|
+
meta_file = open(File.join(@path, 'meta.xml'), @write)
|
35
|
+
meta_file.write(meta_xml_data)
|
36
|
+
meta_file.close
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def find_taxon_id(data)
|
41
|
+
fields = []
|
42
|
+
data.each_with_index { |f, i| fields << [f.strip, i] }
|
43
|
+
taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
|
44
|
+
raise GeneratorError if taxon_id.size != 1
|
45
|
+
[taxon_id[0], fields]
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
data/lib/dwc-archive/ingester.rb
CHANGED
@@ -24,12 +24,12 @@ class DarwinCore
|
|
24
24
|
private
|
25
25
|
def process_csv_row(result, errors, row)
|
26
26
|
str = row.join('')
|
27
|
-
if
|
28
|
-
require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
|
29
|
-
UTF8RGX === str ? result << row : errors << row
|
30
|
-
else
|
27
|
+
if R19
|
31
28
|
str = str.force_encoding('utf-8')
|
32
29
|
str.encoding.name == "UTF-8" && str.valid_encoding? ? result << row : errors << row
|
30
|
+
else
|
31
|
+
require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
|
32
|
+
UTF8RGX === str ? result << row : errors << row
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
data/lib/dwc-archive/metadata.rb
CHANGED
data/lib/ruby_extensions.rb
CHANGED
@@ -9,7 +9,7 @@ class Hash
|
|
9
9
|
result = Nokogiri::XML(xml_io)
|
10
10
|
return { result.root.name.to_sym => xml_node_to_hash(result.root)}
|
11
11
|
rescue Exception => e
|
12
|
-
|
12
|
+
raise e
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
@@ -53,8 +53,8 @@ class Hash
|
|
53
53
|
|
54
54
|
def prepare(data)
|
55
55
|
return data if data.class != String
|
56
|
-
|
57
|
-
|
56
|
+
return true if data.strip == "true"
|
57
|
+
return false if data.strip == "false"
|
58
58
|
data.to_i.to_s == data ? data.to_i : data
|
59
59
|
end
|
60
60
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
- 2
|
9
8
|
- 3
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Dmitry Mozzherin
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-
|
18
|
+
date: 2010-07-12 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -64,8 +64,10 @@ files:
|
|
64
64
|
- README.rdoc
|
65
65
|
- Rakefile
|
66
66
|
- VERSION
|
67
|
-
- features/
|
68
|
-
- features/
|
67
|
+
- features/dwca-creator.feature
|
68
|
+
- features/dwca-reader.feature
|
69
|
+
- features/step_definitions/dwc-creator_steps.rb
|
70
|
+
- features/step_definitions/dwc-reader_steps.rb
|
69
71
|
- features/support/env.rb
|
70
72
|
- lib/dwc-archive.rb
|
71
73
|
- lib/dwc-archive/.expander.rb.swo
|
@@ -74,6 +76,9 @@ files:
|
|
74
76
|
- lib/dwc-archive/errors.rb
|
75
77
|
- lib/dwc-archive/expander.rb
|
76
78
|
- lib/dwc-archive/extension.rb
|
79
|
+
- lib/dwc-archive/generator.rb
|
80
|
+
- lib/dwc-archive/generator_eml_xml.rb
|
81
|
+
- lib/dwc-archive/generator_meta_xml.rb
|
77
82
|
- lib/dwc-archive/ingester.rb
|
78
83
|
- lib/dwc-archive/metadata.rb
|
79
84
|
- lib/dwc-archive/utf_regex_ruby18.rb
|