dwc-archive 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -20,3 +20,4 @@ pkg
20
20
  *.gemspec
21
21
 
22
22
  ## PROJECT::SPECIFIC
23
+ tags
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.3
1
+ 0.3.0
@@ -0,0 +1,40 @@
1
+ Feature: Creating and writing a Darwin Core Archive
2
+ In order to communicate with DwCA compatible programs
3
+ A User should be able to
4
+ Save data from ruby objects into Darwin Core Archive file
5
+
6
+ Scenario: Creating Core File
7
+ Given an array of urls for Darwin Core or other terms
8
+ And arrays of data in the order correpsonding to order of terms
9
+ When User creates generator
10
+ And User sends this data to core generator
11
+ Then these data should be saved as "darwin_core.txt" file
12
+
13
+ Scenario: Creating Extensions
14
+ Given 2 sets of data with terms as urls in the header
15
+ When User creates generator
16
+ And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
17
+ Then data are saved as "vernacular.txt" and "synonyms.txt"
18
+
19
+ Scenario: Creating metadata.xml and eml.xml
20
+ Given an array of urls for Darwin Core or other terms
21
+ And arrays of data in the order correpsonding to order of terms
22
+ And 2 sets of data with terms as urls in the header
23
+ When User creates generator
24
+ And User sends this data to core generator
25
+ And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
26
+ And User generates meta.xml and eml.xml
27
+ Then there should be "meta.xml" file with core and extensions informations
28
+ And there should be "eml.xml" file with authoriship information
29
+
30
+ Scenario: Making DarwinCore Archive file
31
+ Given an array of urls for Darwin Core or other terms
32
+ And arrays of data in the order correpsonding to order of terms
33
+ And 2 sets of data with terms as urls in the header
34
+ When User creates generator
35
+ And User sends this data to core generator
36
+ And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
37
+ And User generates meta.xml and eml.xml
38
+ And generates archive
39
+ Then there should be a valid new archive file
40
+
@@ -1,4 +1,4 @@
1
- Feature: Creation of a Darwing Core Archive
1
+ Feature: Reading of a Darwing Core Archive
2
2
  In order to start working with Darwin Core Archive file
3
3
  A user should be able initiate dwc object from a file
4
4
  So I want to implement handling of dwc object creation
@@ -0,0 +1,105 @@
1
+ #encoding: utf-8
2
+ require 'ruby-debug'
3
+
4
+
5
+ Given /^an array of urls for Darwin Core or other terms$/ do
6
+ @rows = ["http://rs.tdwg.org/dwc/terms/taxonID", "http://rs.tdwg.org/dwc/terms/parentNameUsageID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/taxonRank"]
7
+ end
8
+
9
+ Given /^arrays of data in the order correpsonding to order of terms$/ do
10
+ @data = [
11
+ [1, 0, "Plantae", "kingdom"],
12
+ [2, 1, "Betula", "genus"],
13
+ [3, 2, "Betula verucosa", "species"]
14
+ ]
15
+ end
16
+
17
+ When /^User sends this data to core generator$/ do
18
+ @data = @data.unshift @rows
19
+ @gen.add_core(@data, 'darwin_core.txt')
20
+ end
21
+
22
+ Then /^these data should be saved as "([^\"]*)" file$/ do |file_name|
23
+ file = File.join(@gen.path, file_name)
24
+ @gen.files.include?(file_name).should be_true
25
+ csv = CSV.open(file).count.should == 4
26
+ end
27
+
28
+ Given /^2 sets of data with terms as urls in the header$/ do
29
+ @vernaculars = [
30
+ ["http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/vernacularName"],
31
+ [1, "Plants"],
32
+ [1, "Растения"],
33
+ [2, "Birch"],
34
+ [2, "Береза"],
35
+ [3, "Wheeping Birch"],
36
+ [3, "Береза плакучая"]
37
+ ]
38
+ @synonyms = [
39
+ ["http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/taxonomicStatus"],
40
+ [1, "Betila Linnaeus, 1753", 'misspelling']
41
+ ]
42
+ end
43
+
44
+ When /^User creates generator$/ do
45
+ @gen = DarwinCore::Generator.new('/tmp/dwc.tar.gz')
46
+ end
47
+
48
+ When /^User adds extensions with file names "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
49
+ @gen.add_extension(@vernaculars, file_name_1)
50
+ @gen.add_extension(@synonyms, file_name_2)
51
+ end
52
+
53
+ Then /^data are saved as "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
54
+ [file_name_1, file_name_2].each do |file_name|
55
+ file = File.join(@gen.path, file_name)
56
+ @gen.files.include?(file_name).should be_true
57
+ csv = CSV.open(file).count.should > 1
58
+ end
59
+ end
60
+
61
+ When /^User generates meta\.xml and eml.xml$/ do
62
+ @gen.add_meta_xml
63
+ @gen.add_eml_xml({
64
+ :id => '1234',
65
+ :title => 'Test Classification',
66
+ :authors => [
67
+ { :first_name => 'John',
68
+ :last_name => 'Doe',
69
+ :email => 'jdoe@example.com' },
70
+ { :first_name => 'Jane',
71
+ :last_name => 'Doe',
72
+ :email => 'jane@example.com' }
73
+ ],
74
+ :abstract => 'test classification',
75
+ :citation => 'Test classification: Doe John, Doe Jane, Taxnonmy, 10, 1, 2010',
76
+ :url => 'http://example.com'
77
+ })
78
+ end
79
+
80
+ Then /^there should be "([^\"]*)" file with core and extensions informations$/ do |file_name|
81
+ meta = File.join(@gen.path, file_name)
82
+ @gen.files.include?(file_name).should be_true
83
+ dom = Nokogiri::XML(open(File.join(@gen.path, file_name)))
84
+ dom.xpath('//xmlns:core//xmlns:location').text.should == 'darwin_core.txt'
85
+ dom.xpath('//xmlns:extension[1]//xmlns:location').text.should == 'vernacular.txt'
86
+ end
87
+
88
+ Then /^there should be "([^\"]*)" file with authoriship information$/ do |file_name|
89
+ eml = File.join(@gen.path, file_name)
90
+ @gen.files.include?(file_name).should be_true
91
+ end
92
+
93
+ Given /^a path to a new file \- "([^\"]*)"$/ do |file_name|
94
+ @dwca_file = file_name
95
+ end
96
+
97
+ When /^generates archive$/ do
98
+ @gen.pack
99
+ end
100
+
101
+ Then /^there should be a valid new archive file$/ do
102
+ dwc = DarwinCore.new('/tmp/dwc.tar.gz')
103
+ dwc.archive.valid?.should be_true
104
+ end
105
+
data/lib/dwc-archive.rb CHANGED
@@ -1,13 +1,16 @@
1
1
  # encoding: UTF-8
2
2
  $:.unshift(File.dirname(__FILE__)) unless
3
3
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
4
+ R19 = RUBY_VERSION.split('.')[0..1].join('').to_i > 18
4
5
  require 'ruby_extensions'
5
6
  require 'fileutils'
6
- begin
7
+ require 'ostruct'
8
+
9
+ if R19
10
+ require 'csv'
11
+ else
7
12
  require 'fastercsv'
8
13
  CSV = FasterCSV
9
- rescue LoadError
10
- require 'csv'
11
14
  end
12
15
  require 'dwc-archive/ingester'
13
16
  require 'dwc-archive/errors'
@@ -16,6 +19,9 @@ require 'dwc-archive/archive'
16
19
  require 'dwc-archive/core'
17
20
  require 'dwc-archive/extension'
18
21
  require 'dwc-archive/metadata'
22
+ require 'dwc-archive/generator'
23
+ require 'dwc-archive/generator_meta_xml'
24
+ require 'dwc-archive/generator_eml_xml'
19
25
 
20
26
  class DarwinCore
21
27
  attr_reader :archive, :core, :metadata, :extensions
@@ -5,4 +5,5 @@ class DarwinCore
5
5
  class InvalidArchiveError < Error; end
6
6
  class CoreFileError < Error; end
7
7
  class ExtensionFileError < Error; end
8
+ class GeneratorError < Error; end
8
9
  end
@@ -0,0 +1,73 @@
1
+ class DarwinCore
2
+ class Generator
3
+ attr_reader :eml_xml_data
4
+
5
+ #TODO refactor -- for now copying expander methods
6
+ def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
7
+ @dwc_path = dwc_path
8
+ @path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
9
+ FileUtils.mkdir(@path)
10
+ @meta_xml_data = {:extensions => []}
11
+ @eml_xml_data = {:id => nil, :title => nil, :authors => [], :abstract => nil, :citation => nil, :url => nil}
12
+ @write = R19 ? 'w:utf-8' : 'w'
13
+ end
14
+
15
+ #TODO refactor!
16
+ def clean
17
+ FileUtils.rm_rf(@path) if FileTest.exists?(@path)
18
+ end
19
+
20
+ def add_core(data, file_name, keep_headers = true)
21
+ c = CSV.open(File.join(@path,file_name), @write)
22
+ header = data.shift
23
+ fields = header.map do |f|
24
+ f.strip!
25
+ raise GeneratorError("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
26
+ f.split("/")[-1]
27
+ end
28
+ data.unshift(fields) if keep_headers
29
+ @meta_xml_data[:core] = {:fields => header, :ignoreHeaderLines => keep_headers, :location => file_name}
30
+ data.each {|d| c << d}
31
+ c.close
32
+ end
33
+
34
+ def add_extension(data, file_name, keep_headers = true)
35
+ c = CSV.open(File.join(@path,file_name), @write)
36
+ header = data.shift
37
+ fields = header.map do |f|
38
+ f.strip!
39
+ raise GeneratorError("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
40
+ f.split("/")[-1]
41
+ end
42
+ data.unshift(fields) if keep_headers
43
+ @meta_xml_data[:extensions] << { :fields => header, :ignoreHeaderLines => keep_headers, :location => file_name }
44
+ data.each { |d| c << d }
45
+ c.close
46
+ end
47
+
48
+ def add_meta_xml
49
+ meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
50
+ meta.create
51
+ end
52
+
53
+ def add_eml_xml(data)
54
+ @eml_xml_data = data
55
+ eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
56
+ eml.create
57
+ end
58
+
59
+ def path
60
+ @path
61
+ end
62
+
63
+ def files
64
+ return nil unless @path && FileTest.exists?(@path)
65
+ Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
66
+ end
67
+
68
+ def pack
69
+ a = "cd #{@path}; tar -zcf #{@dwc_path} *"
70
+ system(a)
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,50 @@
1
+ class DarwinCore
2
+ class Generator
3
+ class EmlXml
4
+ def initialize(data, path)
5
+ @data = data
6
+ @path = path
7
+ @write = R19 ? 'w:utf-8' : 'w'
8
+ end
9
+ def create
10
+ builder = Nokogiri::XML::Builder.new do |xml|
11
+ xml.eml( :packageId => "eml.1.1",
12
+ :system => "knb",
13
+ 'xmlns:eml' => "eml://ecoinformatics.org/eml-2.1.0",
14
+ 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
15
+ 'xsi:schemaLocation' => "eml://ecoinformatics.org/eml-2.1.0 eml.xsd" ) do
16
+ xml.dataset(:id => @data[:id]) do
17
+ xml.title(@data[:title])
18
+ contacts = []
19
+ @data[:authors].each_with_index do |a, i|
20
+ creator_id = i + 1
21
+ contacts << creator_id
22
+ xml.creator(:id => creator_id, :scope => 'document') do
23
+ xml.individualName do
24
+ xml.givenName(a[:first_name])
25
+ xml.surName(a[:last_name])
26
+ end
27
+ xml.electronicMailAddress(a[:email])
28
+ end
29
+ end
30
+ xml.abstract(@data[:abstract])
31
+ contacts.each do |contact|
32
+ xml.contact { xml.references(contact) }
33
+ end
34
+ end
35
+ xml.additionalMetadata do
36
+ xml.metadata do
37
+ xml.citation(@data[:citation])
38
+ end
39
+ end
40
+ xml.parent.namespace = xml.parent.namespace_definitions.first
41
+ end
42
+ end
43
+ data = builder.to_xml
44
+ f = open(File.join(@path, 'eml.xml'), @write)
45
+ f.write(data)
46
+ f.close
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,51 @@
1
+ class DarwinCore
2
+ class Generator
3
+ class MetaXml
4
+ def initialize(data, path)
5
+ @data = data
6
+ @path = path
7
+ @write = R19 ? 'w:utf-8' : 'w'
8
+ end
9
+
10
+ def create
11
+ builder = Nokogiri::XML::Builder.new do |xml|
12
+ opts = { :encoding => "UTF-8", :fieldsTerminatedBy => ",", :fieldsEnclosedBy => '"', :linesTerminatedBy => "\n", :rowType => "http://rs.tdwg.org/dwc/tems/Taxon" }
13
+ xml.starArchive(:xmlns => "http://rs.tdwg.org/dwc/terms/xsd/archive/",
14
+ "xmlns:xsi" =>"http://www.w3.org/2001/XMLSchema-instance",
15
+ "xsi:schemaLocation" => "http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd",
16
+ :fileRoot => ".") do
17
+ xml.core(opts.merge(:ignoreHeaderLines => @data[:core][:ignoreHeaderLines])) do
18
+ xml.files { xml.location(@data[:core][:location]) }
19
+ taxon_id, fields = find_taxon_id(@data[:core][:fields])
20
+ xml.id_(:term => taxon_id[0], :index => taxon_id[1])
21
+ fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
22
+ end
23
+ @data[:extensions].each do |e|
24
+ xml.extension(opts.merge(:ignoreHeaderLines => e[:ignoreHeaderLines])) do
25
+ xml.files { xml.location(e[:location]) }
26
+ taxon_id, fields = find_taxon_id(e[:fields])
27
+ xml.coreid(:term => taxon_id[0], :index => taxon_id[1])
28
+ fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
29
+ end
30
+ end
31
+ end
32
+ end
33
+ meta_xml_data = builder.to_xml
34
+ meta_file = open(File.join(@path, 'meta.xml'), @write)
35
+ meta_file.write(meta_xml_data)
36
+ meta_file.close
37
+ end
38
+
39
+ private
40
+ def find_taxon_id(data)
41
+ fields = []
42
+ data.each_with_index { |f, i| fields << [f.strip, i] }
43
+ taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
44
+ raise GeneratorError if taxon_id.size != 1
45
+ [taxon_id[0], fields]
46
+ end
47
+
48
+ end
49
+ end
50
+ end
51
+
@@ -24,12 +24,12 @@ class DarwinCore
24
24
  private
25
25
  def process_csv_row(result, errors, row)
26
26
  str = row.join('')
27
- if defined? FasterCSV
28
- require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
29
- UTF8RGX === str ? result << row : errors << row
30
- else
27
+ if R19
31
28
  str = str.force_encoding('utf-8')
32
29
  str.encoding.name == "UTF-8" && str.valid_encoding? ? result << row : errors << row
30
+ else
31
+ require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
32
+ UTF8RGX === str ? result << row : errors << row
33
33
  end
34
34
  end
35
35
 
@@ -1,6 +1,6 @@
1
1
  class DarwinCore
2
2
  class Metadata
3
- def initialize(archive)
3
+ def initialize(archive = nil)
4
4
  @archive = archive
5
5
  @metadata = @archive.eml
6
6
  end
@@ -9,7 +9,7 @@ class Hash
9
9
  result = Nokogiri::XML(xml_io)
10
10
  return { result.root.name.to_sym => xml_node_to_hash(result.root)}
11
11
  rescue Exception => e
12
- # raise your custom exception here
12
+ raise e
13
13
  end
14
14
  end
15
15
 
@@ -53,8 +53,8 @@ class Hash
53
53
 
54
54
  def prepare(data)
55
55
  return data if data.class != String
56
- data = true if data.strip == "true"
57
- data = false if data.strip == "false"
56
+ return true if data.strip == "true"
57
+ return false if data.strip == "false"
58
58
  data.to_i.to_s == data ? data.to_i : data
59
59
  end
60
60
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc-archive
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
8
  - 3
10
- version: 0.2.3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dmitry Mozzherin
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-05-27 00:00:00 -04:00
18
+ date: 2010-07-12 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -64,8 +64,10 @@ files:
64
64
  - README.rdoc
65
65
  - Rakefile
66
66
  - VERSION
67
- - features/dwc-archive.feature
68
- - features/step_definitions/dwc-archive_steps.rb
67
+ - features/dwca-creator.feature
68
+ - features/dwca-reader.feature
69
+ - features/step_definitions/dwc-creator_steps.rb
70
+ - features/step_definitions/dwc-reader_steps.rb
69
71
  - features/support/env.rb
70
72
  - lib/dwc-archive.rb
71
73
  - lib/dwc-archive/.expander.rb.swo
@@ -74,6 +76,9 @@ files:
74
76
  - lib/dwc-archive/errors.rb
75
77
  - lib/dwc-archive/expander.rb
76
78
  - lib/dwc-archive/extension.rb
79
+ - lib/dwc-archive/generator.rb
80
+ - lib/dwc-archive/generator_eml_xml.rb
81
+ - lib/dwc-archive/generator_meta_xml.rb
77
82
  - lib/dwc-archive/ingester.rb
78
83
  - lib/dwc-archive/metadata.rb
79
84
  - lib/dwc-archive/utf_regex_ruby18.rb