dwc-archive 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -20,3 +20,4 @@ pkg
20
20
  *.gemspec
21
21
 
22
22
  ## PROJECT::SPECIFIC
23
+ tags
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.3
1
+ 0.3.0
@@ -0,0 +1,40 @@
1
+ Feature: Creating and writing a Darwin Core Archive
2
+ In order to communicate with DwCA compatible programs
3
+ A User should be able to
4
+ Save data from ruby objects into Darwin Core Archive file
5
+
6
+ Scenario: Creating Core File
7
+ Given an array of urls for Darwin Core or other terms
8
+ And arrays of data in the order correpsonding to order of terms
9
+ When User creates generator
10
+ And User sends this data to core generator
11
+ Then these data should be saved as "darwin_core.txt" file
12
+
13
+ Scenario: Creating Extensions
14
+ Given 2 sets of data with terms as urls in the header
15
+ When User creates generator
16
+ And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
17
+ Then data are saved as "vernacular.txt" and "synonyms.txt"
18
+
19
+ Scenario: Creating metadata.xml and eml.xml
20
+ Given an array of urls for Darwin Core or other terms
21
+ And arrays of data in the order correpsonding to order of terms
22
+ And 2 sets of data with terms as urls in the header
23
+ When User creates generator
24
+ And User sends this data to core generator
25
+ And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
26
+ And User generates meta.xml and eml.xml
27
+ Then there should be "meta.xml" file with core and extensions informations
28
+ And there should be "eml.xml" file with authoriship information
29
+
30
+ Scenario: Making DarwinCore Archive file
31
+ Given an array of urls for Darwin Core or other terms
32
+ And arrays of data in the order correpsonding to order of terms
33
+ And 2 sets of data with terms as urls in the header
34
+ When User creates generator
35
+ And User sends this data to core generator
36
+ And User adds extensions with file names "vernacular.txt" and "synonyms.txt"
37
+ And User generates meta.xml and eml.xml
38
+ And generates archive
39
+ Then there should be a valid new archive file
40
+
@@ -1,4 +1,4 @@
1
- Feature: Creation of a Darwing Core Archive
1
+ Feature: Reading of a Darwing Core Archive
2
2
  In order to start working with Darwin Core Archive file
3
3
  A user should be able initiate dwc object from a file
4
4
  So I want to implement handling of dwc object creation
@@ -0,0 +1,105 @@
1
+ #encoding: utf-8
2
+ require 'ruby-debug'
3
+
4
+
5
+ Given /^an array of urls for Darwin Core or other terms$/ do
6
+ @rows = ["http://rs.tdwg.org/dwc/terms/taxonID", "http://rs.tdwg.org/dwc/terms/parentNameUsageID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/taxonRank"]
7
+ end
8
+
9
+ Given /^arrays of data in the order correpsonding to order of terms$/ do
10
+ @data = [
11
+ [1, 0, "Plantae", "kingdom"],
12
+ [2, 1, "Betula", "genus"],
13
+ [3, 2, "Betula verucosa", "species"]
14
+ ]
15
+ end
16
+
17
+ When /^User sends this data to core generator$/ do
18
+ @data = @data.unshift @rows
19
+ @gen.add_core(@data, 'darwin_core.txt')
20
+ end
21
+
22
+ Then /^these data should be saved as "([^\"]*)" file$/ do |file_name|
23
+ file = File.join(@gen.path, file_name)
24
+ @gen.files.include?(file_name).should be_true
25
+ csv = CSV.open(file).count.should == 4
26
+ end
27
+
28
+ Given /^2 sets of data with terms as urls in the header$/ do
29
+ @vernaculars = [
30
+ ["http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/vernacularName"],
31
+ [1, "Plants"],
32
+ [1, "Растения"],
33
+ [2, "Birch"],
34
+ [2, "Береза"],
35
+ [3, "Wheeping Birch"],
36
+ [3, "Береза плакучая"]
37
+ ]
38
+ @synonyms = [
39
+ ["http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/taxonomicStatus"],
40
+ [1, "Betila Linnaeus, 1753", 'misspelling']
41
+ ]
42
+ end
43
+
44
+ When /^User creates generator$/ do
45
+ @gen = DarwinCore::Generator.new('/tmp/dwc.tar.gz')
46
+ end
47
+
48
+ When /^User adds extensions with file names "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
49
+ @gen.add_extension(@vernaculars, file_name_1)
50
+ @gen.add_extension(@synonyms, file_name_2)
51
+ end
52
+
53
+ Then /^data are saved as "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
54
+ [file_name_1, file_name_2].each do |file_name|
55
+ file = File.join(@gen.path, file_name)
56
+ @gen.files.include?(file_name).should be_true
57
+ csv = CSV.open(file).count.should > 1
58
+ end
59
+ end
60
+
61
+ When /^User generates meta\.xml and eml.xml$/ do
62
+ @gen.add_meta_xml
63
+ @gen.add_eml_xml({
64
+ :id => '1234',
65
+ :title => 'Test Classification',
66
+ :authors => [
67
+ { :first_name => 'John',
68
+ :last_name => 'Doe',
69
+ :email => 'jdoe@example.com' },
70
+ { :first_name => 'Jane',
71
+ :last_name => 'Doe',
72
+ :email => 'jane@example.com' }
73
+ ],
74
+ :abstract => 'test classification',
75
+ :citation => 'Test classification: Doe John, Doe Jane, Taxnonmy, 10, 1, 2010',
76
+ :url => 'http://example.com'
77
+ })
78
+ end
79
+
80
+ Then /^there should be "([^\"]*)" file with core and extensions informations$/ do |file_name|
81
+ meta = File.join(@gen.path, file_name)
82
+ @gen.files.include?(file_name).should be_true
83
+ dom = Nokogiri::XML(open(File.join(@gen.path, file_name)))
84
+ dom.xpath('//xmlns:core//xmlns:location').text.should == 'darwin_core.txt'
85
+ dom.xpath('//xmlns:extension[1]//xmlns:location').text.should == 'vernacular.txt'
86
+ end
87
+
88
+ Then /^there should be "([^\"]*)" file with authoriship information$/ do |file_name|
89
+ eml = File.join(@gen.path, file_name)
90
+ @gen.files.include?(file_name).should be_true
91
+ end
92
+
93
+ Given /^a path to a new file \- "([^\"]*)"$/ do |file_name|
94
+ @dwca_file = file_name
95
+ end
96
+
97
+ When /^generates archive$/ do
98
+ @gen.pack
99
+ end
100
+
101
+ Then /^there should be a valid new archive file$/ do
102
+ dwc = DarwinCore.new('/tmp/dwc.tar.gz')
103
+ dwc.archive.valid?.should be_true
104
+ end
105
+
data/lib/dwc-archive.rb CHANGED
@@ -1,13 +1,16 @@
1
1
  # encoding: UTF-8
2
2
  $:.unshift(File.dirname(__FILE__)) unless
3
3
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
4
+ R19 = RUBY_VERSION.split('.')[0..1].join('').to_i > 18
4
5
  require 'ruby_extensions'
5
6
  require 'fileutils'
6
- begin
7
+ require 'ostruct'
8
+
9
+ if R19
10
+ require 'csv'
11
+ else
7
12
  require 'fastercsv'
8
13
  CSV = FasterCSV
9
- rescue LoadError
10
- require 'csv'
11
14
  end
12
15
  require 'dwc-archive/ingester'
13
16
  require 'dwc-archive/errors'
@@ -16,6 +19,9 @@ require 'dwc-archive/archive'
16
19
  require 'dwc-archive/core'
17
20
  require 'dwc-archive/extension'
18
21
  require 'dwc-archive/metadata'
22
+ require 'dwc-archive/generator'
23
+ require 'dwc-archive/generator_meta_xml'
24
+ require 'dwc-archive/generator_eml_xml'
19
25
 
20
26
  class DarwinCore
21
27
  attr_reader :archive, :core, :metadata, :extensions
@@ -5,4 +5,5 @@ class DarwinCore
5
5
  class InvalidArchiveError < Error; end
6
6
  class CoreFileError < Error; end
7
7
  class ExtensionFileError < Error; end
8
+ class GeneratorError < Error; end
8
9
  end
@@ -0,0 +1,73 @@
1
+ class DarwinCore
2
+ class Generator
3
+ attr_reader :eml_xml_data
4
+
5
+ #TODO refactor -- for now copying expander methods
6
+ def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
7
+ @dwc_path = dwc_path
8
+ @path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
9
+ FileUtils.mkdir(@path)
10
+ @meta_xml_data = {:extensions => []}
11
+ @eml_xml_data = {:id => nil, :title => nil, :authors => [], :abstract => nil, :citation => nil, :url => nil}
12
+ @write = R19 ? 'w:utf-8' : 'w'
13
+ end
14
+
15
+ #TODO refactor!
16
+ def clean
17
+ FileUtils.rm_rf(@path) if FileTest.exists?(@path)
18
+ end
19
+
20
+ def add_core(data, file_name, keep_headers = true)
21
+ c = CSV.open(File.join(@path,file_name), @write)
22
+ header = data.shift
23
+ fields = header.map do |f|
24
+ f.strip!
25
+ raise GeneratorError("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
26
+ f.split("/")[-1]
27
+ end
28
+ data.unshift(fields) if keep_headers
29
+ @meta_xml_data[:core] = {:fields => header, :ignoreHeaderLines => keep_headers, :location => file_name}
30
+ data.each {|d| c << d}
31
+ c.close
32
+ end
33
+
34
+ def add_extension(data, file_name, keep_headers = true)
35
+ c = CSV.open(File.join(@path,file_name), @write)
36
+ header = data.shift
37
+ fields = header.map do |f|
38
+ f.strip!
39
+ raise GeneratorError("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
40
+ f.split("/")[-1]
41
+ end
42
+ data.unshift(fields) if keep_headers
43
+ @meta_xml_data[:extensions] << { :fields => header, :ignoreHeaderLines => keep_headers, :location => file_name }
44
+ data.each { |d| c << d }
45
+ c.close
46
+ end
47
+
48
+ def add_meta_xml
49
+ meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
50
+ meta.create
51
+ end
52
+
53
+ def add_eml_xml(data)
54
+ @eml_xml_data = data
55
+ eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
56
+ eml.create
57
+ end
58
+
59
+ def path
60
+ @path
61
+ end
62
+
63
+ def files
64
+ return nil unless @path && FileTest.exists?(@path)
65
+ Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
66
+ end
67
+
68
+ def pack
69
+ a = "cd #{@path}; tar -zcf #{@dwc_path} *"
70
+ system(a)
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,50 @@
1
+ class DarwinCore
2
+ class Generator
3
+ class EmlXml
4
+ def initialize(data, path)
5
+ @data = data
6
+ @path = path
7
+ @write = R19 ? 'w:utf-8' : 'w'
8
+ end
9
+ def create
10
+ builder = Nokogiri::XML::Builder.new do |xml|
11
+ xml.eml( :packageId => "eml.1.1",
12
+ :system => "knb",
13
+ 'xmlns:eml' => "eml://ecoinformatics.org/eml-2.1.0",
14
+ 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
15
+ 'xsi:schemaLocation' => "eml://ecoinformatics.org/eml-2.1.0 eml.xsd" ) do
16
+ xml.dataset(:id => @data[:id]) do
17
+ xml.title(@data[:title])
18
+ contacts = []
19
+ @data[:authors].each_with_index do |a, i|
20
+ creator_id = i + 1
21
+ contacts << creator_id
22
+ xml.creator(:id => creator_id, :scope => 'document') do
23
+ xml.individualName do
24
+ xml.givenName(a[:first_name])
25
+ xml.surName(a[:last_name])
26
+ end
27
+ xml.electronicMailAddress(a[:email])
28
+ end
29
+ end
30
+ xml.abstract(@data[:abstract])
31
+ contacts.each do |contact|
32
+ xml.contact { xml.references(contact) }
33
+ end
34
+ end
35
+ xml.additionalMetadata do
36
+ xml.metadata do
37
+ xml.citation(@data[:citation])
38
+ end
39
+ end
40
+ xml.parent.namespace = xml.parent.namespace_definitions.first
41
+ end
42
+ end
43
+ data = builder.to_xml
44
+ f = open(File.join(@path, 'eml.xml'), @write)
45
+ f.write(data)
46
+ f.close
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,51 @@
1
+ class DarwinCore
2
+ class Generator
3
+ class MetaXml
4
+ def initialize(data, path)
5
+ @data = data
6
+ @path = path
7
+ @write = R19 ? 'w:utf-8' : 'w'
8
+ end
9
+
10
+ def create
11
+ builder = Nokogiri::XML::Builder.new do |xml|
12
+ opts = { :encoding => "UTF-8", :fieldsTerminatedBy => ",", :fieldsEnclosedBy => '"', :linesTerminatedBy => "\n", :rowType => "http://rs.tdwg.org/dwc/tems/Taxon" }
13
+ xml.starArchive(:xmlns => "http://rs.tdwg.org/dwc/terms/xsd/archive/",
14
+ "xmlns:xsi" =>"http://www.w3.org/2001/XMLSchema-instance",
15
+ "xsi:schemaLocation" => "http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd",
16
+ :fileRoot => ".") do
17
+ xml.core(opts.merge(:ignoreHeaderLines => @data[:core][:ignoreHeaderLines])) do
18
+ xml.files { xml.location(@data[:core][:location]) }
19
+ taxon_id, fields = find_taxon_id(@data[:core][:fields])
20
+ xml.id_(:term => taxon_id[0], :index => taxon_id[1])
21
+ fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
22
+ end
23
+ @data[:extensions].each do |e|
24
+ xml.extension(opts.merge(:ignoreHeaderLines => e[:ignoreHeaderLines])) do
25
+ xml.files { xml.location(e[:location]) }
26
+ taxon_id, fields = find_taxon_id(e[:fields])
27
+ xml.coreid(:term => taxon_id[0], :index => taxon_id[1])
28
+ fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
29
+ end
30
+ end
31
+ end
32
+ end
33
+ meta_xml_data = builder.to_xml
34
+ meta_file = open(File.join(@path, 'meta.xml'), @write)
35
+ meta_file.write(meta_xml_data)
36
+ meta_file.close
37
+ end
38
+
39
+ private
40
+ def find_taxon_id(data)
41
+ fields = []
42
+ data.each_with_index { |f, i| fields << [f.strip, i] }
43
+ taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
44
+ raise GeneratorError if taxon_id.size != 1
45
+ [taxon_id[0], fields]
46
+ end
47
+
48
+ end
49
+ end
50
+ end
51
+
@@ -24,12 +24,12 @@ class DarwinCore
24
24
  private
25
25
  def process_csv_row(result, errors, row)
26
26
  str = row.join('')
27
- if defined? FasterCSV
28
- require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
29
- UTF8RGX === str ? result << row : errors << row
30
- else
27
+ if R19
31
28
  str = str.force_encoding('utf-8')
32
29
  str.encoding.name == "UTF-8" && str.valid_encoding? ? result << row : errors << row
30
+ else
31
+ require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
32
+ UTF8RGX === str ? result << row : errors << row
33
33
  end
34
34
  end
35
35
 
@@ -1,6 +1,6 @@
1
1
  class DarwinCore
2
2
  class Metadata
3
- def initialize(archive)
3
+ def initialize(archive = nil)
4
4
  @archive = archive
5
5
  @metadata = @archive.eml
6
6
  end
@@ -9,7 +9,7 @@ class Hash
9
9
  result = Nokogiri::XML(xml_io)
10
10
  return { result.root.name.to_sym => xml_node_to_hash(result.root)}
11
11
  rescue Exception => e
12
- # raise your custom exception here
12
+ raise e
13
13
  end
14
14
  end
15
15
 
@@ -53,8 +53,8 @@ class Hash
53
53
 
54
54
  def prepare(data)
55
55
  return data if data.class != String
56
- data = true if data.strip == "true"
57
- data = false if data.strip == "false"
56
+ return true if data.strip == "true"
57
+ return false if data.strip == "false"
58
58
  data.to_i.to_s == data ? data.to_i : data
59
59
  end
60
60
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc-archive
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
8
  - 3
10
- version: 0.2.3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dmitry Mozzherin
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-05-27 00:00:00 -04:00
18
+ date: 2010-07-12 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -64,8 +64,10 @@ files:
64
64
  - README.rdoc
65
65
  - Rakefile
66
66
  - VERSION
67
- - features/dwc-archive.feature
68
- - features/step_definitions/dwc-archive_steps.rb
67
+ - features/dwca-creator.feature
68
+ - features/dwca-reader.feature
69
+ - features/step_definitions/dwc-creator_steps.rb
70
+ - features/step_definitions/dwc-reader_steps.rb
69
71
  - features/support/env.rb
70
72
  - lib/dwc-archive.rb
71
73
  - lib/dwc-archive/.expander.rb.swo
@@ -74,6 +76,9 @@ files:
74
76
  - lib/dwc-archive/errors.rb
75
77
  - lib/dwc-archive/expander.rb
76
78
  - lib/dwc-archive/extension.rb
79
+ - lib/dwc-archive/generator.rb
80
+ - lib/dwc-archive/generator_eml_xml.rb
81
+ - lib/dwc-archive/generator_meta_xml.rb
77
82
  - lib/dwc-archive/ingester.rb
78
83
  - lib/dwc-archive/metadata.rb
79
84
  - lib/dwc-archive/utf_regex_ruby18.rb