taxonifi 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/.document +5 -0
  2. data/Gemfile +18 -0
  3. data/Gemfile.lock +30 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +155 -0
  6. data/Rakefile +53 -0
  7. data/VERSION +1 -0
  8. data/lib/assessor/assessor.rb +31 -0
  9. data/lib/assessor/base.rb +17 -0
  10. data/lib/assessor/row_assessor.rb +131 -0
  11. data/lib/export/export.rb +9 -0
  12. data/lib/export/format/base.rb +43 -0
  13. data/lib/export/format/species_file.rb +341 -0
  14. data/lib/lumper/lumper.rb +334 -0
  15. data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
  16. data/lib/models/author_year.rb +39 -0
  17. data/lib/models/base.rb +73 -0
  18. data/lib/models/collection.rb +92 -0
  19. data/lib/models/generic_object.rb +15 -0
  20. data/lib/models/geog.rb +59 -0
  21. data/lib/models/geog_collection.rb +28 -0
  22. data/lib/models/name.rb +206 -0
  23. data/lib/models/name_collection.rb +149 -0
  24. data/lib/models/person.rb +49 -0
  25. data/lib/models/ref.rb +85 -0
  26. data/lib/models/ref_collection.rb +106 -0
  27. data/lib/models/species_name.rb +85 -0
  28. data/lib/splitter/builder.rb +26 -0
  29. data/lib/splitter/lexer.rb +70 -0
  30. data/lib/splitter/parser.rb +54 -0
  31. data/lib/splitter/splitter.rb +45 -0
  32. data/lib/splitter/tokens.rb +322 -0
  33. data/lib/taxonifi.rb +36 -0
  34. data/test/file_fixtures/Lygaeoidea.csv +801 -0
  35. data/test/helper.rb +38 -0
  36. data/test/test_exporter.rb +32 -0
  37. data/test/test_lumper_geogs.rb +59 -0
  38. data/test/test_lumper_hierarchical_collection.rb +88 -0
  39. data/test/test_lumper_names.rb +119 -0
  40. data/test/test_lumper_parent_child_name_collection.rb +41 -0
  41. data/test/test_lumper_refs.rb +91 -0
  42. data/test/test_parser.rb +34 -0
  43. data/test/test_splitter.rb +27 -0
  44. data/test/test_splitter_tokens.rb +403 -0
  45. data/test/test_taxonifi.rb +11 -0
  46. data/test/test_taxonifi_accessor.rb +61 -0
  47. data/test/test_taxonifi_geog.rb +51 -0
  48. data/test/test_taxonifi_name.rb +186 -0
  49. data/test/test_taxonifi_name_collection.rb +158 -0
  50. data/test/test_taxonifi_ref.rb +90 -0
  51. data/test/test_taxonifi_ref_collection.rb +69 -0
  52. data/test/test_taxonifi_species_name.rb +95 -0
  53. metadata +167 -0
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # gem 'geokit'
7
+
8
+ # Add dependencies to develop your gem here.
9
+ # Include everything needed to run rake, tests, features, etc.
10
+ group :development do
11
+ # gem "shoulda", ">= 0"
12
+ gem "rdoc", "~> 3.12"
13
+ gem "bundler", "> 1.0.0"
14
+ gem "jeweler", "~> 1.8.3"
15
+ gem "debugger"
16
+ # gem "ruby-debug19"
17
+ # gem "simplecov", ">= 0"
18
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,30 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ columnize (0.3.6)
5
+ debugger (1.1.1)
6
+ columnize (>= 0.3.1)
7
+ debugger-linecache (~> 1.1)
8
+ debugger-ruby_core_source (~> 1.1)
9
+ debugger-linecache (1.1.1)
10
+ debugger-ruby_core_source (>= 1.1.1)
11
+ debugger-ruby_core_source (1.1.1)
12
+ git (1.2.5)
13
+ jeweler (1.8.3)
14
+ bundler (~> 1.0)
15
+ git (>= 1.2.5)
16
+ rake
17
+ rdoc
18
+ json (1.6.6)
19
+ rake (0.9.2.2)
20
+ rdoc (3.12)
21
+ json (~> 1.4)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ bundler (> 1.0.0)
28
+ debugger
29
+ jeweler (~> 1.8.3)
30
+ rdoc (~> 3.12)
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Illinois Natural History Survey
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,155 @@
1
+ = taxonifi
2
+ There will always be "legacy" taxonomic data that needs shuffling around. The taxonifi gem is a suite of general purpose tools that act as a middle layer for data-conversion purposes (e.g. migrating legacy taxonomic databases). It's first application was to convert DwC-style data downloaded from EoL into a Species File. The code is well documented in unit tests, poke around to see if it might be useful. In particular, if you've considered building a collection of regular expressions particular to biodiversity data look at the Tokens code and related tests.
3
+
4
+ Overall, the goal is to provide well documented (and unit-tested) coded that is broadly useful, and vanilla enough to encourage other to fork and hack on their own.
5
+
6
+ == Source
7
+ Source is available at https://github.com/SpeciesFile/taxonifi. The rdoc API is also viewable at http://taxonifi.speciesfile.org, (though those docs may lag behind commits to github).
8
+
9
+ == What's next?
10
+
11
+ Before you jump on board you should also check out similar code from the Global Names team at https://github.com/GlobalNamesArchitecture. Future integration and merging of shared functionality is planned. Code will be released in an "early-and-often" approach
12
+
13
+ = Getting started
14
+ taxonifi is coded for Ruby 1.9.3, it has not been tested on earlier versions (though it will certainly not work with 1.8.7).
15
+ Using Ruby Version Manager (RVM, https://rvm.io/ ) is highly recommend. You can test your version of Ruby by doinging "ruby -v" in your terminal.
16
+
17
+ Taxonifi is presently coded for convience, not speed (though it's not necessarily slow). It assumes that conversion processes are typically one-offs that can afford to run over a longer period of time (read minutes rather than seconds). Reading, and fully parsing into objects, around 25k rows of nomenclature (class to species, inc. author year, = ~45k names) in to memory as Taxonifi objects benchmarks at around 2 minutes. Faster indexing is planned as needed, likely using Redis (see GNA link above).
18
+
19
+
20
+ To install:
21
+
22
+ gem install taxonifi
23
+
24
+ In your script
25
+
26
+ require 'taxonifi'
27
+
28
+
29
+ = Use
30
+ == Quick start
31
+
32
+ Write some code:
33
+
34
+ require 'taxonifi'
35
+
36
+ headers = ["a", "B", "c"]
37
+ csv_string = CSV.generate() do |csv|
38
+ csv << @headers
39
+ csv << %w{a b c}
40
+ end
41
+
42
+ csv = CSV.parse(csv_string, {headers: true, :header_converters :downcase})
43
+
44
+ # Taxonifi can create generic hierachical collections based on column headers
45
+ c = Taxonifi::Lumper.create_hierarchical_collection(csv, %w{a b c}) # => a Taxonifi::Model::Collection
46
+ c.collection.first # => Taxonifi::Model::GenericObject
47
+ c.collection.first.name # => "a"
48
+ c.collection.last.name # => "c"
49
+ c.collection.last.parent.name # => "b"
50
+ c.collection.first.row_number # => 0
51
+ c.collection.first.rank # => "a"
52
+
53
+ # Header order is important:
54
+ c = Taxonifi::Lumper.create_hierarchical_collection(csv, %w{c a b}) # => a Taxonifi::Model::Collection
55
+ c.collection.first.name # => "c"
56
+ c.collection.last.rank # => "c"
57
+ c.collection.last.name # => "b"
58
+ c.collection.last.parent.name # => "a"
59
+
60
+ # Collections of GenericObjects (and some other Taxonifi::Collection based objects like TaxonifiNameCollection) only include
61
+ # unique names, i.e. if a name has a shared parent lineage only the name itself is created, not its parents.
62
+ # For example, for:
63
+ # a b c
64
+ # a d nil
65
+ # b nil d
66
+ # The collection consists of objects with names a,b,c,d,b,d respectively.
67
+ # This makes it very useful for handling not only nomenclatural but other nested data as well.
68
+
69
+ There are collections of specific types (e.g. taxonomic names, geographic names):
70
+
71
+ string = CSV.generate() do |csv|
72
+ csv << %w{family genus species author_year}
73
+ csv << ["Fooidae", "Foo", "bar", "Smith, 1854"]
74
+ csv << ["Fooidae", "Foo", "foo", "(Smith, 1854)"]
75
+ end
76
+
77
+ csv = CSV.parse(string, {headers: true})
78
+
79
+ nc = Taxonifi::Lumper.create_name_collection(csv) # => Taxonifi::Model::NameCollection
80
+
81
+ nc.collection.first # => Taxonifi::Model::Name
82
+ nc.collection.first.name # => "Fooidae"
83
+ nc.collection.first.rank # => "family"
84
+ nc.collection.first.year # => nil
85
+ nc.collection.first.author # => []
86
+ nc.collection.last.rank # => "species"
87
+ nc.collection.last.name # => "foo"
88
+ nc.collection.last.author.first.last_name # => "Smith"
89
+ nc.collection.last.year # => "1854"
90
+
91
+ Parent/child style nomenclature is also parseable.
92
+
93
+ There are *lots* more examples of code use in the test suite.
94
+
95
+ == Export/conversion
96
+
97
+ The following is an example that translates a DwC style input format as exported by EOL into tables importable to SpeciesFile. The input file is has id, parent, child, vernacular, synonym columns. Data are exported by default to a the users home folder in a taxonifi directory. The export creates 6 tables that can be imported into Species File directly.
98
+
99
+ require 'taxonifi'
100
+ file = File.expand_path(File.join(File.dirname(__FILE__), 'file_fixtures/Lygaeoidea-csv.tsv'))
101
+
102
+ csv = CSV.read(file, {
103
+ headers: true,
104
+ col_sep: "\t",
105
+ header_converters: :downcase
106
+ } )
107
+
108
+ nc = Taxonifi::Lumper::Lumps::ParentChildNameCollection.name_collection(csv)
109
+ e = Taxonifi::Export::SpeciesFile.new(:nc => nc, :authorized_user_id => 1)
110
+ e.export
111
+
112
+ You should be able to relativley quickly use the export framework to code new output formats.
113
+
114
+ == Reading files
115
+
116
+ taxonifi feeds on Ruby's CSV. read your files with header true, and downcased, e.g.:
117
+
118
+ csv = CSV.read('input/my_data.tab', {
119
+ headers: true,
120
+ header_converters: :downcase,
121
+ col_sep: "\t" } )
122
+
123
+ == Code organization
124
+
125
+ test # unit tests, quite a few of them
126
+ lib # the main libraries
127
+ lib/assessor # libraries to assess the properties of incoming data
128
+ lib/export # export wrappers
129
+ lib/export/format # one module for each export type
130
+ lumper # code that builds Taxonifi objects
131
+ models # Taxonifi objects
132
+ splitter # a parser/lexer/token suite for breaking down data
133
+
134
+ = Contributing to taxonifi
135
+
136
+ (this is generic)
137
+
138
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
139
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
140
+ * Fork the project.
141
+ * Start a feature/bugfix branch.
142
+ * Commit and push until you are happy with your contribution.
143
+ * Write unit test for your code. Changes are good, just as long as tests run clean.
144
+ * All pull requests should test clean.
145
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
146
+
147
+ = About
148
+
149
+ taxonifi is coded by Matt Yoder in consultation with the Species File Group at University of Illinois.
150
+
151
+ = Copyright
152
+
153
+ Copyright (c) 2012 Illinois Natural History Survey. See LICENSE.txt for
154
+ further details.
155
+
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ # encodang: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "taxonifi"
18
+ gem.homepage = "http://github.com/mjy/taxonifi"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{A general purpose framework for scripted handling of taxonomic names}
21
+ gem.description = %Q{Taxonifi contains simple models and utilties of use in for parsing lists of taxonomic name (life) related metadata}
22
+ gem.email = "diapriid@gmail.com"
23
+ gem.authors = ["mjy"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ # require 'rcov/rcovtask'
36
+ # Rcov::RcovTask.new do |test|
37
+ # test.libs << 'test'
38
+ # test.pattern = 'test/**/test_*.rb'
39
+ # test.verbose = true
40
+ # test.rcov_opts << '--exclude "gems/*"'
41
+ # end
42
+
43
+ task :default => :test
44
+
45
+ require 'rdoc/task'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "taxonifi #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,31 @@
1
+ module Taxonifi
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), 'base'))
4
+ require File.expand_path(File.join(File.dirname(__FILE__), 'row_assessor'))
5
+
6
+ class AssessorError < StandardError; end
7
+
8
+ # The assessor assesses!
9
+ #
10
+ # A work in progress. The idea is to provide
11
+ # a mechanism to assess incoming data to determine
12
+ # what possible outputs (or other operations)
13
+ # are possible.
14
+ module Assessor
15
+
16
+ INPUTS = {
17
+ name_and_heirarchy: {
18
+ complete_match: true,
19
+ require_columns: [ ],
20
+ optional_columns: [ ],
21
+ }
22
+ }
23
+
24
+ # INPUT.key => /lumper/class
25
+ OUTPUTS = {
26
+ name_and_heirarchy: [ ]
27
+ }
28
+
29
+ end # end Assessor module
30
+ end # Taxonifi module
31
+
@@ -0,0 +1,17 @@
1
+ module Taxonifi
2
+ module Assessor
3
+
4
+ # Stub for a base model. Not used
5
+ class Base
6
+ attr_reader :outputs, :inputs
7
+
8
+ # def _assess_inputs
9
+ # end
10
+
11
+ # def _assess_outputs
12
+ # end
13
+ end
14
+
15
+ end # end Assessor module
16
+ end # Taxonifi module
17
+
@@ -0,0 +1,131 @@
1
+ module Taxonifi
2
+ module Assessor
3
+
4
+ # Code to assess the metadata properties of a csv row.
5
+ #
6
+ # !! Note that there are various
7
+ # !! CSV methods for returning row columns
8
+ # !! that have particular attributes
9
+ module RowAssessor
10
+
11
+ class RowAssessorError < StandardError; end
12
+
13
+ # Pass a CSV (require "csv") row as read with the following
14
+ # parameters:
15
+ # headers: true
16
+ # header_converters: :symbol
17
+ class RowAssessor < Taxonifi::Assessor::Base
18
+ attr_reader :lumps # the lumps present in this row
19
+ def initialze(csv_row)
20
+ cols = []
21
+ cols = csv_row.entries.select{|c,v| !v.nil?}.collect{|c| c[0]}
22
+ @lumps = Taxonifi::Lumper.available_lumps(cols)
23
+ end
24
+ end
25
+
26
+ # Return the first column with data, scoped by lump if provided.
27
+ def self.first_available(csv_row, lump = nil)
28
+ if lump.nil?
29
+ csv_row.entries.each do |c,v|
30
+ return [c,v] if !csv_row[c].nil?
31
+ end
32
+ else
33
+ lump.each do |l|
34
+ return [l, csv_row[l.to_s]] if !csv_row[l.to_s].nil?
35
+ end
36
+ end
37
+ end
38
+
39
+ # Return the last column with data, scoped by lump if provided.
40
+ def self.last_available(csv_row, lump = nil)
41
+ if lump.nil?
42
+ csv_row.entries.reverse.each do |c,v|
43
+ return [c,v] if !csv_row[c].nil?
44
+ end
45
+ else
46
+ lump.reverse.each do |l|
47
+ return [l, csv_row[l.to_s]] if !csv_row[l.to_s].nil?
48
+ end
49
+ end
50
+ end
51
+
52
+ # Return the rank (symbol) of the taxon name rank. Raises
53
+ # if no name detected.
54
+ def self.lump_name_rank(csv_row)
55
+ lumps = Taxonifi::Lumper.available_lumps(csv_row.headers)
56
+ if lumps.include?(:species) # has to be a species name
57
+ if csv_row[:subspecies].nil?
58
+ return :species
59
+ else
60
+ return :subspecies
61
+ end
62
+ elsif lumps.include?(:genera)
63
+ if csv_row[:subgenus].nil?
64
+ return :genus
65
+ else
66
+ return :subgenus
67
+ end
68
+ else
69
+ return Taxonifi::Assessor::RowAssessor.last_available(csv_row, Taxonifi::Lumper::LUMPS[:higher]).first.to_sym
70
+ end
71
+
72
+ # this far? bad
73
+ raise RowAssessor::RowAssessorError
74
+ end
75
+
76
+ # Return the column representing the parent of the name
77
+ # represented in this row.
78
+ def self.parent_taxon_column(csv_row)
79
+ lumps = Taxonifi::Lumper.available_lumps(csv_row.headers)
80
+ last = last_available(csv_row, Taxonifi::RANKS)
81
+ last_available(csv_row, Taxonifi::RANKS[0..Taxonifi::RANKS.index(last[0])-1])
82
+ end
83
+
84
+ # Return an Array of headers that represent taxonomic ranks.
85
+ def self.rank_headers(headers)
86
+ Taxonifi::RANKS & headers
87
+ end
88
+
89
+ # Return an Array of headers that represent geographic columns.
90
+ def self.geog_headers(headers)
91
+ Taxonifi::Lumper::LUMPS[:basic_geog] & headers
92
+ end
93
+
94
+ # Return lumps for which at least one column has data.
95
+ def self.intersecting_lumps_with_data(row, lumps_to_try = nil)
96
+ lumps_to_try ||= Taxonifi::Lumper::LUMPS.keys
97
+ lumps = []
98
+ lumps_to_try.each do |l|
99
+ has_data = false
100
+ Taxonifi::Lumper::LUMPS[l].each do |c|
101
+ if !row[c].nil? && !row[c].empty?
102
+ has_data = true
103
+ break
104
+ end
105
+ end
106
+ has_data && lumps.push(l)
107
+ end
108
+ lumps
109
+ end
110
+
111
+ # Return lumps that have data for all columns.
112
+ def self.lumps_with_data(row, lumps_to_try = nil)
113
+ lumps_to_try ||= Taxonifi::Lumper::LUMPS.keys
114
+ lumps = []
115
+ lumps_to_try.each do |l|
116
+ has_data = true
117
+ Taxonifi::Lumper::LUMPS[l].each do |c|
118
+ if row[c].nil? || row[c].empty?
119
+ has_data = false
120
+ break
121
+ end
122
+ end
123
+ has_data && lumps.push(l)
124
+ end
125
+ lumps
126
+ end
127
+
128
+ end
129
+ end
130
+ end
131
+
@@ -0,0 +1,9 @@
1
+ module Taxonifi
2
+ # Export related functionality.
3
+ module Export
4
+ class ExportError < StandardError; end
5
+ Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "format/*.rb") )) do |file|
6
+ require file
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,43 @@
1
+ module Taxonifi::Export
2
+
3
+ # All export classes inherit from Taxonifi::Export::Base
4
+ class Base
5
+ EXPORT_BASE = File.expand_path(File.join(Dir.home(), 'taxonifi', 'export'))
6
+ attr_accessor :base_export_path, :export_folder
7
+
8
+ def initialize(options = {})
9
+ opts = {
10
+ :base_export_path => EXPORT_BASE,
11
+ :export_folder => '.'
12
+ }.merge!(options)
13
+
14
+ @base_export_path = opts[:base_export_path]
15
+ @export_folder = opts[:export_folder]
16
+ end
17
+
18
+ # Return the path to which exported files will be written.
19
+ def export_path
20
+ File.expand_path(File.join(@base_export_path, @export_folder))
21
+ end
22
+
23
+ # Subclassed models expand on this method, typically writing files
24
+ # to the folders created here.
25
+ def export
26
+ configure_folders
27
+ end
28
+
29
+ # Recursively (over)write the the export path.
30
+ def configure_folders
31
+ FileUtils.mkdir_p export_path
32
+ end
33
+
34
+ # Write the string to a file in the export path.
35
+ def write_file(filename = 'foo', string = nil)
36
+ raise ExportError, 'Nothing to export for #{filename}.' if string.nil? || string == ""
37
+ f = File.new( File.expand_path(File.join(export_path, filename)), 'w+')
38
+ f.puts string
39
+ f.close
40
+ end
41
+
42
+ end
43
+ end