taxonifi 0.2.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +59 -0
  3. data/.travis.yml +11 -0
  4. data/Gemfile +5 -17
  5. data/Gemfile.lock +22 -40
  6. data/README.md +192 -0
  7. data/Rakefile +35 -26
  8. data/lib/export/format/base.rb +1 -1
  9. data/lib/export/format/species_file.rb +154 -152
  10. data/lib/lumper/clump.rb +1 -1
  11. data/lib/lumper/lumper.rb +22 -18
  12. data/lib/lumper/lumps/parent_child_name_collection.rb +1 -2
  13. data/lib/lumper/name_index.rb +21 -0
  14. data/lib/{models → model}/author_year.rb +2 -2
  15. data/lib/{models → model}/base.rb +35 -5
  16. data/lib/{models → model}/collection.rb +8 -1
  17. data/lib/{models → model}/name.rb +128 -36
  18. data/lib/{models → model}/name_collection.rb +134 -33
  19. data/lib/{models → model}/person.rb +1 -1
  20. data/lib/{models → model}/ref.rb +4 -2
  21. data/lib/model/ref_collection.rb +171 -0
  22. data/lib/{models → model}/species_name.rb +24 -3
  23. data/lib/splitter/builder.rb +1 -1
  24. data/lib/splitter/parser.rb +5 -0
  25. data/lib/splitter/tokens.rb +54 -9
  26. data/lib/taxonifi/version.rb +3 -0
  27. data/lib/taxonifi.rb +5 -9
  28. data/taxonifi.gemspec +29 -99
  29. data/test/helper.rb +1 -1
  30. data/test/test_exporter.rb +1 -1
  31. data/test/test_lumper_names.rb +9 -9
  32. data/test/test_lumper_refs.rb +4 -4
  33. data/test/test_parser.rb +97 -26
  34. data/test/test_splitter_tokens.rb +25 -4
  35. data/test/test_taxonifi_base.rb +1 -1
  36. data/test/test_taxonifi_geog.rb +1 -1
  37. data/test/test_taxonifi_name.rb +13 -14
  38. data/test/test_taxonifi_name_collection.rb +11 -5
  39. data/test/test_taxonifi_ref.rb +1 -1
  40. data/test/test_taxonifi_ref_collection.rb +40 -3
  41. data/test/test_taxonifi_species_name.rb +51 -1
  42. data/travis/before_install.sh +2 -0
  43. metadata +96 -66
  44. data/README.rdoc +0 -154
  45. data/VERSION +0 -1
  46. data/lib/models/ref_collection.rb +0 -107
  47. /data/lib/{models → model}/generic_object.rb +0 -0
  48. /data/lib/{models → model}/geog.rb +0 -0
  49. /data/lib/{models → model}/geog_collection.rb +0 -0
  50. /data/lib/{models → model}/shared_class_methods.rb +0 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7df9c89aa9370f5b09c35f4164eb9238be2b8a0d
4
+ data.tar.gz: 78c4de112c6e40cdf1e2197402f8f47e9dea6fa0
5
+ SHA512:
6
+ metadata.gz: 6de2acff969064aad7a34e73759298630e3640430dff1c8ad3e189c373e0c5fe22c84cb8d56b71910d4711cf5446f344048a8bc4cbcb5c8ea8faec78a1cb8ac4
7
+ data.tar.gz: ff6a30b825545b759819540d468d37c68ffe700a531023875ed3e5a917dff6cd14859838da84081c87aefa36962d1398ffb6472c9662a47803d03a78d2144152
data/.gitignore ADDED
@@ -0,0 +1,59 @@
1
+
2
+ # ignore in development script work
3
+ sf
4
+
5
+ vendor
6
+
7
+ *.xlsx
8
+
9
+ # rcov generated
10
+ coverage
11
+ coverage.data
12
+
13
+ # rdoc generated
14
+ rdoc
15
+
16
+ # yard generated
17
+ doc
18
+ .yardoc
19
+
20
+ # bundler
21
+ .bundle
22
+
23
+ # jeweler generated
24
+ pkg
25
+
26
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
27
+ #
28
+ # * Create a file at ~/.gitignore
29
+ # * Include files you want ignored
30
+ # * Run: git config --global core.excludesfile ~/.gitignore
31
+ #
32
+ # After doing this, these files will be ignored in all your git projects,
33
+ # saving you from having to 'pollute' every project you touch with them
34
+ #
35
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
36
+ #
37
+ # For MacOS:
38
+ #
39
+ .DS_Store
40
+
41
+ .swo
42
+
43
+ # For TextMate
44
+ #*.tmproj
45
+ #tmtags
46
+
47
+ # For emacs:
48
+ *~
49
+ #\#*
50
+ #.\#*
51
+
52
+ # For vim:
53
+ *.swp
54
+
55
+ # For redcar:
56
+ #.redcar
57
+
58
+ # For rubinius:
59
+ #*.rbc
data/.travis.yml ADDED
@@ -0,0 +1,11 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.5
4
+ bundler_args: --without development
5
+ before_install: ./travis/before_install.sh
6
+ branches:
7
+ only:
8
+ - master
9
+ email:
10
+ - diapriid@gmail.com
11
+ on_failure: change
data/Gemfile CHANGED
@@ -1,19 +1,7 @@
1
- source "http://rubygems.org"
2
- # Add dependencies required to use your gem here.
3
- # Example:
4
- # gem "activesupport", ">= 2.3.5"
5
1
 
6
- # gem 'geokit'
2
+ source 'https://rubygems.org'
3
+
4
+ # Specify your gem's dependencies in gemspec
5
+ gemspec
6
+
7
7
 
8
- # Add dependencies to develop your gem here.
9
- # Include everything needed to run rake, tests, features, etc.
10
- group :development do
11
- # gem "shoulda", ">= 0"
12
- gem "rdoc", "~> 3.12"
13
- gem "bundler", "> 1.0.0"
14
- gem "jeweler", "~> 1.8.3"
15
- gem "activerecord", "3.2.8"
16
- gem "debugger"
17
- # gem "ruby-debug19"
18
- # gem "simplecov", ">= 0"
19
- end
data/Gemfile.lock CHANGED
@@ -1,47 +1,29 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ taxonifi (0.3.2)
5
+ rake (~> 10.4)
6
+
1
7
  GEM
2
- remote: http://rubygems.org/
8
+ remote: https://rubygems.org/
3
9
  specs:
4
- activemodel (3.2.8)
5
- activesupport (= 3.2.8)
6
- builder (~> 3.0.0)
7
- activerecord (3.2.8)
8
- activemodel (= 3.2.8)
9
- activesupport (= 3.2.8)
10
- arel (~> 3.0.2)
11
- tzinfo (~> 0.3.29)
12
- activesupport (3.2.8)
13
- i18n (~> 0.6)
14
- multi_json (~> 1.0)
15
- arel (3.0.2)
16
- builder (3.0.0)
17
- columnize (0.3.6)
18
- debugger (1.2.0)
19
- columnize (>= 0.3.1)
20
- debugger-linecache (~> 1.1.1)
21
- debugger-ruby_core_source (~> 1.1.3)
22
- debugger-linecache (1.1.2)
23
- debugger-ruby_core_source (>= 1.1.1)
24
- debugger-ruby_core_source (1.1.3)
25
- git (1.2.5)
26
- i18n (0.6.1)
27
- jeweler (1.8.4)
28
- bundler (~> 1.0)
29
- git (>= 1.2.5)
30
- rake
31
- rdoc
32
- json (1.7.5)
33
- multi_json (1.3.6)
34
- rake (0.9.2.2)
35
- rdoc (3.12)
36
- json (~> 1.4)
37
- tzinfo (0.3.33)
10
+ awesome_print (1.6.1)
11
+ builder (3.2.2)
12
+ byebug (4.0.5)
13
+ columnize (= 0.9.0)
14
+ columnize (0.9.0)
15
+ did_you_mean (0.9.8)
16
+ interception
17
+ interception (0.5)
18
+ rake (10.4.2)
38
19
 
39
20
  PLATFORMS
40
21
  ruby
41
22
 
42
23
  DEPENDENCIES
43
- activerecord (= 3.2.8)
44
- bundler (> 1.0.0)
45
- debugger
46
- jeweler (~> 1.8.3)
47
- rdoc (~> 3.12)
24
+ awesome_print (~> 1.6)
25
+ builder (~> 3.2)
26
+ bundler (~> 1.9)
27
+ byebug (~> 4.0)
28
+ did_you_mean (~> 0.9)
29
+ taxonifi!
data/README.md ADDED
@@ -0,0 +1,192 @@
1
+
2
+ [![Continuous Integration Status][1]][2]
3
+ [![Dependency Status][7]][8]
4
+
5
+
6
+ taxonifi
7
+ ========
8
+ There will always be "legacy" taxonomic data that needs shuffling around. The taxonifi gem is a suite of general purpose tools that act as a middle layer for data-conversion purposes (e.g. migrating legacy taxonomic databases). It's first application was to convert DwC-style data downloaded from EoL into a Species File. The code is well documented in unit tests, poke around to see if it might be useful. In particular, if you've considered building a collection of regular expressions particular to biodiversity data look at the Tokens code and related tests.
9
+
10
+ Overall, the goal is to provide well documented (and unit-tested) coded that is broadly useful, and vanilla enough to encourage other to fork and hack on their own.
11
+
12
+ Source
13
+ ------
14
+ Source is available at https://github.com/SpeciesFile/taxonifi . The rdoc API is also viewable at http://taxonifi.speciesfile.org , (though those docs may lag behind commits to github).
15
+
16
+ What's next?
17
+ ------------
18
+
19
+ Before you jump on board you should also check out similar code from the Global Names team at https://github.com/GlobalNamesArchitecture. Future integration and merging of shared functionality is planned. Code will be released in an "early-and-often" approach.
20
+
21
+ Taxonifi is presently coded for convience, not speed (though it's not necessarily slow). It assumes that conversion processes are typically one-offs that can afford to run over a longer period of time (read minutes rather than seconds). Reading, and fully parsing into objects, around 25k rows of nomenclature (class to species, inc. author year, = ~45k names) in to memory as Taxonifi objects benchmarks at around 2 minutes. Faster indexing is planned as needed, likely using Redis (see GNA link above).
22
+
23
+ Getting started
24
+ ---------------
25
+ taxonifi is coded for Ruby 1.9.3, it has not been tested on earlier versions (though it will certainly not work with 1.8.7).
26
+ Using Ruby Version Manager (RVM, https://rvm.io/ ) is highly recommend. You can test your version of Ruby by doinging "ruby -v" in your terminal.
27
+
28
+ To install:
29
+
30
+ ```
31
+ gem install taxonifi
32
+ ```
33
+
34
+ In your script:
35
+
36
+ ```
37
+ require 'taxonifi'
38
+ ```
39
+
40
+ Use
41
+ ===
42
+
43
+ Quick start
44
+ -----------
45
+
46
+ Write some code:
47
+
48
+ ```
49
+ require 'taxonifi'
50
+
51
+ headers = ["a", "B", "c"]
52
+ csv_string = CSV.generate() do |csv|
53
+ csv << @headers
54
+ csv << %w{a b c}
55
+ end
56
+
57
+ csv = CSV.parse(csv_string, {headers: true, header_converters: :downcase})
58
+
59
+ # Taxonifi can create generic hierachical collections based on column headers
60
+ c = Taxonifi::Lumper.create_hierarchical_collection(csv, %w{a b c}) # => a Taxonifi::Model::Collection
61
+ c.collection.first # => Taxonifi::Model::GenericObject
62
+ c.collection.first.name # => "a"
63
+ c.collection.last.name # => "c"
64
+ c.collection.last.parent.name # => "b"
65
+ c.collection.first.row_number # => 0
66
+ c.collection.first.rank # => "a"
67
+
68
+ # Header order is important:
69
+ c = Taxonifi::Lumper.create_hierarchical_collection(csv, %w{c a b}) # => a Taxonifi::Model::Collection
70
+ c.collection.first.name # => "c"
71
+ c.collection.last.rank # => "c"
72
+ c.collection.last.name # => "b"
73
+ c.collection.last.parent.name # => "a"
74
+
75
+ # Collections of GenericObjects (and some other Taxonifi::Collection based objects like TaxonifiNameCollection) only include
76
+ # unique names, i.e. if a name has a shared parent lineage only the name itself is created, not its parents.
77
+ # For example, for:
78
+ # a b c
79
+ # a d nil
80
+ # b nil d
81
+ # The collection consists of objects with names a,b,c,d,b,d respectively.
82
+ # This makes it very useful for handling not only nomenclatural but other nested data as well.
83
+ ```
84
+
85
+ There are collections of specific types (e.g. taxonomic names, geographic names):
86
+
87
+ ```
88
+ string = CSV.generate() do |csv|
89
+ csv << %w{family genus species author_year}
90
+ csv << ["Fooidae", "Foo", "bar", "Smith, 1854"]
91
+ csv << ["Fooidae", "Foo", "foo", "(Smith, 1854)"]
92
+ end
93
+
94
+ csv = CSV.parse(string, {headers: true})
95
+
96
+ nc = Taxonifi::Lumper.create_name_collection(:csv => csv) # => Taxonifi::Model::NameCollection
97
+
98
+ nc.collection.first # => Taxonifi::Model::Name
99
+ nc.collection.first.name # => "Fooidae"
100
+ nc.collection.first.rank # => "family"
101
+ nc.collection.first.year # => nil
102
+ nc.collection.first.author # => []
103
+ nc.collection.last.rank # => "species"
104
+ nc.collection.last.name # => "foo"
105
+ nc.collection.last.author.first.last_name # => "Smith"
106
+ nc.collection.last.year # => "1854"
107
+ ```
108
+
109
+ Parent/child style nomenclature is also parseable.
110
+
111
+ There are *lots* more examples of code use in the test suite.
112
+
113
+ Export/conversion
114
+ -----------------
115
+
116
+ The following is an example that translates a DwC style input format as exported by EOL into tables importable to SpeciesFile. The input file is has id, parent, child, vernacular, synonym columns. Data are exported by default to a the users home folder in a taxonifi directory. The export creates 6 tables that can be imported into Species File directly.
117
+
118
+ ```
119
+ require 'taxonifi'
120
+ file = File.expand_path(File.join(File.dirname(__FILE__), 'file_fixtures/Lygaeoidea-csv.tsv'))
121
+
122
+ csv = CSV.read(file, {
123
+ headers: true,
124
+ col_sep: "\t",
125
+ header_converters: :downcase
126
+ } )
127
+
128
+ nc = Taxonifi::Lumper::Lumps::ParentChildNameCollection.name_collection(csv)
129
+ e = Taxonifi::Export::SpeciesFile.new(:nc => nc, :authorized_user_id => 1)
130
+ e.export
131
+ ```
132
+
133
+ You should be able to relativley quickly use the export framework to code new output formats.
134
+
135
+ Reading files
136
+ -------------
137
+
138
+ taxonifi feeds on Ruby's CSV. read your files with header true, and downcased, e.g.:
139
+
140
+ ```
141
+ csv = CSV.read('input/my_data.tab', {
142
+ headers: true,
143
+ header_converters: :downcase,
144
+ col_sep: "\t" } )
145
+ ```
146
+
147
+ Code organization
148
+ -----------------
149
+
150
+ ```
151
+ test # unit tests, quite a few of them
152
+ lib # the main libraries
153
+ lib/assessor # libraries to assess the properties of incoming data
154
+ lib/export # export wrappers
155
+ lib/export/format # one module for each export type
156
+ lib/lumper # code that builds Taxonifi objects
157
+ lib/model # Taxonifi objects
158
+ lib/splitter # a parser/lexer/token suite for breaking down data
159
+ ```
160
+
161
+ Contributing to taxonifi
162
+ ------------------------
163
+
164
+ (this is generic)
165
+
166
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
167
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
168
+ * Fork the project.
169
+ * Start a feature/bugfix branch.
170
+ * Commit and push until you are happy with your contribution.
171
+ * Write unit test for your code. Changes are good, just as long as tests run clean.
172
+ * All pull requests should test clean.
173
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
174
+
175
+ About
176
+ -----
177
+
178
+ taxonifi is coded by Matt Yoder in consultation with the Species File Group at University of Illinois.
179
+
180
+ Copyright
181
+ ---------
182
+
183
+ Copyright (c) 2012 Illinois Natural History Survey. See LICENSE.txt for
184
+ further details.
185
+
186
+
187
+ [1]: https://secure.travis-ci.org/SpeciesFileGroup/taxonifi.png?branch=master
188
+ [2]: http://travis-ci.org/SpeciesFileGroup/taxonifi?branch=master
189
+ [7]: https://gemnasium.com/SpeciesFileGroup/taxonifi.png?branch=master
190
+ [8]: https://gemnasium.com/SpeciesFileGroup/taxonifi?branch=master
191
+
192
+
data/Rakefile CHANGED
@@ -1,31 +1,38 @@
1
- # encodang: utf-8
2
-
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
11
- end
1
+ #!/usr/bin/env rake
2
+
3
+ require 'bundler/gem_tasks'
12
4
  require 'rake'
5
+ require 'rake/testtask'
6
+ require 'taxonifi/version'
7
+
8
+ # require 'rubygems'
9
+ # require 'bundler'
10
+
11
+ # begin
12
+ # Bundler.setup(:default, :development)
13
+ # rescue Bundler::BundlerError => e
14
+ # $stderr.puts e.message
15
+ # $stderr.puts "Run `bundle install` to install missing gems"
16
+ # exit e.status_code
17
+ # end
18
+
19
+ #require 'jeweler'
20
+
21
+ # Jeweler::Tasks.new do |gem|
22
+ # # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
23
+ # gem.name = "taxonifi"
24
+ # gem.homepage = "http://github.com/SpeciesFile/taxonifi"
25
+ # gem.license = "MIT"
26
+ # gem.summary = %Q{A general purpose framework for scripted handling of taxonomic names}
27
+ # gem.description = %Q{Taxonifi contains simple models and utilties of use in for parsing lists of taxonomic name (life) related metadata}
28
+ # gem.email = "diapriid@gmail.com"
29
+ # gem.authors = ["mjy"]
30
+ # # dependencies defined in Gemfile
31
+ # end
32
+
33
+ # Jeweler::RubygemsDotOrgTasks.new
13
34
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
- gem.name = "taxonifi"
18
- gem.homepage = "http://github.com/SpeciesFile/taxonifi"
19
- gem.license = "MIT"
20
- gem.summary = %Q{A general purpose framework for scripted handling of taxonomic names}
21
- gem.description = %Q{Taxonifi contains simple models and utilties of use in for parsing lists of taxonomic name (life) related metadata}
22
- gem.email = "diapriid@gmail.com"
23
- gem.authors = ["mjy"]
24
- # dependencies defined in Gemfile
25
- end
26
- Jeweler::RubygemsDotOrgTasks.new
27
35
 
28
- require 'rake/testtask'
29
36
  Rake::TestTask.new(:test) do |test|
30
37
  test.libs << 'lib' << 'test'
31
38
  test.pattern = 'test/**/test_*.rb'
@@ -43,11 +50,13 @@ end
43
50
  task :default => :test
44
51
 
45
52
  require 'rdoc/task'
53
+
46
54
  Rake::RDocTask.new do |rdoc|
47
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
55
+ version = Taxonifi::VERSION
48
56
 
49
57
  rdoc.rdoc_dir = 'rdoc'
50
58
  rdoc.title = "taxonifi #{version}"
51
59
  rdoc.rdoc_files.include('README*')
52
60
  rdoc.rdoc_files.include('lib/**/*.rb')
61
+
53
62
  end
@@ -7,7 +7,7 @@ module Taxonifi::Export
7
7
  # See https://phenoscape.svn.sourceforge.net/svnroot/phenoscape/trunk/vocab/taxonomic_rank.obo
8
8
  # Site: https://www.phenoscape.org/wiki/Taxonomic_Rank_Vocabulary
9
9
  # Values of -1 have no correspondance in that ontology.
10
- # Nt all values are supported. Not all values are included.
10
+ # Not all values are supported. Not all values are included.
11
11
  TAXRANKS = {
12
12
  'taxonomic_rank' => 0,
13
13
  'variety' => 16,