taxonifi 0.2.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +59 -0
  3. data/.travis.yml +11 -0
  4. data/Gemfile +5 -17
  5. data/Gemfile.lock +22 -40
  6. data/README.md +192 -0
  7. data/Rakefile +35 -26
  8. data/lib/export/format/base.rb +1 -1
  9. data/lib/export/format/species_file.rb +154 -152
  10. data/lib/lumper/clump.rb +1 -1
  11. data/lib/lumper/lumper.rb +22 -18
  12. data/lib/lumper/lumps/parent_child_name_collection.rb +1 -2
  13. data/lib/lumper/name_index.rb +21 -0
  14. data/lib/{models → model}/author_year.rb +2 -2
  15. data/lib/{models → model}/base.rb +35 -5
  16. data/lib/{models → model}/collection.rb +8 -1
  17. data/lib/{models → model}/name.rb +128 -36
  18. data/lib/{models → model}/name_collection.rb +134 -33
  19. data/lib/{models → model}/person.rb +1 -1
  20. data/lib/{models → model}/ref.rb +4 -2
  21. data/lib/model/ref_collection.rb +171 -0
  22. data/lib/{models → model}/species_name.rb +24 -3
  23. data/lib/splitter/builder.rb +1 -1
  24. data/lib/splitter/parser.rb +5 -0
  25. data/lib/splitter/tokens.rb +54 -9
  26. data/lib/taxonifi/version.rb +3 -0
  27. data/lib/taxonifi.rb +5 -9
  28. data/taxonifi.gemspec +29 -99
  29. data/test/helper.rb +1 -1
  30. data/test/test_exporter.rb +1 -1
  31. data/test/test_lumper_names.rb +9 -9
  32. data/test/test_lumper_refs.rb +4 -4
  33. data/test/test_parser.rb +97 -26
  34. data/test/test_splitter_tokens.rb +25 -4
  35. data/test/test_taxonifi_base.rb +1 -1
  36. data/test/test_taxonifi_geog.rb +1 -1
  37. data/test/test_taxonifi_name.rb +13 -14
  38. data/test/test_taxonifi_name_collection.rb +11 -5
  39. data/test/test_taxonifi_ref.rb +1 -1
  40. data/test/test_taxonifi_ref_collection.rb +40 -3
  41. data/test/test_taxonifi_species_name.rb +51 -1
  42. data/travis/before_install.sh +2 -0
  43. metadata +96 -66
  44. data/README.rdoc +0 -154
  45. data/VERSION +0 -1
  46. data/lib/models/ref_collection.rb +0 -107
  47. /data/lib/{models → model}/generic_object.rb +0 -0
  48. /data/lib/{models → model}/geog.rb +0 -0
  49. /data/lib/{models → model}/geog_collection.rb +0 -0
  50. /data/lib/{models → model}/shared_class_methods.rb +0 -0
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: taxonifi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
5
- prerelease:
4
+ version: 0.3.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - mjy
@@ -12,101 +11,106 @@ cert_chain: []
12
11
  date: 2013-03-27 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
- name: rdoc
14
+ name: rake
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - "~>"
20
18
  - !ruby/object:Gem::Version
21
- version: '3.12'
22
- type: :development
19
+ version: '10.4'
20
+ type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ~>
24
+ - - "~>"
28
25
  - !ruby/object:Gem::Version
29
- version: '3.12'
26
+ version: '10.4'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: bundler
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>'
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.9'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.9'
41
+ - !ruby/object:Gem::Dependency
42
+ name: awesome_print
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
36
46
  - !ruby/object:Gem::Version
37
- version: 1.0.0
47
+ version: '1.6'
38
48
  type: :development
39
49
  prerelease: false
40
50
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
51
  requirements:
43
- - - ! '>'
52
+ - - "~>"
44
53
  - !ruby/object:Gem::Version
45
- version: 1.0.0
54
+ version: '1.6'
46
55
  - !ruby/object:Gem::Dependency
47
- name: jeweler
56
+ name: did_you_mean
48
57
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
58
  requirements:
51
- - - ~>
59
+ - - "~>"
52
60
  - !ruby/object:Gem::Version
53
- version: 1.8.3
61
+ version: '0.9'
54
62
  type: :development
55
63
  prerelease: false
56
64
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
65
  requirements:
59
- - - ~>
66
+ - - "~>"
60
67
  - !ruby/object:Gem::Version
61
- version: 1.8.3
68
+ version: '0.9'
62
69
  - !ruby/object:Gem::Dependency
63
- name: activerecord
70
+ name: byebug
64
71
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
72
  requirements:
67
- - - '='
73
+ - - "~>"
68
74
  - !ruby/object:Gem::Version
69
- version: 3.2.8
75
+ version: '4.0'
70
76
  type: :development
71
77
  prerelease: false
72
78
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
79
  requirements:
75
- - - '='
80
+ - - "~>"
76
81
  - !ruby/object:Gem::Version
77
- version: 3.2.8
82
+ version: '4.0'
78
83
  - !ruby/object:Gem::Dependency
79
- name: debugger
84
+ name: builder
80
85
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
86
  requirements:
83
- - - ! '>='
87
+ - - "~>"
84
88
  - !ruby/object:Gem::Version
85
- version: '0'
89
+ version: '3.2'
86
90
  type: :development
87
91
  prerelease: false
88
92
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
93
  requirements:
91
- - - ! '>='
94
+ - - "~>"
92
95
  - !ruby/object:Gem::Version
93
- version: '0'
96
+ version: '3.2'
94
97
  description: Taxonifi contains simple models and utilties of use in for parsing lists
95
- of taxonomic name (life) related metadata
98
+ of taxonomic name (life) related metadata or other heirarchically defined data.
96
99
  email: diapriid@gmail.com
97
100
  executables: []
98
101
  extensions: []
99
102
  extra_rdoc_files:
100
103
  - LICENSE.txt
101
- - README.rdoc
104
+ - README.md
102
105
  files:
103
- - .document
106
+ - ".document"
107
+ - ".gitignore"
108
+ - ".travis.yml"
104
109
  - Gemfile
105
110
  - Gemfile.lock
106
111
  - LICENSE.txt
107
- - README.rdoc
112
+ - README.md
108
113
  - Rakefile
109
- - VERSION
110
114
  - lib/assessor/assessor.rb
111
115
  - lib/assessor/base.rb
112
116
  - lib/assessor/row_assessor.rb
@@ -118,25 +122,27 @@ files:
118
122
  - lib/lumper/clump.rb
119
123
  - lib/lumper/lumper.rb
120
124
  - lib/lumper/lumps/parent_child_name_collection.rb
121
- - lib/models/author_year.rb
122
- - lib/models/base.rb
123
- - lib/models/collection.rb
124
- - lib/models/generic_object.rb
125
- - lib/models/geog.rb
126
- - lib/models/geog_collection.rb
127
- - lib/models/name.rb
128
- - lib/models/name_collection.rb
129
- - lib/models/person.rb
130
- - lib/models/ref.rb
131
- - lib/models/ref_collection.rb
132
- - lib/models/shared_class_methods.rb
133
- - lib/models/species_name.rb
125
+ - lib/lumper/name_index.rb
126
+ - lib/model/author_year.rb
127
+ - lib/model/base.rb
128
+ - lib/model/collection.rb
129
+ - lib/model/generic_object.rb
130
+ - lib/model/geog.rb
131
+ - lib/model/geog_collection.rb
132
+ - lib/model/name.rb
133
+ - lib/model/name_collection.rb
134
+ - lib/model/person.rb
135
+ - lib/model/ref.rb
136
+ - lib/model/ref_collection.rb
137
+ - lib/model/shared_class_methods.rb
138
+ - lib/model/species_name.rb
134
139
  - lib/splitter/builder.rb
135
140
  - lib/splitter/lexer.rb
136
141
  - lib/splitter/parser.rb
137
142
  - lib/splitter/splitter.rb
138
143
  - lib/splitter/tokens.rb
139
144
  - lib/taxonifi.rb
145
+ - lib/taxonifi/version.rb
140
146
  - lib/utils/array.rb
141
147
  - lib/utils/hash.rb
142
148
  - taxonifi.gemspec
@@ -165,32 +171,56 @@ files:
165
171
  - test/test_taxonifi_ref.rb
166
172
  - test/test_taxonifi_ref_collection.rb
167
173
  - test/test_taxonifi_species_name.rb
174
+ - travis/before_install.sh
168
175
  homepage: http://github.com/SpeciesFile/taxonifi
169
176
  licenses:
170
177
  - MIT
178
+ metadata: {}
171
179
  post_install_message:
172
180
  rdoc_options: []
173
181
  require_paths:
174
182
  - lib
175
183
  required_ruby_version: !ruby/object:Gem::Requirement
176
- none: false
177
184
  requirements:
178
- - - ! '>='
185
+ - - ">="
179
186
  - !ruby/object:Gem::Version
180
187
  version: '0'
181
- segments:
182
- - 0
183
- hash: -2473283969605789743
184
188
  required_rubygems_version: !ruby/object:Gem::Requirement
185
- none: false
186
189
  requirements:
187
- - - ! '>='
190
+ - - ">="
188
191
  - !ruby/object:Gem::Version
189
192
  version: '0'
190
193
  requirements: []
191
194
  rubyforge_project:
192
- rubygems_version: 1.8.25
195
+ rubygems_version: 2.4.5
193
196
  signing_key:
194
- specification_version: 3
195
- summary: A general purpose framework for scripted handling of taxonomic names
196
- test_files: []
197
+ specification_version: 4
198
+ summary: A general purpose framework for scripted handling of taxonomic names or other
199
+ heirarchical metadata.
200
+ test_files:
201
+ - test/file_fixtures/Fossil.csv
202
+ - test/file_fixtures/Lygaeoidea.csv
203
+ - test/file_fixtures/names.csv
204
+ - test/helper.rb
205
+ - test/test_export_prolog.rb
206
+ - test/test_exporter.rb
207
+ - test/test_lumper_clump.rb
208
+ - test/test_lumper_geogs.rb
209
+ - test/test_lumper_hierarchical_collection.rb
210
+ - test/test_lumper_names.rb
211
+ - test/test_lumper_parent_child_name_collection.rb
212
+ - test/test_lumper_refs.rb
213
+ - test/test_obo_nomenclature.rb
214
+ - test/test_parser.rb
215
+ - test/test_splitter.rb
216
+ - test/test_splitter_tokens.rb
217
+ - test/test_taxonifi.rb
218
+ - test/test_taxonifi_accessor.rb
219
+ - test/test_taxonifi_base.rb
220
+ - test/test_taxonifi_geog.rb
221
+ - test/test_taxonifi_name.rb
222
+ - test/test_taxonifi_name_collection.rb
223
+ - test/test_taxonifi_ref.rb
224
+ - test/test_taxonifi_ref_collection.rb
225
+ - test/test_taxonifi_species_name.rb
226
+ has_rdoc:
data/README.rdoc DELETED
@@ -1,154 +0,0 @@
1
- = taxonifi
2
- There will always be "legacy" taxonomic data that needs shuffling around. The taxonifi gem is a suite of general purpose tools that act as a middle layer for data-conversion purposes (e.g. migrating legacy taxonomic databases). It's first application was to convert DwC-style data downloaded from EoL into a Species File. The code is well documented in unit tests, poke around to see if it might be useful. In particular, if you've considered building a collection of regular expressions particular to biodiversity data look at the Tokens code and related tests.
3
-
4
- Overall, the goal is to provide well documented (and unit-tested) coded that is broadly useful, and vanilla enough to encourage other to fork and hack on their own.
5
-
6
- == Source
7
- Source is available at https://github.com/SpeciesFile/taxonifi . The rdoc API is also viewable at http://taxonifi.speciesfile.org , (though those docs may lag behind commits to github).
8
-
9
- == What's next?
10
-
11
- Before you jump on board you should also check out similar code from the Global Names team at https://github.com/GlobalNamesArchitecture. Future integration and merging of shared functionality is planned. Code will be released in an "early-and-often" approach.
12
-
13
- Taxonifi is presently coded for convience, not speed (though it's not necessarily slow). It assumes that conversion processes are typically one-offs that can afford to run over a longer period of time (read minutes rather than seconds). Reading, and fully parsing into objects, around 25k rows of nomenclature (class to species, inc. author year, = ~45k names) in to memory as Taxonifi objects benchmarks at around 2 minutes. Faster indexing is planned as needed, likely using Redis (see GNA link above).
14
-
15
- = Getting started
16
- taxonifi is coded for Ruby 1.9.3, it has not been tested on earlier versions (though it will certainly not work with 1.8.7).
17
- Using Ruby Version Manager (RVM, https://rvm.io/ ) is highly recommend. You can test your version of Ruby by doinging "ruby -v" in your terminal.
18
-
19
- To install:
20
-
21
- gem install taxonifi
22
-
23
- In your script
24
-
25
- require 'taxonifi'
26
-
27
-
28
- = Use
29
- == Quick start
30
-
31
- Write some code:
32
-
33
- require 'taxonifi'
34
-
35
- headers = ["a", "B", "c"]
36
- csv_string = CSV.generate() do |csv|
37
- csv << @headers
38
- csv << %w{a b c}
39
- end
40
-
41
- csv = CSV.parse(csv_string, {headers: true, :header_converters :downcase})
42
-
43
- # Taxonifi can create generic hierachical collections based on column headers
44
- c = Taxonifi::Lumper.create_hierarchical_collection(csv, %w{a b c}) # => a Taxonifi::Model::Collection
45
- c.collection.first # => Taxonifi::Model::GenericObject
46
- c.collection.first.name # => "a"
47
- c.collection.last.name # => "c"
48
- c.collection.last.parent.name # => "b"
49
- c.collection.first.row_number # => 0
50
- c.collection.first.rank # => "a"
51
-
52
- # Header order is important:
53
- c = Taxonifi::Lumper.create_hierarchical_collection(csv, %w{c a b}) # => a Taxonifi::Model::Collection
54
- c.collection.first.name # => "c"
55
- c.collection.last.rank # => "c"
56
- c.collection.last.name # => "b"
57
- c.collection.last.parent.name # => "a"
58
-
59
- # Collections of GenericObjects (and some other Taxonifi::Collection based objects like TaxonifiNameCollection) only include
60
- # unique names, i.e. if a name has a shared parent lineage only the name itself is created, not its parents.
61
- # For example, for:
62
- # a b c
63
- # a d nil
64
- # b nil d
65
- # The collection consists of objects with names a,b,c,d,b,d respectively.
66
- # This makes it very useful for handling not only nomenclatural but other nested data as well.
67
-
68
- There are collections of specific types (e.g. taxonomic names, geographic names):
69
-
70
- string = CSV.generate() do |csv|
71
- csv << %w{family genus species author_year}
72
- csv << ["Fooidae", "Foo", "bar", "Smith, 1854"]
73
- csv << ["Fooidae", "Foo", "foo", "(Smith, 1854)"]
74
- end
75
-
76
- csv = CSV.parse(string, {headers: true})
77
-
78
- nc = Taxonifi::Lumper.create_name_collection(:csv => csv) # => Taxonifi::Model::NameCollection
79
-
80
- nc.collection.first # => Taxonifi::Model::Name
81
- nc.collection.first.name # => "Fooidae"
82
- nc.collection.first.rank # => "family"
83
- nc.collection.first.year # => nil
84
- nc.collection.first.author # => []
85
- nc.collection.last.rank # => "species"
86
- nc.collection.last.name # => "foo"
87
- nc.collection.last.author.first.last_name # => "Smith"
88
- nc.collection.last.year # => "1854"
89
-
90
- Parent/child style nomenclature is also parseable.
91
-
92
- There are *lots* more examples of code use in the test suite.
93
-
94
- == Export/conversion
95
-
96
- The following is an example that translates a DwC style input format as exported by EOL into tables importable to SpeciesFile. The input file is has id, parent, child, vernacular, synonym columns. Data are exported by default to a the users home folder in a taxonifi directory. The export creates 6 tables that can be imported into Species File directly.
97
-
98
- require 'taxonifi'
99
- file = File.expand_path(File.join(File.dirname(__FILE__), 'file_fixtures/Lygaeoidea-csv.tsv'))
100
-
101
- csv = CSV.read(file, {
102
- headers: true,
103
- col_sep: "\t",
104
- header_converters: :downcase
105
- } )
106
-
107
- nc = Taxonifi::Lumper::Lumps::ParentChildNameCollection.name_collection(csv)
108
- e = Taxonifi::Export::SpeciesFile.new(:nc => nc, :authorized_user_id => 1)
109
- e.export
110
-
111
- You should be able to relativley quickly use the export framework to code new output formats.
112
-
113
- == Reading files
114
-
115
- taxonifi feeds on Ruby's CSV. read your files with header true, and downcased, e.g.:
116
-
117
- csv = CSV.read('input/my_data.tab', {
118
- headers: true,
119
- header_converters: :downcase,
120
- col_sep: "\t" } )
121
-
122
- == Code organization
123
-
124
- test # unit tests, quite a few of them
125
- lib # the main libraries
126
- lib/assessor # libraries to assess the properties of incoming data
127
- lib/export # export wrappers
128
- lib/export/format # one module for each export type
129
- lumper # code that builds Taxonifi objects
130
- models # Taxonifi objects
131
- splitter # a parser/lexer/token suite for breaking down data
132
-
133
- = Contributing to taxonifi
134
-
135
- (this is generic)
136
-
137
- * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
138
- * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
139
- * Fork the project.
140
- * Start a feature/bugfix branch.
141
- * Commit and push until you are happy with your contribution.
142
- * Write unit test for your code. Changes are good, just as long as tests run clean.
143
- * All pull requests should test clean.
144
- * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
145
-
146
- = About
147
-
148
- taxonifi is coded by Matt Yoder in consultation with the Species File Group at University of Illinois.
149
-
150
- = Copyright
151
-
152
- Copyright (c) 2012 Illinois Natural History Survey. See LICENSE.txt for
153
- further details.
154
-
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.2.0
@@ -1,107 +0,0 @@
1
- module Taxonifi
2
- class RefCollectionError < StandardError; end
3
-
4
- module Model
5
-
6
- # A collection of references.
7
- class RefCollection < Taxonifi::Model::Collection
8
-
9
- # An options index when there is one reference per row.
10
- attr_accessor :row_index
11
-
12
- # Points a Ref#id to an array of Person#ids.
13
- # Built on request.
14
- attr_accessor :author_index
15
-
16
- def initialize(options = {})
17
- super
18
- @row_index = []
19
- @author_index = {}
20
- true
21
- end
22
-
23
- # The instance collection class.
24
- def object_class
25
- Taxonifi::Model::Ref
26
- end
27
-
28
- # The object at a given row.
29
- # TODO: inherit from Collection?
30
- def object_from_row(row_number)
31
- return nil if row_number.nil?
32
- @row_index[row_number]
33
- end
34
-
35
- # Incrementally (re-)assigns the id of every associated author (Person)
36
- # This is only useful if you assume every author is unique.
37
- def enumerate_authors(initial_id = 0)
38
- i = initial_id
39
- collection.each do |r|
40
- r.authors.each do |a|
41
- a.id = i
42
- i += 1
43
- end
44
- end
45
- end
46
-
47
- # Finds unique authors, and combines them, then
48
- # rebuilds author lists using references to the new unique set.
49
- def uniquify_authors(initial_id = 0)
50
- auth_index = {}
51
- unique_authors.each_with_index do |a, i|
52
- a.id = i + initial_id
53
- auth_index.merge!(a.compact_string => a)
54
- end
55
-
56
- collection.each do |r|
57
- new_authors = []
58
- r.authors.inject(new_authors){|ary, a| ary.push(auth_index[a.compact_string])}
59
- r.authors = new_authors
60
- end
61
- true
62
- end
63
-
64
- # Build the author index.
65
- # {Ref#id => [a1#id, ... an#id]}
66
- def build_author_index
67
- collection.each do |r|
68
- @author_index.merge!(r.id => r.authors.collect{|a| a.id ? a.id : -1})
69
- end
70
- end
71
-
72
- # Return an array the unique author strings in this collection.
73
- def unique_author_strings
74
- auths = {}
75
- collection.each do |r|
76
- r.authors.each do |a|
77
- auths.merge!(a.display_name => nil)
78
- end
79
- end
80
- auths.keys.sort
81
- end
82
-
83
- # Returns Array of Taxonifi::Model::Person
84
- # Will need better indexing on big lists?
85
- def unique_authors
86
- auths = []
87
- collection.each do |r|
88
- r.authors.each do |a|
89
- found = false
90
- auths.each do |x|
91
- if a.identical?(x)
92
- found = true
93
- next
94
- end
95
- end
96
- if not found
97
- auths.push a.clone
98
- end
99
- end
100
- end
101
- auths
102
- end
103
-
104
- end
105
- end
106
-
107
- end
File without changes
File without changes
File without changes
File without changes