taxonifi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +155 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/assessor/assessor.rb +31 -0
- data/lib/assessor/base.rb +17 -0
- data/lib/assessor/row_assessor.rb +131 -0
- data/lib/export/export.rb +9 -0
- data/lib/export/format/base.rb +43 -0
- data/lib/export/format/species_file.rb +341 -0
- data/lib/lumper/lumper.rb +334 -0
- data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
- data/lib/models/author_year.rb +39 -0
- data/lib/models/base.rb +73 -0
- data/lib/models/collection.rb +92 -0
- data/lib/models/generic_object.rb +15 -0
- data/lib/models/geog.rb +59 -0
- data/lib/models/geog_collection.rb +28 -0
- data/lib/models/name.rb +206 -0
- data/lib/models/name_collection.rb +149 -0
- data/lib/models/person.rb +49 -0
- data/lib/models/ref.rb +85 -0
- data/lib/models/ref_collection.rb +106 -0
- data/lib/models/species_name.rb +85 -0
- data/lib/splitter/builder.rb +26 -0
- data/lib/splitter/lexer.rb +70 -0
- data/lib/splitter/parser.rb +54 -0
- data/lib/splitter/splitter.rb +45 -0
- data/lib/splitter/tokens.rb +322 -0
- data/lib/taxonifi.rb +36 -0
- data/test/file_fixtures/Lygaeoidea.csv +801 -0
- data/test/helper.rb +38 -0
- data/test/test_exporter.rb +32 -0
- data/test/test_lumper_geogs.rb +59 -0
- data/test/test_lumper_hierarchical_collection.rb +88 -0
- data/test/test_lumper_names.rb +119 -0
- data/test/test_lumper_parent_child_name_collection.rb +41 -0
- data/test/test_lumper_refs.rb +91 -0
- data/test/test_parser.rb +34 -0
- data/test/test_splitter.rb +27 -0
- data/test/test_splitter_tokens.rb +403 -0
- data/test/test_taxonifi.rb +11 -0
- data/test/test_taxonifi_accessor.rb +61 -0
- data/test/test_taxonifi_geog.rb +51 -0
- data/test/test_taxonifi_name.rb +186 -0
- data/test/test_taxonifi_name_collection.rb +158 -0
- data/test/test_taxonifi_ref.rb +90 -0
- data/test/test_taxonifi_ref_collection.rb +69 -0
- data/test/test_taxonifi_species_name.rb +95 -0
- metadata +167 -0
@@ -0,0 +1,149 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
class NameCollectionError < StandardError; end
|
3
|
+
module Model
|
4
|
+
|
5
|
+
# A collection of taxonomic names.
|
6
|
+
class NameCollection < Taxonifi::Model::Collection
|
7
|
+
|
8
|
+
attr_accessor :by_name_index
|
9
|
+
attr_accessor :ref_collection
|
10
|
+
|
11
|
+
def initialize(options = {})
|
12
|
+
super
|
13
|
+
@collection = []
|
14
|
+
@by_name_index = {} # "foo => [1,2,3]"
|
15
|
+
Taxonifi::RANKS.inject(@by_name_index){|hsh, v| hsh.merge!(v => {})}
|
16
|
+
@by_name_index['unknown'] = {} # unranked names get dumped in here
|
17
|
+
@ref_collection = nil
|
18
|
+
true
|
19
|
+
end
|
20
|
+
|
21
|
+
def object_class
|
22
|
+
Taxonifi::Model::Name
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return the highest RANK for which there is no
|
26
|
+
# name in this collection.
|
27
|
+
def encompassing_rank
|
28
|
+
highest = RANKS.size
|
29
|
+
@collection.each do |n|
|
30
|
+
h = RANKS.index(n.rank)
|
31
|
+
highest = h if h < highest
|
32
|
+
end
|
33
|
+
RANKS[highest - 1]
|
34
|
+
end
|
35
|
+
|
36
|
+
# The names objects in the collection at a rank.
|
37
|
+
# TODO: Should index this on add_object
|
38
|
+
def names_at_rank(rank)
|
39
|
+
raise if !RANKS.include?(rank)
|
40
|
+
names = []
|
41
|
+
@collection.each do |n|
|
42
|
+
names << n if n.rank == rank
|
43
|
+
end
|
44
|
+
names
|
45
|
+
end
|
46
|
+
|
47
|
+
# Returns id of matching existing name
|
48
|
+
# or false if there i s no match.
|
49
|
+
# Matches against name (string) and parents ("identity")
|
50
|
+
def name_exists?(name = Taxonifi::Model::Name)
|
51
|
+
# Does the name (string) exist?
|
52
|
+
rank = name.rank.downcase
|
53
|
+
rank ||= 'unknown'
|
54
|
+
if by_name_index[rank][name.name]
|
55
|
+
# Yes, check to see if parents match
|
56
|
+
by_name_index[rank][name.name].each do |id|
|
57
|
+
vector = parent_id_vector(id)
|
58
|
+
vector.pop
|
59
|
+
if vector == parent_id_vector(name.parent.id)
|
60
|
+
exists = true
|
61
|
+
return id
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
false
|
66
|
+
end
|
67
|
+
|
68
|
+
# Add an individaul name object, indexing it.
|
69
|
+
def add_object(obj)
|
70
|
+
super
|
71
|
+
index_by_name(obj)
|
72
|
+
obj
|
73
|
+
end
|
74
|
+
|
75
|
+
# Add an individaul name object, without indexing it.
|
76
|
+
def add_object_pre_indexed(obj)
|
77
|
+
super
|
78
|
+
index_by_name(obj)
|
79
|
+
obj
|
80
|
+
end
|
81
|
+
|
82
|
+
# Add a Taxonifi::Model::SpeciesName object
|
83
|
+
# as individual objects.
|
84
|
+
def add_species_name(sn)
|
85
|
+
raise "Failed trying to load [#{sn.display_name}]. SpeciesName#genus#parent must be set before using add_species_name." if sn.genus.parent.nil?
|
86
|
+
current_parent_id = sn.genus.parent.id
|
87
|
+
sn.names.each do |o|
|
88
|
+
o.parent = object_by_id(current_parent_id)
|
89
|
+
if id = name_exists?(o)
|
90
|
+
cp_id = id
|
91
|
+
else
|
92
|
+
add_object(o)
|
93
|
+
cp_id = o.id
|
94
|
+
end
|
95
|
+
current_parent_id = cp_id
|
96
|
+
end
|
97
|
+
current_parent_id # return the id of the last name created
|
98
|
+
end
|
99
|
+
|
100
|
+
# As #add_species_name but do
|
101
|
+
# not assign ids to the incoming names
|
102
|
+
# TODO: deprecate?
|
103
|
+
def add_species_name_unindexed(sn)
|
104
|
+
sn.names.each do |o|
|
105
|
+
if !name_exists?(o)
|
106
|
+
add_object(o)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Take the author/years of these names and generate a reference collection.
|
112
|
+
# Start the ids assigned to the references with initial_id.
|
113
|
+
def generate_ref_collection(initial_id = 0)
|
114
|
+
rc = Taxonifi::Model::RefCollection.new(:initial_id => initial_id)
|
115
|
+
if collection.size > 0
|
116
|
+
uniques = collection.inject({}){|hsh, n| hsh.merge!(n.author_year_string => nil)}.keys.compact
|
117
|
+
if uniques.size > 0
|
118
|
+
uniques.sort.each_with_index do |r, i|
|
119
|
+
next if r.size == 0
|
120
|
+
ref = Taxonifi::Model::Ref.new(:author_year => r)
|
121
|
+
rc.add_object(ref)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
@ref_collection = rc
|
126
|
+
end
|
127
|
+
|
128
|
+
# Assign a reference collection to this name collection.
|
129
|
+
# !! Overwrites existing reference collection, including ones built
|
130
|
+
# using generate_ref_collection.
|
131
|
+
def ref_collection=(ref_collection)
|
132
|
+
@ref_collection = ref_collection if ref_collection.class == Taxonifi::Model::RefCollection
|
133
|
+
end
|
134
|
+
|
135
|
+
protected
|
136
|
+
|
137
|
+
# Index the object by name into the
|
138
|
+
# @by_name_index variable (this looks like:
|
139
|
+
# {"Foo bar" => [1,2,93]})
|
140
|
+
def index_by_name(obj)
|
141
|
+
rank = obj.rank
|
142
|
+
rank ||= 'unknown'
|
143
|
+
by_name_index[rank][obj.name] ||= []
|
144
|
+
by_name_index[rank][obj.name].push obj.id
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "../models/base.rb"))
|
2
|
+
|
3
|
+
module Taxonifi
|
4
|
+
module Model
|
5
|
+
|
6
|
+
# Simple Person class.
|
7
|
+
# You can store multiple initials and suffixes.
|
8
|
+
class Person < Taxonifi::Model::Base
|
9
|
+
ATTRIBUTES = [
|
10
|
+
:first_name,
|
11
|
+
:last_name,
|
12
|
+
:initials, # an Array, no periods.
|
13
|
+
:suffix # an Array
|
14
|
+
]
|
15
|
+
|
16
|
+
ATTRIBUTES.each do |a|
|
17
|
+
attr_accessor a
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize(options = {})
|
21
|
+
opts = {
|
22
|
+
}.merge!(options)
|
23
|
+
# Check for valid opts prior to building
|
24
|
+
build(ATTRIBUTES, opts)
|
25
|
+
true
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns a string with data delimited by pipes.
|
29
|
+
# Used in identity comparisons.
|
30
|
+
def compact_string
|
31
|
+
s = [ATTRIBUTES.sort.collect{|a| send(a)}].join("|").downcase.gsub(/\s/, '')
|
32
|
+
end
|
33
|
+
|
34
|
+
# Nothing fancy, just the data.
|
35
|
+
def display_name
|
36
|
+
[@last_name, @first_name, @initials, @suffix].compact.flatten.join(" ")
|
37
|
+
end
|
38
|
+
|
39
|
+
# Return a string representing the initials, periods added.
|
40
|
+
def initials_string
|
41
|
+
if @initials.nil?
|
42
|
+
nil
|
43
|
+
else
|
44
|
+
@initials.join(".") + "."
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/models/ref.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
class RefError < StandardError; end
|
3
|
+
module Model
|
4
|
+
|
5
|
+
# A basic reference object.
|
6
|
+
class Ref < Taxonifi::Model::Base
|
7
|
+
|
8
|
+
# These attributes are set automatically on #new()
|
9
|
+
ATTRIBUTES = [
|
10
|
+
:authors,
|
11
|
+
:title,
|
12
|
+
:year,
|
13
|
+
:publication,
|
14
|
+
:volume,
|
15
|
+
:number,
|
16
|
+
:pages,
|
17
|
+
:pg_start,
|
18
|
+
:pg_end,
|
19
|
+
:cited_page,
|
20
|
+
:full_citation
|
21
|
+
]
|
22
|
+
|
23
|
+
# Array of Taxonifi::Model::Person
|
24
|
+
attr_accessor :authors
|
25
|
+
# String
|
26
|
+
attr_accessor :title
|
27
|
+
# String
|
28
|
+
attr_accessor :year
|
29
|
+
# String
|
30
|
+
attr_accessor :publication
|
31
|
+
# String
|
32
|
+
attr_accessor :volume
|
33
|
+
# String
|
34
|
+
attr_accessor :number
|
35
|
+
# String. Anything that doesn't fit in a page range.
|
36
|
+
attr_accessor :pages
|
37
|
+
# String
|
38
|
+
attr_accessor :pg_start
|
39
|
+
# String
|
40
|
+
attr_accessor :pg_end
|
41
|
+
# String. Some specific page(s) of note.
|
42
|
+
attr_accessor :cited_page
|
43
|
+
# String. The full text of the citation, as read from input or assigned, not computed from individual components.
|
44
|
+
attr_accessor :full_citation
|
45
|
+
|
46
|
+
# String. Computed index based on existing Ref#authors and Ref#year
|
47
|
+
attr_accessor :author_year_index
|
48
|
+
|
49
|
+
# If :author_year is passed it is broken down into People + year.
|
50
|
+
def initialize(options = {})
|
51
|
+
opts = {
|
52
|
+
}.merge!(options)
|
53
|
+
@parent = nil
|
54
|
+
build(ATTRIBUTES, opts)
|
55
|
+
@authors = [] if @authors.nil?
|
56
|
+
raise Taxonifi::RefError, 'If :author_year is provided then authors and year must not be.' if opts[:author_year] && (!opts[:year].nil? || !opts[:authors].nil?)
|
57
|
+
add_author_year(opts[:author_year]) if !opts[:author_year].nil? && opts[:author_year].size > 0
|
58
|
+
true
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_author_year(string)
|
62
|
+
auth_yr = Taxonifi::Splitter::Builder.build_author_year(string)
|
63
|
+
@year = auth_yr.year
|
64
|
+
@authors = auth_yr.people
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns a pipe delimited representation of the reference.
|
68
|
+
def compact_string
|
69
|
+
s = [authors.collect{|a| a.compact_string}.join, year, self.title, publication, volume, number, pages, pg_start, pg_end, cited_page].join("|").downcase.gsub(/\s/, '')
|
70
|
+
s
|
71
|
+
end
|
72
|
+
|
73
|
+
# Return a by author_year index.
|
74
|
+
def author_year_index
|
75
|
+
@author_year_index ||= generate_author_year_index
|
76
|
+
end
|
77
|
+
|
78
|
+
# (re-) generate the author year index.
|
79
|
+
def generate_author_year_index
|
80
|
+
@author_year_index = Taxonifi::Model::AuthorYear.new(people: @authors, year: @year).compact_index
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
class RefCollectionError < StandardError; end
|
3
|
+
|
4
|
+
module Model
|
5
|
+
|
6
|
+
# A collection of references.
|
7
|
+
class RefCollection < Taxonifi::Model::Collection
|
8
|
+
|
9
|
+
# An options index when there is one reference per row.
|
10
|
+
attr_accessor :row_index
|
11
|
+
|
12
|
+
# Points a Ref#id to an array of Person#ids.
|
13
|
+
# Built on request.
|
14
|
+
attr_accessor :author_index
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
super
|
18
|
+
@row_index = []
|
19
|
+
@author_index = {}
|
20
|
+
true
|
21
|
+
end
|
22
|
+
|
23
|
+
# The instance collection class.
|
24
|
+
def object_class
|
25
|
+
Taxonifi::Model::Ref
|
26
|
+
end
|
27
|
+
|
28
|
+
# The object at a given row.
|
29
|
+
# TODO: inherit from Collection?
|
30
|
+
def object_from_row(row_number)
|
31
|
+
@row_index[row_number]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Incrementally (re-)assigns the id of every associated author (Person)
|
35
|
+
# This is only really useful if you assume every author is unique.
|
36
|
+
def enumerate_authors(initial_id = 0)
|
37
|
+
i = initial_id
|
38
|
+
collection.each do |r|
|
39
|
+
r.authors.each do |a|
|
40
|
+
a.id = i
|
41
|
+
i += 1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Finds unique authors, and combines them, then
|
47
|
+
# rebuilds author lists using references to the new unique set.
|
48
|
+
def uniquify_authors(initial_id = 0)
|
49
|
+
auth_index = {}
|
50
|
+
unique_authors.each_with_index do |a, i|
|
51
|
+
a.id = i + initial_id
|
52
|
+
auth_index.merge!(a.compact_string => a)
|
53
|
+
end
|
54
|
+
|
55
|
+
collection.each do |r|
|
56
|
+
new_authors = []
|
57
|
+
r.authors.inject(new_authors){|ary, a| ary.push(auth_index[a.compact_string])}
|
58
|
+
r.authors = new_authors
|
59
|
+
end
|
60
|
+
true
|
61
|
+
end
|
62
|
+
|
63
|
+
# Build the author index.
|
64
|
+
# {Ref#id => [a1#id, ... an#id]}
|
65
|
+
def build_author_index
|
66
|
+
collection.each do |r|
|
67
|
+
@author_index.merge!(r.id => r.authors.collect{|a| a.id ? a.id : -1})
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Return an array the unique author strings in this collection.
|
72
|
+
def unique_author_strings
|
73
|
+
auths = {}
|
74
|
+
collection.each do |r|
|
75
|
+
r.authors.each do |a|
|
76
|
+
auths.merge!(a.display_name => nil)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
auths.keys.sort
|
80
|
+
end
|
81
|
+
|
82
|
+
# Returns Array of Taxonifi::Model::Person
|
83
|
+
# Will need better indexing on big lists?
|
84
|
+
def unique_authors
|
85
|
+
auths = []
|
86
|
+
collection.each do |r|
|
87
|
+
r.authors.each do |a|
|
88
|
+
found = false
|
89
|
+
auths.each do |x|
|
90
|
+
if a.identical?(x)
|
91
|
+
found = true
|
92
|
+
next
|
93
|
+
end
|
94
|
+
end
|
95
|
+
if not found
|
96
|
+
auths.push a.clone
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
auths
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
class SpeciesNameError < StandardError; end
|
3
|
+
module Model
|
4
|
+
|
5
|
+
# The species name model is just a pointer to 5 Taxonifi::Model::Names.
|
6
|
+
# The various metadata (author, year, original combination) is stored with the individual
|
7
|
+
# instances of those names.
|
8
|
+
# Taxonifi::Model::Names have no ids!
|
9
|
+
|
10
|
+
class SpeciesName < Taxonifi::Model::Base
|
11
|
+
ATTRIBUTES = [:genus, :subgenus, :species, :subspecies, :parent]
|
12
|
+
ATTRIBUTES.each do |a|
|
13
|
+
attr_accessor a
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
opts = {
|
18
|
+
}.merge!(options)
|
19
|
+
build(ATTRIBUTES, opts)
|
20
|
+
true
|
21
|
+
end
|
22
|
+
|
23
|
+
# Set the genus name.
|
24
|
+
def genus=(genus)
|
25
|
+
@genus = genus
|
26
|
+
end
|
27
|
+
|
28
|
+
# Set the subgenus name.
|
29
|
+
def subgenus=(subgenus)
|
30
|
+
raise Taxonifi::SpeciesNameError, "Species name must have a Genus name before subgenus can be assigned" if @genus.nil?
|
31
|
+
@subgenus = subgenus
|
32
|
+
@subgenus.parent = @genus
|
33
|
+
end
|
34
|
+
|
35
|
+
# Set the species name.
|
36
|
+
def species=(species)
|
37
|
+
raise Taxonifi::SpeciesNameError, "Species name must have a Genus name before species can be assigned" if @genus.nil?
|
38
|
+
@species = species
|
39
|
+
@species.parent = (@subgenus ? @subgenus : @genus)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Set the subspecies name.
|
43
|
+
def subspecies=(subspecies)
|
44
|
+
raise Taxonifi::SpeciesNameError, "Subspecies name must have a species name before species can be assigned" if @species.nil?
|
45
|
+
@subspecies = subspecies
|
46
|
+
@subspecies.parent = @species
|
47
|
+
end
|
48
|
+
|
49
|
+
# Set the parent name.
|
50
|
+
def parent=(parent)
|
51
|
+
if parent.class != Taxonifi::Model::Name
|
52
|
+
raise SpeciesNameError, "Parent is not a Taxonifi::Model::Name."
|
53
|
+
end
|
54
|
+
|
55
|
+
if parent.rank.nil? || (Taxonifi::RANKS.index('genus') <= Taxonifi::RANKS.index(parent.rank))
|
56
|
+
raise Taxonifi::SpeciesNameError, "Parents of SpeciesNames must have rank higher than Genus."
|
57
|
+
end
|
58
|
+
|
59
|
+
@parent = parent
|
60
|
+
end
|
61
|
+
|
62
|
+
# Return an array of Name objects.
|
63
|
+
def names
|
64
|
+
ATTRIBUTES.collect{|a| self.send(a)}.compact
|
65
|
+
end
|
66
|
+
|
67
|
+
# Return a string representation of the species name.
|
68
|
+
def display_name
|
69
|
+
strs = []
|
70
|
+
self.names.each do |n|
|
71
|
+
case n.rank
|
72
|
+
when 'subgenus'
|
73
|
+
strs.push "(#{n.name})"
|
74
|
+
else
|
75
|
+
strs.push n.name
|
76
|
+
end
|
77
|
+
end
|
78
|
+
strs.push self.names.last.author_year
|
79
|
+
txt = strs.compact.join(" ")
|
80
|
+
txt
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Builder functionality for parsing/lexing framework.
|
2
|
+
module Taxonifi::Splitter::Builder
|
3
|
+
|
4
|
+
# Load all builders (= models)
|
5
|
+
# TODO: perhaps use a different scope that doesn't require loading all at once
|
6
|
+
Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "../models/*.rb") )) do |file|
|
7
|
+
require file
|
8
|
+
end
|
9
|
+
|
10
|
+
# Build and return Taxonifi::Model::AuthorYear from a string.
|
11
|
+
def self.build_author_year(text)
|
12
|
+
lexer = Taxonifi::Splitter::Lexer.new(text)
|
13
|
+
builder = Taxonifi::Model::AuthorYear.new
|
14
|
+
Taxonifi::Splitter::Parser.new(lexer, builder).parse_author_year
|
15
|
+
builder
|
16
|
+
end
|
17
|
+
|
18
|
+
# Build and return Taxonifi::Model::SpeciesName from a string.
|
19
|
+
def self.build_species_name(text)
|
20
|
+
lexer = Taxonifi::Splitter::Lexer.new(text, :species_name)
|
21
|
+
builder = Taxonifi::Model::SpeciesName.new
|
22
|
+
Taxonifi::Splitter::Parser.new(lexer, builder).parse_species_name
|
23
|
+
builder
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#
|
2
|
+
# Lexer taken verbatim from OboParser and other mjy gems.
|
3
|
+
#
|
4
|
+
class Taxonifi::Splitter::Lexer
|
5
|
+
attr_reader :input, :token_list
|
6
|
+
def initialize(input, token_list = nil)
|
7
|
+
|
8
|
+
raise Taxonifi::Splitter::SplitterError, "Invalid token list passed to Lexer." if (!token_list.nil? && !Taxonifi::Splitter::TOKEN_LISTS.include?(token_list) )
|
9
|
+
token_list = :global_token_list if token_list.nil?
|
10
|
+
|
11
|
+
@input = input
|
12
|
+
@token_list = token_list
|
13
|
+
@next_token = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
# Checks whether the next token is of the specified class.
|
17
|
+
def peek(token_class, token_list = nil)
|
18
|
+
token = read_next_token(token_class)
|
19
|
+
return token.class == token_class
|
20
|
+
end
|
21
|
+
|
22
|
+
# Return (and delete) the next token from the input stream, or raise an exception
|
23
|
+
# if the next token is not of the given class.
|
24
|
+
def pop(token_class)
|
25
|
+
token = read_next_token(token_class)
|
26
|
+
@next_token = nil
|
27
|
+
if token.class != token_class
|
28
|
+
raise(Taxonifi::Splitter::SplitterError, "expected #{token_class.to_s} but received #{token.class.to_s} at #{@input[0..10]}...", caller)
|
29
|
+
else
|
30
|
+
return token
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# Read (and store) the next token from the input, if it has not already been read.
|
37
|
+
def read_next_token(token_class)
|
38
|
+
if @next_token
|
39
|
+
return @next_token
|
40
|
+
else
|
41
|
+
# check for a match on the specified class first
|
42
|
+
if match(token_class)
|
43
|
+
return @next_token
|
44
|
+
else
|
45
|
+
# now check all the tokens for a match
|
46
|
+
Taxonifi::Splitter::Tokens.send(@token_list).each {|t|
|
47
|
+
return @next_token if match(t)
|
48
|
+
}
|
49
|
+
end
|
50
|
+
# no match, either end of string or lex-error
|
51
|
+
if @input != ''
|
52
|
+
raise(Taxonifi::Splitter::SplitterError, "Lexer Error, unknown token at |#{@input[0..20]}...", caller)
|
53
|
+
else
|
54
|
+
return nil
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Match a token to the input.
|
60
|
+
def match(token_class)
|
61
|
+
if (m = token_class.regexp.match(@input))
|
62
|
+
@next_token = token_class.new(m[1])
|
63
|
+
@input = @input[m.end(0)..-1]
|
64
|
+
return true
|
65
|
+
else
|
66
|
+
return false
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#
|
2
|
+
# Parser pattern taken from OboParser and other mjy gems.
|
3
|
+
#
|
4
|
+
# The parser takes a builder and a lexer and does the actual breakdown.
|
5
|
+
#
|
6
|
+
class Taxonifi::Splitter::Parser
|
7
|
+
def initialize(lexer, builder )
|
8
|
+
@lexer = lexer
|
9
|
+
@builder = builder
|
10
|
+
end
|
11
|
+
|
12
|
+
# parse out an author year combination.
|
13
|
+
# TODO: This is only indirectly tested in lumper code
|
14
|
+
def parse_author_year
|
15
|
+
t = @lexer.pop(Taxonifi::Splitter::Tokens::AuthorYear)
|
16
|
+
|
17
|
+
lexer = Taxonifi::Splitter::Lexer.new(t.authors)
|
18
|
+
authors = lexer.pop(Taxonifi::Splitter::Tokens::Authors)
|
19
|
+
|
20
|
+
# TODO: A people collection?
|
21
|
+
authors.names.each do |a|
|
22
|
+
n = Taxonifi::Model::Person.new()
|
23
|
+
n.last_name = a[:last_name]
|
24
|
+
n.initials = a[:initials]
|
25
|
+
@builder.people.push n
|
26
|
+
end
|
27
|
+
|
28
|
+
@builder.year = t.year.to_i
|
29
|
+
@builder.parens = t.parens
|
30
|
+
end
|
31
|
+
|
32
|
+
# Parse a species name
|
33
|
+
def parse_species_name
|
34
|
+
t = @lexer.pop(Taxonifi::Splitter::Tokens::Quadrinomial)
|
35
|
+
ranks = %w{genus subgenus species subspecies}
|
36
|
+
names = {}
|
37
|
+
last_parent = nil
|
38
|
+
ranks.each do |r|
|
39
|
+
names.merge!(r: nil)
|
40
|
+
@builder.send("#{r}=", Taxonifi::Model::Name.new(:name => t.send(r), rank: r) ) if t.send(r)
|
41
|
+
end
|
42
|
+
|
43
|
+
if @lexer.peek(Taxonifi::Splitter::Tokens::AuthorYear)
|
44
|
+
t = @lexer.pop(Taxonifi::Splitter::Tokens::AuthorYear)
|
45
|
+
@builder.names.last.author = t.authors
|
46
|
+
@builder.names.last.year = t.year
|
47
|
+
@builder.names.last.parens = !t.parens
|
48
|
+
@builder.names.last.derive_authors_year
|
49
|
+
end
|
50
|
+
|
51
|
+
@builder
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
|
3
|
+
# An implementation of the parser/lexer/token pattern by Krishna Dole which in turn was based on
|
4
|
+
# Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library, which has evolved
|
5
|
+
# into mjy's obo_parser/nexus_parser libraries.
|
6
|
+
module Splitter
|
7
|
+
|
8
|
+
TOKEN_LISTS = [
|
9
|
+
:global_token_list,
|
10
|
+
:volume_number,
|
11
|
+
:pages,
|
12
|
+
:species_name
|
13
|
+
]
|
14
|
+
|
15
|
+
class SplitterError < StandardError; end
|
16
|
+
|
17
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'tokens'))
|
18
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'parser'))
|
19
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'lexer'))
|
20
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'builder'))
|
21
|
+
|
22
|
+
|
23
|
+
# stub, we might not need
|
24
|
+
class Splitter
|
25
|
+
def initialize
|
26
|
+
true
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end # end Splitter module
|
31
|
+
end # Taxonifi module
|
32
|
+
|
33
|
+
|
34
|
+
#= Implementation
|
35
|
+
|
36
|
+
def do_bar(input)
|
37
|
+
@input = input
|
38
|
+
raise(Taxonifi::Splitter::SplitterError, "Nothing passed to parse!") if !@input || @input.size == 0
|
39
|
+
|
40
|
+
builder = Taxonifi::Splitter::SplitterBuilder.new
|
41
|
+
lexer = Taxonifi::Splitter::Lexer.new(@input)
|
42
|
+
Taxonfi::Splitter::Parser.new(lexer, builder).foo
|
43
|
+
return builder.bar
|
44
|
+
end
|
45
|
+
|