taxonifi 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +155 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/assessor/assessor.rb +31 -0
- data/lib/assessor/base.rb +17 -0
- data/lib/assessor/row_assessor.rb +131 -0
- data/lib/export/export.rb +9 -0
- data/lib/export/format/base.rb +43 -0
- data/lib/export/format/species_file.rb +341 -0
- data/lib/lumper/lumper.rb +334 -0
- data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
- data/lib/models/author_year.rb +39 -0
- data/lib/models/base.rb +73 -0
- data/lib/models/collection.rb +92 -0
- data/lib/models/generic_object.rb +15 -0
- data/lib/models/geog.rb +59 -0
- data/lib/models/geog_collection.rb +28 -0
- data/lib/models/name.rb +206 -0
- data/lib/models/name_collection.rb +149 -0
- data/lib/models/person.rb +49 -0
- data/lib/models/ref.rb +85 -0
- data/lib/models/ref_collection.rb +106 -0
- data/lib/models/species_name.rb +85 -0
- data/lib/splitter/builder.rb +26 -0
- data/lib/splitter/lexer.rb +70 -0
- data/lib/splitter/parser.rb +54 -0
- data/lib/splitter/splitter.rb +45 -0
- data/lib/splitter/tokens.rb +322 -0
- data/lib/taxonifi.rb +36 -0
- data/test/file_fixtures/Lygaeoidea.csv +801 -0
- data/test/helper.rb +38 -0
- data/test/test_exporter.rb +32 -0
- data/test/test_lumper_geogs.rb +59 -0
- data/test/test_lumper_hierarchical_collection.rb +88 -0
- data/test/test_lumper_names.rb +119 -0
- data/test/test_lumper_parent_child_name_collection.rb +41 -0
- data/test/test_lumper_refs.rb +91 -0
- data/test/test_parser.rb +34 -0
- data/test/test_splitter.rb +27 -0
- data/test/test_splitter_tokens.rb +403 -0
- data/test/test_taxonifi.rb +11 -0
- data/test/test_taxonifi_accessor.rb +61 -0
- data/test/test_taxonifi_geog.rb +51 -0
- data/test/test_taxonifi_name.rb +186 -0
- data/test/test_taxonifi_name_collection.rb +158 -0
- data/test/test_taxonifi_ref.rb +90 -0
- data/test/test_taxonifi_ref_collection.rb +69 -0
- data/test/test_taxonifi_species_name.rb +95 -0
- metadata +167 -0
@@ -0,0 +1,149 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
class NameCollectionError < StandardError; end
|
3
|
+
module Model
|
4
|
+
|
5
|
+
# A collection of taxonomic names.
|
6
|
+
class NameCollection < Taxonifi::Model::Collection
|
7
|
+
|
8
|
+
attr_accessor :by_name_index
|
9
|
+
attr_accessor :ref_collection
|
10
|
+
|
11
|
+
def initialize(options = {})
|
12
|
+
super
|
13
|
+
@collection = []
|
14
|
+
@by_name_index = {} # "foo => [1,2,3]"
|
15
|
+
Taxonifi::RANKS.inject(@by_name_index){|hsh, v| hsh.merge!(v => {})}
|
16
|
+
@by_name_index['unknown'] = {} # unranked names get dumped in here
|
17
|
+
@ref_collection = nil
|
18
|
+
true
|
19
|
+
end
|
20
|
+
|
21
|
+
def object_class
|
22
|
+
Taxonifi::Model::Name
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return the highest RANK for which there is no
|
26
|
+
# name in this collection.
|
27
|
+
def encompassing_rank
|
28
|
+
highest = RANKS.size
|
29
|
+
@collection.each do |n|
|
30
|
+
h = RANKS.index(n.rank)
|
31
|
+
highest = h if h < highest
|
32
|
+
end
|
33
|
+
RANKS[highest - 1]
|
34
|
+
end
|
35
|
+
|
36
|
+
# The names objects in the collection at a rank.
|
37
|
+
# TODO: Should index this on add_object
|
38
|
+
def names_at_rank(rank)
|
39
|
+
raise if !RANKS.include?(rank)
|
40
|
+
names = []
|
41
|
+
@collection.each do |n|
|
42
|
+
names << n if n.rank == rank
|
43
|
+
end
|
44
|
+
names
|
45
|
+
end
|
46
|
+
|
47
|
+
# Returns id of matching existing name
|
48
|
+
# or false if there i s no match.
|
49
|
+
# Matches against name (string) and parents ("identity")
|
50
|
+
def name_exists?(name = Taxonifi::Model::Name)
|
51
|
+
# Does the name (string) exist?
|
52
|
+
rank = name.rank.downcase
|
53
|
+
rank ||= 'unknown'
|
54
|
+
if by_name_index[rank][name.name]
|
55
|
+
# Yes, check to see if parents match
|
56
|
+
by_name_index[rank][name.name].each do |id|
|
57
|
+
vector = parent_id_vector(id)
|
58
|
+
vector.pop
|
59
|
+
if vector == parent_id_vector(name.parent.id)
|
60
|
+
exists = true
|
61
|
+
return id
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
false
|
66
|
+
end
|
67
|
+
|
68
|
+
# Add an individaul name object, indexing it.
|
69
|
+
def add_object(obj)
|
70
|
+
super
|
71
|
+
index_by_name(obj)
|
72
|
+
obj
|
73
|
+
end
|
74
|
+
|
75
|
+
# Add an individaul name object, without indexing it.
|
76
|
+
def add_object_pre_indexed(obj)
|
77
|
+
super
|
78
|
+
index_by_name(obj)
|
79
|
+
obj
|
80
|
+
end
|
81
|
+
|
82
|
+
# Add a Taxonifi::Model::SpeciesName object
|
83
|
+
# as individual objects.
|
84
|
+
def add_species_name(sn)
|
85
|
+
raise "Failed trying to load [#{sn.display_name}]. SpeciesName#genus#parent must be set before using add_species_name." if sn.genus.parent.nil?
|
86
|
+
current_parent_id = sn.genus.parent.id
|
87
|
+
sn.names.each do |o|
|
88
|
+
o.parent = object_by_id(current_parent_id)
|
89
|
+
if id = name_exists?(o)
|
90
|
+
cp_id = id
|
91
|
+
else
|
92
|
+
add_object(o)
|
93
|
+
cp_id = o.id
|
94
|
+
end
|
95
|
+
current_parent_id = cp_id
|
96
|
+
end
|
97
|
+
current_parent_id # return the id of the last name created
|
98
|
+
end
|
99
|
+
|
100
|
+
# As #add_species_name but do
|
101
|
+
# not assign ids to the incoming names
|
102
|
+
# TODO: deprecate?
|
103
|
+
def add_species_name_unindexed(sn)
|
104
|
+
sn.names.each do |o|
|
105
|
+
if !name_exists?(o)
|
106
|
+
add_object(o)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Take the author/years of these names and generate a reference collection.
|
112
|
+
# Start the ids assigned to the references with initial_id.
|
113
|
+
def generate_ref_collection(initial_id = 0)
|
114
|
+
rc = Taxonifi::Model::RefCollection.new(:initial_id => initial_id)
|
115
|
+
if collection.size > 0
|
116
|
+
uniques = collection.inject({}){|hsh, n| hsh.merge!(n.author_year_string => nil)}.keys.compact
|
117
|
+
if uniques.size > 0
|
118
|
+
uniques.sort.each_with_index do |r, i|
|
119
|
+
next if r.size == 0
|
120
|
+
ref = Taxonifi::Model::Ref.new(:author_year => r)
|
121
|
+
rc.add_object(ref)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
@ref_collection = rc
|
126
|
+
end
|
127
|
+
|
128
|
+
# Assign a reference collection to this name collection.
|
129
|
+
# !! Overwrites existing reference collection, including ones built
|
130
|
+
# using generate_ref_collection.
|
131
|
+
def ref_collection=(ref_collection)
|
132
|
+
@ref_collection = ref_collection if ref_collection.class == Taxonifi::Model::RefCollection
|
133
|
+
end
|
134
|
+
|
135
|
+
protected
|
136
|
+
|
137
|
+
# Index the object by name into the
|
138
|
+
# @by_name_index variable (this looks like:
|
139
|
+
# {"Foo bar" => [1,2,93]})
|
140
|
+
def index_by_name(obj)
|
141
|
+
rank = obj.rank
|
142
|
+
rank ||= 'unknown'
|
143
|
+
by_name_index[rank][obj.name] ||= []
|
144
|
+
by_name_index[rank][obj.name].push obj.id
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "../models/base.rb"))
|
2
|
+
|
3
|
+
module Taxonifi
|
4
|
+
module Model
|
5
|
+
|
6
|
+
# Simple Person class.
|
7
|
+
# You can store multiple initials and suffixes.
|
8
|
+
class Person < Taxonifi::Model::Base
|
9
|
+
ATTRIBUTES = [
|
10
|
+
:first_name,
|
11
|
+
:last_name,
|
12
|
+
:initials, # an Array, no periods.
|
13
|
+
:suffix # an Array
|
14
|
+
]
|
15
|
+
|
16
|
+
ATTRIBUTES.each do |a|
|
17
|
+
attr_accessor a
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize(options = {})
|
21
|
+
opts = {
|
22
|
+
}.merge!(options)
|
23
|
+
# Check for valid opts prior to building
|
24
|
+
build(ATTRIBUTES, opts)
|
25
|
+
true
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns a string with data delimited by pipes.
|
29
|
+
# Used in identity comparisons.
|
30
|
+
def compact_string
|
31
|
+
s = [ATTRIBUTES.sort.collect{|a| send(a)}].join("|").downcase.gsub(/\s/, '')
|
32
|
+
end
|
33
|
+
|
34
|
+
# Nothing fancy, just the data.
|
35
|
+
def display_name
|
36
|
+
[@last_name, @first_name, @initials, @suffix].compact.flatten.join(" ")
|
37
|
+
end
|
38
|
+
|
39
|
+
# Return a string representing the initials, periods added.
|
40
|
+
def initials_string
|
41
|
+
if @initials.nil?
|
42
|
+
nil
|
43
|
+
else
|
44
|
+
@initials.join(".") + "."
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/models/ref.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
class RefError < StandardError; end
|
3
|
+
module Model
|
4
|
+
|
5
|
+
# A basic reference object.
|
6
|
+
class Ref < Taxonifi::Model::Base
|
7
|
+
|
8
|
+
# These attributes are set automatically on #new()
|
9
|
+
ATTRIBUTES = [
|
10
|
+
:authors,
|
11
|
+
:title,
|
12
|
+
:year,
|
13
|
+
:publication,
|
14
|
+
:volume,
|
15
|
+
:number,
|
16
|
+
:pages,
|
17
|
+
:pg_start,
|
18
|
+
:pg_end,
|
19
|
+
:cited_page,
|
20
|
+
:full_citation
|
21
|
+
]
|
22
|
+
|
23
|
+
# Array of Taxonifi::Model::Person
|
24
|
+
attr_accessor :authors
|
25
|
+
# String
|
26
|
+
attr_accessor :title
|
27
|
+
# String
|
28
|
+
attr_accessor :year
|
29
|
+
# String
|
30
|
+
attr_accessor :publication
|
31
|
+
# String
|
32
|
+
attr_accessor :volume
|
33
|
+
# String
|
34
|
+
attr_accessor :number
|
35
|
+
# String. Anything that doesn't fit in a page range.
|
36
|
+
attr_accessor :pages
|
37
|
+
# String
|
38
|
+
attr_accessor :pg_start
|
39
|
+
# String
|
40
|
+
attr_accessor :pg_end
|
41
|
+
# String. Some specific page(s) of note.
|
42
|
+
attr_accessor :cited_page
|
43
|
+
# String. The full text of the citation, as read from input or assigned, not computed from individual components.
|
44
|
+
attr_accessor :full_citation
|
45
|
+
|
46
|
+
# String. Computed index based on existing Ref#authors and Ref#year
|
47
|
+
attr_accessor :author_year_index
|
48
|
+
|
49
|
+
# If :author_year is passed it is broken down into People + year.
|
50
|
+
def initialize(options = {})
|
51
|
+
opts = {
|
52
|
+
}.merge!(options)
|
53
|
+
@parent = nil
|
54
|
+
build(ATTRIBUTES, opts)
|
55
|
+
@authors = [] if @authors.nil?
|
56
|
+
raise Taxonifi::RefError, 'If :author_year is provided then authors and year must not be.' if opts[:author_year] && (!opts[:year].nil? || !opts[:authors].nil?)
|
57
|
+
add_author_year(opts[:author_year]) if !opts[:author_year].nil? && opts[:author_year].size > 0
|
58
|
+
true
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_author_year(string)
|
62
|
+
auth_yr = Taxonifi::Splitter::Builder.build_author_year(string)
|
63
|
+
@year = auth_yr.year
|
64
|
+
@authors = auth_yr.people
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns a pipe delimited representation of the reference.
|
68
|
+
def compact_string
|
69
|
+
s = [authors.collect{|a| a.compact_string}.join, year, self.title, publication, volume, number, pages, pg_start, pg_end, cited_page].join("|").downcase.gsub(/\s/, '')
|
70
|
+
s
|
71
|
+
end
|
72
|
+
|
73
|
+
# Return a by author_year index.
|
74
|
+
def author_year_index
|
75
|
+
@author_year_index ||= generate_author_year_index
|
76
|
+
end
|
77
|
+
|
78
|
+
# (re-) generate the author year index.
|
79
|
+
def generate_author_year_index
|
80
|
+
@author_year_index = Taxonifi::Model::AuthorYear.new(people: @authors, year: @year).compact_index
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
class RefCollectionError < StandardError; end
|
3
|
+
|
4
|
+
module Model
|
5
|
+
|
6
|
+
# A collection of references.
|
7
|
+
class RefCollection < Taxonifi::Model::Collection
|
8
|
+
|
9
|
+
# An options index when there is one reference per row.
|
10
|
+
attr_accessor :row_index
|
11
|
+
|
12
|
+
# Points a Ref#id to an array of Person#ids.
|
13
|
+
# Built on request.
|
14
|
+
attr_accessor :author_index
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
super
|
18
|
+
@row_index = []
|
19
|
+
@author_index = {}
|
20
|
+
true
|
21
|
+
end
|
22
|
+
|
23
|
+
# The instance collection class.
|
24
|
+
def object_class
|
25
|
+
Taxonifi::Model::Ref
|
26
|
+
end
|
27
|
+
|
28
|
+
# The object at a given row.
|
29
|
+
# TODO: inherit from Collection?
|
30
|
+
def object_from_row(row_number)
|
31
|
+
@row_index[row_number]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Incrementally (re-)assigns the id of every associated author (Person)
|
35
|
+
# This is only really useful if you assume every author is unique.
|
36
|
+
def enumerate_authors(initial_id = 0)
|
37
|
+
i = initial_id
|
38
|
+
collection.each do |r|
|
39
|
+
r.authors.each do |a|
|
40
|
+
a.id = i
|
41
|
+
i += 1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Finds unique authors, and combines them, then
|
47
|
+
# rebuilds author lists using references to the new unique set.
|
48
|
+
def uniquify_authors(initial_id = 0)
|
49
|
+
auth_index = {}
|
50
|
+
unique_authors.each_with_index do |a, i|
|
51
|
+
a.id = i + initial_id
|
52
|
+
auth_index.merge!(a.compact_string => a)
|
53
|
+
end
|
54
|
+
|
55
|
+
collection.each do |r|
|
56
|
+
new_authors = []
|
57
|
+
r.authors.inject(new_authors){|ary, a| ary.push(auth_index[a.compact_string])}
|
58
|
+
r.authors = new_authors
|
59
|
+
end
|
60
|
+
true
|
61
|
+
end
|
62
|
+
|
63
|
+
# Build the author index.
|
64
|
+
# {Ref#id => [a1#id, ... an#id]}
|
65
|
+
def build_author_index
|
66
|
+
collection.each do |r|
|
67
|
+
@author_index.merge!(r.id => r.authors.collect{|a| a.id ? a.id : -1})
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Return an array the unique author strings in this collection.
|
72
|
+
def unique_author_strings
|
73
|
+
auths = {}
|
74
|
+
collection.each do |r|
|
75
|
+
r.authors.each do |a|
|
76
|
+
auths.merge!(a.display_name => nil)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
auths.keys.sort
|
80
|
+
end
|
81
|
+
|
82
|
+
# Returns Array of Taxonifi::Model::Person
|
83
|
+
# Will need better indexing on big lists?
|
84
|
+
def unique_authors
|
85
|
+
auths = []
|
86
|
+
collection.each do |r|
|
87
|
+
r.authors.each do |a|
|
88
|
+
found = false
|
89
|
+
auths.each do |x|
|
90
|
+
if a.identical?(x)
|
91
|
+
found = true
|
92
|
+
next
|
93
|
+
end
|
94
|
+
end
|
95
|
+
if not found
|
96
|
+
auths.push a.clone
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
auths
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
class SpeciesNameError < StandardError; end
|
3
|
+
module Model
|
4
|
+
|
5
|
+
# The species name model is just a pointer to 5 Taxonifi::Model::Names.
|
6
|
+
# The various metadata (author, year, original combination) is stored with the individual
|
7
|
+
# instances of those names.
|
8
|
+
# Taxonifi::Model::Names have no ids!
|
9
|
+
|
10
|
+
class SpeciesName < Taxonifi::Model::Base
|
11
|
+
ATTRIBUTES = [:genus, :subgenus, :species, :subspecies, :parent]
|
12
|
+
ATTRIBUTES.each do |a|
|
13
|
+
attr_accessor a
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
opts = {
|
18
|
+
}.merge!(options)
|
19
|
+
build(ATTRIBUTES, opts)
|
20
|
+
true
|
21
|
+
end
|
22
|
+
|
23
|
+
# Set the genus name.
|
24
|
+
def genus=(genus)
|
25
|
+
@genus = genus
|
26
|
+
end
|
27
|
+
|
28
|
+
# Set the subgenus name.
|
29
|
+
def subgenus=(subgenus)
|
30
|
+
raise Taxonifi::SpeciesNameError, "Species name must have a Genus name before subgenus can be assigned" if @genus.nil?
|
31
|
+
@subgenus = subgenus
|
32
|
+
@subgenus.parent = @genus
|
33
|
+
end
|
34
|
+
|
35
|
+
# Set the species name.
|
36
|
+
def species=(species)
|
37
|
+
raise Taxonifi::SpeciesNameError, "Species name must have a Genus name before species can be assigned" if @genus.nil?
|
38
|
+
@species = species
|
39
|
+
@species.parent = (@subgenus ? @subgenus : @genus)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Set the subspecies name.
|
43
|
+
def subspecies=(subspecies)
|
44
|
+
raise Taxonifi::SpeciesNameError, "Subspecies name must have a species name before species can be assigned" if @species.nil?
|
45
|
+
@subspecies = subspecies
|
46
|
+
@subspecies.parent = @species
|
47
|
+
end
|
48
|
+
|
49
|
+
# Set the parent name.
|
50
|
+
def parent=(parent)
|
51
|
+
if parent.class != Taxonifi::Model::Name
|
52
|
+
raise SpeciesNameError, "Parent is not a Taxonifi::Model::Name."
|
53
|
+
end
|
54
|
+
|
55
|
+
if parent.rank.nil? || (Taxonifi::RANKS.index('genus') <= Taxonifi::RANKS.index(parent.rank))
|
56
|
+
raise Taxonifi::SpeciesNameError, "Parents of SpeciesNames must have rank higher than Genus."
|
57
|
+
end
|
58
|
+
|
59
|
+
@parent = parent
|
60
|
+
end
|
61
|
+
|
62
|
+
# Return an array of Name objects.
|
63
|
+
def names
|
64
|
+
ATTRIBUTES.collect{|a| self.send(a)}.compact
|
65
|
+
end
|
66
|
+
|
67
|
+
# Return a string representation of the species name.
|
68
|
+
def display_name
|
69
|
+
strs = []
|
70
|
+
self.names.each do |n|
|
71
|
+
case n.rank
|
72
|
+
when 'subgenus'
|
73
|
+
strs.push "(#{n.name})"
|
74
|
+
else
|
75
|
+
strs.push n.name
|
76
|
+
end
|
77
|
+
end
|
78
|
+
strs.push self.names.last.author_year
|
79
|
+
txt = strs.compact.join(" ")
|
80
|
+
txt
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Builder functionality for parsing/lexing framework.
|
2
|
+
module Taxonifi::Splitter::Builder
|
3
|
+
|
4
|
+
# Load all builders (= models)
|
5
|
+
# TODO: perhaps use a different scope that doesn't require loading all at once
|
6
|
+
Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "../models/*.rb") )) do |file|
|
7
|
+
require file
|
8
|
+
end
|
9
|
+
|
10
|
+
# Build and return Taxonifi::Model::AuthorYear from a string.
|
11
|
+
def self.build_author_year(text)
|
12
|
+
lexer = Taxonifi::Splitter::Lexer.new(text)
|
13
|
+
builder = Taxonifi::Model::AuthorYear.new
|
14
|
+
Taxonifi::Splitter::Parser.new(lexer, builder).parse_author_year
|
15
|
+
builder
|
16
|
+
end
|
17
|
+
|
18
|
+
# Build and return Taxonifi::Model::SpeciesName from a string.
|
19
|
+
def self.build_species_name(text)
|
20
|
+
lexer = Taxonifi::Splitter::Lexer.new(text, :species_name)
|
21
|
+
builder = Taxonifi::Model::SpeciesName.new
|
22
|
+
Taxonifi::Splitter::Parser.new(lexer, builder).parse_species_name
|
23
|
+
builder
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#
|
2
|
+
# Lexer taken verbatim from OboParser and other mjy gems.
|
3
|
+
#
|
4
|
+
class Taxonifi::Splitter::Lexer
|
5
|
+
attr_reader :input, :token_list
|
6
|
+
def initialize(input, token_list = nil)
|
7
|
+
|
8
|
+
raise Taxonifi::Splitter::SplitterError, "Invalid token list passed to Lexer." if (!token_list.nil? && !Taxonifi::Splitter::TOKEN_LISTS.include?(token_list) )
|
9
|
+
token_list = :global_token_list if token_list.nil?
|
10
|
+
|
11
|
+
@input = input
|
12
|
+
@token_list = token_list
|
13
|
+
@next_token = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
# Checks whether the next token is of the specified class.
|
17
|
+
def peek(token_class, token_list = nil)
|
18
|
+
token = read_next_token(token_class)
|
19
|
+
return token.class == token_class
|
20
|
+
end
|
21
|
+
|
22
|
+
# Return (and delete) the next token from the input stream, or raise an exception
|
23
|
+
# if the next token is not of the given class.
|
24
|
+
def pop(token_class)
|
25
|
+
token = read_next_token(token_class)
|
26
|
+
@next_token = nil
|
27
|
+
if token.class != token_class
|
28
|
+
raise(Taxonifi::Splitter::SplitterError, "expected #{token_class.to_s} but received #{token.class.to_s} at #{@input[0..10]}...", caller)
|
29
|
+
else
|
30
|
+
return token
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# Read (and store) the next token from the input, if it has not already been read.
|
37
|
+
def read_next_token(token_class)
|
38
|
+
if @next_token
|
39
|
+
return @next_token
|
40
|
+
else
|
41
|
+
# check for a match on the specified class first
|
42
|
+
if match(token_class)
|
43
|
+
return @next_token
|
44
|
+
else
|
45
|
+
# now check all the tokens for a match
|
46
|
+
Taxonifi::Splitter::Tokens.send(@token_list).each {|t|
|
47
|
+
return @next_token if match(t)
|
48
|
+
}
|
49
|
+
end
|
50
|
+
# no match, either end of string or lex-error
|
51
|
+
if @input != ''
|
52
|
+
raise(Taxonifi::Splitter::SplitterError, "Lexer Error, unknown token at |#{@input[0..20]}...", caller)
|
53
|
+
else
|
54
|
+
return nil
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Match a token to the input.
|
60
|
+
def match(token_class)
|
61
|
+
if (m = token_class.regexp.match(@input))
|
62
|
+
@next_token = token_class.new(m[1])
|
63
|
+
@input = @input[m.end(0)..-1]
|
64
|
+
return true
|
65
|
+
else
|
66
|
+
return false
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#
|
2
|
+
# Parser pattern taken from OboParser and other mjy gems.
|
3
|
+
#
|
4
|
+
# The parser takes a builder and a lexer and does the actual breakdown.
|
5
|
+
#
|
6
|
+
class Taxonifi::Splitter::Parser
|
7
|
+
def initialize(lexer, builder )
|
8
|
+
@lexer = lexer
|
9
|
+
@builder = builder
|
10
|
+
end
|
11
|
+
|
12
|
+
# parse out an author year combination.
|
13
|
+
# TODO: This is only indirectly tested in lumper code
|
14
|
+
def parse_author_year
|
15
|
+
t = @lexer.pop(Taxonifi::Splitter::Tokens::AuthorYear)
|
16
|
+
|
17
|
+
lexer = Taxonifi::Splitter::Lexer.new(t.authors)
|
18
|
+
authors = lexer.pop(Taxonifi::Splitter::Tokens::Authors)
|
19
|
+
|
20
|
+
# TODO: A people collection?
|
21
|
+
authors.names.each do |a|
|
22
|
+
n = Taxonifi::Model::Person.new()
|
23
|
+
n.last_name = a[:last_name]
|
24
|
+
n.initials = a[:initials]
|
25
|
+
@builder.people.push n
|
26
|
+
end
|
27
|
+
|
28
|
+
@builder.year = t.year.to_i
|
29
|
+
@builder.parens = t.parens
|
30
|
+
end
|
31
|
+
|
32
|
+
# Parse a species name
|
33
|
+
def parse_species_name
|
34
|
+
t = @lexer.pop(Taxonifi::Splitter::Tokens::Quadrinomial)
|
35
|
+
ranks = %w{genus subgenus species subspecies}
|
36
|
+
names = {}
|
37
|
+
last_parent = nil
|
38
|
+
ranks.each do |r|
|
39
|
+
names.merge!(r: nil)
|
40
|
+
@builder.send("#{r}=", Taxonifi::Model::Name.new(:name => t.send(r), rank: r) ) if t.send(r)
|
41
|
+
end
|
42
|
+
|
43
|
+
if @lexer.peek(Taxonifi::Splitter::Tokens::AuthorYear)
|
44
|
+
t = @lexer.pop(Taxonifi::Splitter::Tokens::AuthorYear)
|
45
|
+
@builder.names.last.author = t.authors
|
46
|
+
@builder.names.last.year = t.year
|
47
|
+
@builder.names.last.parens = !t.parens
|
48
|
+
@builder.names.last.derive_authors_year
|
49
|
+
end
|
50
|
+
|
51
|
+
@builder
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Taxonifi
|
2
|
+
|
3
|
+
# An implementation of the parser/lexer/token pattern by Krishna Dole which in turn was based on
|
4
|
+
# Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library, which has evolved
|
5
|
+
# into mjy's obo_parser/nexus_parser libraries.
|
6
|
+
module Splitter
|
7
|
+
|
8
|
+
TOKEN_LISTS = [
|
9
|
+
:global_token_list,
|
10
|
+
:volume_number,
|
11
|
+
:pages,
|
12
|
+
:species_name
|
13
|
+
]
|
14
|
+
|
15
|
+
class SplitterError < StandardError; end
|
16
|
+
|
17
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'tokens'))
|
18
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'parser'))
|
19
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'lexer'))
|
20
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'builder'))
|
21
|
+
|
22
|
+
|
23
|
+
# stub, we might not need
|
24
|
+
class Splitter
|
25
|
+
def initialize
|
26
|
+
true
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end # end Splitter module
|
31
|
+
end # Taxonifi module
|
32
|
+
|
33
|
+
|
34
|
+
#= Implementation
|
35
|
+
|
36
|
+
def do_bar(input)
|
37
|
+
@input = input
|
38
|
+
raise(Taxonifi::Splitter::SplitterError, "Nothing passed to parse!") if !@input || @input.size == 0
|
39
|
+
|
40
|
+
builder = Taxonifi::Splitter::SplitterBuilder.new
|
41
|
+
lexer = Taxonifi::Splitter::Lexer.new(@input)
|
42
|
+
Taxonfi::Splitter::Parser.new(lexer, builder).foo
|
43
|
+
return builder.bar
|
44
|
+
end
|
45
|
+
|