taxonifi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
 - data/Gemfile +18 -0
 - data/Gemfile.lock +30 -0
 - data/LICENSE.txt +20 -0
 - data/README.rdoc +155 -0
 - data/Rakefile +53 -0
 - data/VERSION +1 -0
 - data/lib/assessor/assessor.rb +31 -0
 - data/lib/assessor/base.rb +17 -0
 - data/lib/assessor/row_assessor.rb +131 -0
 - data/lib/export/export.rb +9 -0
 - data/lib/export/format/base.rb +43 -0
 - data/lib/export/format/species_file.rb +341 -0
 - data/lib/lumper/lumper.rb +334 -0
 - data/lib/lumper/lumps/parent_child_name_collection.rb +84 -0
 - data/lib/models/author_year.rb +39 -0
 - data/lib/models/base.rb +73 -0
 - data/lib/models/collection.rb +92 -0
 - data/lib/models/generic_object.rb +15 -0
 - data/lib/models/geog.rb +59 -0
 - data/lib/models/geog_collection.rb +28 -0
 - data/lib/models/name.rb +206 -0
 - data/lib/models/name_collection.rb +149 -0
 - data/lib/models/person.rb +49 -0
 - data/lib/models/ref.rb +85 -0
 - data/lib/models/ref_collection.rb +106 -0
 - data/lib/models/species_name.rb +85 -0
 - data/lib/splitter/builder.rb +26 -0
 - data/lib/splitter/lexer.rb +70 -0
 - data/lib/splitter/parser.rb +54 -0
 - data/lib/splitter/splitter.rb +45 -0
 - data/lib/splitter/tokens.rb +322 -0
 - data/lib/taxonifi.rb +36 -0
 - data/test/file_fixtures/Lygaeoidea.csv +801 -0
 - data/test/helper.rb +38 -0
 - data/test/test_exporter.rb +32 -0
 - data/test/test_lumper_geogs.rb +59 -0
 - data/test/test_lumper_hierarchical_collection.rb +88 -0
 - data/test/test_lumper_names.rb +119 -0
 - data/test/test_lumper_parent_child_name_collection.rb +41 -0
 - data/test/test_lumper_refs.rb +91 -0
 - data/test/test_parser.rb +34 -0
 - data/test/test_splitter.rb +27 -0
 - data/test/test_splitter_tokens.rb +403 -0
 - data/test/test_taxonifi.rb +11 -0
 - data/test/test_taxonifi_accessor.rb +61 -0
 - data/test/test_taxonifi_geog.rb +51 -0
 - data/test/test_taxonifi_name.rb +186 -0
 - data/test/test_taxonifi_name_collection.rb +158 -0
 - data/test/test_taxonifi_ref.rb +90 -0
 - data/test/test_taxonifi_ref_collection.rb +69 -0
 - data/test/test_taxonifi_species_name.rb +95 -0
 - metadata +167 -0
 
| 
         @@ -0,0 +1,322 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #
         
     | 
| 
      
 2 
     | 
    
         
            +
            # Tokens are simple classes that return a regular expression (pattern to match).
         
     | 
| 
      
 3 
     | 
    
         
            +
            # You should write a test in test_resolver.rb before defining a token.
         
     | 
| 
      
 4 
     | 
    
         
            +
            # Remember to register tokens in lists at the bottom of this file.
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            module Taxonifi::Splitter::Tokens
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
              class Token 
         
     | 
| 
      
 9 
     | 
    
         
            +
                # This allows access the to class attribute regexp, without using a class variable
         
     | 
| 
      
 10 
     | 
    
         
            +
                class << self 
         
     | 
| 
      
 11 
     | 
    
         
            +
                  attr_reader :regexp
         
     | 
| 
      
 12 
     | 
    
         
            +
                end
         
     | 
| 
      
 13 
     | 
    
         
            +
                
         
     | 
| 
      
 14 
     | 
    
         
            +
                attr_reader :value, :flag
         
     | 
| 
      
 15 
     | 
    
         
            +
                def initialize(str)
         
     | 
| 
      
 16 
     | 
    
         
            +
                  @value = str
         
     | 
| 
      
 17 
     | 
    
         
            +
                end
         
     | 
| 
      
 18 
     | 
    
         
            +
              end
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
              class Year < Token
         
     | 
| 
      
 21 
     | 
    
         
            +
                @regexp = Regexp.new(/\A\s*(\d\d\d\d)\s*/i)
         
     | 
| 
      
 22 
     | 
    
         
            +
              end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
              class LeftParen < Token
         
     | 
| 
      
 25 
     | 
    
         
            +
                @regexp = Regexp.new(/\A\s*(\()\s*/i)
         
     | 
| 
      
 26 
     | 
    
         
            +
              end
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
              class RightParen < Token
         
     | 
| 
      
 29 
     | 
    
         
            +
                @regexp = Regexp.new(/\A\s*(\))\s*/i)
         
     | 
| 
      
 30 
     | 
    
         
            +
              end
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
              class Comma < Token
         
     | 
| 
      
 33 
     | 
    
         
            +
                @regexp = Regexp.new(/\A\s*(\,)\s*/i)
         
     | 
| 
      
 34 
     | 
    
         
            +
              end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
              # A token to match an author year combination, breaks
         
     | 
| 
      
 37 
     | 
    
         
            +
              # the string into three parts.
         
     | 
| 
      
 38 
     | 
    
         
            +
              class AuthorYear < Token
         
     | 
| 
      
 39 
     | 
    
         
            +
                attr_reader :authors, :year, :parens
         
     | 
| 
      
 40 
     | 
    
         
            +
                # This is going to hit just everything, should only be used 
         
     | 
| 
      
 41 
     | 
    
         
            +
                # in one off when you know you have that string.
         
     | 
| 
      
 42 
     | 
    
         
            +
                @regexp = Regexp.new(/\A\s*(\(?[^\+\d)]+(\d\d\d\d)?\)?)\s*/i)
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                def initialize(str)
         
     | 
| 
      
 45 
     | 
    
         
            +
                  str.strip!
         
     | 
| 
      
 46 
     | 
    
         
            +
                  # check for parens
         
     | 
| 
      
 47 
     | 
    
         
            +
                  if str =~ /\((.*)\)/
         
     | 
| 
      
 48 
     | 
    
         
            +
                    w = $1
         
     | 
| 
      
 49 
     | 
    
         
            +
                    @parens = true
         
     | 
| 
      
 50 
     | 
    
         
            +
                  else
         
     | 
| 
      
 51 
     | 
    
         
            +
                    w = str
         
     | 
| 
      
 52 
     | 
    
         
            +
                    @parens = false
         
     | 
| 
      
 53 
     | 
    
         
            +
                  end
         
     | 
| 
      
 54 
     | 
    
         
            +
                  # check for year
         
     | 
| 
      
 55 
     | 
    
         
            +
                  if w =~ /(\d\d\d\d)\Z/
         
     | 
| 
      
 56 
     | 
    
         
            +
                    @year = $1.to_i
         
     | 
| 
      
 57 
     | 
    
         
            +
                    w.gsub!(/\d\d\d\d\Z/, "")
         
     | 
| 
      
 58 
     | 
    
         
            +
                    w.strip!
         
     | 
| 
      
 59 
     | 
    
         
            +
                  end
         
     | 
| 
      
 60 
     | 
    
         
            +
                  w.gsub!(/,\s*\Z/, '')
         
     | 
| 
      
 61 
     | 
    
         
            +
                  @authors = w.strip
         
     | 
| 
      
 62 
     | 
    
         
            +
                  true 
         
     | 
| 
      
 63 
     | 
    
         
            +
                end
         
     | 
| 
      
 64 
     | 
    
         
            +
              end
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
      
 66 
     | 
    
         
            +
              # Complex breakdown of author strings. Handles
         
     | 
| 
      
 67 
     | 
    
         
            +
              # a wide variety of formats.   
         
     | 
| 
      
 68 
     | 
    
         
            +
              # See test_splitter_tokens.rb for scope. As with
         
     | 
| 
      
 69 
     | 
    
         
            +
              # AuthorYear this will match just about anything when used alone.
         
     | 
| 
      
 70 
     | 
    
         
            +
              # Add exceptions at will, just test using TestSplittTokens#test_authors.
         
     | 
| 
      
 71 
     | 
    
         
            +
              # TODO: Unicode the [a-z] bits?
         
     | 
| 
      
 72 
     | 
    
         
            +
              class Authors < Token
         
     | 
| 
      
 73 
     | 
    
         
            +
                attr_reader :names
         
     | 
| 
      
 74 
     | 
    
         
            +
                @regexp = Regexp.new(/\A\s*([^\d]+)\s*/i)
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
                def initialize(input)
         
     | 
| 
      
 77 
     | 
    
         
            +
                  str = input 
         
     | 
| 
      
 78 
     | 
    
         
            +
                  @names = [] 
         
     | 
| 
      
 79 
     | 
    
         
            +
                  str.strip!
         
     | 
| 
      
 80 
     | 
    
         
            +
                  naked_and = false # look for the pattern 'Foo, Bar and Smith', i.e. no initials
         
     | 
| 
      
 81 
     | 
    
         
            +
                  individuals = []
         
     | 
| 
      
 82 
     | 
    
         
            +
                  last_individual = nil
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
                  # We can simplify if there is an "and" or & 
         
     | 
| 
      
 85 
     | 
    
         
            +
                  if str =~ /(\s+and\s+|\&)/i
         
     | 
| 
      
 86 
     | 
    
         
            +
                    l,r = str.split(/\s+\,?\s*and\s+|\s+\&\s+/i, 2) # added \, \s+
         
     | 
| 
      
 87 
     | 
    
         
            +
                    last_individual = r
         
     | 
| 
      
 88 
     | 
    
         
            +
                    str = l  
         
     | 
| 
      
 89 
     | 
    
         
            +
                    naked_and = true
         
     | 
| 
      
 90 
     | 
    
         
            +
                  end
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
                  # Look for an exception case, no initials, "and" or "&" previously present, like:
         
     | 
| 
      
 93 
     | 
    
         
            +
                  #   Foo, Bar and Smith  
         
     | 
| 
      
 94 
     | 
    
         
            +
                  if naked_and && not(str =~ /\./) && str =~ /s*([A-Z][a-z]{1,})\s*\,+\s*([A-Z][a-z]{1,})/ 
         
     | 
| 
      
 95 
     | 
    
         
            +
                    individuals.unshift str.split(/\s*\,\s*/)
         
     | 
| 
      
 96 
     | 
    
         
            +
                    str = nil 
         
     | 
| 
      
 97 
     | 
    
         
            +
                  end
         
     | 
| 
      
 98 
     | 
    
         
            +
             
     | 
| 
      
 99 
     | 
    
         
            +
                  # Look for an exception case, no periods and multiple commas, like:
         
     | 
| 
      
 100 
     | 
    
         
            +
                  #   Foo A, Bar ZA, Smith-Blorf A
         
     | 
| 
      
 101 
     | 
    
         
            +
                  if str && !naked_and && (str.split(",").size > 2) && !(str =~ /\./)
         
     | 
| 
      
 102 
     | 
    
         
            +
                    individuals = str.split(",")
         
     | 
| 
      
 103 
     | 
    
         
            +
                    str = nil
         
     | 
| 
      
 104 
     | 
    
         
            +
                  end
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
                  prefix = ['van den ', 'Van ', "O'", "Mc", 'Campos ', 'Costa ']
         
     | 
| 
      
 107 
     | 
    
         
            +
                  pre_reg = prefix.collect{|p| "(#{Regexp.escape(p)})?"}.join
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
                  postfix = ['de la', 'von', 'da', 'van', ', Jr.'] 
         
     | 
| 
      
 110 
     | 
    
         
            +
                  post_reg = postfix.collect{|p| "(#{Regexp.escape(p)})?"}.join
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
                  # Initials second
         
     | 
| 
      
 113 
     | 
    
         
            +
                  m1 = Regexp.new(/^\s*(#{pre_reg}             # legal prefix words, includes space if present
         
     | 
| 
      
 114 
     | 
    
         
            +
                                        [A-Z][a-z]+            # a captialized Name 
         
     | 
| 
      
 115 
     | 
    
         
            +
                                        (\-[A-Z][a-z]+)?       # optional dashed addition
         
     | 
| 
      
 116 
     | 
    
         
            +
                                        \s*,\s*                # required comma
         
     | 
| 
      
 117 
     | 
    
         
            +
                                        (\s*                   #  initials, optionally surrounded by whitescape
         
     | 
| 
      
 118 
     | 
    
         
            +
                                         (\-)?                 # optional preceeding dash, hits second initials 
         
     | 
| 
      
 119 
     | 
    
         
            +
                                         [A-Z]                 # required capital initial
         
     | 
| 
      
 120 
     | 
    
         
            +
                                         (\-)?                 # optional initial dash   
         
     | 
| 
      
 121 
     | 
    
         
            +
                                         (\-[A-Z])?            # optional dashed initial
         
     | 
| 
      
 122 
     | 
    
         
            +
                                        \s*\.                  # required period
         
     | 
| 
      
 123 
     | 
    
         
            +
                                        \s*)              
         
     | 
| 
      
 124 
     | 
    
         
            +
                                        {1,}                   # repeat initials as necessary
         
     | 
| 
      
 125 
     | 
    
         
            +
                                        #{post_reg})           # optional legal postfixes
         
     | 
| 
      
 126 
     | 
    
         
            +
                                    \s*/x)
         
     | 
| 
      
 127 
     | 
    
         
            +
             
     | 
| 
      
 128 
     | 
    
         
            +
                  # Initials first
         
     | 
| 
      
 129 
     | 
    
         
            +
                  m2 = Regexp.new(/^\s*(([A-Z]\.\s*){1,}#{pre_reg}[A-Z][a-z]+#{post_reg}),/)  #  (R. Watson | R.F. Watson),
         
     | 
| 
      
 130 
     | 
    
         
            +
             
     | 
| 
      
 131 
     | 
    
         
            +
                  # pick off remaining authors one at a time 
         
     | 
| 
      
 132 
     | 
    
         
            +
                  if str
         
     | 
| 
      
 133 
     | 
    
         
            +
                    parsing = true
         
     | 
| 
      
 134 
     | 
    
         
            +
                    i = 0
         
     | 
| 
      
 135 
     | 
    
         
            +
                    while parsing
         
     | 
| 
      
 136 
     | 
    
         
            +
                      individual = ''
         
     | 
| 
      
 137 
     | 
    
         
            +
                      check_for_more_individuals = false
         
     | 
| 
      
 138 
     | 
    
         
            +
                      [m2, m1].each do |regex|
         
     | 
| 
      
 139 
     | 
    
         
            +
                        if str =~ regex
         
     | 
| 
      
 140 
     | 
    
         
            +
                          individual = $1
         
     | 
| 
      
 141 
     | 
    
         
            +
                          str.slice!(individual)
         
     | 
| 
      
 142 
     | 
    
         
            +
                          str.strip!
         
     | 
| 
      
 143 
     | 
    
         
            +
                          str.slice!(",")
         
     | 
| 
      
 144 
     | 
    
         
            +
                          individuals.push(individual)
         
     | 
| 
      
 145 
     | 
    
         
            +
                          check_for_more_individuals = true # at least once match, keep going
         
     | 
| 
      
 146 
     | 
    
         
            +
                        end
         
     | 
| 
      
 147 
     | 
    
         
            +
                      end
         
     | 
| 
      
 148 
     | 
    
         
            +
             
     | 
| 
      
 149 
     | 
    
         
            +
                      # puts "[#{individual}] : #{str}"
         
     | 
| 
      
 150 
     | 
    
         
            +
                      if !check_for_more_individuals
         
     | 
| 
      
 151 
     | 
    
         
            +
                        if str && str.size != 0
         
     | 
| 
      
 152 
     | 
    
         
            +
                          individuals.push(str)
         
     | 
| 
      
 153 
     | 
    
         
            +
                          parsing = false
         
     | 
| 
      
 154 
     | 
    
         
            +
                        end
         
     | 
| 
      
 155 
     | 
    
         
            +
                      end
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
                      i += 1
         
     | 
| 
      
 158 
     | 
    
         
            +
                      raise if i > 100
         
     | 
| 
      
 159 
     | 
    
         
            +
                      parsing = false if str.size == 0
         
     | 
| 
      
 160 
     | 
    
         
            +
                    end
         
     | 
| 
      
 161 
     | 
    
         
            +
                  end
         
     | 
| 
      
 162 
     | 
    
         
            +
             
     | 
| 
      
 163 
     | 
    
         
            +
                  # Note to remember positive look behind (?<= ) for future hax
         
     | 
| 
      
 164 
     | 
    
         
            +
                  # str.split(/(?<=[A-Z])\s*[;,]{1}\s*/, 2)
         
     | 
| 
      
 165 
     | 
    
         
            +
             
     | 
| 
      
 166 
     | 
    
         
            +
                  individuals.push(last_individual) if !last_individual.nil?
         
     | 
| 
      
 167 
     | 
    
         
            +
                  individuals.flatten!
         
     | 
| 
      
 168 
     | 
    
         
            +
             
     | 
| 
      
 169 
     | 
    
         
            +
                  # At this point we have isolated individuals.  Strategy is to slice out initials and remainder is last name.
         
     | 
| 
      
 170 
     | 
    
         
            +
                  # Initials regex matches any A-B. A. or " A ", "A-B" pattern (including repeats) 
         
     | 
| 
      
 171 
     | 
    
         
            +
                  # TODO: Make a Token
         
     | 
| 
      
 172 
     | 
    
         
            +
                  match_initials = Regexp.new(/(((\s((\-)?[A-Z](\-[A-Z])?\s?){1,})$)|(((\-)?[A-Z](\-[A-Z|a-z]\s*)?\.\s*){1,})|(\s((\-)?[A-Z](\-[A-Z])?\s){1,}))/)
         
     | 
| 
      
 173 
     | 
    
         
            +
             
     | 
| 
      
 174 
     | 
    
         
            +
                  # TODO: merge with pre/postfix list
         
     | 
| 
      
 175 
     | 
    
         
            +
                  suffixes = [
         
     | 
| 
      
 176 
     | 
    
         
            +
                    Regexp.new(/\s(van)\s?/i),
         
     | 
| 
      
 177 
     | 
    
         
            +
                    Regexp.new(/\s(jr\.)/i),
         
     | 
| 
      
 178 
     | 
    
         
            +
                    Regexp.new(/\s(von)\s?/i),
         
     | 
| 
      
 179 
     | 
    
         
            +
                    Regexp.new(/\s(de la)\s?/i),
         
     | 
| 
      
 180 
     | 
    
         
            +
                    Regexp.new(/\s(da)\s?/i),
         
     | 
| 
      
 181 
     | 
    
         
            +
                  ]
         
     | 
| 
      
 182 
     | 
    
         
            +
             
     | 
| 
      
 183 
     | 
    
         
            +
                  individuals.each do |i|
         
     | 
| 
      
 184 
     | 
    
         
            +
                    a = {}  # new author
         
     | 
| 
      
 185 
     | 
    
         
            +
             
     | 
| 
      
 186 
     | 
    
         
            +
                    initials = nil
         
     | 
| 
      
 187 
     | 
    
         
            +
                    last_name = nil
         
     | 
| 
      
 188 
     | 
    
         
            +
                    if i =~ match_initials
         
     | 
| 
      
 189 
     | 
    
         
            +
                      initials = $1
         
     | 
| 
      
 190 
     | 
    
         
            +
                      i.slice!(initials)
         
     | 
| 
      
 191 
     | 
    
         
            +
                      i.strip! 
         
     | 
| 
      
 192 
     | 
    
         
            +
                      last_name = i
         
     | 
| 
      
 193 
     | 
    
         
            +
                    else
         
     | 
| 
      
 194 
     | 
    
         
            +
                      last_name = i
         
     | 
| 
      
 195 
     | 
    
         
            +
                    end
         
     | 
| 
      
 196 
     | 
    
         
            +
             
     | 
| 
      
 197 
     | 
    
         
            +
                    suffix = [] 
         
     | 
| 
      
 198 
     | 
    
         
            +
                    suffixes.each do |s| # .collect{|p| Regexp.escape(p)}.each do |s|
         
     | 
| 
      
 199 
     | 
    
         
            +
                      if last_name =~ s
         
     | 
| 
      
 200 
     | 
    
         
            +
                        t = $1 
         
     | 
| 
      
 201 
     | 
    
         
            +
                        suffix.push(t) 
         
     | 
| 
      
 202 
     | 
    
         
            +
                        last_name.slice!(t)
         
     | 
| 
      
 203 
     | 
    
         
            +
                      end
         
     | 
| 
      
 204 
     | 
    
         
            +
                    end
         
     | 
| 
      
 205 
     | 
    
         
            +
                    a[:suffix] = suffix.join(" ") if suffix.size > 0 
         
     | 
| 
      
 206 
     | 
    
         
            +
             
     | 
| 
      
 207 
     | 
    
         
            +
                    last_name.gsub!(/\.|\,/, '')
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
                    a[:last_name] = last_name.strip if last_name # "if" not fully tested for consequences
         
     | 
| 
      
 210 
     | 
    
         
            +
                    a[:initials] = initials.strip.split(/\s|\./).collect{|v| v.strip}.select{|x| x.size > 0} if initials && initials.size > 0
         
     | 
| 
      
 211 
     | 
    
         
            +
             
     | 
| 
      
 212 
     | 
    
         
            +
                    @names << a
         
     | 
| 
      
 213 
     | 
    
         
            +
                  end
         
     | 
| 
      
 214 
     | 
    
         
            +
                end
         
     | 
| 
      
 215 
     | 
    
         
            +
              end
         
     | 
| 
      
 216 
     | 
    
         
            +
             
     | 
| 
      
 217 
     | 
    
         
            +
              # A token to match volume-number combinations, with various possible formats.
         
     | 
| 
      
 218 
     | 
    
         
            +
              class VolumeNumber  < Token
         
     | 
| 
      
 219 
     | 
    
         
            +
                attr_reader :volume, :number
         
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
      
 221 
     | 
    
         
            +
                @regexp = Regexp.new(/\A\s*(([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?)\s*/i)
         
     | 
| 
      
 222 
     | 
    
         
            +
                # @regexp = Regexp.new(/\A\s*((\d+)\s*[:\(]?\s*(\d+)?\)?)\s*/i) <- only digits allowed in this version
         
     | 
| 
      
 223 
     | 
    
         
            +
             
     | 
| 
      
 224 
     | 
    
         
            +
                def initialize(str)
         
     | 
| 
      
 225 
     | 
    
         
            +
                  str.strip 
         
     | 
| 
      
 226 
     | 
    
         
            +
                  str =~ /\A\s*([^:(]+)\s*[:\(]?\s*([^:)]+)?\)?\s*/i
         
     | 
| 
      
 227 
     | 
    
         
            +
                  @volume = $1
         
     | 
| 
      
 228 
     | 
    
         
            +
                  @number = $2
         
     | 
| 
      
 229 
     | 
    
         
            +
                  @volume && @volume.strip!
         
     | 
| 
      
 230 
     | 
    
         
            +
                  @number && @number.strip!
         
     | 
| 
      
 231 
     | 
    
         
            +
                end
         
     | 
| 
      
 232 
     | 
    
         
            +
              end
         
     | 
| 
      
 233 
     | 
    
         
            +
             
     | 
| 
      
 234 
     | 
    
         
            +
              # A token to match page ranges, with remainders noted. 
         
     | 
| 
      
 235 
     | 
    
         
            +
              class Pages < Token
         
     | 
| 
      
 236 
     | 
    
         
            +
                attr_reader :pg_start, :pg_end, :remainder
         
     | 
| 
      
 237 
     | 
    
         
            +
                @regexp = Regexp.new(/\A\s*((\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?)/i)
         
     | 
| 
      
 238 
     | 
    
         
            +
             
     | 
| 
      
 239 
     | 
    
         
            +
                def initialize(str)
         
     | 
| 
      
 240 
     | 
    
         
            +
                  str.strip 
         
     | 
| 
      
 241 
     | 
    
         
            +
                  str =~ /\A\s*(\d+)\s*[-]?\s*(\d+)?\)?\s*[\.\,]?(.*)?/i
         
     | 
| 
      
 242 
     | 
    
         
            +
                  @pg_start = $1 
         
     | 
| 
      
 243 
     | 
    
         
            +
                  @pg_end = $2
         
     | 
| 
      
 244 
     | 
    
         
            +
                  @remainder = $3
         
     | 
| 
      
 245 
     | 
    
         
            +
                end
         
     | 
| 
      
 246 
     | 
    
         
            +
              end
         
     | 
| 
      
 247 
     | 
    
         
            +
             
     | 
| 
      
 248 
     | 
    
         
            +
              # A token to match quadrinomial.s
         
     | 
| 
      
 249 
     | 
    
         
            +
              # Matches: 
         
     | 
| 
      
 250 
     | 
    
         
            +
              # Foo
         
     | 
| 
      
 251 
     | 
    
         
            +
              # Foo (Bar)
         
     | 
| 
      
 252 
     | 
    
         
            +
              # Foo (Bar) stuff
         
     | 
| 
      
 253 
     | 
    
         
            +
              # Foo (Bar) stuff things
         
     | 
| 
      
 254 
     | 
    
         
            +
              # Foo stuff
         
     | 
| 
      
 255 
     | 
    
         
            +
              # Foo stuff things
         
     | 
| 
      
 256 
     | 
    
         
            +
              # TODO: This will likley erroroneously match on authors names that are uncapitalized, e.g.:
         
     | 
| 
      
 257 
     | 
    
         
            +
              #   Foo stuff von Helsing, 1920
         
     | 
| 
      
 258 
     | 
    
         
            +
              class Quadrinomial < Token
         
     | 
| 
      
 259 
     | 
    
         
            +
                attr_reader :genus, :subgenus, :species, :subspecies
         
     | 
| 
      
 260 
     | 
    
         
            +
                @regexp = Regexp.new(/\A\s*(([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?)\s*/)
         
     | 
| 
      
 261 
     | 
    
         
            +
             
     | 
| 
      
 262 
     | 
    
         
            +
                def initialize(str)
         
     | 
| 
      
 263 
     | 
    
         
            +
                  str.strip 
         
     | 
| 
      
 264 
     | 
    
         
            +
                  str =~ /\A\s*([A-Z][^\s]+)\s*(\([A-Z][a-z]+\))?\s?([a-z][^\s]+)?\s?([a-z][^\s]+)?\s*/i
         
     | 
| 
      
 265 
     | 
    
         
            +
                  @genus = $1 
         
     | 
| 
      
 266 
     | 
    
         
            +
                  @subgenus = $2
         
     | 
| 
      
 267 
     | 
    
         
            +
                  @species = $3
         
     | 
| 
      
 268 
     | 
    
         
            +
                  @subspecies = $4
         
     | 
| 
      
 269 
     | 
    
         
            +
             
     | 
| 
      
 270 
     | 
    
         
            +
                  if @subgenus =~ /\((.*)\)/
         
     | 
| 
      
 271 
     | 
    
         
            +
                    @subgenus = $1
         
     | 
| 
      
 272 
     | 
    
         
            +
                  end
         
     | 
| 
      
 273 
     | 
    
         
            +
                end
         
     | 
| 
      
 274 
     | 
    
         
            +
              end
         
     | 
| 
      
 275 
     | 
    
         
            +
             
     | 
| 
      
 276 
     | 
    
         
            +
              # !! You must register token lists as symbols in
         
     | 
| 
      
 277 
     | 
    
         
            +
              # !! Taxonifi::Splitter
         
     | 
| 
      
 278 
     | 
    
         
            +
              # 
         
     | 
| 
      
 279 
     | 
    
         
            +
              # Include all tokens in the global_token_list.
         
     | 
| 
      
 280 
     | 
    
         
            +
              # Tokens are matched in order of the list. If you 
         
     | 
| 
      
 281 
     | 
    
         
            +
              # re-order an list ensure that unit tests fail.
         
     | 
| 
      
 282 
     | 
    
         
            +
              # Create an untested list at your own risk, any proposed
         
     | 
| 
      
 283 
     | 
    
         
            +
              # ordering will be accepted as long as tests pass.
         
     | 
| 
      
 284 
     | 
    
         
            +
              
         
     | 
| 
      
 285 
     | 
    
         
            +
              # All tokens.
         
     | 
| 
      
 286 
     | 
    
         
            +
              def self.global_token_list
         
     | 
| 
      
 287 
     | 
    
         
            +
                [ 
         
     | 
| 
      
 288 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::Quadrinomial,
         
     | 
| 
      
 289 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::LeftParen,
         
     | 
| 
      
 290 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::Year,
         
     | 
| 
      
 291 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::Comma,
         
     | 
| 
      
 292 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::RightParen,
         
     | 
| 
      
 293 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::AuthorYear,
         
     | 
| 
      
 294 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::Authors,
         
     | 
| 
      
 295 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::VolumeNumber,
         
     | 
| 
      
 296 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::Pages,
         
     | 
| 
      
 297 
     | 
    
         
            +
                ]   
         
     | 
| 
      
 298 
     | 
    
         
            +
              end
         
     | 
| 
      
 299 
     | 
    
         
            +
             
     | 
| 
      
 300 
     | 
    
         
            +
              # Tokens used in breaking down volume/number ranges.
         
     | 
| 
      
 301 
     | 
    
         
            +
              def self.volume_number
         
     | 
| 
      
 302 
     | 
    
         
            +
                [
         
     | 
| 
      
 303 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::VolumeNumber
         
     | 
| 
      
 304 
     | 
    
         
            +
                ]
         
     | 
| 
      
 305 
     | 
    
         
            +
              end
         
     | 
| 
      
 306 
     | 
    
         
            +
             
     | 
| 
      
 307 
     | 
    
         
            +
              # Tokens used in breaking down page ranges.
         
     | 
| 
      
 308 
     | 
    
         
            +
              def self.pages
         
     | 
| 
      
 309 
     | 
    
         
            +
                [
         
     | 
| 
      
 310 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::Pages
         
     | 
| 
      
 311 
     | 
    
         
            +
                ]
         
     | 
| 
      
 312 
     | 
    
         
            +
              end
         
     | 
| 
      
 313 
     | 
    
         
            +
             
     | 
| 
      
 314 
     | 
    
         
            +
              # Tokens used in breaking down species names.
         
     | 
| 
      
 315 
     | 
    
         
            +
              def self.species_name
         
     | 
| 
      
 316 
     | 
    
         
            +
                [
         
     | 
| 
      
 317 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::Quadrinomial,
         
     | 
| 
      
 318 
     | 
    
         
            +
                  Taxonifi::Splitter::Tokens::AuthorYear,
         
     | 
| 
      
 319 
     | 
    
         
            +
                ]
         
     | 
| 
      
 320 
     | 
    
         
            +
              end
         
     | 
| 
      
 321 
     | 
    
         
            +
             
     | 
| 
      
 322 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/taxonifi.rb
    ADDED
    
    | 
         @@ -0,0 +1,36 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'csv'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            # Everything in Taxonifi is in here.
         
     | 
| 
      
 4 
     | 
    
         
            +
            module Taxonifi
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
              # Taxonomic ranks. 
         
     | 
| 
      
 7 
     | 
    
         
            +
              RANKS = %w{
         
     | 
| 
      
 8 
     | 
    
         
            +
                  kingdom
         
     | 
| 
      
 9 
     | 
    
         
            +
                  phylum
         
     | 
| 
      
 10 
     | 
    
         
            +
                  class
         
     | 
| 
      
 11 
     | 
    
         
            +
                  infraclass
         
     | 
| 
      
 12 
     | 
    
         
            +
                  order 
         
     | 
| 
      
 13 
     | 
    
         
            +
                  suborder
         
     | 
| 
      
 14 
     | 
    
         
            +
                  infraorder
         
     | 
| 
      
 15 
     | 
    
         
            +
                  superfamily
         
     | 
| 
      
 16 
     | 
    
         
            +
                  family
         
     | 
| 
      
 17 
     | 
    
         
            +
                  subfamily
         
     | 
| 
      
 18 
     | 
    
         
            +
                  tribe
         
     | 
| 
      
 19 
     | 
    
         
            +
                  subtribe
         
     | 
| 
      
 20 
     | 
    
         
            +
                  genus
         
     | 
| 
      
 21 
     | 
    
         
            +
                  subgenus
         
     | 
| 
      
 22 
     | 
    
         
            +
                  species
         
     | 
| 
      
 23 
     | 
    
         
            +
                  subspecies
         
     | 
| 
      
 24 
     | 
    
         
            +
              }
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
              require File.expand_path(File.join(File.dirname(__FILE__), 'lumper/lumper'))
         
     | 
| 
      
 28 
     | 
    
         
            +
              require File.expand_path(File.join(File.dirname(__FILE__), 'splitter/splitter'))
         
     | 
| 
      
 29 
     | 
    
         
            +
              require File.expand_path(File.join(File.dirname(__FILE__), 'assessor/assessor'))
         
     | 
| 
      
 30 
     | 
    
         
            +
              require File.expand_path(File.join(File.dirname(__FILE__), 'export/export'))
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
              Dir.glob( File.expand_path(File.join(File.dirname(__FILE__), "models/*.rb") )) do |file|
         
     | 
| 
      
 33 
     | 
    
         
            +
                require file
         
     | 
| 
      
 34 
     | 
    
         
            +
              end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
            end
         
     |