RubyGems - biodiversity - Versions diffs - 1.0.10 → 3.0.0 - Mend

biodiversity 1.0.10 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/.rvmrc +1 -1
data/.travis.yml +7 -0
data/CHANGELOG +42 -0
data/Gemfile +8 -6
data/Gemfile.lock +33 -33
data/README.md +167 -0
data/Rakefile +16 -11
data/VERSION +1 -1
data/bin/parserver +33 -44
data/lib/biodiversity/parser.rb +160 -33
data/lib/biodiversity/parser/scientific_name_canonical.treetop +4 -2
data/lib/biodiversity/parser/scientific_name_clean.treetop +479 -277
data/lib/biodiversity/parser/scientific_name_dirty.treetop +11 -16
data/spec/parser/scientific_name.spec.rb +63 -7
data/spec/parser/scientific_name_clean.spec.rb +76 -24
data/spec/parser/scientific_name_dirty.spec.rb +4 -6
data/spec/parser/test_data.txt +132 -41
data/spec/parser/todo.txt +27 -0
metadata +153 -119
data/README.rdoc +0 -99

data/lib/biodiversity/parser.rb CHANGED Viewed

@@ -9,22 +9,44 @@ require 'json'
 module PreProcessor
   NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
   TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
-  TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
-  TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p.\s?p.)\s*$/i
+  TAXON_CONCEPTS2 = /\s+
+                     (\(?s\.\s?s\.|
+                     \(?s\.\s?l\.|
+                     \(?s\.\s?str\.|
+                     \(?s\.\s?lat\.|
+                    sec\.|sec|near)\b.*$/x
+  TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
   NOMEN_CONCEPTS  = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
-  LAST_WORD_JUNK  = /(,\s*|\s+)(von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort.|hort)\s*$/i
+  LAST_WORD_JUNK  = /(,\s*|\s+)
+                    (spp\.|spp|var\.|
+                     var|von|van|ined\.|
+                     ined|sensu|new|non|nec|
+                     nudum|cf\.|cf|sp\.|sp|
+                     ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
   def self.clean(a_string)
-    [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
+    [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
+     TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
       a_string = a_string.gsub(i, '')
     end
     a_string = a_string.tr('ſ','s') #old 's'
     a_string
-  end
+  end
 end
+# Public: Parser which runs in parallel.
+#
+# Examples
+#
+# parser = ParallelParser.new(4)
+# parser.parse(['Betula L.', 'Pardosa moesta'])
 class ParallelParser
+  # Public: Initialize ParallelParser.
+  #
+  # processes_num - an Integer to setup the number of processes (default: nil).
+  #                 If processes number is not set it will be determined
+  #                 automatically.
   def initialize(processes_num = nil)
     require 'parallel'
     cpu_num
@@ -35,11 +57,32 @@ class ParallelParser
     end
   end
+  # Public: Parses an array of scientific names using several processes
+  # in parallel.
+  #
+  # Scientific names are deduplicated in the process, so every string is
+  # parsed only once.
+  #
+  # names_list - takes an Array of scientific names,
+  #              each element should be a String.
+  #
+  # Examples
+  #
+  # parser = ParallelParser.new(4)
+  # parser.parse(['Homo sapiens L.', 'Quercus quercus'])
+  #
+  # Returns a Hash with scientific names as a key, and parsing results as
+  # a value.
   def parse(names_list)
-    parsed = Parallel.map(names_list.uniq, :in_processes => @processes_num) { |n| [n, parse_process(n)] }
+    parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
+      [n, parse_process(n)]
+    end
     parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
   end
+  # Public: Returns the number of cores/CPUs.
+  #
+  # Returns Integer of cores/CPUs.
   def cpu_num
     @cpu_num ||= Parallel.processor_count
   end
@@ -47,7 +90,7 @@ class ParallelParser
   private
   def parse_process(name)
     p = ScientificNameParser.new
-    p.parse(name) rescue {:scientificName => {:parsed => false, :verbatim => name,  :error => 'Parser error'}}
+    p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
   end
 end
@@ -58,15 +101,64 @@ end
 #     @family    = /^\s*[A-Z][a-z]\+viridae|viroidae/i
 #     @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
 #     @genus     = /^\s*[A-Z][a-z]\+virus|viroid/i
-#     @species   = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/i
+#     @species   = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
+#                   viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
 #     @parsed    = nil
 #   end
 # end
 class ScientificNameParser
-  VERSION = open(File.join(File.dirname(__FILE__), '..', '..', 'VERSION')).readline.strip
-  def initialize
+  VERSION = open(File.join(File.dirname(__FILE__),
+                           '..',
+                           '..',
+                           'VERSION')).readline.strip
+  FAILED_RESULT = ->(name) do
+    { scientificName:
+      { parsed: false, verbatim: name.to_s.strip,  error: 'Parser error' }
+    }
+  end
+  def self.version
+    VERSION
+  end
+  def self.fix_case(name_string)
+    name_ary = name_string.split(/\s+/)
+    words_num = name_ary.size
+    res = nil
+    if words_num == 1
+      res = name_ary[0].gsub(/[\(\)\{\}]/, '')
+      if res.size > 1
+        res = UnicodeUtils.upcase(res[0]) + UnicodeUtils.downcase(res[1..-1])
+      else
+        res = nil
+      end
+    else
+      if name_ary[0].size > 1
+        word1 = UnicodeUtils.upcase(name_ary[0][0]) +
+          UnicodeUtils.downcase(name_ary[0][1..-1])
+      else
+        word1 = name_ary[0]
+      end
+      if name_ary[1].match(/^\(/)
+        word2 = name_ary[1].gsub(/\)$/, '') + ')'
+        word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
+          UnicodeUtils.downcase(word2[2..-1])
+      else
+        word2 = UnicodeUtils.downcase(name_ary[1])
+      end
+      res = word1 + ' ' +
+        word2 + ' ' +
+        name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(' ')
+      res.strip!
+    end
+    res
+  end
+  def initialize(opts = {})
+    @canonical_with_rank = !!opts[:canonical_with_rank]
     @verbatim = ''
     @clean = ScientificNameCleanParser.new
     @dirty = ScientificNameDirtyParser.new
@@ -75,8 +167,12 @@ class ScientificNameParser
   end
   def virus?(a_string)
-    !!(a_string.match(/\sICTV\s*$/) || a_string.match(/\b(virus|viruses|phage|phages|viroid|viroids|satellite|satellites|prion|prions)\b/i))
-  end
+    !!(a_string.match(/\sICTV\s*$/) ||
+       a_string.match(/\b(virus|viruses|
+                          phage|phages|viroid|viroids|
+                          satellite|satellites|prion|prions)\b/ix) ||
+       a_string.match(/[A-Z]?[a-z]+virus\b/))
+  end
   def unknown_placement?(a_string)
     !!(a_string.match(/incertae\s+sedis/i) || a_string.match(/inc\.\s*sed\./i))
@@ -85,54 +181,85 @@ class ScientificNameParser
   def parsed
     @parsed
   end
   def parse(a_string)
-    @verbatim = a_string
+    @verbatim = a_string.strip
     a_string = PreProcessor::clean(a_string)
     if virus?(a_string)
-      @parsed = { :verbatim => a_string, :virus => true }
+      @parsed = { verbatim: a_string, virus: true }
     elsif unknown_placement?(a_string)
-      @parsed = { :verbatim => a_string }
+      @parsed = { verbatim: a_string }
     else
-      @parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || { :verbatim => a_string }
+      begin
+        @parsed = @clean.parse(a_string) || @dirty.parse(a_string)
+        unless @parsed
+          index = @dirty.index || @clean.index
+          salvage_match = a_string[0..index].split(/\s+/)[0..-2]
+          salvage_string = salvage_match ? salvage_match.join(' ') : a_string
+          @parsed =  @dirty.parse(salvage_string) ||
+                     @canonical.parse(a_string) ||
+                     { verbatim: a_string }
+        end
+      rescue
+        @parsed = FAILED_RESULT.(@verbatim)
+      end
     end
     def @parsed.verbatim=(a_string)
       @verbatim = a_string
     end
-    def @parsed.all(verbatim = @verbatim)
+    def @parsed.all(opts = {})
+      canonical_with_rank = !!opts[:canonical_with_rank]
       parsed = self.class != Hash
-      res = { :parsed => parsed, :parser_version => ScientificNameParser::VERSION}
+      res = { parsed: parsed, parser_version: ScientificNameParser::VERSION}
       if parsed
         hybrid = self.hybrid rescue false
         res.merge!({
-          :verbatim => @verbatim,
-          :normalized => self.value,
-          :canonical => self.canonical,
-          :hybrid => hybrid,
-          :details => self.details,
-          :parser_run => self.parser_run,
-          :positions => self.pos
+          verbatim: @verbatim,
+          normalized: self.value,
+          canonical: self.canonical,
+          hybrid: hybrid,
+          details: self.details,
+          parser_run: self.parser_run,
+          positions: self.pos
           })
       else
         res.merge!(self)
       end
+      if (canonical_with_rank &&
+          canonical.count(' ') > 1 &&
+          res[:details][0][:infraspecies])
+        ScientificNameParser.add_rank_to_canonical(res)
+      end
       res = {:scientificName => res}
-      res
     end
     def @parsed.pos_json
       self.pos.to_json rescue ''
     end
     def @parsed.all_json
       self.all.to_json rescue ''
     end
     @parsed.verbatim = @verbatim
-    @parsed.all
+    @parsed.all(canonical_with_rank: @canonical_with_rank)
+  end
+  private
+  def self.add_rank_to_canonical(parsed)
+    parts = parsed[:canonical].split(' ')
+    name_ary = parts[0..1]
+    parsed[:details][0][:infraspecies].each do |data|
+      infrasp = data[:string]
+      rank = data[:rank]
+      name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
+    end
+    parsed[:canonical] = name_ary.join(' ')
   end
 end

data/lib/biodiversity/parser/scientific_name_canonical.treetop CHANGED Viewed

@@ -40,7 +40,7 @@ grammar ScientificNameCanonical
       end
       def canonical
-        a.canonical + " " + b.canonical + " " + c.canonical
+        a.canonical + " " + c.canonical
       end
       def pos
@@ -58,7 +58,7 @@ grammar ScientificNameCanonical
       end
       def canonical
-        a.canonical + " " + b.canonical
+        a.canonical
       end
       def pos
@@ -110,6 +110,8 @@ grammar ScientificNameCanonical
   end
   rule garbage
+    space "$$g@rbg3$$"
+    /
     space (["',.]) space [^щ]*
     /
     space_hard [^ш]+

data/lib/biodiversity/parser/scientific_name_clean.treetop CHANGED Viewed

@@ -1,4 +1,6 @@
 # encoding: UTF-8
+require 'unicode_utils'
 grammar ScientificNameClean
   rule root
@@ -6,19 +8,19 @@ grammar ScientificNameClean
       def value
         a.value.gsub(/\s{2,}/, ' ').strip
       end
       def canonical
         a.canonical.gsub(/\s{2,}/, ' ').strip
       end
       def pos
         a.pos
       end
       def hybrid
         a.hybrid
       end
       def details
         a.details.class == Array ? a.details : [a.details]
       end
@@ -28,25 +30,25 @@ grammar ScientificNameClean
       end
     }
   end
   rule scientific_name_5
     a:multinomial_name space_hard hybrid_character space_hard b:species {
       def value
         a.value + " × " + b.value
       end
       def canonical
         a.canonical + " × " + b.canonical
       end
       def pos
         a.pos.merge(b.pos)
       end
       def hybrid
         true
       end
       def details
         [a.details, b.details.merge({:genus => a.details[:genus]})]
       end
@@ -56,19 +58,19 @@ grammar ScientificNameClean
       def value
         a.value + " " + b.apply(c)
       end
       def canonical
         a.canonical
       end
       def pos
         a.pos.merge(c.pos)
       end
       def hybrid
         a.hybrid
       end
       def details
         a.details.merge(b.details(c))
       end
@@ -76,25 +78,25 @@ grammar ScientificNameClean
     /
     scientific_name_4
   end
   rule scientific_name_4
     a:scientific_name_1 space hybrid_character space b:scientific_name_1 {
       def value
         a.value + " × " + b.value
       end
       def canonical
         a.canonical + " × " + b.canonical
       end
       def pos
         a.pos.merge(b.pos)
       end
       def hybrid
         true
       end
       def details
         [a.details, b.details]
       end
@@ -104,19 +106,19 @@ grammar ScientificNameClean
       def value
         a.value + " × ?"
       end
       def canonical
         a.canonical
       end
       def pos
         a.pos
       end
       def hybrid
         true
       end
       def details
         [a.details, "?"]
       end
@@ -124,25 +126,25 @@ grammar ScientificNameClean
     /
     scientific_name_3
   end
   rule scientific_name_3
     a:hybrid_character space b:scientific_name_2 {
       def  value
         a.value + " " + b.value
       end
       def canonical
         b.canonical
       end
       def pos
         b.pos
       end
       def hybrid
         true
       end
       def details
         b.details
       end
@@ -150,25 +152,25 @@ grammar ScientificNameClean
     /
     scientific_name_2
   end
   rule scientific_name_2
     a:scientific_name_1 space b:status_part {
       def value
         a.value + " " + b.value
       end
       def canonical
         a.canonical
       end
       def pos
         a.pos
       end
       def hybrid
         a.hybrid rescue false
       end
       def details
         a.details.merge(b.details)
       end
@@ -178,12 +180,14 @@ grammar ScientificNameClean
   end
   rule scientific_name_1
+    multiuninomial_name
+    /
     multinomial_name
     /
-    uninomial_name
+    uninomial_name
   end
   rule status_part
     a:status_word space b:status_part {
       def value
@@ -196,7 +200,7 @@ grammar ScientificNameClean
     /
     status_word
   end
   rule status_word
     latin_word [\.] {
       def value
@@ -209,114 +213,239 @@ grammar ScientificNameClean
     #/
     #latin_word
   end
+  rule unparsed
+    .+ space {
+      def value
+        ''
+      end
+      def hybrid
+        false
+      end
+      def canonical
+        ''
+      end
+      def pos
+        {interval.begin => ['unparsed', interval.end]}
+      end
+      def details
+        {:unparsed => text_value}
+      end
+    }
+  end
   rule multinomial_name
-    a:genus space b:infragenus space species_prefix? space c:species space_hard d:infraspecies_mult {
+    a:genus space b:infragenus space aid:annotation_identification? space c:species space_hard d:infraspecies_mult {
       def value
         a.value + " " + b.value + " " + c.value + " " + d.value
       end
       def canonical
         a.canonical + " " + c.canonical + " " + d.canonical
       end
       def pos
         a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
       end
       def hybrid
         c.hybrid rescue false
       end
       def details
         a.details.merge(b.details).merge(c.details).merge(d.details)
       end
     }
-    /
-    a:genus space b:infragenus space species_prefix? space c:species {
+    /
+    a:genus space b:infragenus space aid:annotation_identification? space c:species space aid:annotation_identification space d:infraspecies_mult {
       def value
-        a.value + " " + b.value + " " + c.value
+        a.value + " " + b.value + " " + c.value + " " + d.value
       end
       def canonical
-        a.canonical + " " + c.canonical
+        a.canonical + " " + c.canonical + " " + d.canonical
       end
       def pos
-        a.pos.merge(b.pos).merge(c.pos)
+        a.pos.merge(b.pos).merge(c.pos).merge(d.pos)
+      end
+      def hybrid
+        c.hybrid rescue false
+      end
+      def details
+        a.details.merge(b.details).merge(c.details).merge(d.details)
+      end
+    }
+    /
+    a:genus space b:infragenus space aid:annotation_identification? space c:species {
+      def value
+        if defined? aid.apply
+          a.value + " " + b.value + aid.apply(c)
+        else
+          a.value + " " + b.value + " " + c.value
+        end
       end
+      def canonical
+        if defined? aid.apply
+          a.canonical + aid.canonical(c)
+        else
+          a.canonical + " " + c.canonical
+        end
+      end
+      def pos
+        if defined? aid.apply
+          a.pos.merge(b.pos).merge(aid.pos(c))
+        else
+          a.pos.merge(b.pos).merge(c.pos)
+        end
+      end
       def hybrid
         c.hybrid rescue false
       end
       def details
-        a.details.merge(b.details).merge(c.details)
+        if defined? aid.apply
+          a.details.merge(b.details).merge(aid.apply(c))
+        else
+          a.details.merge(b.details).merge(c.details)
+        end
       end
     }
     /
-    a:genus space species_prefix? space b:species space_hard c:infraspecies_mult {
+    a:genus space aid:annotation_identification? space b:species space_hard c:infraspecies_mult {
       def value
-        a.value + " " + b.value + " " + c.value
+        a.value + " " + b.value + " " + c.value
       end
       def canonical
         a.canonical + " " + b.canonical + " " + c.canonical
       end
       def pos
         a.pos.merge(b.pos).merge(c.pos)
       end
       def hybrid
         b.hybrid rescue false
       end
       def details
         a.details.merge(b.details).merge(c.details)
       end
     }
     /
-    a:genus space species_prefix? space b:species {
+    a:genus space aid:annotation_identification? space b:species {
       def value
-        a.value + " " + b.value
+        if defined? aid.apply
+          a.value + aid.apply(b)
+        else
+          a.value + " " + b.value
+        end
       end
       def canonical
-        a.canonical + " " + b.canonical
+        if defined? aid.apply
+          a.canonical + aid.canonical(b)
+        else
+          a.canonical + " " + b.canonical
+        end
       end
       def pos
-        a.pos.merge(b.pos)
+        if defined? aid.apply
+          a.pos.merge(aid.pos(b))
+        else
+          a.pos.merge(b.pos)
+        end
       end
       def hybrid
         b.hybrid rescue false
       end
       def details
-        a.details.merge(b.details)
+        if defined? aid.apply
+          a.details.merge(aid.details(b))
+        else
+          a.details.merge(b.details)
+        end
+      end
+    }
+    /
+    a:genus space aid:annotation_identification space b:unparsed {
+      def value
+        a.value + aid.apply(b)
+      end
+      def canonical
+        a.canonical + aid.canonical(b)
+      end
+      def pos
+        a.pos.merge(aid.pos(b))
+      end
+      def hybrid
+        false
+      end
+      def details
+        a.details.merge(aid.details(b))
       end
     }
   end
+  rule multiuninomial_name
+    a:uninomial_name space b:rank_uninomial space c:uninomial_name {
+      def value
+        a.value + " " + b.value + " " + c.value
+      end
+      def canonical
+        a.canonical
+      end
+      def hybrid
+        false
+      end
+      def pos
+        a.pos.merge(b.pos(c))
+      end
+      def details
+        a.details.merge(b.details(c))
+      end
+    }
+  end
   rule infraspecies_mult
     a:infraspecies space b:infraspecies_mult {
       def value
         a.value + " " + b.value
       end
       def canonical
         a.canonical + " " + b.canonical
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
-        a_array =  a.details[:infraspecies].class == Array ? a.details[:infraspecies] : [a.details[:infraspecies]]
+        a_array =  a.details[:infraspecies].class == Array ? a.details[:infraspecies] : [a.details[:infraspecies]]
         b_array = b.details[:infraspecies].class == Array ? b.details[:infraspecies] : [b.details[:infraspecies]]
         a.details.merge({:infraspecies => a_array + b_array})
       end
@@ -324,70 +453,98 @@ grammar ScientificNameClean
     /
     infraspecies {
       def details
-        {:infraspecies => [super[:infraspecies]]}
+        if super[:annotation_identification]
+          {:infraspecies => [{:annotation_identification => super[:annotation_identification], :ignored => super[:ignored]}]}
+        else
+          {:infraspecies => [super[:infraspecies]]}
+        end
       end
     }
   end
   rule infraspecies
     a:infraspecies_string space b:authorship {
       def value
         a.value + " " + b.value
       end
       def canonical
         a.canonical
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
         {:infraspecies => a.details[:infraspecies].merge(b.details)}
       end
     }
     /
-    infraspecies_string
+    infraspecies_string
   end
   rule infraspecies_string
-    sel:rank space_hard a:species_word {
-      def value
+    sel:rank space a:species_word {
+      def value
         sel.apply(a)
       end
       def canonical
         sel.canonical(a)
       end
       def pos
         sel.pos(a)
       end
       def details
         sel.details(a)
       end
     }
     /
-    species_word ![\.] {
+    aid:annotation_identification space a:species_word ![\.] {
       def value
-        text_value
+        aid.apply(a)
+      end
+      def canonical
+        aid.canonical(a)
+      end
+      def pos
+        def a.pos
+          {interval.begin => ['infraspecies', a.interval.end]}
+        end
+        aid.pos(a)
+      end
+      def details
+        def a.details
+          {:infraspecies => {:string => value, :rank => 'n/a'}}
+        end
+        aid.details(a)
       end
+    }
+    /
+    a:species_word ![\.] {
+      def value
+        a.value
+      end
       def canonical
         value
       end
       def pos
         {interval.begin => ['infraspecies', interval.end]}
       end
       def details
         {:infraspecies => {:string => value, :rank => 'n/a'}}
       end
     }
   end
   rule taxon_concept_rank
     ("sec."/"sensu.") {
       def value
@@ -398,77 +555,70 @@ grammar ScientificNameClean
       end
       def details(a = nil)
         {:taxon_concept => a.details}
-      end
+      end
     }
   end
   rule rank
-    ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
-    /"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
+    ("morph."/"f.sp."/"B "/"ssp."/"ssp "/"mut."/"nat "/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var "/"subsp."/"subsp "/"subf."/"race "/"forma "/"fma."/"fma "/"form."/"form "/"fo."/"fo"/"f."/"α"/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
     {
       def value
         text_value.strip
       end
       def apply(a)
-        " " + text_value + " " + a.value
+        " " + text_value.strip + " " + a.value
       end
       def canonical(a)
         " " + a.value
       end
       def pos(a)
-        {interval.begin => ['infraspecific_type', interval.end], a.interval.begin => ['infraspecies', a.interval.end]}
+        interval_end =  text_value[-1] == ' ' ? interval.end - 1 : interval.end
+        {interval.begin => ['infraspecific_type', interval_end], a.interval.begin => ['infraspecies', a.interval.end]}
       end
       def details(a = nil)
-        {:infraspecies => {:string => (a.value rescue nil), :rank => text_value}}
+        {:infraspecies => {:string => (a.value rescue nil), :rank => text_value.strip}}
       end
     }
-    /
-    rank_forma
   end
-  rule rank_forma
-    ("forma"/"form."/"form"/"fo."/"f.")
-    {
+  rule rank_uninomial
+    ("sect."/"sect "/"subsect."/"subsect "/"trib."/"trib "/"subtrib."/"subtrib "/"ser."/"ser "/"subgen."/"subgen "/"fam."/"fam "/"subfam."/"subfam "/"supertrib."/"supertrib ") {
       def value
-        "f."
-      end
-      def apply(a)
-        " " + value + " " + a.value
-      end
-      def canonical(a)
-        " " + a.value
+        text_value.strip
       end
-      def pos(a)
-        {interval.begin => ['infraspecific_type', interval.end], a.interval.begin => ['infraspecies', a.interval.end]}
+      def pos(uni)
+        {interval.begin => ['rank_uninomial', interval.end], uni.interval.begin => ['uninomial', uni.interval.end]}
       end
-      def details(a = nil)
-        {:infraspecies => {:string => (a.value rescue nil), :rank => value}}
+      def details(uni)
+        {:rank_uninomials => value, :uninomial2 => uni.details[:uninomial]}
       end
     }
   end
   rule species
     a:species_string space b:authorship {
       def value
         a.value + " " + b.value
       end
       def canonical
         a.canonical
       end
       def hybrid
         a.hybrid rescue false
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
         {:species => a.details[:species].merge(b.details)}
       end
@@ -476,43 +626,21 @@ grammar ScientificNameClean
     /
     species_string
   end
   rule species_string
-    # a:species_word &(space_hard author_prefix_word space_hard) {
-    #   def value
-    #     a.value
-    #   end
-    #
-    #   def canonical
-    #     a.value
-    #   end
-    #
-    #   def hybrid
-    #     a.hybrid rescue false
-    #   end
-    #
-    #   def pos
-    #     {a.interval.begin => ['species', a.interval.end]}
-    #   end
-    #
-    #   def details
-    #     {:species => {:string => a.value}}
-    #   end
-    # }
-    # /
     species_word {
       def canonical
         value
       end
       def pos
         {interval.begin => ['species', interval.end]}
       end
       def hybrid
         false
       end
       def details
         {:species => {:string => value}}
       end
@@ -520,65 +648,85 @@ grammar ScientificNameClean
     /
     species_word_hybrid
   end
   rule infragenus
     left_paren space a:(cap_latin_word/capped_dotted_char) space right_paren {
       def value
         "(" + a.value + ")"
       end
       def canonical
         a.value
       end
       def pos
         {a.interval.begin => ['infragenus', a.interval.end]}
       end
       def details
         {:infragenus => {:string => a.value}}
       end
     }
   end
   rule genus
-    a:uninomial_string !(space_hard author_prefix_word space_hard author_word) {
+    a:(abbreviated_genus/uninomial_string) !(space_hard author_prefix_word space_hard author_word) {
       def value
         a.value
       end
       def pos
         {a.interval.begin => ['genus', a.interval.end]}
       end
       def canonical
         a.value
       end
       def details
         {:genus => {:string => a.value}}
       end
     }
   end
+  rule abbreviated_genus
+    [A-Z] [a-z]? [a-z]? [\\.] space {
+      def value
+        text_value.strip
+      end
+      def canonical
+        value
+      end
+      def pos
+        {interval.begin => ["abbreviated_genus", interval.end]}
+      end
+      def details
+        {:abbreviated_genus => {:string => value}}
+      end
+    }
+  end
   rule uninomial_name
     a:uninomial_string space b:infragenus space c:simple_authorship {
       def value
         a.value + " " + b.value + " " + c.value
       end
       def canonical
         a.canonical
       end
       def pos
         a.pos.merge(b.pos).merge(c.pos)
       end
       def hybrid
         false
       end
       def details
         {:uninomial => a.details[:uninomial].merge(b.details).merge(c.details)}
       end
@@ -588,19 +736,19 @@ grammar ScientificNameClean
       def value
         a.value + " " + b.value
       end
       def canonical
         a.canonical
       end
       def pos
         a.pos.merge(b.pos)
       end
       def hybrid
         false
       end
       def details
         {:uninomial => a.details[:uninomial].merge(b.details)}
       end
@@ -610,19 +758,19 @@ grammar ScientificNameClean
       def value
         a.value + " " + b.value
       end
       def canonical
         a.canonical
       end
       def pos
         a.pos.merge(b.pos)
       end
       def hybrid
         false
       end
       def details
         {:uninomial => a.details[:uninomial].merge(b.details)}
       end
@@ -636,31 +784,31 @@ grammar ScientificNameClean
       def canonical
         value
       end
       def pos
         {interval.begin => ['uninomial', interval.end]}
       end
       def hybrid
         false
       end
-      def details
+      def details
         {:uninomial => {:string => value}}
       end
     }
   end
   rule authorship
     a:basionym_authorship_with_parenthesis space b:simple_authorship ","? space c:ex_authorship {
       def value
         a.value + " " + b.value + " " + c.value
       end
       def pos
         a.pos.merge(b.pos).merge(c.pos)
       end
       def details
         val = {:authorship => text_value.strip, :combinationAuthorTeam => b.details[:basionymAuthorTeam], :basionymAuthorTeam => a.details[:basionymAuthorTeam]}
         val[:combinationAuthorTeam].merge!(c.details)
@@ -672,11 +820,11 @@ grammar ScientificNameClean
       def value
         a.value + " " + b.value
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
         {:authorship => text_value.strip, :combinationAuthorTeam => b.details[:basionymAuthorTeam], :basionymAuthorTeam => a.details[:basionymAuthorTeam]}
       end
@@ -688,11 +836,11 @@ grammar ScientificNameClean
       def value
         a.value + " " + b.value
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
         val = a.details
         val[:authorship] = text_value.strip
@@ -703,21 +851,21 @@ grammar ScientificNameClean
     /
     simple_authorship
   end
   rule basionym_authorship_with_parenthesis
     left_paren space a:authors_names space right_paren space [,]? space b:year {
       def value
         "(" + a.value + " " + b.value + ")"
       end
       def pos
        a.pos.merge(b.pos)
-      end
+      end
       def details
-        { :authorship => text_value,
-          :basionymAuthorTeam => {:author_team => text_value}.merge(a.details).merge(b.details)
+        { :authorship => text_value,
+          :basionymAuthorTeam => {:author_team => text_value}.merge(a.details).merge(b.details)
           }
       end
     }
@@ -726,11 +874,11 @@ grammar ScientificNameClean
       def value
         "(" + a.value + " " + b.value + ")"
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
         val = a.details
         val[:basionymAuthorTeam].merge!(b.details)
@@ -743,15 +891,15 @@ grammar ScientificNameClean
       def value
         "(" + a.value + ")"
       end
       def pos
         a.pos
       end
       def details
         val = a.details
         val[:authorship] = text_value
-        val
+        val
       end
     }
     /
@@ -759,32 +907,32 @@ grammar ScientificNameClean
       def value
         "(?)"
       end
       def pos
         {a.interval.begin => ['unknown_author', a.interval.end]}
       end
       def details
         {:authorship => text_value, :basionymAuthorTeam => {:authorTeam => text_value, :author => ['?']}}
       end
     }
   end
   rule ex_authorship
     ex_sep space b:simple_authorship {
       def value
         " ex " + b.value
       end
       def pos
         b.pos
       end
       def details
         val = {:exAuthorTeam => {:authorTeam => b.text_value.strip}.merge(b.details[:basionymAuthorTeam])}
         val
       end
-    }
+    }
   end
   rule simple_authorship
@@ -792,17 +940,17 @@ grammar ScientificNameClean
       def value
         a.value + " " + b.value
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
         details_with_arg(:basionymAuthorTeam)
       end
       def details_with_arg(authorTeamType = 'basionymAuthorTeam')
-        { :authorship => text_value,
+        { :authorship => text_value,
           authorTeamType.to_sym => {
             :authorTeam => a.text_value.strip
           }.merge(a.details).merge(b.details)
@@ -814,17 +962,17 @@ grammar ScientificNameClean
       def value
         a.value + " " + b.value
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
         details_with_arg(:basionymAuthorTeam)
       end
       def details_with_arg(authorTeamType = 'basionymAuthorTeam')
-        { :authorship => text_value,
+        { :authorship => text_value,
           authorTeamType.to_sym => {
             :authorTeam => a.text_value.strip
           }.merge(a.details).merge(b.details)
@@ -838,27 +986,27 @@ grammar ScientificNameClean
         details[:basionymAuthorTeam].merge!(super)
         details
       end
       def details_with_arg(authorTeamType = 'basionymAuthorTeam')
-        { :authorship => text_value,
+        { :authorship => text_value,
           authorTeamType.to_sym => {
             :authorTeam => text_value,
           }
-        }
+        }
       end
     }
   end
   rule authors_names
     a:author_name space sep:author_separator space b:authors_names {
       def value
         sep.apply(a,b)
       end
       def pos
         sep.pos(a,b)
       end
       def details
         sep.details(a,b)
       end
@@ -868,40 +1016,40 @@ grammar ScientificNameClean
     /
     unknown_auth
   end
   rule unknown_auth
-    ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") {
+    ("auct."/"auct"/"hort."/"hort"/"anon."/"anon"/"ht."/"ht") !latin_word {
       def value
         text_value
       end
       def pos
        {interval.begin => ['unknown_author', interval.end]}
       end
       def details
         {:author => ["unknown"]}
       end
     }
   end
   rule ex_sep
     ("ex"/"in") &[\s]
   end
   rule author_separator
-    ("&"/","/"and"/"et") {
+    ("&amp;"/"&"/","/"and"/"et") {
       def apply(a,b)
         sep = text_value.strip
-        sep = " et" if ["&","and","et"].include? sep
+        sep = " &" if ["&amp;", "&","and","et"].include? sep
         a.value + sep + " " + b.value
       end
       def pos(a,b)
         a.pos.merge(b.pos)
       end
       def details(a,b)
         {:author => a.details[:author] + b.details[:author]}
       end
@@ -913,8 +1061,8 @@ grammar ScientificNameClean
       def value
         a.value + ' ' + b.value
       end
-      def pos
+      def pos
         a.pos.merge(b.pos)
       end
@@ -925,17 +1073,17 @@ grammar ScientificNameClean
     /
     author_name_without_postfix
   end
   rule author_name_without_postfix
     space a:author_prefix_word space b:author_name {
       def value
         a.value + " " + b.value
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
         {:author => [value]}
       end
@@ -945,11 +1093,11 @@ grammar ScientificNameClean
       def value
         a.value + " " + b.value
       end
       def pos
         a.pos.merge(b.pos)
       end
       def details
         {:author => [value]}
       end
@@ -957,17 +1105,17 @@ grammar ScientificNameClean
     /
     author_word
   end
   rule author_word
     "A S. Xu" {
       def value
         text_value.strip
       end
       def pos
         {interval.begin => ['author_word', 1], (interval.begin + 2) => ['author_word', 2], (interval.begin + 5) => ['author_word', 2]}
       end
       def details
         {:author => [value]}
       end
@@ -977,26 +1125,28 @@ grammar ScientificNameClean
       def value
         text_value.strip
       end
       def pos
         #cheating because there are several words in some of them
         {interval.begin => ['author_word', interval.end]}
       end
       def details
         {:author => [value]}
       end
     }
-    /
+    /
     ("Å"/"Ö"/"Á"/"Ø"/"Ô"/"Š"/"Ś"/"Č"/"Ķ"/"Ł"/"É"/"Ž"/[A-W]/[Y-Z]) [^0-9\[\]\(\)\s&,]* {
       def value
-        text_value
+        text_value.gsub(/([\p{Lu}]{3,})/) do |match|
+          UnicodeUtils.titlecase(match)
+        end
       end
       def pos
         {interval.begin => ['author_word', interval.end]}
       end
       def details
         {:author => [value]}
       end
@@ -1006,11 +1156,11 @@ grammar ScientificNameClean
       def value
         text_value
       end
       def pos
         {interval.begin => ['author_word', interval.end]}
       end
       def details
         {:author => [value]}
       end
@@ -1018,13 +1168,13 @@ grammar ScientificNameClean
     /
     author_prefix_word
   end
   rule author_prefix_word
-    space ("ab"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
+    space ("ab"/"af"/"bis"/"da"/"der"/"des"/"den"/"della"/"dela"/"de"/"di"/"du"/"la"/"ter"/"van"/"von") &space_hard {
       def value
         text_value
       end
       def pos
         #cheating because there are several words in some of them
         {interval.begin => ['author_word', interval.end]}
@@ -1034,7 +1184,7 @@ grammar ScientificNameClean
   rule author_postfix_word
     ("f."/"filius") {
-      def value
+      def value
         text_value.strip
       end
@@ -1043,7 +1193,7 @@ grammar ScientificNameClean
       end
     }
   end
   rule cap_latin_word_pair
     a:cap_latin_word "-" b:cap_latin_word {
       def value
@@ -1051,7 +1201,7 @@ grammar ScientificNameClean
       end
     }
   end
   rule cap_latin_word
     a:([A-Z]/cap_digraph) b:latin_word "?" {
       def value
@@ -1091,19 +1241,19 @@ grammar ScientificNameClean
       def value
         a.value + " " + b.value
       end
       def canonical
         b.value
       end
       def hybrid
         true
       end
       def pos
         {b.interval.begin => ['species', b.interval.end]}
       end
       def details
         {:species => {:string => b.value}}
       end
@@ -1113,19 +1263,19 @@ grammar ScientificNameClean
       def value
         "× " + b.value
       end
       def canonical
         b.value
       end
       def hybrid
         true
       end
       def pos
         {b.interval.begin => ['species', b.interval.end]}
       end
       def details
         {:species => {:string => b.value}}
       end
@@ -1135,29 +1285,74 @@ grammar ScientificNameClean
       def value
         "× " + b.value
       end
       def canonical
         b.value
       end
       def hybrid
         true
       end
       def pos
         {b.interval.begin => ['species', b.interval.end]}
       end
       def details
         {:species => {:string => b.value}}
       end
     }
   end
-  rule species_prefix
-    ("aff."/"corrig."/"?") &space_hard
+  rule annotation_identification
+    ("sp.nr."/"sp. nr."/"nr."/"nr "/"sp.aff."/"sp. aff."/"sp."/"sp "/"species"/"spp."/"spp "/"aff."/"aff "/"monst."/"? ") {
+      def value
+        text_value.strip
+      end
+      def apply(sp)
+        ''
+      end
+      def canonical(sp)
+        ''
+      end
+      def pos(sp)
+        interval_end = text_value[-1] == ' ' ? interval.end - 1 : interval.end
+        {interval.begin => ['annotation_identification', interval.end]}
+      end
+      def details(sp)
+        {:annotation_identification => value, :ignored => sp.details}
+      end
+    }
+    /
+    ("cf."/"cf ") {
+      def value
+        text_value.strip
+      end
+      def apply(sp)
+        ' ' + value + ' ' + sp.value
+      end
+      def canonical(sp)
+        ' ' + sp.canonical
+      end
+      def pos(sp)
+        interval_end = text_value[-1] == ' ' ? interval.end - 1 : interval.end
+        {interval.begin => ['annotation_identification', interval.end]}.merge(sp.pos)
+      end
+      def details(sp)
+        {:annotation_identification => value, :species => sp.details}
+      end
+    }
   end
   rule species_word
     a:[0-9]+ "-"? b:latin_word {
       def value
@@ -1177,6 +1372,12 @@ grammar ScientificNameClean
       end
     }
     /
+    "o\'donelli" {
+      def value
+        "odonelli"
+      end
+    }
+    /
     a:valid_name_letter b:valid_name_letters {
       def value
         a.value + b.value
@@ -1191,9 +1392,9 @@ grammar ScientificNameClean
         text_value.split('').each do |l|
           l = 'ae' if l == 'æ'
           l = 'oe' if l == 'œ'
-          # not sure if we should normalize ë as well. It is legal in botanical code, but it
-          # might be beneficial to normalize it for the reconsiliation purposes
-          # l = 'e' if l == 'ë'
+          # We normalize ë as well. It is legal in botanical code, but it
+          # is beneficial to normalize it for the reconsiliation purposes
+          l = 'e' if l == 'ë'
           res << l
         end
         res
@@ -1207,6 +1408,7 @@ grammar ScientificNameClean
         res = text_value
         res = 'ae' if res == 'æ'
         res = 'oe' if res == 'œ'
+        res = 'e'  if res == 'ë'
         res
       end
     }
@@ -1224,7 +1426,7 @@ grammar ScientificNameClean
       def value
       'Oe'
       end
-    }
+    }
   end
   rule year
@@ -1232,14 +1434,14 @@ grammar ScientificNameClean
       def value
         a.value
       end
       def pos
         a.pos
       end
       def details
         a.details
-      end
+      end
     }
     /
     year_number_with_character
@@ -1262,31 +1464,31 @@ grammar ScientificNameClean
       end
     }
   end
   rule year_number
-    [12] [7890] [0-9] [0-9]? [\?]? {
+    [12] [7890] [0-9] ([0-9] [\?]?/"?") {
       def value
         text_value
       end
       def pos
         {interval.begin => ['year', interval.end]}
       end
       def details
         {:year => value}
       end
     }
   end
   rule left_paren
     "("
   end
   rule right_paren
     ")"
   end
   rule hybrid_character
     ("x"/"X") {
       def value
@@ -1296,7 +1498,7 @@ grammar ScientificNameClean
     /
     multiplication_sign
   end
   rule multiplication_sign
     ("×"/"*") {
       def value
@@ -1304,7 +1506,7 @@ grammar ScientificNameClean
       end
     }
   end
   rule space
     [\s]*
   end
@@ -1312,5 +1514,5 @@ grammar ScientificNameClean
   rule space_hard
     [\s]+
   end
 end