RubyGems - biodiversity - Versions diffs - 3.5.1 → 4.0.0 - Mend

biodiversity 3.5.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +5 -5
data/.gitignore +1 -0
data/.rubocop.yml +9 -6
data/.ruby-version +1 -1
data/.travis.yml +1 -6
data/CHANGELOG +3 -0
data/Gemfile +2 -0
data/README.md +37 -178
data/Rakefile +15 -48
data/biodiversity.gemspec +18 -21
data/clib/linux/libgnparser.h +93 -0
data/clib/linux/libgnparser.so +0 -0
data/clib/mac/libgnparser.h +93 -0
data/clib/mac/libgnparser.so +0 -0
data/lib/biodiversity.rb +4 -9
data/lib/biodiversity/parser.rb +65 -281
data/lib/biodiversity/version.rb +8 -1
data/spec/lib/biodiversity_spec.rb +9 -0
data/spec/lib/parser_spec.rb +38 -0
data/spec/spec_helper.rb +4 -81
metadata +27 -102
data/.byebug_history +0 -18
data/.document +0 -5
data/examples/socket_client.rb +0 -25
data/lib/biodiversity/guid.rb +0 -1
data/lib/biodiversity/guid/lsid.rb +0 -16
data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
data/spec/biodiversity_spec.rb +0 -11
data/spec/files/test_data.txt +0 -490
data/spec/files/todo.txt +0 -55
data/spec/guid/lsid.spec.rb +0 -15
data/spec/parser/scientific_name_canonical_spec.rb +0 -36
data/spec/parser/scientific_name_clean_spec.rb +0 -1137
data/spec/parser/scientific_name_dirty_spec.rb +0 -165
data/spec/parser/scientific_name_spec.rb +0 -193

data/clib/linux/libgnparser.h ADDED Viewed

@@ -0,0 +1,93 @@
+/* Code generated by cmd/cgo; DO NOT EDIT. */
+/* package gitlab.com/gogna/gnparser/binding */
+#line 1 "cgo-builtin-export-prolog"
+#include <stddef.h> /* for ptrdiff_t below */
+#ifndef GO_CGO_EXPORT_PROLOGUE_H
+#define GO_CGO_EXPORT_PROLOGUE_H
+#ifndef GO_CGO_GOSTRING_TYPEDEF
+typedef struct { const char *p; ptrdiff_t n; } _GoString_;
+#endif
+#endif
+/* Start of preamble from import "C" comments.  */
+#line 3 "main.go"
+	#include "stdlib.h"
+#line 1 "cgo-generated-wrapper"
+/* End of preamble from import "C" comments.  */
+/* Start of boilerplate cgo prologue.  */
+#line 1 "cgo-gcc-export-header-prolog"
+#ifndef GO_CGO_PROLOGUE_H
+#define GO_CGO_PROLOGUE_H
+typedef signed char GoInt8;
+typedef unsigned char GoUint8;
+typedef short GoInt16;
+typedef unsigned short GoUint16;
+typedef int GoInt32;
+typedef unsigned int GoUint32;
+typedef long long GoInt64;
+typedef unsigned long long GoUint64;
+typedef GoInt64 GoInt;
+typedef GoUint64 GoUint;
+typedef __SIZE_TYPE__ GoUintptr;
+typedef float GoFloat32;
+typedef double GoFloat64;
+typedef float _Complex GoComplex64;
+typedef double _Complex GoComplex128;
+/*
+  static assertion to make sure the file is being used on architecture
+  at least with matching size of GoInt.
+*/
+typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
+#ifndef GO_CGO_GOSTRING_TYPEDEF
+typedef _GoString_ GoString;
+#endif
+typedef void *GoMap;
+typedef void *GoChan;
+typedef struct { void *t; void *v; } GoInterface;
+typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
+#endif
+/* End of boilerplate cgo prologue.  */
+#ifdef __cplusplus
+extern "C" {
+#endif
+// ParseToString function takes a name-string, desired format, and parses
+// the name-string to either JSON, or pipe-separated values, depending on
+// the desired format. Format can take values of 'simple', 'compact', 'pretty'.
+extern char* ParseToString(char* p0, char* p1);
+// ParseAryToStrings function takes an array of names, parsing format and a
+// reference to an output: an empty array of strings to return the the data
+// back. It populates the output array with raw strings of either JSON or
+// pipe-separated parsed values (depending on a given format). Format can take
+// values of 'simple', 'compact', or 'pretty'.
+extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
+#ifdef __cplusplus
+}
+#endif

data/clib/linux/libgnparser.so ADDED Viewed

Binary file

data/clib/mac/libgnparser.h ADDED Viewed

@@ -0,0 +1,93 @@
+/* Code generated by cmd/cgo; DO NOT EDIT. */
+/* package gitlab.com/gogna/gnparser/binding */
+#line 1 "cgo-builtin-export-prolog"
+#include <stddef.h> /* for ptrdiff_t below */
+#ifndef GO_CGO_EXPORT_PROLOGUE_H
+#define GO_CGO_EXPORT_PROLOGUE_H
+#ifndef GO_CGO_GOSTRING_TYPEDEF
+typedef struct { const char *p; ptrdiff_t n; } _GoString_;
+#endif
+#endif
+/* Start of preamble from import "C" comments.  */
+#line 3 "main.go"
+	#include "stdlib.h"
+#line 1 "cgo-generated-wrapper"
+/* End of preamble from import "C" comments.  */
+/* Start of boilerplate cgo prologue.  */
+#line 1 "cgo-gcc-export-header-prolog"
+#ifndef GO_CGO_PROLOGUE_H
+#define GO_CGO_PROLOGUE_H
+typedef signed char GoInt8;
+typedef unsigned char GoUint8;
+typedef short GoInt16;
+typedef unsigned short GoUint16;
+typedef int GoInt32;
+typedef unsigned int GoUint32;
+typedef long long GoInt64;
+typedef unsigned long long GoUint64;
+typedef GoInt64 GoInt;
+typedef GoUint64 GoUint;
+typedef __SIZE_TYPE__ GoUintptr;
+typedef float GoFloat32;
+typedef double GoFloat64;
+typedef float _Complex GoComplex64;
+typedef double _Complex GoComplex128;
+/*
+  static assertion to make sure the file is being used on architecture
+  at least with matching size of GoInt.
+*/
+typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
+#ifndef GO_CGO_GOSTRING_TYPEDEF
+typedef _GoString_ GoString;
+#endif
+typedef void *GoMap;
+typedef void *GoChan;
+typedef struct { void *t; void *v; } GoInterface;
+typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
+#endif
+/* End of boilerplate cgo prologue.  */
+#ifdef __cplusplus
+extern "C" {
+#endif
+// ParseToString function takes a name-string, desired format, and parses
+// the name-string to either JSON, or pipe-separated values, depending on
+// the desired format. Format can take values of 'simple', 'compact', 'pretty'.
+extern char* ParseToString(char* p0, char* p1);
+// ParseAryToStrings function takes an array of names, parsing format and a
+// reference to an output: an empty array of strings to return the the data
+// back. It populates the output array with raw strings of either JSON or
+// pipe-separated parsed values (depending on a given format). Format can take
+// values of 'simple', 'compact', or 'pretty'.
+extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
+#ifdef __cplusplus
+}
+#endif

data/clib/mac/libgnparser.so ADDED Viewed

Binary file

data/lib/biodiversity.rb CHANGED Viewed

@@ -1,15 +1,10 @@
-require 'treetop'
+# frozen_string_literal: true
+require 'ffi'
 require 'json'
-require 'open-uri'
 require_relative 'biodiversity/version'
 require_relative 'biodiversity/parser'
-require_relative 'biodiversity/guid'
+# Biodiversity module provides a namespace for scientific name parser.
 module Biodiversity
-  LSID_RESOLVER_URL = 'http://lsid.tdwg.org/'
-  def self.version
-    VERSION
-  end
 end

data/lib/biodiversity/parser.rb CHANGED Viewed

@@ -1,294 +1,78 @@
-# encoding: UTF-8
-require "gn_uuid"
-require_relative "parser/scientific_name_clean"
-require_relative "parser/scientific_name_dirty"
-require_relative "parser/scientific_name_canonical"
+# frozen_string_literal: true
-module PreProcessor
-  NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
-  TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
-  TAXON_CONCEPTS2 = /\s+
-                     (\(?s\.\s?s\.|
-                     \(?s\.\s?l\.|
-                     \(?s\.\s?str\.|
-                     \(?s\.\s?lat\.|
-                    sec\.|sec|near)\b.*$/x
-  TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
-  NOMEN_CONCEPTS  = /(,\s*|\s+)(\(?nomen\b|\(?nom\.|\(?comb\.).*$/i
-  LAST_WORD_JUNK  = /(,\s*|\s+)
-                    (spp\.|spp|var\.|
-                     var|von|van|ined\.|
-                     ined|sensu|new|non|nec|
-                     nudum|cf\.|cf|sp\.|sp|
-                     ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
-  def self.clean(a_string)
-    orig = a_string
-    [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
-     TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
-      a_string = a_string.gsub(i, "")
-    end
-    tail = orig[a_string.size..-1]
-    a_string = a_string.tr("ſ","s") #old "s"
-    a_string = a_string.tr("_", " ") if a_string.strip.match(/\s/).nil?
-    [a_string, tail.strip]
-  end
-end
-# Public: Parser which runs in parallel.
-#
-# Examples
-#
-# parser = ParallelParser.new(4)
-# parser.parse(["Betula L.", "Pardosa moesta"])
-class ParallelParser
-  # Public: Initialize ParallelParser.
-  #
-  # processes_num - an Integer to setup the number of processes (default: nil).
-  #                 If processes number is not set it will be determined
-  #                 automatically.
-  def initialize(processes_num = nil)
-    require "parallel"
-    cpu_num
-    if processes_num.to_i > 0
-      @processes_num = [processes_num, cpu_num - 1].min
-    else
-      @processes_num = cpu_num > 3 ? cpu_num - 2 : 1
-    end
-  end
-  # Public: Parses an array of scientific names using several processes
-  # in parallel.
-  #
-  # Scientific names are deduplicated in the process, so every string is
-  # parsed only once.
-  #
-  # names_list - takes an Array of scientific names,
-  #              each element should be a String.
-  #
-  # Examples
-  #
-  # parser = ParallelParser.new(4)
-  # parser.parse(["Homo sapiens L.", "Quercus quercus"])
-  #
-  # Returns a Hash with scientific names as a key, and parsing results as
-  # a value.
-  def parse(names_list)
-    parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
-      [n, parse_process(n)]
-    end
-    parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
-  end
-  # Public: Returns the number of cores/CPUs.
-  #
-  # Returns Integer of cores/CPUs.
-  def cpu_num
-    @cpu_num ||= Parallel.processor_count
-  end
-  private
-  def parse_process(name)
-    p = ScientificNameParser.new
-    p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
-  end
+# CLib is required to free memory after it is used by C
+module CLib
+  extend FFI::Library
+  ffi_lib FFI::Library::LIBC
+  attach_function :free, [:pointer], :void
 end
-# we can use these expressions when we are ready to parse virus names
-# class VirusParser
-#   def initialize
-#     @order     = /^\s*[A-Z][a-z]\+virales/i
-#     @family    = /^\s*[A-Z][a-z]\+viridae|viroidae/i
-#     @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
-#     @genus     = /^\s*[A-Z][a-z]\+virus|viroid/i
-#     @species   = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
-#                   viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
-#     @parsed    = nil
-#   end
-# end
-class ScientificNameParser
-  FAILED_RESULT = ->(name) do
-    { scientificName:
-      { id: GnUUID.uuid(name), parsed: false, verbatim: name,
-        error: "Parser internal error" }
-    }
-  end
-  def self.add_rank_to_canonical(parsed)
-    return parsed if parsed[:scientificName][:hybrid]
-    name = parsed[:scientificName]
-    parts = name[:canonical].split(" ")
-    name_ary = parts[0..1]
-    name[:details][0][:infraspecies].each do |data|
-      infrasp = data[:string]
-      rank = data[:rank]
-      name_ary << (rank && rank != "n/a" ? "#{rank} #{infrasp}" : infrasp)
+module Biodiversity
+  # Parser provides a namespace for functions to parse scientific names.
+  module Parser
+    extend FFI::Library
+    platform = case Gem.platforms[1].os
+               when 'linux'
+                 'linux'
+               when 'darwin'
+                 'mac'
+               when 'mswin64'
+                 'win'
+               else
+                 raise "Unsupported platform: #{Gem.platforms[1].os}"
+               end
+    ffi_lib File.join(__dir__, '..', '..', 'clib', platform, 'libgnparser.so')
+    POINTER_SIZE = FFI.type_size(:pointer)
+    attach_function(:parse_go, :ParseToString, %i[string string], :string)
+    attach_function(:parse_ary_go, :ParseAryToStrings,
+                    %i[pointer int string pointer], :void)
+    def self.parse(name, simple = false)
+      format = simple ? 'simple' : 'compact'
+      parsed = parse_go(name, format)
+      output(parsed, simple)
     end
-    parsed[:scientificName][:canonical] = name_ary.join(" ")
-    parsed
-  end
-  def self.version
-    Biodiversity::VERSION
-  end
-  def self.fix_case(name_string)
-    name_ary = name_string.split(/\s+/)
-    words_num = name_ary.size
-    res = nil
-    if words_num == 1
-      res = name_ary[0].gsub(/[\(\)\{\}]/, "")
-      if res.size > 1
-        res = UnicodeUtils.upcase(res[0]) + UnicodeUtils.downcase(res[1..-1])
-      else
-        res = nil
+    def self.parse_ary(ary, simple = false)
+      format = simple ? 'simple' : 'compact'
+      in_ptr = FFI::MemoryPointer.new(:pointer, ary.length)
+      in_ptr.write_array_of_pointer(
+        ary.map { |s| FFI::MemoryPointer.from_string(s) }
+      )
+      out_var = FFI::MemoryPointer.new(:pointer)
+      parse_ary_go(in_ptr, ary.length, format, out_var)
+      out_var.read_pointer
+             .get_array_of_string(0, ary.length)
+             .each_with_object([]) do |prsd, a|
+        a << output(prsd, simple)
       end
-    else
-      if name_ary[0].size > 1
-        word1 = UnicodeUtils.upcase(name_ary[0][0]) +
-          UnicodeUtils.downcase(name_ary[0][1..-1])
-      else
-        word1 = name_ary[0]
+    ensure
+      out_var.read_pointer.get_array_of_pointer(0, ary.length).each do |p|
+        CLib.free(p)
       end
-      if name_ary[1].match(/^\(/)
-        word2 = name_ary[1].gsub(/\)$/, "") + ")"
-        word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
-          UnicodeUtils.downcase(word2[2..-1])
-      else
-        word2 = UnicodeUtils.downcase(name_ary[1])
-      end
-      res = word1 + " " +
-        word2 + " " +
-        name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(" ")
-      res.strip!
+      CLib.free(out_var.read_pointer)
     end
-    res
-  end
-  def initialize(opts = {})
-    @canonical_with_rank = !!opts[:canonical_with_rank]
-    @verbatim = ""
-    @clean = ScientificNameCleanParser.new
-    @dirty = ScientificNameDirtyParser.new
-    @canonical = ScientificNameCanonicalParser.new
-    @parsed = nil
-    @tail = nil
-  end
-  def virus?(a_string)
-    !!(a_string.match(/\sICTV\s*$/) ||
-       a_string.match(/\b(virus|viruses|particle|particles|
-                          phage|phages|viroid|viroids|virophage|
-                          prion|prions|NPV)\b/ix) ||
-       a_string.match(/[A-Z]?[a-z]+virus\b/) ||
-       a_string.match(/\b[A-Za-z]*(satellite[s]?|NPV)\b/))
-  end
-  def noparse?(a_string)
-    incertae_sedis = a_string.match(/incertae\s+sedis/i) ||
-      a_string.match(/inc\.\s*sed\./i)
-    rna = a_string.match(/[^A-Z]RNA[^A-Z]*/)
-    incertae_sedis || rna
-  end
-  def parsed
-    @parsed
-  end
-  def parse(a_string)
-    @verbatim = a_string
-    a_string, @tail = PreProcessor::clean(a_string)
-    if virus?(a_string)
-      @parsed = { verbatim: @verbatim, virus: true }
-    elsif noparse?(a_string)
-      @parsed = { verbatim: @verbatim }
-    else
-      begin
-        @parsed = @clean.parse(a_string) || @dirty.parse(a_string)
-        unless @parsed
-          index = @dirty.index || @clean.index
-          salvage_match = a_string[0..index].split(/\s+/)[0..-2]
-          salvage_string = salvage_match ? salvage_match.join(" ") : a_string
-          @parsed =  @dirty.parse(salvage_string) ||
-                     @canonical.parse(a_string) ||
-                     { verbatim: @verbatim }
-        end
-      rescue
-        @parsed = FAILED_RESULT.(@verbatim)
-      end
-    end
-    def @parsed.verbatim=(a_string)
-      @verbatim = a_string
-      @id = GnUUID.uuid(@verbatim)
-    end
-    def @parsed.all(opts = {})
-      canonical_with_rank = !!opts[:canonical_with_rank]
-      parsed = self.class != Hash
-      res = { id: @id, parsed: parsed,
-              parser_version: ScientificNameParser::version}
-      if parsed
-        hybrid = self.hybrid rescue false
-        res.merge!({
-          verbatim: @verbatim,
-          normalized: self.value,
-          canonical: self.canonical,
-          hybrid: hybrid,
-          details: self.details,
-          parser_run: self.parser_run,
-          positions: self.pos
-          })
+    def self.output(parsed, simple)
+      if simple
+        parsed = parsed.split('|')
+        {
+          id: parsed[0],
+          verbatim: parsed[1],
+          canonicalName: {
+            full: parsed[2],
+            simple: parsed[3],
+            stem: parsed[4]
+          },
+          authorship: parsed[5],
+          quality: parsed[6]
+        }
       else
-        res.merge!(self)
-      end
-      res[:surrogate] = true if ScientificNameParser.surrogate?(res)
-      res = {:scientificName => res}
-      if (canonical_with_rank &&
-          canonical.count(" ") > 1 &&
-          res[:scientificName][:details][0][:infraspecies])
-        ScientificNameParser.add_rank_to_canonical(res)
+        JSON.parse(parsed, symbolize_names: true)
       end
-      res
-    end
-    def @parsed.pos_json
-      self.pos.to_json rescue ""
     end
-    def @parsed.all_json
-      self.all.to_json rescue ""
-    end
-    @parsed.verbatim = @verbatim
-    res = @parsed.all(canonical_with_rank: @canonical_with_rank)
-    res[:scientificName].merge!(tail: @tail) if @tail && @tail != ""
-    res
   end
-  private
-  def self.surrogate?(parsed_data)
-    return false unless parsed_data[:parsed]
-    name = parsed_data[:verbatim]
-    pos = parsed_data[:positions].to_a.flatten
-    surrogate1 = /BOLD:|[\d]{5,}/i
-    surrogate2 = /\b(spp|sp|nr|cf)[\.]?[\s]*$/i
-    is_surrogate = false
-    ai_index = pos.index("annotation_identification")
-    if ai_index
-      ai = name[pos[ai_index - 1]..pos[ai_index + 1]]
-      is_surrogate = true if ai.match(/^(spp|cf|sp|nr)/)
-    end
-    is_surrogate = true if !is_surrogate && (name.match(surrogate1) ||
-                     name.match(surrogate2))
-    is_surrogate
-  end
-end
+end