biodiversity 1.0.10 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rvmrc +1 -1
- data/.travis.yml +7 -0
- data/CHANGELOG +42 -0
- data/Gemfile +8 -6
- data/Gemfile.lock +33 -33
- data/README.md +167 -0
- data/Rakefile +16 -11
- data/VERSION +1 -1
- data/bin/parserver +33 -44
- data/lib/biodiversity/parser.rb +160 -33
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +4 -2
- data/lib/biodiversity/parser/scientific_name_clean.treetop +479 -277
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +11 -16
- data/spec/parser/scientific_name.spec.rb +63 -7
- data/spec/parser/scientific_name_clean.spec.rb +76 -24
- data/spec/parser/scientific_name_dirty.spec.rb +4 -6
- data/spec/parser/test_data.txt +132 -41
- data/spec/parser/todo.txt +27 -0
- metadata +153 -119
- data/README.rdoc +0 -99
    
        data/.rvmrc
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
            rvm use ruby-1.9. | 
| 1 | 
            +
            rvm use ruby-1.9.3-p392@biodiversity --create
         | 
    
        data/.travis.yml
    ADDED
    
    
    
        data/CHANGELOG
    ADDED
    
    | @@ -0,0 +1,42 @@ | |
| 1 | 
            +
            3.0.0 -- removing support for ruby 1.8.7, making biodiversity gem be the same
         | 
| 2 | 
            +
            as biodiversity19, deprecating biodiversity19. A few newly discovered bugs
         | 
| 3 | 
            +
            are fixed.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            2.1.0 -- added ScientificNameParser.version method
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            2.0.0 -- backward incompatibe change in parserver, therefore new major number.
         | 
| 8 | 
            +
            In parserver removed option --output=canonical_with_rank, instead added -r
         | 
| 9 | 
            +
            option which allows to have canonical with rank with either json or canonical
         | 
| 10 | 
            +
            outputs
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            1.2.0 -- changed method invocation signature ScientificNameParser.new
         | 
| 13 | 
            +
            Now it can take options
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            1.1.3 -- added 'fo' as rank
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            1.1.2  -- static method for fixins all-caps canonical names, fixing caps
         | 
| 18 | 
            +
            for authors words, ampersand instead of 'et' in normalization
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            1.1.1  -- more multi-uninomials cases, expanded viruses detection, added
         | 
| 21 | 
            +
            abbreviated genera and a few small fixes
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            1.1.0  -- added multi-uninomials, fixes in identification annotations (aff.,
         | 
| 24 | 
            +
            cf., sp. etc), bug fixes, more robust salvage mode
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            1.0.17 -- fixed a bug which prevented all diacritics be converted correctly
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            1.0.16 -- dirty mode now converts ë to e
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            1.0.15 -- additional rules added for names ending with ssp. sp sp. and cf.
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            1.0.14 -- canonical forms had allowed ë as a character until now. After this
         | 
| 33 | 
            +
            version the only utf-8 character allowed in canonical forms should be the
         | 
| 34 | 
            +
            multiplication sign for hybrids.
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            1.0.13 -- canonical forms for cf. aff. qualifiers are modified: canonical for
         | 
| 37 | 
            +
            'Aus cf. bus' is now 'Aus bus'; canonical for 'Aus aff. bus' is now 'Aus'.
         | 
| 38 | 
            +
            Ranks at the end of the name like 'var', 'ssp', 'spp' are considered junk and
         | 
| 39 | 
            +
            are ignored
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            1.0.12 -- bug is fixed which prevented 'Cucurbita pepo' be parsed correctly,
         | 
| 42 | 
            +
            f., forma, fr. are now treated as any other ranks.
         | 
    
        data/Gemfile
    CHANGED
    
    | @@ -1,13 +1,15 @@ | |
| 1 | 
            -
            source  | 
| 1 | 
            +
            source 'https://rubygems.org'
         | 
| 2 2 |  | 
| 3 | 
            -
            gem  | 
| 4 | 
            -
            gem  | 
| 3 | 
            +
            gem 'rake',           '~> 10.0'
         | 
| 4 | 
            +
            gem 'treetop',        '~> 1.4'
         | 
| 5 | 
            +
            gem 'parallel',       '~> 0.6'
         | 
| 6 | 
            +
            gem 'unicode_utils',  '~> 1.4'
         | 
| 5 7 |  | 
| 6 8 | 
             
            group :development do
         | 
| 7 | 
            -
              gem  | 
| 9 | 
            +
              gem 'debugger',     '~> 1.5'
         | 
| 10 | 
            +
              gem 'jeweler',      '~> 1.8'
         | 
| 8 11 | 
             
            end
         | 
| 9 12 |  | 
| 10 13 | 
             
            group :test do
         | 
| 11 | 
            -
              gem  | 
| 12 | 
            -
              gem "rspec"
         | 
| 14 | 
            +
              gem 'rspec',        '~> 2.13'
         | 
| 13 15 | 
             
            end
         | 
    
        data/Gemfile.lock
    CHANGED
    
    | @@ -1,47 +1,47 @@ | |
| 1 1 | 
             
            GEM
         | 
| 2 | 
            -
              remote:  | 
| 2 | 
            +
              remote: https://rubygems.org/
         | 
| 3 3 | 
             
              specs:
         | 
| 4 | 
            -
                 | 
| 5 | 
            -
                 | 
| 6 | 
            -
             | 
| 4 | 
            +
                columnize (0.3.6)
         | 
| 5 | 
            +
                debugger (1.5.0)
         | 
| 6 | 
            +
                  columnize (>= 0.3.1)
         | 
| 7 | 
            +
                  debugger-linecache (~> 1.2.0)
         | 
| 8 | 
            +
                  debugger-ruby_core_source (~> 1.2.0)
         | 
| 9 | 
            +
                debugger-linecache (1.2.0)
         | 
| 10 | 
            +
                debugger-ruby_core_source (1.2.0)
         | 
| 11 | 
            +
                diff-lcs (1.2.1)
         | 
| 7 12 | 
             
                git (1.2.5)
         | 
| 8 | 
            -
                jeweler (1. | 
| 13 | 
            +
                jeweler (1.8.4)
         | 
| 9 14 | 
             
                  bundler (~> 1.0)
         | 
| 10 15 | 
             
                  git (>= 1.2.5)
         | 
| 11 16 | 
             
                  rake
         | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
                parallel (0. | 
| 17 | 
            +
                  rdoc
         | 
| 18 | 
            +
                json (1.7.7)
         | 
| 19 | 
            +
                parallel (0.6.2)
         | 
| 15 20 | 
             
                polyglot (0.3.3)
         | 
| 16 | 
            -
                rake (0. | 
| 17 | 
            -
                 | 
| 18 | 
            -
                   | 
| 19 | 
            -
             | 
| 20 | 
            -
                  rspec- | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
                rspec- | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
                  ruby_core_source (>= 0.1.4)
         | 
| 29 | 
            -
                ruby-debug19 (0.11.6)
         | 
| 30 | 
            -
                  columnize (>= 0.3.1)
         | 
| 31 | 
            -
                  linecache19 (>= 0.5.11)
         | 
| 32 | 
            -
                  ruby-debug-base19 (>= 0.11.19)
         | 
| 33 | 
            -
                ruby_core_source (0.1.5)
         | 
| 34 | 
            -
                  archive-tar-minitar (>= 0.5.2)
         | 
| 35 | 
            -
                treetop (1.4.10)
         | 
| 21 | 
            +
                rake (10.0.3)
         | 
| 22 | 
            +
                rdoc (4.0.0)
         | 
| 23 | 
            +
                  json (~> 1.4)
         | 
| 24 | 
            +
                rspec (2.13.0)
         | 
| 25 | 
            +
                  rspec-core (~> 2.13.0)
         | 
| 26 | 
            +
                  rspec-expectations (~> 2.13.0)
         | 
| 27 | 
            +
                  rspec-mocks (~> 2.13.0)
         | 
| 28 | 
            +
                rspec-core (2.13.1)
         | 
| 29 | 
            +
                rspec-expectations (2.13.0)
         | 
| 30 | 
            +
                  diff-lcs (>= 1.1.3, < 2.0)
         | 
| 31 | 
            +
                rspec-mocks (2.13.0)
         | 
| 32 | 
            +
                treetop (1.4.12)
         | 
| 36 33 | 
             
                  polyglot
         | 
| 37 34 | 
             
                  polyglot (>= 0.3.1)
         | 
| 35 | 
            +
                unicode_utils (1.4.0)
         | 
| 38 36 |  | 
| 39 37 | 
             
            PLATFORMS
         | 
| 40 38 | 
             
              ruby
         | 
| 41 39 |  | 
| 42 40 | 
             
            DEPENDENCIES
         | 
| 43 | 
            -
               | 
| 44 | 
            -
               | 
| 45 | 
            -
               | 
| 46 | 
            -
               | 
| 47 | 
            -
               | 
| 41 | 
            +
              debugger (~> 1.5)
         | 
| 42 | 
            +
              jeweler (~> 1.8)
         | 
| 43 | 
            +
              parallel (~> 0.6)
         | 
| 44 | 
            +
              rake (~> 10.0)
         | 
| 45 | 
            +
              rspec (~> 2.13)
         | 
| 46 | 
            +
              treetop (~> 1.4)
         | 
| 47 | 
            +
              unicode_utils (~> 1.4)
         | 
    
        data/README.md
    ADDED
    
    | @@ -0,0 +1,167 @@ | |
| 1 | 
            +
            Biodiversity
         | 
| 2 | 
            +
            ============
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            [![Gem Version][1]][2]
         | 
| 5 | 
            +
            [![Continuous Integration Status][3]][4]
         | 
| 6 | 
            +
            [![CodePolice][5]][6]
         | 
| 7 | 
            +
            [![Dependency Status][7]][8]
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            Parses taxonomic scientific name and breaks it into semantic elements.
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            *WARNING, IMPORTANT!:*
         | 
| 12 | 
            +
            Support for Ruby 1.8.7 IS DROPPED. Both biodiversity and
         | 
| 13 | 
            +
            biodiversity19 will be for Ruby > 1.9.1 and will be identical gems.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            biodiversity19 is now deprecated and will be phased out in a couple of years.
         | 
| 16 | 
            +
            You are strongly encouraged to change your dependencies from
         | 
| 17 | 
            +
            biodiversity19 to biodiversity
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            Installation
         | 
| 20 | 
            +
            ------------
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                sudo gem install biodiversity
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            Example usage
         | 
| 25 | 
            +
            -------------
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            ### As a command line script
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            You can parse file with taxonomic names from command line.
         | 
| 30 | 
            +
            File should contain one scientific name per line
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                nnparse file_with_names
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            The resuls will be put into parsed.json file in the current directory.
         | 
| 35 | 
            +
            To save results into a different file:
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                nnparse file_with_names output_file
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            ### As a socket server
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            If you do not use Ruby and need a fast access to the parser functionality
         | 
| 42 | 
            +
            you can use a socket server
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                parserver
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                parserver -h
         | 
| 47 | 
            +
                Usage: parserver [options]
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    -r, --canonical_with_rank        Adds infraspecies rank to canonical forms
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    -o, --output=output              Specifies the type of the output:
         | 
| 52 | 
            +
                    json - parsed results in json
         | 
| 53 | 
            +
                    canonical - canonical form only
         | 
| 54 | 
            +
                                                     Default: json
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    -p, --port=port                  Specifies the port number
         | 
| 57 | 
            +
                                                     Default: 4334
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                    -h, --help                       Show this help message.
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                parserver --output=canonical
         | 
| 62 | 
            +
             | 
| 63 | 
            +
             | 
| 64 | 
            +
             | 
| 65 | 
            +
            With default settings you can access parserserver via 4334 port using a
         | 
| 66 | 
            +
            socket client library of your programming language.  You can find
         | 
| 67 | 
            +
            [socket client script example][9] in the examples directory of the gem.
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            If you want to check if socket server works for you:
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                #run server in one terminal
         | 
| 72 | 
            +
                parserver
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                #in another terminal window type
         | 
| 75 | 
            +
                telnet localhost 4334
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            If you enter a line with a scientific name -- server will send you back
         | 
| 78 | 
            +
            parsed information in json format.
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            To stop telnet client type any of `end`,`exit`,`q`, `.` instead
         | 
| 81 | 
            +
            of scientific name
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                $ telnet localhost 4334
         | 
| 84 | 
            +
                Trying ::1...
         | 
| 85 | 
            +
                Connected to localhost.
         | 
| 86 | 
            +
                Escape character is '^]'.
         | 
| 87 | 
            +
                Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan
         | 
| 88 | 
            +
                {"scientificName":{"canonical":"Acacia abyssinica calophylla"...}}
         | 
| 89 | 
            +
                end
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            ### As a library
         | 
| 92 | 
            +
             | 
| 93 | 
            +
            You can use it as a library in Ruby, JRuby etc.
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                require 'biodiversity'
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                parser = ScientificNameParser.new
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                #to find version number
         | 
| 100 | 
            +
                ScientificNameParser.version
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                # to fix capitalization in canonicals
         | 
| 103 | 
            +
                ScientificNameParser.fix_case("QUERCUS (QUERCUS) ALBA")
         | 
| 104 | 
            +
                # Output: Quercus (Quercus) alba
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                # to parse a scientific name into a ruby hash
         | 
| 107 | 
            +
                parser.parse("Plantago major")
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                #to get json representation
         | 
| 110 | 
            +
                parser.parse("Plantago").to_json
         | 
| 111 | 
            +
                #or
         | 
| 112 | 
            +
                parser.parse("Plantago")
         | 
| 113 | 
            +
                parser.all_json
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                # to clean name up
         | 
| 116 | 
            +
                parser.parse("      Plantago       major    ")[:scientificName][:normalized]
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                # to get only cleaned up latin part of the name
         | 
| 119 | 
            +
                parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:canonical]
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                # to get detailed information about elements of the name
         | 
| 122 | 
            +
                parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            Returned result is not always linear, if name is complex. To get simple linear
         | 
| 125 | 
            +
            representation of the name you can use:
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:position]
         | 
| 128 | 
            +
                # returns {0=>["genus", 16], 17=>["species", 26],
         | 
| 129 | 
            +
                # 28=>["author_word", 32], 33=>["author_word", 40],
         | 
| 130 | 
            +
                # 42=>["author_word", 44], 45=>["author_word", 50],
         | 
| 131 | 
            +
                # 53=>["author_word", 58], 59=>["year", 63]}
         | 
| 132 | 
            +
                # where the key is the char index of the start of
         | 
| 133 | 
            +
                # a word, first element of the value is a semantic meaning
         | 
| 134 | 
            +
                # of the word, second element of the value is the character index
         | 
| 135 | 
            +
                # of end of the word
         | 
| 136 | 
            +
             | 
| 137 | 
            +
            To parse using several CPUs (4 seem to be optimal)
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                parser = ParallelParser.new
         | 
| 140 | 
            +
                # ParallelParser.new(4) will try to run 4 processes if hardware allows
         | 
| 141 | 
            +
                array_of_names = ["Betula alba", "Homo sapiens"....]
         | 
| 142 | 
            +
                parser.parse(array_of_names)
         | 
| 143 | 
            +
                # Output: {"Betula alba" => {:scientificName...}, "Homo sapiens" => {:scientificName...}, ...}
         | 
| 144 | 
            +
             | 
| 145 | 
            +
            parallel parser takes list of names and returns back a hash with names as keys and parsed data as values
         | 
| 146 | 
            +
             | 
| 147 | 
            +
            To get canonicals with ranks for infraspecific epithets:
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                parser = ScientificNameParser.new(canonical_with_rank: true)
         | 
| 150 | 
            +
                parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
         | 
| 151 | 
            +
                # Output: Cola cordifolia var. puberula
         | 
| 152 | 
            +
             | 
| 153 | 
            +
            To resolve lsid and get back RDF file
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
         | 
| 156 | 
            +
             | 
| 157 | 
            +
             | 
| 158 | 
            +
             | 
| 159 | 
            +
            [1]: https://badge.fury.io/rb/biodiversity19.png
         | 
| 160 | 
            +
            [2]: http://badge.fury.io/rb/biodiversity19
         | 
| 161 | 
            +
            [3]: https://secure.travis-ci.org/GlobalNamesArchitecture/biodiversity.png
         | 
| 162 | 
            +
            [4]: http://travis-ci.org/GlobalNamesArchitecture/biodiversity
         | 
| 163 | 
            +
            [5]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity.png
         | 
| 164 | 
            +
            [6]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity
         | 
| 165 | 
            +
            [7]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity.png
         | 
| 166 | 
            +
            [8]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity
         | 
| 167 | 
            +
            [9]: https://github.com/GlobalNamesArchitecture/biodiversity/blob/master/examples/socket_client.rb
         | 
    
        data/Rakefile
    CHANGED
    
    | @@ -20,35 +20,41 @@ ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i | |
| 20 20 | 
             
            begin
         | 
| 21 21 | 
             
              require 'jeweler'
         | 
| 22 22 | 
             
              Jeweler::Tasks.new do |gem|
         | 
| 23 | 
            -
                gem.name =  | 
| 23 | 
            +
                gem.name = 'biodiversity'
         | 
| 24 | 
            +
                #To delete ruby_version < 19 ? 'biodiversity' : 'biodiversity19'
         | 
| 24 25 | 
             
                gem.summary = 'Parser of scientific names'
         | 
| 25 26 | 
             
                gem.description = 'Tools for biodiversity informatics'
         | 
| 26 | 
            -
                gem.email =  | 
| 27 | 
            -
                gem.homepage =  | 
| 28 | 
            -
                gem.authors = [ | 
| 27 | 
            +
                gem.email = 'dmozzherin@gmail.com'
         | 
| 28 | 
            +
                gem.homepage = 'http://github.com/GlobalNamesArchitecture/biodiversity'
         | 
| 29 | 
            +
                gem.authors = ['Dmitry Mozzherin']
         | 
| 29 30 | 
             
                gem.has_rdoc = false
         | 
| 30 31 | 
             
                gem.bindir = 'bin'
         | 
| 31 32 | 
             
                gem.executables = ['nnparse', 'parserver']
         | 
| 32 33 | 
             
                gem.add_dependency('treetop')
         | 
| 33 34 | 
             
                gem.add_dependency('parallel')
         | 
| 34 | 
            -
                gem.add_dependency('json') if ruby_version < 19
         | 
| 35 | 
            +
                # gem.add_dependency('json') if ruby_version < 19
         | 
| 35 36 | 
             
                gem.add_development_dependency "rspec"
         | 
| 36 | 
            -
                # gem is a Gem::Specification... | 
| 37 | 
            +
                # gem is a Gem::Specification...
         | 
| 38 | 
            +
                # see http://www.rubygems.org/read/chapter/20 for additional settings
         | 
| 37 39 | 
             
              end
         | 
| 38 40 | 
             
            rescue LoadError
         | 
| 39 | 
            -
              puts  | 
| 41 | 
            +
              puts 'Jeweler (or a dependency) not available. ' +
         | 
| 42 | 
            +
                'Install it with: sudo gem install jeweler'
         | 
| 40 43 | 
             
            end
         | 
| 41 44 |  | 
| 42 45 | 
             
            task :tt do
         | 
| 43 | 
            -
              ['scientific_name_clean', | 
| 46 | 
            +
              ['scientific_name_clean',
         | 
| 47 | 
            +
               'scientific_name_dirty',
         | 
| 48 | 
            +
               'scientific_name_canonical'].each do |f|
         | 
| 44 49 | 
             
                file = "#{dir}/lib/biodiversity/parser/#{f}"
         | 
| 45 50 | 
             
                FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
         | 
| 46 51 | 
             
                system("tt #{file}.treetop")
         | 
| 47 52 | 
             
                rf = "#{file}.rb"
         | 
| 48 | 
            -
                rfn = open(rf +  | 
| 53 | 
            +
                rfn = open(rf + '.tmp', 'w')
         | 
| 49 54 | 
             
                skip_head = false
         | 
| 50 55 | 
             
                f = open(rf)
         | 
| 51 | 
            -
                #getting around a bug in treetop which prevents setting | 
| 56 | 
            +
                # getting around a bug in treetop which prevents setting
         | 
| 57 | 
            +
                # UTF-8 encoding in ruby19
         | 
| 52 58 | 
             
                f.each_with_index do |l, i|
         | 
| 53 59 | 
             
                  skip_head = l.match(/^# Autogenerated/) if i == 0
         | 
| 54 60 | 
             
                  if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))
         | 
| @@ -63,4 +69,3 @@ task :tt do | |
| 63 69 | 
             
                `mv #{rf}.tmp #{rf}`
         | 
| 64 70 | 
             
              end
         | 
| 65 71 | 
             
            end
         | 
| 66 | 
            -
             | 
    
        data/VERSION
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
             | 
| 1 | 
            +
            3.0.0
         | 
    
        data/bin/parserver
    CHANGED
    
    | @@ -5,78 +5,67 @@ require 'socket' | |
| 5 5 | 
             
            require 'biodiversity'          # Get sockets from stdlib
         | 
| 6 6 |  | 
| 7 7 | 
             
            DEFAULT_PORT = 4334
         | 
| 8 | 
            -
            RUBY_VERSION_INT = RUBY_VERSION.split( | 
| 8 | 
            +
            RUBY_VERSION_INT = RUBY_VERSION.split('.')[0..1].join('').to_i
         | 
| 9 9 | 
             
            OPTIONS = {
         | 
| 10 | 
            -
              : | 
| 11 | 
            -
              : | 
| 10 | 
            +
              output: 'json',
         | 
| 11 | 
            +
              canonical_with_rank: false, 
         | 
| 12 | 
            +
              port: DEFAULT_PORT
         | 
| 12 13 | 
             
            }
         | 
| 13 14 |  | 
| 14 15 | 
             
            options = {}
         | 
| 15 16 | 
             
            ARGV.options do |opts|
         | 
| 16 17 | 
             
              script_name = File.basename($0)
         | 
| 17 | 
            -
              opts.banner = "Usage:  | 
| 18 | 
            +
              opts.banner = "Usage: #{script_name} [options]"
         | 
| 18 19 |  | 
| 19 | 
            -
              opts.separator  | 
| 20 | 
            +
              opts.separator ''
         | 
| 20 21 |  | 
| 21 | 
            -
              opts.on( | 
| 22 | 
            -
                       | 
| 22 | 
            +
              opts.on('-r', 
         | 
| 23 | 
            +
                      '--canonical_with_rank', 
         | 
| 24 | 
            +
                      'Adds infraspecies rank to canonical forms'
         | 
| 25 | 
            +
                      ) { |rank| options[:canonical_with_rank] = rank }
         | 
| 26 | 
            +
             | 
| 27 | 
            +
              opts.separator ''
         | 
| 28 | 
            +
             | 
| 29 | 
            +
              opts.on('-o', '--output=output', String,
         | 
| 30 | 
            +
                      'Specifies the type of the output:
         | 
| 23 31 | 
             
                json - parsed results in json
         | 
| 24 | 
            -
                canonical - canonical  | 
| 25 | 
            -
             | 
| 26 | 
            -
                      "Default: json") { |output| options[:output] = output }
         | 
| 32 | 
            +
                canonical - canonical form only',
         | 
| 33 | 
            +
                      'Default: json') { |output| options[:output] = output }
         | 
| 27 34 |  | 
| 28 | 
            -
              opts.separator  | 
| 35 | 
            +
              opts.separator ''
         | 
| 29 36 |  | 
| 30 | 
            -
              opts.on( | 
| 31 | 
            -
                       | 
| 37 | 
            +
              opts.on('-p', '--port=port', String,
         | 
| 38 | 
            +
                      'Specifies the port number',
         | 
| 32 39 | 
             
                      "Default: #{DEFAULT_PORT}") { |port| options[:port] = port }
         | 
| 33 40 |  | 
| 34 | 
            -
              opts.separator  | 
| 41 | 
            +
              opts.separator ''
         | 
| 35 42 |  | 
| 36 | 
            -
              opts.on( | 
| 37 | 
            -
                       | 
| 43 | 
            +
              opts.on('-h', '--help',
         | 
| 44 | 
            +
                      'Show this help message.') { puts opts; exit }
         | 
| 38 45 |  | 
| 39 46 | 
             
              opts.parse!
         | 
| 40 47 | 
             
            end
         | 
| 41 48 |  | 
| 42 | 
            -
            OPTIONS[:output] = options[:output] if ['canonical' | 
| 49 | 
            +
            OPTIONS[:output] = options[:output] if ['canonical'].include?(options[:output])
         | 
| 43 50 | 
             
            OPTIONS[:port] = options[:port].to_i if options[:port].to_i > 0
         | 
| 44 | 
            -
             | 
| 45 | 
            -
            def parser_error(name_string)
         | 
| 46 | 
            -
              {:scientificName => {:parsed => false, :verbatim => name_string,  :error => 'Parser error'}}
         | 
| 47 | 
            -
            end
         | 
| 51 | 
            +
            OPTIONS[:canonical_with_rank] = !!options[:canonical_with_rank]
         | 
| 48 52 |  | 
| 49 53 | 
             
            def get_output(name_string, parser)
         | 
| 50 54 | 
             
              begin
         | 
| 51 | 
            -
                if RUBY_VERSION_INT < 19
         | 
| 52 | 
            -
                  old_kcode = $KCODE
         | 
| 53 | 
            -
                  $KCODE = 'NONE'
         | 
| 54 | 
            -
                end
         | 
| 55 55 | 
             
                parsed = parser.parse(name_string)
         | 
| 56 | 
            -
                if RUBY_VERSION_INT < 19
         | 
| 57 | 
            -
                  $KCODE = old_kcode
         | 
| 58 | 
            -
                end
         | 
| 59 56 | 
             
              rescue
         | 
| 60 | 
            -
                parsed =  | 
| 57 | 
            +
                parsed = ScientificNameParser::FAILED_RESULT.(name_string)
         | 
| 61 58 | 
             
              end
         | 
| 62 59 | 
             
              output =  OPTIONS[:output]
         | 
| 63 60 | 
             
              return parsed.to_json if output == 'json'
         | 
| 64 | 
            -
               | 
| 65 | 
            -
              return canonical.to_s if output == 'canonical' || canonical == nil || parsed[:scientificName][:hybrid] || !parsed[:scientificName][:parsed]
         | 
| 66 | 
            -
              parts = parsed[:scientificName][:canonical].split(" ")
         | 
| 67 | 
            -
             | 
| 68 | 
            -
              if parts.size > 2 && parsed[:scientificName][:details][0][:infraspecies]
         | 
| 69 | 
            -
                name_ary = parts[0..1]
         | 
| 70 | 
            -
                parsed[:scientificName][:details][0][:infraspecies].each do |data|
         | 
| 71 | 
            -
                  name_ary << (data[:rank] && data[:rank] != 'n/a'? "#{data[:rank]} #{data[:string]}" : data[:string])
         | 
| 72 | 
            -
                end
         | 
| 73 | 
            -
                canonical = name_ary.join(" ")
         | 
| 74 | 
            -
              end
         | 
| 75 | 
            -
              canonical
         | 
| 61 | 
            +
              parsed[:scientificName][:canonical].to_s
         | 
| 76 62 | 
             
            end
         | 
| 77 63 |  | 
| 78 | 
            -
            puts "Running parser service on port  | 
| 79 | 
            -
             | 
| 64 | 
            +
            puts "Running parser service on port %s, output type is '%s'" % 
         | 
| 65 | 
            +
                 [OPTIONS[:port], OPTIONS[:output]]
         | 
| 66 | 
            +
            opts = {}
         | 
| 67 | 
            +
            opts = {canonical_with_rank: true} if OPTIONS[:canonical_with_rank]
         | 
| 68 | 
            +
            parser = ScientificNameParser.new(opts)
         | 
| 80 69 | 
             
            server = TCPServer.open(OPTIONS[:port])  # Socket to listen on a port
         | 
| 81 70 | 
             
            loop do                         # Servers run forever
         | 
| 82 71 | 
             
              Thread.start(server.accept) do |client|
         | 
| @@ -85,7 +74,7 @@ loop do                         # Servers run forever | |
| 85 74 | 
             
                while a = client.readline rescue nil
         | 
| 86 75 | 
             
                  count += 1
         | 
| 87 76 | 
             
                  puts "parsed %s'th name" % count if count % 1000 == 0
         | 
| 88 | 
            -
                  a.force_encoding( | 
| 77 | 
            +
                  a.force_encoding('utf-8') if a && RUBY_VERSION_INT >= 19
         | 
| 89 78 | 
             
                  if ['end','exit','q', '.'].include? a.strip
         | 
| 90 79 | 
             
                    client.close
         | 
| 91 80 | 
             
                    break
         |