biodiversity 1.0.10 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rvmrc CHANGED
@@ -1 +1 @@
1
- rvm use ruby-1.9.2-p290@biodiversity --create
1
+ rvm use ruby-1.9.3-p392@biodiversity --create
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ rvm:
2
+ - 1.9.3
3
+ - 2.0.0
4
+ bundler_args: --without development
5
+ branches:
6
+ only:
7
+ - master
data/CHANGELOG ADDED
@@ -0,0 +1,42 @@
1
+ 3.0.0 -- removing support for ruby 1.8.7, making biodiversity gem be the same
2
+ as biodiversity19, deprecating biodiversity19. A few newly discovered bugs
3
+ are fixed.
4
+
5
+ 2.1.0 -- added ScientificNameParser.version method
6
+
7
+ 2.0.0 -- backward incompatibe change in parserver, therefore new major number.
8
+ In parserver removed option --output=canonical_with_rank, instead added -r
9
+ option which allows to have canonical with rank with either json or canonical
10
+ outputs
11
+
12
+ 1.2.0 -- changed method invocation signature ScientificNameParser.new
13
+ Now it can take options
14
+
15
+ 1.1.3 -- added 'fo' as rank
16
+
17
+ 1.1.2 -- static method for fixins all-caps canonical names, fixing caps
18
+ for authors words, ampersand instead of 'et' in normalization
19
+
20
+ 1.1.1 -- more multi-uninomials cases, expanded viruses detection, added
21
+ abbreviated genera and a few small fixes
22
+
23
+ 1.1.0 -- added multi-uninomials, fixes in identification annotations (aff.,
24
+ cf., sp. etc), bug fixes, more robust salvage mode
25
+
26
+ 1.0.17 -- fixed a bug which prevented all diacritics be converted correctly
27
+
28
+ 1.0.16 -- dirty mode now converts ë to e
29
+
30
+ 1.0.15 -- additional rules added for names ending with ssp. sp sp. and cf.
31
+
32
+ 1.0.14 -- canonical forms had allowed ë as a character until now. After this
33
+ version the only utf-8 character allowed in canonical forms should be the
34
+ multiplication sign for hybrids.
35
+
36
+ 1.0.13 -- canonical forms for cf. aff. qualifiers are modified: canonical for
37
+ 'Aus cf. bus' is now 'Aus bus'; canonical for 'Aus aff. bus' is now 'Aus'.
38
+ Ranks at the end of the name like 'var', 'ssp', 'spp' are considered junk and
39
+ are ignored
40
+
41
+ 1.0.12 -- bug is fixed which prevented 'Cucurbita pepo' be parsed correctly,
42
+ f., forma, fr. are now treated as any other ranks.
data/Gemfile CHANGED
@@ -1,13 +1,15 @@
1
- source "http://rubygems.org"
1
+ source 'https://rubygems.org'
2
2
 
3
- gem "treetop"
4
- gem "parallel"
3
+ gem 'rake', '~> 10.0'
4
+ gem 'treetop', '~> 1.4'
5
+ gem 'parallel', '~> 0.6'
6
+ gem 'unicode_utils', '~> 1.4'
5
7
 
6
8
  group :development do
7
- gem "jeweler"
9
+ gem 'debugger', '~> 1.5'
10
+ gem 'jeweler', '~> 1.8'
8
11
  end
9
12
 
10
13
  group :test do
11
- gem "ruby-debug19", :require => "ruby-debug"
12
- gem "rspec"
14
+ gem 'rspec', '~> 2.13'
13
15
  end
data/Gemfile.lock CHANGED
@@ -1,47 +1,47 @@
1
1
  GEM
2
- remote: http://rubygems.org/
2
+ remote: https://rubygems.org/
3
3
  specs:
4
- archive-tar-minitar (0.5.2)
5
- columnize (0.3.4)
6
- diff-lcs (1.1.3)
4
+ columnize (0.3.6)
5
+ debugger (1.5.0)
6
+ columnize (>= 0.3.1)
7
+ debugger-linecache (~> 1.2.0)
8
+ debugger-ruby_core_source (~> 1.2.0)
9
+ debugger-linecache (1.2.0)
10
+ debugger-ruby_core_source (1.2.0)
11
+ diff-lcs (1.2.1)
7
12
  git (1.2.5)
8
- jeweler (1.6.4)
13
+ jeweler (1.8.4)
9
14
  bundler (~> 1.0)
10
15
  git (>= 1.2.5)
11
16
  rake
12
- linecache19 (0.5.12)
13
- ruby_core_source (>= 0.1.4)
14
- parallel (0.5.9)
17
+ rdoc
18
+ json (1.7.7)
19
+ parallel (0.6.2)
15
20
  polyglot (0.3.3)
16
- rake (0.9.2.2)
17
- rspec (2.7.0)
18
- rspec-core (~> 2.7.0)
19
- rspec-expectations (~> 2.7.0)
20
- rspec-mocks (~> 2.7.0)
21
- rspec-core (2.7.1)
22
- rspec-expectations (2.7.0)
23
- diff-lcs (~> 1.1.2)
24
- rspec-mocks (2.7.0)
25
- ruby-debug-base19 (0.11.25)
26
- columnize (>= 0.3.1)
27
- linecache19 (>= 0.5.11)
28
- ruby_core_source (>= 0.1.4)
29
- ruby-debug19 (0.11.6)
30
- columnize (>= 0.3.1)
31
- linecache19 (>= 0.5.11)
32
- ruby-debug-base19 (>= 0.11.19)
33
- ruby_core_source (0.1.5)
34
- archive-tar-minitar (>= 0.5.2)
35
- treetop (1.4.10)
21
+ rake (10.0.3)
22
+ rdoc (4.0.0)
23
+ json (~> 1.4)
24
+ rspec (2.13.0)
25
+ rspec-core (~> 2.13.0)
26
+ rspec-expectations (~> 2.13.0)
27
+ rspec-mocks (~> 2.13.0)
28
+ rspec-core (2.13.1)
29
+ rspec-expectations (2.13.0)
30
+ diff-lcs (>= 1.1.3, < 2.0)
31
+ rspec-mocks (2.13.0)
32
+ treetop (1.4.12)
36
33
  polyglot
37
34
  polyglot (>= 0.3.1)
35
+ unicode_utils (1.4.0)
38
36
 
39
37
  PLATFORMS
40
38
  ruby
41
39
 
42
40
  DEPENDENCIES
43
- jeweler
44
- parallel
45
- rspec
46
- ruby-debug19
47
- treetop
41
+ debugger (~> 1.5)
42
+ jeweler (~> 1.8)
43
+ parallel (~> 0.6)
44
+ rake (~> 10.0)
45
+ rspec (~> 2.13)
46
+ treetop (~> 1.4)
47
+ unicode_utils (~> 1.4)
data/README.md ADDED
@@ -0,0 +1,167 @@
1
+ Biodiversity
2
+ ============
3
+
4
+ [![Gem Version][1]][2]
5
+ [![Continuous Integration Status][3]][4]
6
+ [![CodePolice][5]][6]
7
+ [![Dependency Status][7]][8]
8
+
9
+ Parses taxonomic scientific name and breaks it into semantic elements.
10
+
11
+ *WARNING, IMPORTANT!:*
12
+ Support for Ruby 1.8.7 IS DROPPED. Both biodiversity and
13
+ biodiversity19 will be for Ruby > 1.9.1 and will be identical gems.
14
+
15
+ biodiversity19 is now deprecated and will be phased out in a couple of years.
16
+ You are strongly encouraged to change your dependencies from
17
+ biodiversity19 to biodiversity
18
+
19
+ Installation
20
+ ------------
21
+
22
+ sudo gem install biodiversity
23
+
24
+ Example usage
25
+ -------------
26
+
27
+ ### As a command line script
28
+
29
+ You can parse file with taxonomic names from command line.
30
+ File should contain one scientific name per line
31
+
32
+ nnparse file_with_names
33
+
34
+ The resuls will be put into parsed.json file in the current directory.
35
+ To save results into a different file:
36
+
37
+ nnparse file_with_names output_file
38
+
39
+ ### As a socket server
40
+
41
+ If you do not use Ruby and need a fast access to the parser functionality
42
+ you can use a socket server
43
+
44
+ parserver
45
+
46
+ parserver -h
47
+ Usage: parserver [options]
48
+
49
+ -r, --canonical_with_rank Adds infraspecies rank to canonical forms
50
+
51
+ -o, --output=output Specifies the type of the output:
52
+ json - parsed results in json
53
+ canonical - canonical form only
54
+ Default: json
55
+
56
+ -p, --port=port Specifies the port number
57
+ Default: 4334
58
+
59
+ -h, --help Show this help message.
60
+
61
+ parserver --output=canonical
62
+
63
+
64
+
65
+ With default settings you can access parserserver via 4334 port using a
66
+ socket client library of your programming language. You can find
67
+ [socket client script example][9] in the examples directory of the gem.
68
+
69
+ If you want to check if socket server works for you:
70
+
71
+ #run server in one terminal
72
+ parserver
73
+
74
+ #in another terminal window type
75
+ telnet localhost 4334
76
+
77
+ If you enter a line with a scientific name -- server will send you back
78
+ parsed information in json format.
79
+
80
+ To stop telnet client type any of `end`,`exit`,`q`, `.` instead
81
+ of scientific name
82
+
83
+ $ telnet localhost 4334
84
+ Trying ::1...
85
+ Connected to localhost.
86
+ Escape character is '^]'.
87
+ Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan
88
+ {"scientificName":{"canonical":"Acacia abyssinica calophylla"...}}
89
+ end
90
+
91
+ ### As a library
92
+
93
+ You can use it as a library in Ruby, JRuby etc.
94
+
95
+ require 'biodiversity'
96
+
97
+ parser = ScientificNameParser.new
98
+
99
+ #to find version number
100
+ ScientificNameParser.version
101
+
102
+ # to fix capitalization in canonicals
103
+ ScientificNameParser.fix_case("QUERCUS (QUERCUS) ALBA")
104
+ # Output: Quercus (Quercus) alba
105
+
106
+ # to parse a scientific name into a ruby hash
107
+ parser.parse("Plantago major")
108
+
109
+ #to get json representation
110
+ parser.parse("Plantago").to_json
111
+ #or
112
+ parser.parse("Plantago")
113
+ parser.all_json
114
+
115
+ # to clean name up
116
+ parser.parse(" Plantago major ")[:scientificName][:normalized]
117
+
118
+ # to get only cleaned up latin part of the name
119
+ parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:canonical]
120
+
121
+ # to get detailed information about elements of the name
122
+ parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
123
+
124
+ Returned result is not always linear, if name is complex. To get simple linear
125
+ representation of the name you can use:
126
+
127
+ parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:position]
128
+ # returns {0=>["genus", 16], 17=>["species", 26],
129
+ # 28=>["author_word", 32], 33=>["author_word", 40],
130
+ # 42=>["author_word", 44], 45=>["author_word", 50],
131
+ # 53=>["author_word", 58], 59=>["year", 63]}
132
+ # where the key is the char index of the start of
133
+ # a word, first element of the value is a semantic meaning
134
+ # of the word, second element of the value is the character index
135
+ # of end of the word
136
+
137
+ To parse using several CPUs (4 seem to be optimal)
138
+
139
+ parser = ParallelParser.new
140
+ # ParallelParser.new(4) will try to run 4 processes if hardware allows
141
+ array_of_names = ["Betula alba", "Homo sapiens"....]
142
+ parser.parse(array_of_names)
143
+ # Output: {"Betula alba" => {:scientificName...}, "Homo sapiens" => {:scientificName...}, ...}
144
+
145
+ parallel parser takes list of names and returns back a hash with names as keys and parsed data as values
146
+
147
+ To get canonicals with ranks for infraspecific epithets:
148
+
149
+ parser = ScientificNameParser.new(canonical_with_rank: true)
150
+ parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
151
+ # Output: Cola cordifolia var. puberula
152
+
153
+ To resolve lsid and get back RDF file
154
+
155
+ LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
156
+
157
+
158
+
159
+ [1]: https://badge.fury.io/rb/biodiversity19.png
160
+ [2]: http://badge.fury.io/rb/biodiversity19
161
+ [3]: https://secure.travis-ci.org/GlobalNamesArchitecture/biodiversity.png
162
+ [4]: http://travis-ci.org/GlobalNamesArchitecture/biodiversity
163
+ [5]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity.png
164
+ [6]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity
165
+ [7]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity.png
166
+ [8]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity
167
+ [9]: https://github.com/GlobalNamesArchitecture/biodiversity/blob/master/examples/socket_client.rb
data/Rakefile CHANGED
@@ -20,35 +20,41 @@ ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
20
20
  begin
21
21
  require 'jeweler'
22
22
  Jeweler::Tasks.new do |gem|
23
- gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
23
+ gem.name = 'biodiversity'
24
+ #To delete ruby_version < 19 ? 'biodiversity' : 'biodiversity19'
24
25
  gem.summary = 'Parser of scientific names'
25
26
  gem.description = 'Tools for biodiversity informatics'
26
- gem.email = "dmozzherin@gmail.com"
27
- gem.homepage = "http://github.com/GlobalNamesArchitecture/biodiversity"
28
- gem.authors = ["Dmitry Mozzherin"]
27
+ gem.email = 'dmozzherin@gmail.com'
28
+ gem.homepage = 'http://github.com/GlobalNamesArchitecture/biodiversity'
29
+ gem.authors = ['Dmitry Mozzherin']
29
30
  gem.has_rdoc = false
30
31
  gem.bindir = 'bin'
31
32
  gem.executables = ['nnparse', 'parserver']
32
33
  gem.add_dependency('treetop')
33
34
  gem.add_dependency('parallel')
34
- gem.add_dependency('json') if ruby_version < 19
35
+ # gem.add_dependency('json') if ruby_version < 19
35
36
  gem.add_development_dependency "rspec"
36
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
37
+ # gem is a Gem::Specification...
38
+ # see http://www.rubygems.org/read/chapter/20 for additional settings
37
39
  end
38
40
  rescue LoadError
39
- puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
41
+ puts 'Jeweler (or a dependency) not available. ' +
42
+ 'Install it with: sudo gem install jeweler'
40
43
  end
41
44
 
42
45
  task :tt do
43
- ['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
46
+ ['scientific_name_clean',
47
+ 'scientific_name_dirty',
48
+ 'scientific_name_canonical'].each do |f|
44
49
  file = "#{dir}/lib/biodiversity/parser/#{f}"
45
50
  FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
46
51
  system("tt #{file}.treetop")
47
52
  rf = "#{file}.rb"
48
- rfn = open(rf + ".tmp", 'w')
53
+ rfn = open(rf + '.tmp', 'w')
49
54
  skip_head = false
50
55
  f = open(rf)
51
- #getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
56
+ # getting around a bug in treetop which prevents setting
57
+ # UTF-8 encoding in ruby19
52
58
  f.each_with_index do |l, i|
53
59
  skip_head = l.match(/^# Autogenerated/) if i == 0
54
60
  if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))
@@ -63,4 +69,3 @@ task :tt do
63
69
  `mv #{rf}.tmp #{rf}`
64
70
  end
65
71
  end
66
-
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.10
1
+ 3.0.0
data/bin/parserver CHANGED
@@ -5,78 +5,67 @@ require 'socket'
5
5
  require 'biodiversity' # Get sockets from stdlib
6
6
 
7
7
  DEFAULT_PORT = 4334
8
- RUBY_VERSION_INT = RUBY_VERSION.split(".")[0..1].join('').to_i
8
+ RUBY_VERSION_INT = RUBY_VERSION.split('.')[0..1].join('').to_i
9
9
  OPTIONS = {
10
- :output => "json",
11
- :port => DEFAULT_PORT
10
+ output: 'json',
11
+ canonical_with_rank: false,
12
+ port: DEFAULT_PORT
12
13
  }
13
14
 
14
15
  options = {}
15
16
  ARGV.options do |opts|
16
17
  script_name = File.basename($0)
17
- opts.banner = "Usage: ruby #{script_name} [options]"
18
+ opts.banner = "Usage: #{script_name} [options]"
18
19
 
19
- opts.separator ""
20
+ opts.separator ''
20
21
 
21
- opts.on("-o", "--output=output", String,
22
- "Specifies the type of the output:
22
+ opts.on('-r',
23
+ '--canonical_with_rank',
24
+ 'Adds infraspecies rank to canonical forms'
25
+ ) { |rank| options[:canonical_with_rank] = rank }
26
+
27
+ opts.separator ''
28
+
29
+ opts.on('-o', '--output=output', String,
30
+ 'Specifies the type of the output:
23
31
  json - parsed results in json
24
- canonical - canonical version
25
- canonical_with_rank - canonical with rank",
26
- "Default: json") { |output| options[:output] = output }
32
+ canonical - canonical form only',
33
+ 'Default: json') { |output| options[:output] = output }
27
34
 
28
- opts.separator ""
35
+ opts.separator ''
29
36
 
30
- opts.on("-p", "--port=port", String,
31
- "Specifies the port number",
37
+ opts.on('-p', '--port=port', String,
38
+ 'Specifies the port number',
32
39
  "Default: #{DEFAULT_PORT}") { |port| options[:port] = port }
33
40
 
34
- opts.separator ""
41
+ opts.separator ''
35
42
 
36
- opts.on("-h", "--help",
37
- "Show this help message.") { puts opts; exit }
43
+ opts.on('-h', '--help',
44
+ 'Show this help message.') { puts opts; exit }
38
45
 
39
46
  opts.parse!
40
47
  end
41
48
 
42
- OPTIONS[:output] = options[:output] if ['canonical', 'canonical_with_rank'].include?(options[:output])
49
+ OPTIONS[:output] = options[:output] if ['canonical'].include?(options[:output])
43
50
  OPTIONS[:port] = options[:port].to_i if options[:port].to_i > 0
44
-
45
- def parser_error(name_string)
46
- {:scientificName => {:parsed => false, :verbatim => name_string, :error => 'Parser error'}}
47
- end
51
+ OPTIONS[:canonical_with_rank] = !!options[:canonical_with_rank]
48
52
 
49
53
  def get_output(name_string, parser)
50
54
  begin
51
- if RUBY_VERSION_INT < 19
52
- old_kcode = $KCODE
53
- $KCODE = 'NONE'
54
- end
55
55
  parsed = parser.parse(name_string)
56
- if RUBY_VERSION_INT < 19
57
- $KCODE = old_kcode
58
- end
59
56
  rescue
60
- parsed = parser_error(name_string)
57
+ parsed = ScientificNameParser::FAILED_RESULT.(name_string)
61
58
  end
62
59
  output = OPTIONS[:output]
63
60
  return parsed.to_json if output == 'json'
64
- canonical = parsed[:scientificName][:canonical]
65
- return canonical.to_s if output == 'canonical' || canonical == nil || parsed[:scientificName][:hybrid] || !parsed[:scientificName][:parsed]
66
- parts = parsed[:scientificName][:canonical].split(" ")
67
-
68
- if parts.size > 2 && parsed[:scientificName][:details][0][:infraspecies]
69
- name_ary = parts[0..1]
70
- parsed[:scientificName][:details][0][:infraspecies].each do |data|
71
- name_ary << (data[:rank] && data[:rank] != 'n/a'? "#{data[:rank]} #{data[:string]}" : data[:string])
72
- end
73
- canonical = name_ary.join(" ")
74
- end
75
- canonical
61
+ parsed[:scientificName][:canonical].to_s
76
62
  end
77
63
 
78
- puts "Running parser service on port #{OPTIONS[:port]}, output type is '#{OPTIONS[:output]}'"
79
- parser = ScientificNameParser.new
64
+ puts "Running parser service on port %s, output type is '%s'" %
65
+ [OPTIONS[:port], OPTIONS[:output]]
66
+ opts = {}
67
+ opts = {canonical_with_rank: true} if OPTIONS[:canonical_with_rank]
68
+ parser = ScientificNameParser.new(opts)
80
69
  server = TCPServer.open(OPTIONS[:port]) # Socket to listen on a port
81
70
  loop do # Servers run forever
82
71
  Thread.start(server.accept) do |client|
@@ -85,7 +74,7 @@ loop do # Servers run forever
85
74
  while a = client.readline rescue nil
86
75
  count += 1
87
76
  puts "parsed %s'th name" % count if count % 1000 == 0
88
- a.force_encoding("utf-8") if a && RUBY_VERSION_INT >= 19
77
+ a.force_encoding('utf-8') if a && RUBY_VERSION_INT >= 19
89
78
  if ['end','exit','q', '.'].include? a.strip
90
79
  client.close
91
80
  break