biodiversity 1.0.10 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.rvmrc CHANGED
@@ -1 +1 @@
1
- rvm use ruby-1.9.2-p290@biodiversity --create
1
+ rvm use ruby-1.9.3-p392@biodiversity --create
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ rvm:
2
+ - 1.9.3
3
+ - 2.0.0
4
+ bundler_args: --without development
5
+ branches:
6
+ only:
7
+ - master
data/CHANGELOG ADDED
@@ -0,0 +1,42 @@
1
+ 3.0.0 -- removing support for ruby 1.8.7, making biodiversity gem be the same
2
+ as biodiversity19, deprecating biodiversity19. A few newly discovered bugs
3
+ are fixed.
4
+
5
+ 2.1.0 -- added ScientificNameParser.version method
6
+
7
+ 2.0.0 -- backward incompatibe change in parserver, therefore new major number.
8
+ In parserver removed option --output=canonical_with_rank, instead added -r
9
+ option which allows to have canonical with rank with either json or canonical
10
+ outputs
11
+
12
+ 1.2.0 -- changed method invocation signature ScientificNameParser.new
13
+ Now it can take options
14
+
15
+ 1.1.3 -- added 'fo' as rank
16
+
17
+ 1.1.2 -- static method for fixins all-caps canonical names, fixing caps
18
+ for authors words, ampersand instead of 'et' in normalization
19
+
20
+ 1.1.1 -- more multi-uninomials cases, expanded viruses detection, added
21
+ abbreviated genera and a few small fixes
22
+
23
+ 1.1.0 -- added multi-uninomials, fixes in identification annotations (aff.,
24
+ cf., sp. etc), bug fixes, more robust salvage mode
25
+
26
+ 1.0.17 -- fixed a bug which prevented all diacritics be converted correctly
27
+
28
+ 1.0.16 -- dirty mode now converts ë to e
29
+
30
+ 1.0.15 -- additional rules added for names ending with ssp. sp sp. and cf.
31
+
32
+ 1.0.14 -- canonical forms had allowed ë as a character until now. After this
33
+ version the only utf-8 character allowed in canonical forms should be the
34
+ multiplication sign for hybrids.
35
+
36
+ 1.0.13 -- canonical forms for cf. aff. qualifiers are modified: canonical for
37
+ 'Aus cf. bus' is now 'Aus bus'; canonical for 'Aus aff. bus' is now 'Aus'.
38
+ Ranks at the end of the name like 'var', 'ssp', 'spp' are considered junk and
39
+ are ignored
40
+
41
+ 1.0.12 -- bug is fixed which prevented 'Cucurbita pepo' be parsed correctly,
42
+ f., forma, fr. are now treated as any other ranks.
data/Gemfile CHANGED
@@ -1,13 +1,15 @@
1
- source "http://rubygems.org"
1
+ source 'https://rubygems.org'
2
2
 
3
- gem "treetop"
4
- gem "parallel"
3
+ gem 'rake', '~> 10.0'
4
+ gem 'treetop', '~> 1.4'
5
+ gem 'parallel', '~> 0.6'
6
+ gem 'unicode_utils', '~> 1.4'
5
7
 
6
8
  group :development do
7
- gem "jeweler"
9
+ gem 'debugger', '~> 1.5'
10
+ gem 'jeweler', '~> 1.8'
8
11
  end
9
12
 
10
13
  group :test do
11
- gem "ruby-debug19", :require => "ruby-debug"
12
- gem "rspec"
14
+ gem 'rspec', '~> 2.13'
13
15
  end
data/Gemfile.lock CHANGED
@@ -1,47 +1,47 @@
1
1
  GEM
2
- remote: http://rubygems.org/
2
+ remote: https://rubygems.org/
3
3
  specs:
4
- archive-tar-minitar (0.5.2)
5
- columnize (0.3.4)
6
- diff-lcs (1.1.3)
4
+ columnize (0.3.6)
5
+ debugger (1.5.0)
6
+ columnize (>= 0.3.1)
7
+ debugger-linecache (~> 1.2.0)
8
+ debugger-ruby_core_source (~> 1.2.0)
9
+ debugger-linecache (1.2.0)
10
+ debugger-ruby_core_source (1.2.0)
11
+ diff-lcs (1.2.1)
7
12
  git (1.2.5)
8
- jeweler (1.6.4)
13
+ jeweler (1.8.4)
9
14
  bundler (~> 1.0)
10
15
  git (>= 1.2.5)
11
16
  rake
12
- linecache19 (0.5.12)
13
- ruby_core_source (>= 0.1.4)
14
- parallel (0.5.9)
17
+ rdoc
18
+ json (1.7.7)
19
+ parallel (0.6.2)
15
20
  polyglot (0.3.3)
16
- rake (0.9.2.2)
17
- rspec (2.7.0)
18
- rspec-core (~> 2.7.0)
19
- rspec-expectations (~> 2.7.0)
20
- rspec-mocks (~> 2.7.0)
21
- rspec-core (2.7.1)
22
- rspec-expectations (2.7.0)
23
- diff-lcs (~> 1.1.2)
24
- rspec-mocks (2.7.0)
25
- ruby-debug-base19 (0.11.25)
26
- columnize (>= 0.3.1)
27
- linecache19 (>= 0.5.11)
28
- ruby_core_source (>= 0.1.4)
29
- ruby-debug19 (0.11.6)
30
- columnize (>= 0.3.1)
31
- linecache19 (>= 0.5.11)
32
- ruby-debug-base19 (>= 0.11.19)
33
- ruby_core_source (0.1.5)
34
- archive-tar-minitar (>= 0.5.2)
35
- treetop (1.4.10)
21
+ rake (10.0.3)
22
+ rdoc (4.0.0)
23
+ json (~> 1.4)
24
+ rspec (2.13.0)
25
+ rspec-core (~> 2.13.0)
26
+ rspec-expectations (~> 2.13.0)
27
+ rspec-mocks (~> 2.13.0)
28
+ rspec-core (2.13.1)
29
+ rspec-expectations (2.13.0)
30
+ diff-lcs (>= 1.1.3, < 2.0)
31
+ rspec-mocks (2.13.0)
32
+ treetop (1.4.12)
36
33
  polyglot
37
34
  polyglot (>= 0.3.1)
35
+ unicode_utils (1.4.0)
38
36
 
39
37
  PLATFORMS
40
38
  ruby
41
39
 
42
40
  DEPENDENCIES
43
- jeweler
44
- parallel
45
- rspec
46
- ruby-debug19
47
- treetop
41
+ debugger (~> 1.5)
42
+ jeweler (~> 1.8)
43
+ parallel (~> 0.6)
44
+ rake (~> 10.0)
45
+ rspec (~> 2.13)
46
+ treetop (~> 1.4)
47
+ unicode_utils (~> 1.4)
data/README.md ADDED
@@ -0,0 +1,167 @@
1
+ Biodiversity
2
+ ============
3
+
4
+ [![Gem Version][1]][2]
5
+ [![Continuous Integration Status][3]][4]
6
+ [![CodePolice][5]][6]
7
+ [![Dependency Status][7]][8]
8
+
9
+ Parses taxonomic scientific name and breaks it into semantic elements.
10
+
11
+ *WARNING, IMPORTANT!:*
12
+ Support for Ruby 1.8.7 IS DROPPED. Both biodiversity and
13
+ biodiversity19 will be for Ruby > 1.9.1 and will be identical gems.
14
+
15
+ biodiversity19 is now deprecated and will be phased out in a couple of years.
16
+ You are strongly encouraged to change your dependencies from
17
+ biodiversity19 to biodiversity
18
+
19
+ Installation
20
+ ------------
21
+
22
+ sudo gem install biodiversity
23
+
24
+ Example usage
25
+ -------------
26
+
27
+ ### As a command line script
28
+
29
+ You can parse file with taxonomic names from command line.
30
+ File should contain one scientific name per line
31
+
32
+ nnparse file_with_names
33
+
34
+ The resuls will be put into parsed.json file in the current directory.
35
+ To save results into a different file:
36
+
37
+ nnparse file_with_names output_file
38
+
39
+ ### As a socket server
40
+
41
+ If you do not use Ruby and need a fast access to the parser functionality
42
+ you can use a socket server
43
+
44
+ parserver
45
+
46
+ parserver -h
47
+ Usage: parserver [options]
48
+
49
+ -r, --canonical_with_rank Adds infraspecies rank to canonical forms
50
+
51
+ -o, --output=output Specifies the type of the output:
52
+ json - parsed results in json
53
+ canonical - canonical form only
54
+ Default: json
55
+
56
+ -p, --port=port Specifies the port number
57
+ Default: 4334
58
+
59
+ -h, --help Show this help message.
60
+
61
+ parserver --output=canonical
62
+
63
+
64
+
65
+ With default settings you can access parserserver via 4334 port using a
66
+ socket client library of your programming language. You can find
67
+ [socket client script example][9] in the examples directory of the gem.
68
+
69
+ If you want to check if socket server works for you:
70
+
71
+ #run server in one terminal
72
+ parserver
73
+
74
+ #in another terminal window type
75
+ telnet localhost 4334
76
+
77
+ If you enter a line with a scientific name -- server will send you back
78
+ parsed information in json format.
79
+
80
+ To stop telnet client type any of `end`,`exit`,`q`, `.` instead
81
+ of scientific name
82
+
83
+ $ telnet localhost 4334
84
+ Trying ::1...
85
+ Connected to localhost.
86
+ Escape character is '^]'.
87
+ Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan
88
+ {"scientificName":{"canonical":"Acacia abyssinica calophylla"...}}
89
+ end
90
+
91
+ ### As a library
92
+
93
+ You can use it as a library in Ruby, JRuby etc.
94
+
95
+ require 'biodiversity'
96
+
97
+ parser = ScientificNameParser.new
98
+
99
+ #to find version number
100
+ ScientificNameParser.version
101
+
102
+ # to fix capitalization in canonicals
103
+ ScientificNameParser.fix_case("QUERCUS (QUERCUS) ALBA")
104
+ # Output: Quercus (Quercus) alba
105
+
106
+ # to parse a scientific name into a ruby hash
107
+ parser.parse("Plantago major")
108
+
109
+ #to get json representation
110
+ parser.parse("Plantago").to_json
111
+ #or
112
+ parser.parse("Plantago")
113
+ parser.all_json
114
+
115
+ # to clean name up
116
+ parser.parse(" Plantago major ")[:scientificName][:normalized]
117
+
118
+ # to get only cleaned up latin part of the name
119
+ parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:canonical]
120
+
121
+ # to get detailed information about elements of the name
122
+ parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
123
+
124
+ Returned result is not always linear, if name is complex. To get simple linear
125
+ representation of the name you can use:
126
+
127
+ parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:position]
128
+ # returns {0=>["genus", 16], 17=>["species", 26],
129
+ # 28=>["author_word", 32], 33=>["author_word", 40],
130
+ # 42=>["author_word", 44], 45=>["author_word", 50],
131
+ # 53=>["author_word", 58], 59=>["year", 63]}
132
+ # where the key is the char index of the start of
133
+ # a word, first element of the value is a semantic meaning
134
+ # of the word, second element of the value is the character index
135
+ # of end of the word
136
+
137
+ To parse using several CPUs (4 seem to be optimal)
138
+
139
+ parser = ParallelParser.new
140
+ # ParallelParser.new(4) will try to run 4 processes if hardware allows
141
+ array_of_names = ["Betula alba", "Homo sapiens"....]
142
+ parser.parse(array_of_names)
143
+ # Output: {"Betula alba" => {:scientificName...}, "Homo sapiens" => {:scientificName...}, ...}
144
+
145
+ parallel parser takes list of names and returns back a hash with names as keys and parsed data as values
146
+
147
+ To get canonicals with ranks for infraspecific epithets:
148
+
149
+ parser = ScientificNameParser.new(canonical_with_rank: true)
150
+ parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
151
+ # Output: Cola cordifolia var. puberula
152
+
153
+ To resolve lsid and get back RDF file
154
+
155
+ LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
156
+
157
+
158
+
159
+ [1]: https://badge.fury.io/rb/biodiversity19.png
160
+ [2]: http://badge.fury.io/rb/biodiversity19
161
+ [3]: https://secure.travis-ci.org/GlobalNamesArchitecture/biodiversity.png
162
+ [4]: http://travis-ci.org/GlobalNamesArchitecture/biodiversity
163
+ [5]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity.png
164
+ [6]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity
165
+ [7]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity.png
166
+ [8]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity
167
+ [9]: https://github.com/GlobalNamesArchitecture/biodiversity/blob/master/examples/socket_client.rb
data/Rakefile CHANGED
@@ -20,35 +20,41 @@ ruby_version = RUBY_VERSION.split('.')[0..1].join('').to_i
20
20
  begin
21
21
  require 'jeweler'
22
22
  Jeweler::Tasks.new do |gem|
23
- gem.name = ruby_version < 19 ? "biodiversity" : "biodiversity19"
23
+ gem.name = 'biodiversity'
24
+ #To delete ruby_version < 19 ? 'biodiversity' : 'biodiversity19'
24
25
  gem.summary = 'Parser of scientific names'
25
26
  gem.description = 'Tools for biodiversity informatics'
26
- gem.email = "dmozzherin@gmail.com"
27
- gem.homepage = "http://github.com/GlobalNamesArchitecture/biodiversity"
28
- gem.authors = ["Dmitry Mozzherin"]
27
+ gem.email = 'dmozzherin@gmail.com'
28
+ gem.homepage = 'http://github.com/GlobalNamesArchitecture/biodiversity'
29
+ gem.authors = ['Dmitry Mozzherin']
29
30
  gem.has_rdoc = false
30
31
  gem.bindir = 'bin'
31
32
  gem.executables = ['nnparse', 'parserver']
32
33
  gem.add_dependency('treetop')
33
34
  gem.add_dependency('parallel')
34
- gem.add_dependency('json') if ruby_version < 19
35
+ # gem.add_dependency('json') if ruby_version < 19
35
36
  gem.add_development_dependency "rspec"
36
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
37
+ # gem is a Gem::Specification...
38
+ # see http://www.rubygems.org/read/chapter/20 for additional settings
37
39
  end
38
40
  rescue LoadError
39
- puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
41
+ puts 'Jeweler (or a dependency) not available. ' +
42
+ 'Install it with: sudo gem install jeweler'
40
43
  end
41
44
 
42
45
  task :tt do
43
- ['scientific_name_clean', 'scientific_name_dirty', 'scientific_name_canonical'].each do |f|
46
+ ['scientific_name_clean',
47
+ 'scientific_name_dirty',
48
+ 'scientific_name_canonical'].each do |f|
44
49
  file = "#{dir}/lib/biodiversity/parser/#{f}"
45
50
  FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
46
51
  system("tt #{file}.treetop")
47
52
  rf = "#{file}.rb"
48
- rfn = open(rf + ".tmp", 'w')
53
+ rfn = open(rf + '.tmp', 'w')
49
54
  skip_head = false
50
55
  f = open(rf)
51
- #getting around a bug in treetop which prevents setting UTF-8 encoding in ruby19
56
+ # getting around a bug in treetop which prevents setting
57
+ # UTF-8 encoding in ruby19
52
58
  f.each_with_index do |l, i|
53
59
  skip_head = l.match(/^# Autogenerated/) if i == 0
54
60
  if skip_head && (l.strip == '' || l.match(/^# Autogenerated/))
@@ -63,4 +69,3 @@ task :tt do
63
69
  `mv #{rf}.tmp #{rf}`
64
70
  end
65
71
  end
66
-
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.10
1
+ 3.0.0
data/bin/parserver CHANGED
@@ -5,78 +5,67 @@ require 'socket'
5
5
  require 'biodiversity' # Get sockets from stdlib
6
6
 
7
7
  DEFAULT_PORT = 4334
8
- RUBY_VERSION_INT = RUBY_VERSION.split(".")[0..1].join('').to_i
8
+ RUBY_VERSION_INT = RUBY_VERSION.split('.')[0..1].join('').to_i
9
9
  OPTIONS = {
10
- :output => "json",
11
- :port => DEFAULT_PORT
10
+ output: 'json',
11
+ canonical_with_rank: false,
12
+ port: DEFAULT_PORT
12
13
  }
13
14
 
14
15
  options = {}
15
16
  ARGV.options do |opts|
16
17
  script_name = File.basename($0)
17
- opts.banner = "Usage: ruby #{script_name} [options]"
18
+ opts.banner = "Usage: #{script_name} [options]"
18
19
 
19
- opts.separator ""
20
+ opts.separator ''
20
21
 
21
- opts.on("-o", "--output=output", String,
22
- "Specifies the type of the output:
22
+ opts.on('-r',
23
+ '--canonical_with_rank',
24
+ 'Adds infraspecies rank to canonical forms'
25
+ ) { |rank| options[:canonical_with_rank] = rank }
26
+
27
+ opts.separator ''
28
+
29
+ opts.on('-o', '--output=output', String,
30
+ 'Specifies the type of the output:
23
31
  json - parsed results in json
24
- canonical - canonical version
25
- canonical_with_rank - canonical with rank",
26
- "Default: json") { |output| options[:output] = output }
32
+ canonical - canonical form only',
33
+ 'Default: json') { |output| options[:output] = output }
27
34
 
28
- opts.separator ""
35
+ opts.separator ''
29
36
 
30
- opts.on("-p", "--port=port", String,
31
- "Specifies the port number",
37
+ opts.on('-p', '--port=port', String,
38
+ 'Specifies the port number',
32
39
  "Default: #{DEFAULT_PORT}") { |port| options[:port] = port }
33
40
 
34
- opts.separator ""
41
+ opts.separator ''
35
42
 
36
- opts.on("-h", "--help",
37
- "Show this help message.") { puts opts; exit }
43
+ opts.on('-h', '--help',
44
+ 'Show this help message.') { puts opts; exit }
38
45
 
39
46
  opts.parse!
40
47
  end
41
48
 
42
- OPTIONS[:output] = options[:output] if ['canonical', 'canonical_with_rank'].include?(options[:output])
49
+ OPTIONS[:output] = options[:output] if ['canonical'].include?(options[:output])
43
50
  OPTIONS[:port] = options[:port].to_i if options[:port].to_i > 0
44
-
45
- def parser_error(name_string)
46
- {:scientificName => {:parsed => false, :verbatim => name_string, :error => 'Parser error'}}
47
- end
51
+ OPTIONS[:canonical_with_rank] = !!options[:canonical_with_rank]
48
52
 
49
53
  def get_output(name_string, parser)
50
54
  begin
51
- if RUBY_VERSION_INT < 19
52
- old_kcode = $KCODE
53
- $KCODE = 'NONE'
54
- end
55
55
  parsed = parser.parse(name_string)
56
- if RUBY_VERSION_INT < 19
57
- $KCODE = old_kcode
58
- end
59
56
  rescue
60
- parsed = parser_error(name_string)
57
+ parsed = ScientificNameParser::FAILED_RESULT.(name_string)
61
58
  end
62
59
  output = OPTIONS[:output]
63
60
  return parsed.to_json if output == 'json'
64
- canonical = parsed[:scientificName][:canonical]
65
- return canonical.to_s if output == 'canonical' || canonical == nil || parsed[:scientificName][:hybrid] || !parsed[:scientificName][:parsed]
66
- parts = parsed[:scientificName][:canonical].split(" ")
67
-
68
- if parts.size > 2 && parsed[:scientificName][:details][0][:infraspecies]
69
- name_ary = parts[0..1]
70
- parsed[:scientificName][:details][0][:infraspecies].each do |data|
71
- name_ary << (data[:rank] && data[:rank] != 'n/a'? "#{data[:rank]} #{data[:string]}" : data[:string])
72
- end
73
- canonical = name_ary.join(" ")
74
- end
75
- canonical
61
+ parsed[:scientificName][:canonical].to_s
76
62
  end
77
63
 
78
- puts "Running parser service on port #{OPTIONS[:port]}, output type is '#{OPTIONS[:output]}'"
79
- parser = ScientificNameParser.new
64
+ puts "Running parser service on port %s, output type is '%s'" %
65
+ [OPTIONS[:port], OPTIONS[:output]]
66
+ opts = {}
67
+ opts = {canonical_with_rank: true} if OPTIONS[:canonical_with_rank]
68
+ parser = ScientificNameParser.new(opts)
80
69
  server = TCPServer.open(OPTIONS[:port]) # Socket to listen on a port
81
70
  loop do # Servers run forever
82
71
  Thread.start(server.accept) do |client|
@@ -85,7 +74,7 @@ loop do # Servers run forever
85
74
  while a = client.readline rescue nil
86
75
  count += 1
87
76
  puts "parsed %s'th name" % count if count % 1000 == 0
88
- a.force_encoding("utf-8") if a && RUBY_VERSION_INT >= 19
77
+ a.force_encoding('utf-8') if a && RUBY_VERSION_INT >= 19
89
78
  if ['end','exit','q', '.'].include? a.strip
90
79
  client.close
91
80
  break