biodiversity19 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.rvmrc CHANGED
@@ -1 +1 @@
1
- rvm use ruby-1.9.3-p194@biodiversity --create
1
+ rvm use ruby-1.9.3-p392@biodiversity --create
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ 2.0.0 -- backward incompatibe change in parserver, therefore new major number.
2
+ In parserver removed option --output=canonical_with_rank, instead added -r
3
+ option which allows to have canonical with rank with either json or canonical
4
+ outputs
5
+
6
+ 1.2.0 -- changed method invocation signature ScientificNameParser.new
7
+ Now it can take options
8
+
1
9
  1.1.3 -- added 'fo' as rank
2
10
 
3
11
  1.1.2 -- static method for fixins all-caps canonical names, fixing caps
data/README.rdoc CHANGED
@@ -31,14 +31,14 @@ options:
31
31
 
32
32
  to return a canonical form of the name string
33
33
 
34
- parserver --output=canonical_with_rank
35
-
36
- the same as above, but infraspecies' rank is shown if available
37
-
38
34
  parserver --port 5555
39
35
 
40
36
  run socket server on a different port
41
37
 
38
+ parserver --canonical_with_rank
39
+
40
+ to add rank to canonical forms with infraspecific epithet, if it is given
41
+
42
42
  Then you can access it via 4334 port using a socket client library of your programming language. You can find socket client script example in the examples directory of the gem.
43
43
 
44
44
  If you want to check if socket server works for you:
data/Rakefile CHANGED
@@ -63,4 +63,3 @@ task :tt do
63
63
  `mv #{rf}.tmp #{rf}`
64
64
  end
65
65
  end
66
-
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.2.0
1
+ 2.0.0
data/bin/parserver CHANGED
@@ -5,10 +5,11 @@ require 'socket'
5
5
  require 'biodiversity' # Get sockets from stdlib
6
6
 
7
7
  DEFAULT_PORT = 4334
8
- RUBY_VERSION_INT = RUBY_VERSION.split(".")[0..1].join('').to_i
8
+ RUBY_VERSION_INT = RUBY_VERSION.split('.')[0..1].join('').to_i
9
9
  OPTIONS = {
10
- :output => "json",
11
- :port => DEFAULT_PORT
10
+ output: 'json',
11
+ canonical_with_rank: false,
12
+ port: DEFAULT_PORT
12
13
  }
13
14
 
14
15
  options = {}
@@ -16,67 +17,55 @@ ARGV.options do |opts|
16
17
  script_name = File.basename($0)
17
18
  opts.banner = "Usage: ruby #{script_name} [options]"
18
19
 
19
- opts.separator ""
20
+ opts.separator ''
20
21
 
21
- opts.on("-o", "--output=output", String,
22
- "Specifies the type of the output:
22
+ opts.on('-r',
23
+ '--canonical_with_rank',
24
+ 'Adds infraspecies rank to canonical forms'
25
+ ) { |rank| options[:canonical_with_rank] = rank }
26
+
27
+ opts.separator ''
28
+
29
+ opts.on('-o', '--output=output', String,
30
+ 'Specifies the type of the output:
23
31
  json - parsed results in json
24
- canonical - canonical version
25
- canonical_with_rank - canonical with rank",
26
- "Default: json") { |output| options[:output] = output }
32
+ canonical - canonical form only',
33
+ 'Default: json') { |output| options[:output] = output }
27
34
 
28
- opts.separator ""
35
+ opts.separator ''
29
36
 
30
- opts.on("-p", "--port=port", String,
31
- "Specifies the port number",
37
+ opts.on('-p', '--port=port', String,
38
+ 'Specifies the port number',
32
39
  "Default: #{DEFAULT_PORT}") { |port| options[:port] = port }
33
40
 
34
- opts.separator ""
41
+ opts.separator ''
35
42
 
36
- opts.on("-h", "--help",
37
- "Show this help message.") { puts opts; exit }
43
+ opts.on('-h', '--help',
44
+ 'Show this help message.') { puts opts; exit }
38
45
 
39
46
  opts.parse!
40
47
  end
41
48
 
42
- OPTIONS[:output] = options[:output] if ['canonical', 'canonical_with_rank'].include?(options[:output])
49
+ OPTIONS[:output] = options[:output] if ['canonical'].include?(options[:output])
43
50
  OPTIONS[:port] = options[:port].to_i if options[:port].to_i > 0
44
-
45
- def parser_error(name_string)
46
- {:scientificName => {:parsed => false, :verbatim => name_string, :error => 'Parser error'}}
47
- end
51
+ OPTIONS[:canonical_with_rank] = !!options[:canonical_with_rank]
48
52
 
49
53
  def get_output(name_string, parser)
50
54
  begin
51
- if RUBY_VERSION_INT < 19
52
- old_kcode = $KCODE
53
- $KCODE = 'NONE'
54
- end
55
55
  parsed = parser.parse(name_string)
56
- if RUBY_VERSION_INT < 19
57
- $KCODE = old_kcode
58
- end
59
56
  rescue
60
- parsed = parser_error(name_string)
57
+ parsed = ScientificNameParser::FAILED_RESULT.(name_string)
61
58
  end
62
59
  output = OPTIONS[:output]
63
60
  return parsed.to_json if output == 'json'
64
- canonical = parsed[:scientificName][:canonical]
65
- return canonical.to_s if output == 'canonical' || canonical == nil || parsed[:scientificName][:hybrid] || !parsed[:scientificName][:parsed]
66
- parts = parsed[:scientificName][:canonical].split(" ")
67
-
68
- if parts.size > 2 && parsed[:scientificName][:details][0][:infraspecies]
69
- name_ary = parts[0..1]
70
- parsed[:scientificName][:details][0][:infraspecies].each do |data|
71
- name_ary << (data[:rank] && data[:rank] != 'n/a'? "#{data[:rank]} #{data[:string]}" : data[:string])
72
- end
73
- canonical = name_ary.join(" ")
74
- end
75
- canonical
61
+ parsed[:scientificName][:canonical].to_s
76
62
  end
77
63
 
78
- puts "Running parser service on port #{OPTIONS[:port]}, output type is '#{OPTIONS[:output]}'"
79
- parser = ScientificNameParser.new
64
+ puts "Running parser service on port %s, output type is '%s'" %
65
+ [OPTIONS[:port], OPTIONS[:output]]
66
+ opts = {}
67
+ opts = {canonical_with_rank: true} if OPTIONS[:canonical_with_rank]
68
+ parser = ScientificNameParser.new(opts)
80
69
  server = TCPServer.open(OPTIONS[:port]) # Socket to listen on a port
81
70
  loop do # Servers run forever
82
71
  Thread.start(server.accept) do |client|
@@ -85,7 +74,7 @@ loop do # Servers run forever
85
74
  while a = client.readline rescue nil
86
75
  count += 1
87
76
  puts "parsed %s'th name" % count if count % 1000 == 0
88
- a.force_encoding("utf-8") if a && RUBY_VERSION_INT >= 19
77
+ a.force_encoding('utf-8') if a && RUBY_VERSION_INT >= 19
89
78
  if ['end','exit','q', '.'].include? a.strip
90
79
  client.close
91
80
  break
@@ -15,7 +15,8 @@ module PreProcessor
15
15
  LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|nudum|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
16
16
 
17
17
  def self.clean(a_string)
18
- [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
18
+ [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
19
+ TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
19
20
  a_string = a_string.gsub(i, '')
20
21
  end
21
22
  a_string = a_string.tr('ſ','s') #old 's'
@@ -49,10 +50,7 @@ class ParallelParser
49
50
  private
50
51
  def parse_process(name)
51
52
  p = ScientificNameParser.new
52
- failed_res = { scientificName: { parsed: false,
53
- verbatim: name,
54
- error: 'Parser error' } }
55
- p.parse(name) rescue failed_res
53
+ p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
56
54
  end
57
55
  end
58
56
 
@@ -73,6 +71,12 @@ class ScientificNameParser
73
71
  '..',
74
72
  '..',
75
73
  'VERSION')).readline.strip
74
+
75
+ FAILED_RESULT = ->(name) do
76
+ { scientificName:
77
+ { parsed: false, verbatim: name.to_s.strip, error: 'Parser error' }
78
+ }
79
+ end
76
80
 
77
81
  def self.fix_case(name_string)
78
82
  name_ary = name_string.split(/\s+/)
@@ -87,17 +91,21 @@ class ScientificNameParser
87
91
  end
88
92
  else
89
93
  if name_ary[0].size > 1
90
- word1 = UnicodeUtils.upcase(name_ary[0][0]) + UnicodeUtils.downcase(name_ary[0][1..-1])
94
+ word1 = UnicodeUtils.upcase(name_ary[0][0]) +
95
+ UnicodeUtils.downcase(name_ary[0][1..-1])
91
96
  else
92
97
  word1 = name_ary[0]
93
98
  end
94
99
  if name_ary[1].match(/^\(/)
95
- word2 = name_ary[1].gsub(/\)$/, '') + ")"
96
- word2 = word2[0] + UnicodeUtils.upcase(word2[1]) + UnicodeUtils.downcase(word2[2..-1])
100
+ word2 = name_ary[1].gsub(/\)$/, '') + ')'
101
+ word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
102
+ UnicodeUtils.downcase(word2[2..-1])
97
103
  else
98
104
  word2 = UnicodeUtils.downcase(name_ary[1])
99
105
  end
100
- res = word1 + " " + word2 + " " + name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(" ")
106
+ res = word1 + ' ' +
107
+ word2 + ' ' +
108
+ name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(' ')
101
109
  res.strip!
102
110
  end
103
111
  res
@@ -114,7 +122,9 @@ class ScientificNameParser
114
122
  end
115
123
 
116
124
  def virus?(a_string)
117
- !!(a_string.match(/\sICTV\s*$/) || a_string.match(/\b(virus|viruses|phage|phages|viroid|viroids|satellite|satellites|prion|prions)\b/i) || a_string.match(/[A-Z]?[a-z]+virus\b/))
125
+ !!(a_string.match(/\sICTV\s*$/) ||
126
+ a_string.match(/\b(virus|viruses|phage|phages|viroid|viroids|satellite|satellites|prion|prions)\b/i) ||
127
+ a_string.match(/[A-Z]?[a-z]+virus\b/))
118
128
  end
119
129
 
120
130
  def unknown_placement?(a_string)
@@ -126,13 +136,13 @@ class ScientificNameParser
126
136
  end
127
137
 
128
138
  def parse(a_string)
129
- @verbatim = a_string
139
+ @verbatim = a_string.strip
130
140
  a_string = PreProcessor::clean(a_string)
131
141
 
132
142
  if virus?(a_string)
133
- @parsed = { :verbatim => a_string, :virus => true }
143
+ @parsed = { verbatim: a_string, virus: true }
134
144
  elsif unknown_placement?(a_string)
135
- @parsed = { :verbatim => a_string }
145
+ @parsed = { verbatim: a_string }
136
146
  else
137
147
  begin
138
148
  @parsed = @clean.parse(a_string) || @dirty.parse(a_string)
@@ -140,12 +150,12 @@ class ScientificNameParser
140
150
  index = @dirty.index || @clean.index
141
151
  salvage_match = a_string[0..index].split(/\s+/)[0..-2]
142
152
  salvage_string = salvage_match ? salvage_match.join(' ') : a_string
143
- @parsed = @dirty.parse(salvage_string) || @canonical.parse(a_string) || { :verbatim => a_string }
153
+ @parsed = @dirty.parse(salvage_string) ||
154
+ @canonical.parse(a_string) ||
155
+ { verbatim: a_string }
144
156
  end
145
157
  rescue
146
- @parsed = { scientificName: { parsed: false,
147
- verbatim: name,
148
- error: 'Parser error' } }
158
+ @parsed = FAILED_RESULT.(@verbatim)
149
159
  end
150
160
  end
151
161
 
@@ -156,22 +166,24 @@ class ScientificNameParser
156
166
  def @parsed.all(opts = {})
157
167
  canonical_with_rank = !!opts[:canonical_with_rank]
158
168
  parsed = self.class != Hash
159
- res = { :parsed => parsed, :parser_version => ScientificNameParser::VERSION}
169
+ res = { parsed: parsed, parser_version: ScientificNameParser::VERSION}
160
170
  if parsed
161
171
  hybrid = self.hybrid rescue false
162
172
  res.merge!({
163
- :verbatim => @verbatim,
164
- :normalized => self.value,
165
- :canonical => self.canonical,
166
- :hybrid => hybrid,
167
- :details => self.details,
168
- :parser_run => self.parser_run,
169
- :positions => self.pos
173
+ verbatim: @verbatim,
174
+ normalized: self.value,
175
+ canonical: self.canonical,
176
+ hybrid: hybrid,
177
+ details: self.details,
178
+ parser_run: self.parser_run,
179
+ positions: self.pos
170
180
  })
171
181
  else
172
182
  res.merge!(self)
173
183
  end
174
- if canonical_with_rank && canonical.count(" ") > 1 && res[:details][0][:infraspecies]
184
+ if (canonical_with_rank &&
185
+ canonical.count(' ') > 1 &&
186
+ res[:details][0][:infraspecies])
175
187
  ScientificNameParser.add_rank_to_canonical(res)
176
188
  end
177
189
  res = {:scientificName => res}
@@ -192,14 +204,14 @@ class ScientificNameParser
192
204
  private
193
205
 
194
206
  def self.add_rank_to_canonical(parsed)
195
- parts = parsed[:canonical].split(" ")
207
+ parts = parsed[:canonical].split(' ')
196
208
  name_ary = parts[0..1]
197
209
  parsed[:details][0][:infraspecies].each do |data|
198
210
  infrasp = data[:string]
199
211
  rank = data[:rank]
200
212
  name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
201
213
  end
202
- parsed[:canonical] = name_ary.join(" ")
214
+ parsed[:canonical] = name_ary.join(' ')
203
215
  end
204
216
 
205
217
  end
@@ -123,7 +123,7 @@ describe ParallelParser do
123
123
  res = pparser.parse(names)
124
124
  names.each_with_index do |name, i|
125
125
  res[name].is_a?(Hash).should be_true
126
- res[name][:scientificName][:verbatim].should == name
126
+ res[name][:scientificName][:verbatim].should == name.strip
127
127
  end
128
128
  end
129
129
  end
@@ -276,8 +276,8 @@ Coeloglossum viride (L.) Hartman x Dactylorhiza majalis (Rchb. f.) P.F. Hunt & S
276
276
  Polypodium x vulgare nothosubsp. mantoniae (Rothm.) Schidlay|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Polypodium x vulgare nothosubsp. mantoniae (Rothm.) Schidlay", "normalized":"Polypodium × vulgare nothosubsp. mantoniae (Rothm.) Schidlay", "canonical":"Polypodium vulgare mantoniae", "hybrid":true, "details":[{"genus":{"string":"Polypodium"}, "species":{"string":"vulgare"}, "infraspecies":[{"string":"mantoniae", "rank":"nothosubsp.", "authorship":"(Rothm.) Schidlay", "combinationAuthorTeam":{"authorTeam":"Schidlay", "author":["Schidlay"]}, "basionymAuthorTeam":{"authorTeam":"Rothm.", "author":["Rothm."]}}]}], "parser_run":1, "positions":{"0":["genus", 10], "14":["species", 21], "22":["infraspecific_type", 33], "34":["infraspecies", 43], "45":["author_word", 51], "53":["author_word", 61]}}}
277
277
 
278
278
  #empty spaces
279
- Asplenium Xinexpectatum(E. L. Braun ex Friesner )Morton |{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":" Asplenium Xinexpectatum(E. L. Braun ex Friesner )Morton ", "normalized":"Asplenium × inexpectatum (E. L. Braun ex Friesner) Morton", "canonical":"Asplenium inexpectatum", "hybrid":true, "details":[{"genus":{"string":"Asplenium"}, "species":{"string":"inexpectatum", "authorship":"(E. L. Braun ex Friesner )Morton", "combinationAuthorTeam":{"authorTeam":"Morton", "author":["Morton"]}, "basionymAuthorTeam":{"authorTeam":"E. L. Braun", "author":["E. L. Braun"], "exAuthorTeam":{"authorTeam":"Friesner", "author":["Friesner"]}}}}], "parser_run":1, "positions":{"4":["genus", 13], "21":["species", 33], "34":["author_word", 36], "37":["author_word", 39], "40":["author_word", 45], "49":["author_word", 57], "64":["author_word", 70]}}}
280
-
279
+ Asplenium Xinexpectatum(E. L. Braun ex Friesner )Morton |{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Asplenium Xinexpectatum(E. L. Braun ex Friesner )Morton", "normalized":"Asplenium × inexpectatum (E. L. Braun ex Friesner) Morton", "canonical":"Asplenium inexpectatum", "hybrid":true, "details":[{"genus":{"string":"Asplenium"}, "species":{"string":"inexpectatum", "authorship":"(E. L. Braun ex Friesner )Morton", "combinationAuthorTeam":{"authorTeam":"Morton", "author":["Morton"]}, "basionymAuthorTeam":{"authorTeam":"E. L. Braun", "author":["E. L. Braun"], "exAuthorTeam":{"authorTeam":"Friesner", "author":["Friesner"]}}}}], "parser_run":1, "positions":{"4":["genus", 13], "21":["species", 33], "34":["author_word", 36], "37":["author_word", 39], "40":["author_word", 45], "49":["author_word", 57], "64":["author_word", 70]}}}
280
+
281
281
  ####
282
282
  #
283
283
  # Names with problems
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biodiversity19
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 2.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-11 00:00:00.000000000 Z
12
+ date: 2013-03-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: treetop
@@ -178,7 +178,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
178
178
  version: '0'
179
179
  segments:
180
180
  - 0
181
- hash: 3489764594482913391
181
+ hash: -2248676907984019850
182
182
  required_rubygems_version: !ruby/object:Gem::Requirement
183
183
  none: false
184
184
  requirements: