biodiversity 1.0.5 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rvmrc +1 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +47 -0
- data/README.rdoc +5 -0
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/lib/biodiversity/parser.rb +28 -0
- data/lib/biodiversity/parser/scientific_name_clean.treetop +1 -1
- data/spec/parser/scientific_name.spec.rb +28 -0
- data/spec/parser/scientific_name_clean.spec.rb +2 -0
- data/spec/parser/test_data.txt +1 -0
- metadata +72 -13
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use ruby-1.9.2-p290@biodiversity --create
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
archive-tar-minitar (0.5.2)
|
5
|
+
columnize (0.3.4)
|
6
|
+
diff-lcs (1.1.3)
|
7
|
+
git (1.2.5)
|
8
|
+
jeweler (1.6.4)
|
9
|
+
bundler (~> 1.0)
|
10
|
+
git (>= 1.2.5)
|
11
|
+
rake
|
12
|
+
linecache19 (0.5.12)
|
13
|
+
ruby_core_source (>= 0.1.4)
|
14
|
+
parallel (0.5.9)
|
15
|
+
polyglot (0.3.3)
|
16
|
+
rake (0.9.2.2)
|
17
|
+
rspec (2.7.0)
|
18
|
+
rspec-core (~> 2.7.0)
|
19
|
+
rspec-expectations (~> 2.7.0)
|
20
|
+
rspec-mocks (~> 2.7.0)
|
21
|
+
rspec-core (2.7.1)
|
22
|
+
rspec-expectations (2.7.0)
|
23
|
+
diff-lcs (~> 1.1.2)
|
24
|
+
rspec-mocks (2.7.0)
|
25
|
+
ruby-debug-base19 (0.11.25)
|
26
|
+
columnize (>= 0.3.1)
|
27
|
+
linecache19 (>= 0.5.11)
|
28
|
+
ruby_core_source (>= 0.1.4)
|
29
|
+
ruby-debug19 (0.11.6)
|
30
|
+
columnize (>= 0.3.1)
|
31
|
+
linecache19 (>= 0.5.11)
|
32
|
+
ruby-debug-base19 (>= 0.11.19)
|
33
|
+
ruby_core_source (0.1.5)
|
34
|
+
archive-tar-minitar (>= 0.5.2)
|
35
|
+
treetop (1.4.10)
|
36
|
+
polyglot
|
37
|
+
polyglot (>= 0.3.1)
|
38
|
+
|
39
|
+
PLATFORMS
|
40
|
+
ruby
|
41
|
+
|
42
|
+
DEPENDENCIES
|
43
|
+
jeweler
|
44
|
+
parallel
|
45
|
+
rspec
|
46
|
+
ruby-debug19
|
47
|
+
treetop
|
data/README.rdoc
CHANGED
@@ -85,6 +85,11 @@ You can use it as a library
|
|
85
85
|
# to get detailed information about elements of the name
|
86
86
|
parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
|
87
87
|
|
88
|
+
# to parse using several CPUs (4 seem to be optimal)
|
89
|
+
parser = ParallelParser.new # ParallelParser.new(4) will try to run 4 processes if hardware allows
|
90
|
+
array_of_names = ["Betula alba", "Homo sapiens"....]
|
91
|
+
parser.parse(array_of_names) # -> {"Betula alba" => "{scientificName...}", "Homo sapiens" => "{scientificName...}", ...}
|
92
|
+
|
88
93
|
# to resolve lsid and get back RDF file
|
89
94
|
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
|
90
95
|
|
data/Rakefile
CHANGED
@@ -30,6 +30,7 @@ begin
|
|
30
30
|
gem.bindir = 'bin'
|
31
31
|
gem.executables = ['nnparse', 'parserver']
|
32
32
|
gem.add_dependency('treetop')
|
33
|
+
gem.add_dependency('parallel')
|
33
34
|
gem.add_dependency('json') if ruby_version < 19
|
34
35
|
gem.add_development_dependency "rspec"
|
35
36
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0.
|
1
|
+
1.0.8
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -23,6 +23,34 @@ module PreProcessor
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
+
class ParallelParser
|
27
|
+
|
28
|
+
def initialize(processes_num = nil)
|
29
|
+
require 'parallel'
|
30
|
+
cpu_num
|
31
|
+
if processes_num.to_i > 0
|
32
|
+
@processes_num = [processes_num, cpu_num - 1].min
|
33
|
+
else
|
34
|
+
@processes_num = cpu_num > 3 ? cpu_num - 2 : 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse(names_list)
|
39
|
+
parsed = Parallel.map(names_list.uniq, :in_processes => @processes_num) { |n| [n, parse_process(n)] }
|
40
|
+
parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
|
41
|
+
end
|
42
|
+
|
43
|
+
def cpu_num
|
44
|
+
@cpu_num ||= Parallel.processor_count
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def parse_process(name)
|
49
|
+
p = ScientificNameParser.new
|
50
|
+
p.parse(name).to_json rescue {'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
26
54
|
# we can use these expressions when we are ready to parse virus names
|
27
55
|
# class VirusParser
|
28
56
|
# def initialize
|
@@ -403,7 +403,7 @@ grammar ScientificNameClean
|
|
403
403
|
end
|
404
404
|
|
405
405
|
rule rank
|
406
|
-
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
|
406
|
+
("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
|
407
407
|
/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
|
408
408
|
{
|
409
409
|
def value
|
@@ -43,3 +43,31 @@ describe ScientificNameParser do
|
|
43
43
|
parse('Nile virus')[:scientificName][:parser_version].should_not be_nil
|
44
44
|
end
|
45
45
|
end
|
46
|
+
|
47
|
+
|
48
|
+
describe ParallelParser do
|
49
|
+
it "should find number of cpus" do
|
50
|
+
pparser = ParallelParser.new
|
51
|
+
pparser.cpu_num.should > 0
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should parse several names in parallel" do
|
55
|
+
names = []
|
56
|
+
read_test_file { |n| names << (n[:name]) if n[:name] }
|
57
|
+
names.uniq!
|
58
|
+
pparser = ParallelParser.new
|
59
|
+
res = pparser.parse(names)
|
60
|
+
names.size.should > 100
|
61
|
+
res.keys.size.should == names.size
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should parse several names in parallel with given num of processes" do
|
65
|
+
names = []
|
66
|
+
read_test_file { |n| names << (n[:name]) if n[:name] }
|
67
|
+
names.uniq!
|
68
|
+
pparser = ParallelParser.new(4)
|
69
|
+
res = pparser.parse(names)
|
70
|
+
names.size.should > 100
|
71
|
+
res.keys.size.should == names.size
|
72
|
+
end
|
73
|
+
end
|
@@ -254,6 +254,8 @@ describe ScientificNameClean do
|
|
254
254
|
pos(sn).should == {0=>["genus", 12], 13=>["species", 19], 25=>["infraspecies", 34], 35=>["author_word", 42], 44=>["author_word", 48], 49=>["author_word", 52], 53=>["year", 57]}
|
255
255
|
sn = "Cassytha peninsularis J. Z. Weber var. flindersii"
|
256
256
|
canonical(sn).should == "Cassytha peninsularis flindersii"
|
257
|
+
sn = "Prunus armeniaca convar. budae (Pénzes) Soó"
|
258
|
+
canonical(sn).should == "Prunus armeniaca budae"
|
257
259
|
end
|
258
260
|
|
259
261
|
it 'should parse unknown original authors (auct.)/(hort.)/(?)' do
|
data/spec/parser/test_data.txt
CHANGED
@@ -109,6 +109,7 @@ Chlorocyperus glaber form. fasciculariforme (Lojac.) Soó|{"scientificName":{"pa
|
|
109
109
|
Sphaerotheca fuliginea f. dahliae Movss. 1967|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Sphaerotheca fuliginea f. dahliae Movss. 1967","normalized":"Sphaerotheca fuliginea f. dahliae Movss. 1967","canonical":"Sphaerotheca fuliginea dahliae","hybrid":false,"details":[{"genus":{"string":"Sphaerotheca"},"species":{"string":"fuliginea"},"infraspecies":[{"string":"dahliae","rank":"f.","authorship":"Movss. 1967","basionymAuthorTeam":{"authorTeam":"Movss.","author":["Movss."],"year":"1967"}}]}],"positions":{"0":["genus",12],"16":["species",25],"36":["infraspecies",43],"47":["author_word",53],"58":["year",62]}}}
|
110
110
|
Polypodium vulgare nothosubsp. mantoniae (Rothm.) Schidlay|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Polypodium vulgare nothosubsp. mantoniae (Rothm.) Schidlay","normalized":"Polypodium vulgare nothosubsp. mantoniae (Rothm.) Schidlay","canonical":"Polypodium vulgare mantoniae","hybrid":false,"details":[{"genus":{"string":"Polypodium"},"species":{"string":"vulgare"},"infraspecies":[{"string":"mantoniae","rank":"nothosubsp.","authorship":"(Rothm.) Schidlay","combinationAuthorTeam":{"authorTeam":"Schidlay","author":["Schidlay"]},"basionymAuthorTeam":{"authorTeam":"Rothm.","author":["Rothm."]}}]}],"positions":{"0":["genus",10],"11":["species",18],"31":["infraspecies",40],"42":["author_word",48],"50":["author_word",58]}}}
|
111
111
|
Allophylus amazonicus var amazonicus|{"scientificName":{"parsed":true, "verbatim":"Allophylus amazonicus var amazonicus", "normalized":"Allophylus amazonicus var amazonicus", "canonical":"Allophylus amazonicus amazonicus", "hybrid":false, "details":[{"genus":{"string":"Allophylus"}, "species":{"string":"amazonicus"}, "infraspecies":[{"string":"amazonicus", "rank":"var"}]}], "parser_version":"test_version", "parser_run":1, "positions":{"0":["genus", 10], "11":["species", 21], "26":["infraspecies", 36]}}}
|
112
|
+
Prunus armeniaca convar. budae (Pénzes) Soó|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Prunus armeniaca convar. budae (Pénzes) Soó", "normalized":"Prunus armeniaca convar. budae (Pénzes) Soó", "canonical":"Prunus armeniaca budae", "hybrid":false, "details":[{"genus":{"string":"Prunus"}, "species":{"string":"armeniaca"}, "infraspecies":[{"string":"budae", "rank":"convar.", "authorship":"(Pénzes) Soó", "combinationAuthorTeam":{"authorTeam":"Soó", "author":["Soó"]}, "basionymAuthorTeam":{"authorTeam":"Pénzes", "author":["Pénzes"]}}]}], "parser_run":1, "positions":{"0":["genus", 6], "7":["species", 16], "25":["infraspecies", 30], "32":["author_word", 38], "40":["author_word", 43]}}}
|
112
113
|
|
113
114
|
#infraspecies_multiple
|
114
115
|
Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall & D.E. Stuntz 1972|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall & D.E. Stuntz 1972","normalized":"Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall et D.E. Stuntz 1972","canonical":"Hydnellum scrobiculatum zonatum parvum","hybrid":false,"details":[{"genus":{"string":"Hydnellum"},"species":{"string":"scrobiculatum"},"infraspecies":[{"string":"zonatum","rank":"var."},{"string":"parvum","rank":"f.","authorship":"(Banker) D. Hall & D.E. Stuntz 1972","combinationAuthorTeam":{"authorTeam":"D. Hall & D.E. Stuntz","author":["D. Hall","D.E. Stuntz"],"year":"1972"},"basionymAuthorTeam":{"authorTeam":"Banker","author":["Banker"]}}]}],"positions":{"0":["genus",9],"10":["species",23],"29":["infraspecies",36],"40":["infraspecies",46],"48":["author_word",54],"56":["author_word",58],"59":["author_word",63],"66":["author_word",70],"71":["author_word",77],"78":["year",82]}}}
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: biodiversity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 7
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 1.0.
|
9
|
+
- 8
|
10
|
+
version: 1.0.8
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Dmitry Mozzherin
|
@@ -15,12 +15,24 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-11-18 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
22
|
+
none: false
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
hash: 3
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
version_requirements: *id001
|
21
31
|
name: treetop
|
22
32
|
prerelease: false
|
23
|
-
|
33
|
+
type: :runtime
|
34
|
+
- !ruby/object:Gem::Dependency
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
24
36
|
none: false
|
25
37
|
requirements:
|
26
38
|
- - ">="
|
@@ -29,12 +41,26 @@ dependencies:
|
|
29
41
|
segments:
|
30
42
|
- 0
|
31
43
|
version: "0"
|
44
|
+
version_requirements: *id002
|
45
|
+
name: parallel
|
46
|
+
prerelease: false
|
32
47
|
type: :runtime
|
33
|
-
version_requirements: *id001
|
34
48
|
- !ruby/object:Gem::Dependency
|
35
|
-
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
hash: 3
|
55
|
+
segments:
|
56
|
+
- 0
|
57
|
+
version: "0"
|
58
|
+
version_requirements: *id003
|
59
|
+
name: jeweler
|
36
60
|
prerelease: false
|
37
|
-
|
61
|
+
type: :development
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
38
64
|
none: false
|
39
65
|
requirements:
|
40
66
|
- - ">="
|
@@ -43,12 +69,26 @@ dependencies:
|
|
43
69
|
segments:
|
44
70
|
- 0
|
45
71
|
version: "0"
|
72
|
+
version_requirements: *id004
|
73
|
+
name: treetop
|
74
|
+
prerelease: false
|
46
75
|
type: :runtime
|
47
|
-
version_requirements: *id002
|
48
76
|
- !ruby/object:Gem::Dependency
|
49
|
-
|
77
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
hash: 3
|
83
|
+
segments:
|
84
|
+
- 0
|
85
|
+
version: "0"
|
86
|
+
version_requirements: *id005
|
87
|
+
name: parallel
|
50
88
|
prerelease: false
|
51
|
-
|
89
|
+
type: :runtime
|
90
|
+
- !ruby/object:Gem::Dependency
|
91
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
52
92
|
none: false
|
53
93
|
requirements:
|
54
94
|
- - ">="
|
@@ -57,8 +97,24 @@ dependencies:
|
|
57
97
|
segments:
|
58
98
|
- 0
|
59
99
|
version: "0"
|
100
|
+
version_requirements: *id006
|
101
|
+
name: json
|
102
|
+
prerelease: false
|
103
|
+
type: :runtime
|
104
|
+
- !ruby/object:Gem::Dependency
|
105
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
hash: 3
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
version: "0"
|
114
|
+
version_requirements: *id007
|
115
|
+
name: rspec
|
116
|
+
prerelease: false
|
60
117
|
type: :development
|
61
|
-
version_requirements: *id003
|
62
118
|
description: Tools for biodiversity informatics
|
63
119
|
email: dmozzherin@gmail.com
|
64
120
|
executables:
|
@@ -71,6 +127,9 @@ extra_rdoc_files:
|
|
71
127
|
- README.rdoc
|
72
128
|
files:
|
73
129
|
- .document
|
130
|
+
- .rvmrc
|
131
|
+
- Gemfile
|
132
|
+
- Gemfile.lock
|
74
133
|
- LICENSE
|
75
134
|
- README.rdoc
|
76
135
|
- Rakefile
|
@@ -125,7 +184,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
125
184
|
requirements: []
|
126
185
|
|
127
186
|
rubyforge_project:
|
128
|
-
rubygems_version: 1.8.
|
187
|
+
rubygems_version: 1.8.10
|
129
188
|
signing_key:
|
130
189
|
specification_version: 3
|
131
190
|
summary: Parser of scientific names
|