biodiversity 1.0.5 → 1.0.8

Sign up to get free protection for your applications and to get access to all the features.
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use ruby-1.9.2-p290@biodiversity --create
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "treetop"
4
+ gem "parallel"
5
+
6
+ group :development do
7
+ gem "jeweler"
8
+ end
9
+
10
+ group :test do
11
+ gem "ruby-debug19", :require => "ruby-debug"
12
+ gem "rspec"
13
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,47 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ archive-tar-minitar (0.5.2)
5
+ columnize (0.3.4)
6
+ diff-lcs (1.1.3)
7
+ git (1.2.5)
8
+ jeweler (1.6.4)
9
+ bundler (~> 1.0)
10
+ git (>= 1.2.5)
11
+ rake
12
+ linecache19 (0.5.12)
13
+ ruby_core_source (>= 0.1.4)
14
+ parallel (0.5.9)
15
+ polyglot (0.3.3)
16
+ rake (0.9.2.2)
17
+ rspec (2.7.0)
18
+ rspec-core (~> 2.7.0)
19
+ rspec-expectations (~> 2.7.0)
20
+ rspec-mocks (~> 2.7.0)
21
+ rspec-core (2.7.1)
22
+ rspec-expectations (2.7.0)
23
+ diff-lcs (~> 1.1.2)
24
+ rspec-mocks (2.7.0)
25
+ ruby-debug-base19 (0.11.25)
26
+ columnize (>= 0.3.1)
27
+ linecache19 (>= 0.5.11)
28
+ ruby_core_source (>= 0.1.4)
29
+ ruby-debug19 (0.11.6)
30
+ columnize (>= 0.3.1)
31
+ linecache19 (>= 0.5.11)
32
+ ruby-debug-base19 (>= 0.11.19)
33
+ ruby_core_source (0.1.5)
34
+ archive-tar-minitar (>= 0.5.2)
35
+ treetop (1.4.10)
36
+ polyglot
37
+ polyglot (>= 0.3.1)
38
+
39
+ PLATFORMS
40
+ ruby
41
+
42
+ DEPENDENCIES
43
+ jeweler
44
+ parallel
45
+ rspec
46
+ ruby-debug19
47
+ treetop
data/README.rdoc CHANGED
@@ -85,6 +85,11 @@ You can use it as a library
85
85
  # to get detailed information about elements of the name
86
86
  parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
87
87
 
88
+ # to parse using several CPUs (4 seem to be optimal)
89
+ parser = ParallelParser.new # ParallelParser.new(4) will try to run 4 processes if hardware allows
90
+ array_of_names = ["Betula alba", "Homo sapiens"....]
91
+ parser.parse(array_of_names) # -> {"Betula alba" => "{scientificName...}", "Homo sapiens" => "{scientificName...}", ...}
92
+
88
93
  # to resolve lsid and get back RDF file
89
94
  LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
90
95
 
data/Rakefile CHANGED
@@ -30,6 +30,7 @@ begin
30
30
  gem.bindir = 'bin'
31
31
  gem.executables = ['nnparse', 'parserver']
32
32
  gem.add_dependency('treetop')
33
+ gem.add_dependency('parallel')
33
34
  gem.add_dependency('json') if ruby_version < 19
34
35
  gem.add_development_dependency "rspec"
35
36
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.5
1
+ 1.0.8
@@ -23,6 +23,34 @@ module PreProcessor
23
23
  end
24
24
  end
25
25
 
26
+ class ParallelParser
27
+
28
+ def initialize(processes_num = nil)
29
+ require 'parallel'
30
+ cpu_num
31
+ if processes_num.to_i > 0
32
+ @processes_num = [processes_num, cpu_num - 1].min
33
+ else
34
+ @processes_num = cpu_num > 3 ? cpu_num - 2 : 1
35
+ end
36
+ end
37
+
38
+ def parse(names_list)
39
+ parsed = Parallel.map(names_list.uniq, :in_processes => @processes_num) { |n| [n, parse_process(n)] }
40
+ parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
41
+ end
42
+
43
+ def cpu_num
44
+ @cpu_num ||= Parallel.processor_count
45
+ end
46
+
47
+ private
48
+ def parse_process(name)
49
+ p = ScientificNameParser.new
50
+ p.parse(name).to_json rescue {'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
51
+ end
52
+ end
53
+
26
54
  # we can use these expressions when we are ready to parse virus names
27
55
  # class VirusParser
28
56
  # def initialize
@@ -403,7 +403,7 @@ grammar ScientificNameClean
403
403
  end
404
404
 
405
405
  rule rank
406
- ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
406
+ ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
407
407
  /"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
408
408
  {
409
409
  def value
@@ -43,3 +43,31 @@ describe ScientificNameParser do
43
43
  parse('Nile virus')[:scientificName][:parser_version].should_not be_nil
44
44
  end
45
45
  end
46
+
47
+
48
+ describe ParallelParser do
49
+ it "should find number of cpus" do
50
+ pparser = ParallelParser.new
51
+ pparser.cpu_num.should > 0
52
+ end
53
+
54
+ it "should parse several names in parallel" do
55
+ names = []
56
+ read_test_file { |n| names << (n[:name]) if n[:name] }
57
+ names.uniq!
58
+ pparser = ParallelParser.new
59
+ res = pparser.parse(names)
60
+ names.size.should > 100
61
+ res.keys.size.should == names.size
62
+ end
63
+
64
+ it "should parse several names in parallel with given num of processes" do
65
+ names = []
66
+ read_test_file { |n| names << (n[:name]) if n[:name] }
67
+ names.uniq!
68
+ pparser = ParallelParser.new(4)
69
+ res = pparser.parse(names)
70
+ names.size.should > 100
71
+ res.keys.size.should == names.size
72
+ end
73
+ end
@@ -254,6 +254,8 @@ describe ScientificNameClean do
254
254
  pos(sn).should == {0=>["genus", 12], 13=>["species", 19], 25=>["infraspecies", 34], 35=>["author_word", 42], 44=>["author_word", 48], 49=>["author_word", 52], 53=>["year", 57]}
255
255
  sn = "Cassytha peninsularis J. Z. Weber var. flindersii"
256
256
  canonical(sn).should == "Cassytha peninsularis flindersii"
257
+ sn = "Prunus armeniaca convar. budae (Pénzes) Soó"
258
+ canonical(sn).should == "Prunus armeniaca budae"
257
259
  end
258
260
 
259
261
  it 'should parse unknown original authors (auct.)/(hort.)/(?)' do
@@ -109,6 +109,7 @@ Chlorocyperus glaber form. fasciculariforme (Lojac.) Soó|{"scientificName":{"pa
109
109
  Sphaerotheca fuliginea f. dahliae Movss. 1967|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Sphaerotheca fuliginea f. dahliae Movss. 1967","normalized":"Sphaerotheca fuliginea f. dahliae Movss. 1967","canonical":"Sphaerotheca fuliginea dahliae","hybrid":false,"details":[{"genus":{"string":"Sphaerotheca"},"species":{"string":"fuliginea"},"infraspecies":[{"string":"dahliae","rank":"f.","authorship":"Movss. 1967","basionymAuthorTeam":{"authorTeam":"Movss.","author":["Movss."],"year":"1967"}}]}],"positions":{"0":["genus",12],"16":["species",25],"36":["infraspecies",43],"47":["author_word",53],"58":["year",62]}}}
110
110
  Polypodium vulgare nothosubsp. mantoniae (Rothm.) Schidlay|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Polypodium vulgare nothosubsp. mantoniae (Rothm.) Schidlay","normalized":"Polypodium vulgare nothosubsp. mantoniae (Rothm.) Schidlay","canonical":"Polypodium vulgare mantoniae","hybrid":false,"details":[{"genus":{"string":"Polypodium"},"species":{"string":"vulgare"},"infraspecies":[{"string":"mantoniae","rank":"nothosubsp.","authorship":"(Rothm.) Schidlay","combinationAuthorTeam":{"authorTeam":"Schidlay","author":["Schidlay"]},"basionymAuthorTeam":{"authorTeam":"Rothm.","author":["Rothm."]}}]}],"positions":{"0":["genus",10],"11":["species",18],"31":["infraspecies",40],"42":["author_word",48],"50":["author_word",58]}}}
111
111
  Allophylus amazonicus var amazonicus|{"scientificName":{"parsed":true, "verbatim":"Allophylus amazonicus var amazonicus", "normalized":"Allophylus amazonicus var amazonicus", "canonical":"Allophylus amazonicus amazonicus", "hybrid":false, "details":[{"genus":{"string":"Allophylus"}, "species":{"string":"amazonicus"}, "infraspecies":[{"string":"amazonicus", "rank":"var"}]}], "parser_version":"test_version", "parser_run":1, "positions":{"0":["genus", 10], "11":["species", 21], "26":["infraspecies", 36]}}}
112
+ Prunus armeniaca convar. budae (Pénzes) Soó|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Prunus armeniaca convar. budae (Pénzes) Soó", "normalized":"Prunus armeniaca convar. budae (Pénzes) Soó", "canonical":"Prunus armeniaca budae", "hybrid":false, "details":[{"genus":{"string":"Prunus"}, "species":{"string":"armeniaca"}, "infraspecies":[{"string":"budae", "rank":"convar.", "authorship":"(Pénzes) Soó", "combinationAuthorTeam":{"authorTeam":"Soó", "author":["Soó"]}, "basionymAuthorTeam":{"authorTeam":"Pénzes", "author":["Pénzes"]}}]}], "parser_run":1, "positions":{"0":["genus", 6], "7":["species", 16], "25":["infraspecies", 30], "32":["author_word", 38], "40":["author_word", 43]}}}
112
113
 
113
114
  #infraspecies_multiple
114
115
  Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall & D.E. Stuntz 1972|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall & D.E. Stuntz 1972","normalized":"Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall et D.E. Stuntz 1972","canonical":"Hydnellum scrobiculatum zonatum parvum","hybrid":false,"details":[{"genus":{"string":"Hydnellum"},"species":{"string":"scrobiculatum"},"infraspecies":[{"string":"zonatum","rank":"var."},{"string":"parvum","rank":"f.","authorship":"(Banker) D. Hall & D.E. Stuntz 1972","combinationAuthorTeam":{"authorTeam":"D. Hall & D.E. Stuntz","author":["D. Hall","D.E. Stuntz"],"year":"1972"},"basionymAuthorTeam":{"authorTeam":"Banker","author":["Banker"]}}]}],"positions":{"0":["genus",9],"10":["species",23],"29":["infraspecies",36],"40":["infraspecies",46],"48":["author_word",54],"56":["author_word",58],"59":["author_word",63],"66":["author_word",70],"71":["author_word",77],"78":["year",82]}}}
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biodiversity
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 7
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 0
9
- - 5
10
- version: 1.0.5
9
+ - 8
10
+ version: 1.0.8
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dmitry Mozzherin
@@ -15,12 +15,24 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-08-09 00:00:00 Z
18
+ date: 2011-11-18 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
+ requirement: &id001 !ruby/object:Gem::Requirement
22
+ none: false
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ hash: 3
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ version_requirements: *id001
21
31
  name: treetop
22
32
  prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
33
+ type: :runtime
34
+ - !ruby/object:Gem::Dependency
35
+ requirement: &id002 !ruby/object:Gem::Requirement
24
36
  none: false
25
37
  requirements:
26
38
  - - ">="
@@ -29,12 +41,26 @@ dependencies:
29
41
  segments:
30
42
  - 0
31
43
  version: "0"
44
+ version_requirements: *id002
45
+ name: parallel
46
+ prerelease: false
32
47
  type: :runtime
33
- version_requirements: *id001
34
48
  - !ruby/object:Gem::Dependency
35
- name: json
49
+ requirement: &id003 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ hash: 3
55
+ segments:
56
+ - 0
57
+ version: "0"
58
+ version_requirements: *id003
59
+ name: jeweler
36
60
  prerelease: false
37
- requirement: &id002 !ruby/object:Gem::Requirement
61
+ type: :development
62
+ - !ruby/object:Gem::Dependency
63
+ requirement: &id004 !ruby/object:Gem::Requirement
38
64
  none: false
39
65
  requirements:
40
66
  - - ">="
@@ -43,12 +69,26 @@ dependencies:
43
69
  segments:
44
70
  - 0
45
71
  version: "0"
72
+ version_requirements: *id004
73
+ name: treetop
74
+ prerelease: false
46
75
  type: :runtime
47
- version_requirements: *id002
48
76
  - !ruby/object:Gem::Dependency
49
- name: rspec
77
+ requirement: &id005 !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ version_requirements: *id005
87
+ name: parallel
50
88
  prerelease: false
51
- requirement: &id003 !ruby/object:Gem::Requirement
89
+ type: :runtime
90
+ - !ruby/object:Gem::Dependency
91
+ requirement: &id006 !ruby/object:Gem::Requirement
52
92
  none: false
53
93
  requirements:
54
94
  - - ">="
@@ -57,8 +97,24 @@ dependencies:
57
97
  segments:
58
98
  - 0
59
99
  version: "0"
100
+ version_requirements: *id006
101
+ name: json
102
+ prerelease: false
103
+ type: :runtime
104
+ - !ruby/object:Gem::Dependency
105
+ requirement: &id007 !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ hash: 3
111
+ segments:
112
+ - 0
113
+ version: "0"
114
+ version_requirements: *id007
115
+ name: rspec
116
+ prerelease: false
60
117
  type: :development
61
- version_requirements: *id003
62
118
  description: Tools for biodiversity informatics
63
119
  email: dmozzherin@gmail.com
64
120
  executables:
@@ -71,6 +127,9 @@ extra_rdoc_files:
71
127
  - README.rdoc
72
128
  files:
73
129
  - .document
130
+ - .rvmrc
131
+ - Gemfile
132
+ - Gemfile.lock
74
133
  - LICENSE
75
134
  - README.rdoc
76
135
  - Rakefile
@@ -125,7 +184,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
125
184
  requirements: []
126
185
 
127
186
  rubyforge_project:
128
- rubygems_version: 1.8.6
187
+ rubygems_version: 1.8.10
129
188
  signing_key:
130
189
  specification_version: 3
131
190
  summary: Parser of scientific names