biodiversity 1.0.5 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use ruby-1.9.2-p290@biodiversity --create
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "treetop"
4
+ gem "parallel"
5
+
6
+ group :development do
7
+ gem "jeweler"
8
+ end
9
+
10
+ group :test do
11
+ gem "ruby-debug19", :require => "ruby-debug"
12
+ gem "rspec"
13
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,47 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ archive-tar-minitar (0.5.2)
5
+ columnize (0.3.4)
6
+ diff-lcs (1.1.3)
7
+ git (1.2.5)
8
+ jeweler (1.6.4)
9
+ bundler (~> 1.0)
10
+ git (>= 1.2.5)
11
+ rake
12
+ linecache19 (0.5.12)
13
+ ruby_core_source (>= 0.1.4)
14
+ parallel (0.5.9)
15
+ polyglot (0.3.3)
16
+ rake (0.9.2.2)
17
+ rspec (2.7.0)
18
+ rspec-core (~> 2.7.0)
19
+ rspec-expectations (~> 2.7.0)
20
+ rspec-mocks (~> 2.7.0)
21
+ rspec-core (2.7.1)
22
+ rspec-expectations (2.7.0)
23
+ diff-lcs (~> 1.1.2)
24
+ rspec-mocks (2.7.0)
25
+ ruby-debug-base19 (0.11.25)
26
+ columnize (>= 0.3.1)
27
+ linecache19 (>= 0.5.11)
28
+ ruby_core_source (>= 0.1.4)
29
+ ruby-debug19 (0.11.6)
30
+ columnize (>= 0.3.1)
31
+ linecache19 (>= 0.5.11)
32
+ ruby-debug-base19 (>= 0.11.19)
33
+ ruby_core_source (0.1.5)
34
+ archive-tar-minitar (>= 0.5.2)
35
+ treetop (1.4.10)
36
+ polyglot
37
+ polyglot (>= 0.3.1)
38
+
39
+ PLATFORMS
40
+ ruby
41
+
42
+ DEPENDENCIES
43
+ jeweler
44
+ parallel
45
+ rspec
46
+ ruby-debug19
47
+ treetop
data/README.rdoc CHANGED
@@ -85,6 +85,11 @@ You can use it as a library
85
85
  # to get detailed information about elements of the name
86
86
  parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
87
87
 
88
+ # to parse using several CPUs (4 seem to be optimal)
89
+ parser = ParallelParser.new # ParallelParser.new(4) will try to run 4 processes if hardware allows
90
+ array_of_names = ["Betula alba", "Homo sapiens"....]
91
+ parser.parse(array_of_names) # -> {"Betula alba" => "{scientificName...}", "Homo sapiens" => "{scientificName...}", ...}
92
+
88
93
  # to resolve lsid and get back RDF file
89
94
  LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
90
95
 
data/Rakefile CHANGED
@@ -30,6 +30,7 @@ begin
30
30
  gem.bindir = 'bin'
31
31
  gem.executables = ['nnparse', 'parserver']
32
32
  gem.add_dependency('treetop')
33
+ gem.add_dependency('parallel')
33
34
  gem.add_dependency('json') if ruby_version < 19
34
35
  gem.add_development_dependency "rspec"
35
36
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.5
1
+ 1.0.8
@@ -23,6 +23,34 @@ module PreProcessor
23
23
  end
24
24
  end
25
25
 
26
+ class ParallelParser
27
+
28
+ def initialize(processes_num = nil)
29
+ require 'parallel'
30
+ cpu_num
31
+ if processes_num.to_i > 0
32
+ @processes_num = [processes_num, cpu_num - 1].min
33
+ else
34
+ @processes_num = cpu_num > 3 ? cpu_num - 2 : 1
35
+ end
36
+ end
37
+
38
+ def parse(names_list)
39
+ parsed = Parallel.map(names_list.uniq, :in_processes => @processes_num) { |n| [n, parse_process(n)] }
40
+ parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
41
+ end
42
+
43
+ def cpu_num
44
+ @cpu_num ||= Parallel.processor_count
45
+ end
46
+
47
+ private
48
+ def parse_process(name)
49
+ p = ScientificNameParser.new
50
+ p.parse(name).to_json rescue {'scientificName' => {'parsed' => false, 'verbatim' => name, 'error' => 'Parser error'}}.to_json
51
+ end
52
+ end
53
+
26
54
  # we can use these expressions when we are ready to parse virus names
27
55
  # class VirusParser
28
56
  # def initialize
@@ -403,7 +403,7 @@ grammar ScientificNameClean
403
403
  end
404
404
 
405
405
  rule rank
406
- ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
406
+ ("morph."/"f.sp."/"B"/"ssp."/"mut."/"nat"/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"α"
407
407
  /"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
408
408
  {
409
409
  def value
@@ -43,3 +43,31 @@ describe ScientificNameParser do
43
43
  parse('Nile virus')[:scientificName][:parser_version].should_not be_nil
44
44
  end
45
45
  end
46
+
47
+
48
+ describe ParallelParser do
49
+ it "should find number of cpus" do
50
+ pparser = ParallelParser.new
51
+ pparser.cpu_num.should > 0
52
+ end
53
+
54
+ it "should parse several names in parallel" do
55
+ names = []
56
+ read_test_file { |n| names << (n[:name]) if n[:name] }
57
+ names.uniq!
58
+ pparser = ParallelParser.new
59
+ res = pparser.parse(names)
60
+ names.size.should > 100
61
+ res.keys.size.should == names.size
62
+ end
63
+
64
+ it "should parse several names in parallel with given num of processes" do
65
+ names = []
66
+ read_test_file { |n| names << (n[:name]) if n[:name] }
67
+ names.uniq!
68
+ pparser = ParallelParser.new(4)
69
+ res = pparser.parse(names)
70
+ names.size.should > 100
71
+ res.keys.size.should == names.size
72
+ end
73
+ end
@@ -254,6 +254,8 @@ describe ScientificNameClean do
254
254
  pos(sn).should == {0=>["genus", 12], 13=>["species", 19], 25=>["infraspecies", 34], 35=>["author_word", 42], 44=>["author_word", 48], 49=>["author_word", 52], 53=>["year", 57]}
255
255
  sn = "Cassytha peninsularis J. Z. Weber var. flindersii"
256
256
  canonical(sn).should == "Cassytha peninsularis flindersii"
257
+ sn = "Prunus armeniaca convar. budae (Pénzes) Soó"
258
+ canonical(sn).should == "Prunus armeniaca budae"
257
259
  end
258
260
 
259
261
  it 'should parse unknown original authors (auct.)/(hort.)/(?)' do
@@ -109,6 +109,7 @@ Chlorocyperus glaber form. fasciculariforme (Lojac.) Soó|{"scientificName":{"pa
109
109
  Sphaerotheca fuliginea f. dahliae Movss. 1967|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Sphaerotheca fuliginea f. dahliae Movss. 1967","normalized":"Sphaerotheca fuliginea f. dahliae Movss. 1967","canonical":"Sphaerotheca fuliginea dahliae","hybrid":false,"details":[{"genus":{"string":"Sphaerotheca"},"species":{"string":"fuliginea"},"infraspecies":[{"string":"dahliae","rank":"f.","authorship":"Movss. 1967","basionymAuthorTeam":{"authorTeam":"Movss.","author":["Movss."],"year":"1967"}}]}],"positions":{"0":["genus",12],"16":["species",25],"36":["infraspecies",43],"47":["author_word",53],"58":["year",62]}}}
110
110
  Polypodium vulgare nothosubsp. mantoniae (Rothm.) Schidlay|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Polypodium vulgare nothosubsp. mantoniae (Rothm.) Schidlay","normalized":"Polypodium vulgare nothosubsp. mantoniae (Rothm.) Schidlay","canonical":"Polypodium vulgare mantoniae","hybrid":false,"details":[{"genus":{"string":"Polypodium"},"species":{"string":"vulgare"},"infraspecies":[{"string":"mantoniae","rank":"nothosubsp.","authorship":"(Rothm.) Schidlay","combinationAuthorTeam":{"authorTeam":"Schidlay","author":["Schidlay"]},"basionymAuthorTeam":{"authorTeam":"Rothm.","author":["Rothm."]}}]}],"positions":{"0":["genus",10],"11":["species",18],"31":["infraspecies",40],"42":["author_word",48],"50":["author_word",58]}}}
111
111
  Allophylus amazonicus var amazonicus|{"scientificName":{"parsed":true, "verbatim":"Allophylus amazonicus var amazonicus", "normalized":"Allophylus amazonicus var amazonicus", "canonical":"Allophylus amazonicus amazonicus", "hybrid":false, "details":[{"genus":{"string":"Allophylus"}, "species":{"string":"amazonicus"}, "infraspecies":[{"string":"amazonicus", "rank":"var"}]}], "parser_version":"test_version", "parser_run":1, "positions":{"0":["genus", 10], "11":["species", 21], "26":["infraspecies", 36]}}}
112
+ Prunus armeniaca convar. budae (Pénzes) Soó|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Prunus armeniaca convar. budae (Pénzes) Soó", "normalized":"Prunus armeniaca convar. budae (Pénzes) Soó", "canonical":"Prunus armeniaca budae", "hybrid":false, "details":[{"genus":{"string":"Prunus"}, "species":{"string":"armeniaca"}, "infraspecies":[{"string":"budae", "rank":"convar.", "authorship":"(Pénzes) Soó", "combinationAuthorTeam":{"authorTeam":"Soó", "author":["Soó"]}, "basionymAuthorTeam":{"authorTeam":"Pénzes", "author":["Pénzes"]}}]}], "parser_run":1, "positions":{"0":["genus", 6], "7":["species", 16], "25":["infraspecies", 30], "32":["author_word", 38], "40":["author_word", 43]}}}
112
113
 
113
114
  #infraspecies_multiple
114
115
  Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall & D.E. Stuntz 1972|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall & D.E. Stuntz 1972","normalized":"Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall et D.E. Stuntz 1972","canonical":"Hydnellum scrobiculatum zonatum parvum","hybrid":false,"details":[{"genus":{"string":"Hydnellum"},"species":{"string":"scrobiculatum"},"infraspecies":[{"string":"zonatum","rank":"var."},{"string":"parvum","rank":"f.","authorship":"(Banker) D. Hall & D.E. Stuntz 1972","combinationAuthorTeam":{"authorTeam":"D. Hall & D.E. Stuntz","author":["D. Hall","D.E. Stuntz"],"year":"1972"},"basionymAuthorTeam":{"authorTeam":"Banker","author":["Banker"]}}]}],"positions":{"0":["genus",9],"10":["species",23],"29":["infraspecies",36],"40":["infraspecies",46],"48":["author_word",54],"56":["author_word",58],"59":["author_word",63],"66":["author_word",70],"71":["author_word",77],"78":["year",82]}}}
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biodiversity
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 7
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 0
9
- - 5
10
- version: 1.0.5
9
+ - 8
10
+ version: 1.0.8
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dmitry Mozzherin
@@ -15,12 +15,24 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-08-09 00:00:00 Z
18
+ date: 2011-11-18 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
+ requirement: &id001 !ruby/object:Gem::Requirement
22
+ none: false
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ hash: 3
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ version_requirements: *id001
21
31
  name: treetop
22
32
  prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
33
+ type: :runtime
34
+ - !ruby/object:Gem::Dependency
35
+ requirement: &id002 !ruby/object:Gem::Requirement
24
36
  none: false
25
37
  requirements:
26
38
  - - ">="
@@ -29,12 +41,26 @@ dependencies:
29
41
  segments:
30
42
  - 0
31
43
  version: "0"
44
+ version_requirements: *id002
45
+ name: parallel
46
+ prerelease: false
32
47
  type: :runtime
33
- version_requirements: *id001
34
48
  - !ruby/object:Gem::Dependency
35
- name: json
49
+ requirement: &id003 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ hash: 3
55
+ segments:
56
+ - 0
57
+ version: "0"
58
+ version_requirements: *id003
59
+ name: jeweler
36
60
  prerelease: false
37
- requirement: &id002 !ruby/object:Gem::Requirement
61
+ type: :development
62
+ - !ruby/object:Gem::Dependency
63
+ requirement: &id004 !ruby/object:Gem::Requirement
38
64
  none: false
39
65
  requirements:
40
66
  - - ">="
@@ -43,12 +69,26 @@ dependencies:
43
69
  segments:
44
70
  - 0
45
71
  version: "0"
72
+ version_requirements: *id004
73
+ name: treetop
74
+ prerelease: false
46
75
  type: :runtime
47
- version_requirements: *id002
48
76
  - !ruby/object:Gem::Dependency
49
- name: rspec
77
+ requirement: &id005 !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ version_requirements: *id005
87
+ name: parallel
50
88
  prerelease: false
51
- requirement: &id003 !ruby/object:Gem::Requirement
89
+ type: :runtime
90
+ - !ruby/object:Gem::Dependency
91
+ requirement: &id006 !ruby/object:Gem::Requirement
52
92
  none: false
53
93
  requirements:
54
94
  - - ">="
@@ -57,8 +97,24 @@ dependencies:
57
97
  segments:
58
98
  - 0
59
99
  version: "0"
100
+ version_requirements: *id006
101
+ name: json
102
+ prerelease: false
103
+ type: :runtime
104
+ - !ruby/object:Gem::Dependency
105
+ requirement: &id007 !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ hash: 3
111
+ segments:
112
+ - 0
113
+ version: "0"
114
+ version_requirements: *id007
115
+ name: rspec
116
+ prerelease: false
60
117
  type: :development
61
- version_requirements: *id003
62
118
  description: Tools for biodiversity informatics
63
119
  email: dmozzherin@gmail.com
64
120
  executables:
@@ -71,6 +127,9 @@ extra_rdoc_files:
71
127
  - README.rdoc
72
128
  files:
73
129
  - .document
130
+ - .rvmrc
131
+ - Gemfile
132
+ - Gemfile.lock
74
133
  - LICENSE
75
134
  - README.rdoc
76
135
  - Rakefile
@@ -125,7 +184,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
125
184
  requirements: []
126
185
 
127
186
  rubyforge_project:
128
- rubygems_version: 1.8.6
187
+ rubygems_version: 1.8.10
129
188
  signing_key:
130
189
  specification_version: 3
131
190
  summary: Parser of scientific names