biodiversity 3.5.1 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rubocop.yml +9 -6
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +1 -6
  6. data/CHANGELOG +3 -0
  7. data/Gemfile +2 -0
  8. data/README.md +37 -178
  9. data/Rakefile +15 -48
  10. data/biodiversity.gemspec +18 -21
  11. data/clib/linux/libgnparser.h +93 -0
  12. data/clib/linux/libgnparser.so +0 -0
  13. data/clib/mac/libgnparser.h +93 -0
  14. data/clib/mac/libgnparser.so +0 -0
  15. data/lib/biodiversity.rb +4 -9
  16. data/lib/biodiversity/parser.rb +65 -281
  17. data/lib/biodiversity/version.rb +8 -1
  18. data/spec/lib/biodiversity_spec.rb +9 -0
  19. data/spec/lib/parser_spec.rb +38 -0
  20. data/spec/spec_helper.rb +4 -81
  21. metadata +27 -102
  22. data/.byebug_history +0 -18
  23. data/.document +0 -5
  24. data/examples/socket_client.rb +0 -25
  25. data/lib/biodiversity/guid.rb +0 -1
  26. data/lib/biodiversity/guid/lsid.rb +0 -16
  27. data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
  28. data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
  29. data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
  30. data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
  31. data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
  32. data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
  33. data/spec/biodiversity_spec.rb +0 -11
  34. data/spec/files/test_data.txt +0 -490
  35. data/spec/files/todo.txt +0 -55
  36. data/spec/guid/lsid.spec.rb +0 -15
  37. data/spec/parser/scientific_name_canonical_spec.rb +0 -36
  38. data/spec/parser/scientific_name_clean_spec.rb +0 -1137
  39. data/spec/parser/scientific_name_dirty_spec.rb +0 -165
  40. data/spec/parser/scientific_name_spec.rb +0 -193
@@ -1,165 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- describe ScientificNameDirty do
4
- before(:all) do
5
- set_parser(ScientificNameDirtyParser.new)
6
- end
7
-
8
- it 'parses clean names' do
9
- expect(parse('Betula verucosa (L.) Bar. 1899')).to_not be_nil
10
- end
11
-
12
- it 'parses double parenthesis' do
13
- sn = 'Eichornia crassipes ( (Martius) ) Solms-Laub.'
14
- expect(parse(sn)).to_not be_nil
15
- expect(value(sn)).to eq 'Eichornia crassipes (Martius) Solms-Laub.'
16
- expect(details(sn)).to eq [{ genus: { string: 'Eichornia' },
17
- species: { string: 'crassipes',
18
- authorship: '( (Martius) ) Solms-Laub.',
19
- combinationAuthorTeam: { authorTeam: 'Solms-Laub.',
20
- author: ['Solms-Laub.'] },
21
- basionymAuthorTeam: { authorTeam: 'Martius',
22
- author: ['Martius'] } } }]
23
- expect(pos(sn)).to eq({ 0 => ['genus', 9], 10 => ['species', 19],
24
- 23 => ['author_word', 30],
25
- 34 => ['author_word', 45] })
26
- end
27
-
28
- it 'parses year without author' do
29
- sn = 'Acarospora cratericola 1929'
30
- expect(parse(sn)).to_not be_nil
31
- expect(pos(sn)).to eq({ 0 => ['genus', 10],
32
- 11 => ['species', 22], 23 => ['year', 27] })
33
- expect(details(sn)).to eq [{ genus: { string: 'Acarospora' },
34
- species: { string: 'cratericola',
35
- year: '1929' } }]
36
- end
37
-
38
- it 'parses double years' do
39
- sn = 'Tridentella tangeroae Bruce, 1987-92'
40
- expect(parse(sn)).to_not be_nil
41
- expect(pos(sn)).to eq({ 0 => ['genus', 11], 12 => ['species', 21],
42
- 22 => ['author_word', 27], 29 => ['year', 36] })
43
- expect(details(sn)).to eq [{ genus: { string: 'Tridentella' },
44
- species: { string: 'tangeroae', authorship: 'Bruce, 1987-92',
45
- basionymAuthorTeam: { authorTeam: 'Bruce', author: ['Bruce'],
46
- year: '1987-92' } } }]
47
- end
48
-
49
- it 'parses dirty years' do
50
- expect(parse('Tridentella tangeroae Bruce, 1988B')).to_not be_nil
51
- expect(parse('Tridentella tangeroae Bruce, 1988b')).to_not be_nil
52
- expect(parse('Tridentella tangeroae Bruce, 1988d')).to_not be_nil
53
- sn = 'Tridentella tangeroae Bruce, 198?'
54
- expect(parse(sn)).to_not be_nil
55
- expect(pos(sn)).to eq({ 0 => ['genus', 11], 12 => ['species', 21],
56
- 22 => ['author_word', 27], 29 => ['year', 33] })
57
- end
58
-
59
- it 'parses year with page number' do
60
- sn = 'Gymnodactylus irregularis WERMUTH 1965: 54'
61
- expect(parse(sn)).to_not be_nil
62
- expect(value(sn)).to eq 'Gymnodactylus irregularis Wermuth 1965'
63
- expect(details(sn)).to eq [{ genus: { string: 'Gymnodactylus' },
64
- species: { string: 'irregularis', authorship: 'WERMUTH 1965: 54',
65
- basionymAuthorTeam: { authorTeam: 'WERMUTH', author: ['Wermuth'],
66
- year: '1965' } } }]
67
- expect(pos(sn)).to eq({ 0 => ['genus', 13], 14 => ['species', 25],
68
- 26 => ['author_word', 33], 34 => ['year', 38] })
69
- end
70
-
71
- it 'parses year with []' do
72
- expect(parse('Anthoscopus Cabanis [1851]')).to_not be_nil
73
- expect(value('Anthoscopus Cabanis [185?]')).
74
- to eq 'Anthoscopus Cabanis (185?)'
75
- expect(parse('Anthoscopus Cabanis [1851?]')).to_not be_nil
76
- expect(value('Anthoscopus Cabanis [1851]')).
77
- to eq 'Anthoscopus Cabanis (1851)'
78
- sn = 'Anthoscopus Cabanis [1851?]'
79
- expect(value(sn)).to eq 'Anthoscopus Cabanis (1851?)'
80
- expect(details(sn)).to eq [{ uninomial: { string: 'Anthoscopus',
81
- authorship: 'Cabanis [1851?]', basionymAuthorTeam:
82
- { authorTeam: 'Cabanis', author: ['Cabanis'],
83
- approximate_year: '(1851?)' } } }]
84
- expect(pos(sn)).to eq({ 0 => ['uninomial', 11],
85
- 12 => ['author_word', 19], 21 => ['year', 26] })
86
- sn = 'Trismegistia monodii Ando, 1973 [1974]'
87
- expect(parse(sn)).to_not be_nil
88
-
89
- #should it be 'Trismegistia monodii Ando 1973 (1974)' instead?
90
- expect(value(sn)).to eq 'Trismegistia monodii Ando 1973 (1974)'
91
-
92
- expect(details(sn)).to eq [{ genus: { string: 'Trismegistia' },
93
- species: { string: 'monodii', authorship: 'Ando, 1973 [1974]',
94
- basionymAuthorTeam: { authorTeam: 'Ando', author: ['Ando'],
95
- year: '1973', approximate_year: '(1974)' } } }]
96
- expect(pos(sn)).to eq({ 0 => ['genus', 12], 13 => ['species', 20],
97
- 21 => ['author_word', 25], 27 => ['year', 31], 33 => ['year', 37] })
98
- expect(parse('Zygaena witti Wiegel [1973]')).to_not be_nil
99
- sn = 'Deyeuxia coarctata Kunth, 1815 [1816]'
100
- expect(parse(sn)).to_not be_nil
101
- expect(pos(sn)).to eq({ 0 => ['genus', 8], 9 => ['species', 18],
102
- 19 => ['author_word', 24], 26 => ['year', 30],
103
- 32 => ['year', 36] })
104
- end
105
-
106
- it 'parses new stuff' do
107
- sn = 'Zoropsis (TaKeoa) nishimurai Yaginuma, 1971' #skipping for now
108
- sn = 'Campylobacter pyloridis Marshall et al.1985.'
109
- expect(details(sn)).to eq [{ genus: { string: 'Campylobacter' },
110
- species: { string: 'pyloridis', authorship: 'Marshall et al.1985.',
111
- basionymAuthorTeam: { authorTeam: 'Marshall et al.',
112
- author: ['Marshall et al.'], year: '1985' } } }]
113
- sn = 'Beijerinckia derxii venezuelae corrig. Thompson and Skerman, 1981'
114
- expect(details(sn)).to eq [{ genus: { string: 'Beijerinckia' },
115
- species: { string: 'derxii' }, infraspecies: [{ string: 'venezuelae',
116
- rank: 'n/a', authorship: 'Thompson and Skerman, 1981',
117
- basionymAuthorTeam: { authorTeam: 'Thompson and Skerman',
118
- author: ['Thompson', 'Skerman'], year: '1981' } }] }]
119
- expect(details('Streptomyces parvisporogenes ignotus 1960')).
120
- to eq [{ genus: { string: 'Streptomyces' },
121
- species: { string: 'parvisporogenes' },
122
- infraspecies: [{ string: 'ignotus', rank: 'n/a', year: '1960' }] }]
123
- expect(details('Oscillaria caviae Simons 1920, according to Simons 1922')).
124
- to eq [{ genus: { string: 'Oscillaria' }, species: { string: 'caviae',
125
- authorship: 'Simons 1920', basionymAuthorTeam:
126
- { authorTeam: 'Simons', author: ['Simons'], year: '1920' } } }]
127
- sn = 'Bacterium monocytogenes hominis\'\' Nyfeldt 1932'
128
- expect(details(sn)).to eq [{ genus: { string: 'Bacterium' },
129
- species: { string: 'monocytogenes' },
130
- infraspecies: [{ string: 'hominis', rank: 'n/a' }] }]
131
- sn = 'Choriozopella trägårdhi Lawrence, 1947'
132
- expect(details(sn)).to eq [{ genus: { string: 'Choriozopella' },
133
- species: { string: 'tragardhi', authorship: 'Lawrence, 1947',
134
- basionymAuthorTeam: { authorTeam: 'Lawrence', author: ['Lawrence'],
135
- year: '1947' } } }]
136
- sn = 'Sparassus françoisi Simon, 1898'
137
- expect(details(sn)).to eq [{ genus: { string: 'Sparassus' },
138
- species: { string: 'francoisi', authorship: 'Simon, 1898',
139
- basionymAuthorTeam: { authorTeam: 'Simon', author: ['Simon'],
140
- year: '1898' } } }]
141
- sn = 'Dyarcyops birói Kulczynski, 1908'
142
- expect(details(sn)).to eq [{ genus: { string: 'Dyarcyops' },
143
- species: { string: 'biroi', authorship: 'Kulczynski, 1908',
144
- basionymAuthorTeam: { authorTeam: 'Kulczynski',
145
- author: ['Kulczynski'], year: '1908' } } }]
146
- end
147
-
148
- it 'parses names with common utf-8 charactes' do
149
- names = ['Rühlella','Sténométope laevissimus Bibron 1855',
150
- 'Döringina Ihering 1929'].each do |name|
151
- expect(parse(name)).to_not be_nil
152
- end
153
- expect(details('Hirsutëlla mâle')).to eq [{ genus: { string: 'Hirsutella' },
154
- species: { string: 'male' } }]
155
- expect(details('Triticum repens vulgäre')).
156
- to eq [{ genus: { string: 'Triticum' }, species: { string: 'repens' },
157
- infraspecies: [{ string: 'vulgare', rank: 'n/a' }] }]
158
- end
159
-
160
- # AsterophUa japonica
161
- # AsyTuktus ridiculw Parent 1931
162
- # AtremOEa Staud 1870
163
-
164
-
165
- end
@@ -1,193 +0,0 @@
1
- # encoding: utf-8
2
-
3
- #NOTE: this spec needs compiled treetop files.
4
-
5
- describe ScientificNameParser do
6
- before(:all) do
7
- set_parser(ScientificNameParser.new)
8
- end
9
-
10
- it "returns version number" do
11
- expect(ScientificNameParser.version).to match /^\d+\.\d+\.\d+/
12
- end
13
-
14
- it "fixes cases" do
15
- names = [
16
- ["QUERCUS ALBA", "Quercus alba"],
17
- ["QUERCUS (QUERCUS) ALBA", "Quercus (Quercus) alba"],
18
- ["QÜERCUS", "Qüercus"],
19
- ["PARDOSA MOéSTA", "Pardosa moésta"],
20
- ]
21
- names.each do |name, capitalization|
22
- expect(ScientificNameParser::fix_case(name)).to eq capitalization
23
- end
24
- end
25
-
26
- it "generates standardized json" do
27
- read_test_file do |y|
28
- expect(JSON.load(json(y[:name]))).to eq JSON.
29
- load(y[:jsn]) unless y[:comment]
30
- end
31
- end
32
-
33
-
34
- # it "generates new test_file" do
35
- # new_test = open(File.expand_path(dir +
36
- # "../../spec/parser/test_data_new.txt"),"w")
37
- # read_test_file do |y|
38
- # if y[:comment]
39
- # new_test.write y[:comment]
40
- # else
41
- # name = y[:name]
42
- # jsn = json(y[:name])# rescue puts(y[:name])
43
- # new_test.write("#{name}|#{jsn}\n")
44
- # end
45
- # end
46
- # end
47
-
48
- it "generates reasonable output if parser failed" do
49
- sn = "ddd sljlkj 3223452432"
50
- expect(json(sn)).to eq "{\"scientificName\":" \
51
- "{\"id\":\"3ebf93d9-b62a-5198-8715-4c8302f0a5d7\",\"parsed\":false," \
52
- "\"parser_version\":\"test_version\"," \
53
- "\"verbatim\":\"ddd sljlkj 3223452432\"}}"
54
- end
55
-
56
- it "shows version when the flag :show_version set to true" do
57
- expect(parse("Homo sapiens")[:scientificName][:parser_version]).
58
- to_not be_nil
59
- end
60
-
61
- it "shows version for not spelled names" do
62
- expect(parse("not_a_name")[:scientificName][:parser_version]).to_not be_nil
63
- end
64
-
65
- it "generates version for viruses" do
66
- expect(parse("Nile virus")[:scientificName][:parser_version]).to_not be_nil
67
- end
68
- end
69
-
70
- describe "ScientificNameParser with ranked canonicals" do
71
- before(:all) do
72
- @parser = ScientificNameParser.new(canonical_with_rank: true)
73
- end
74
-
75
- it "does not influence output for uninomials and binomials" do
76
- data = [
77
- ["Ekbainacanthus Yakowlew 1902","Ekbainacanthus"],
78
- ["Ekboarmia sagnesi herrerai Exposito 2007",
79
- "Ekboarmia sagnesi herrerai"],
80
- ["Ekboarmia holli Oberthür", "Ekboarmia holli"]]
81
-
82
- data.each do |d|
83
- parsed = @parser.parse(d[0])[:scientificName][:canonical]
84
- expect(parsed).to eq d[1]
85
- end
86
- end
87
-
88
- it "preserves rank for ranked multinomials" do
89
- data = [
90
- ["Cola cordifolia var. puberula A. Chev.",
91
- "Cola cordifolia var. puberula"],
92
- ["Abies homolepis forma umbilicata (Mayr) Schelle",
93
- "Abies homolepis forma umbilicata"],
94
- ["Quercus ilex ssp. ballota (Desf.) Samp",
95
- "Quercus ilex ssp. ballota"],
96
- ["Physarum globuliferum forma. flavum Leontyev & Dudka",
97
- "Physarum globuliferum forma. flavum"]
98
- ]
99
- data.each do |d|
100
- parsed = @parser.parse(d[0])[:scientificName][:canonical]
101
- expect(parsed).to eq d[1]
102
- end
103
- end
104
- end
105
-
106
- describe ".add_rank_to_canonical" do
107
- subject(:parser) { ScientificNameParser.new }
108
-
109
- it "adds rank to infraspecies with rank" do
110
- data = [
111
- ["Cola cordifolia var. puberula A. Chev.",
112
- "Cola cordifolia puberula",
113
- "Cola cordifolia var. puberula"],
114
- ["Abies homolepis forma umbilicata (Mayr) Schelle",
115
- "Abies homolepis umbilicata",
116
- "Abies homolepis forma umbilicata"],
117
- ["Quercus ilex ssp. ballota (Desf.) Samp",
118
- "Quercus ilex ballota",
119
- "Quercus ilex ssp. ballota"],
120
- ["Physarum globuliferum forma. flavum Leontyev & Dudka",
121
- "Physarum globuliferum flavum",
122
- "Physarum globuliferum forma. flavum"]
123
- ]
124
- data.each do |d|
125
- parsed = parser.parse(d[0])
126
- canonical1 = parsed[:scientificName][:canonical]
127
- expect(canonical1).to eq d[1]
128
- ScientificNameParser.add_rank_to_canonical(parsed)
129
- canonical2 = parsed[:scientificName][:canonical]
130
- expect(canonical2).to eq d[2]
131
- end
132
- end
133
-
134
- it "returns preprocessing tail if it exists" do
135
- sn = "Stenometope laevissimus sensu Eschmeyer 2004"
136
- res = parser.parse(sn)
137
- expect(res).to_not be_nil
138
- expect(res[:scientificName][:tail]).to eq "sensu Eschmeyer 2004"
139
- end
140
-
141
- it "does not work for hybrids yet" do
142
- data = [["Corda X cordiflora var. puberula",
143
- "Corda cordiflora puberula"]]
144
- data.each do |d|
145
- parsed = parser.parse(d[0])
146
- canonical1 = parsed[:scientificName][:canonical]
147
- expect(canonical1).to eq d[1]
148
- ScientificNameParser.add_rank_to_canonical(parsed)
149
- canonical2 = parsed[:scientificName][:canonical]
150
- expect(canonical2).to eq d[1]
151
- end
152
- end
153
- end
154
-
155
- describe ParallelParser do
156
- it "finds number of cpus" do
157
- pparser = ParallelParser.new
158
- expect(pparser.cpu_num).to be > 0
159
- end
160
-
161
- it "parses several names in parallel" do
162
- names = []
163
- read_test_file { |n| names << (n[:name]) if n[:name] }
164
- names.uniq!
165
- pparser = ParallelParser.new
166
- res = pparser.parse(names)
167
- expect(names.size).to be > 100
168
- expect(res.keys.size).to eq names.size
169
- end
170
-
171
- it "parses several names in parallel with given num of processes" do
172
- names = []
173
- read_test_file { |n| names << (n[:name]) if n[:name] }
174
- names.uniq!
175
- pparser = ParallelParser.new(4)
176
- res = pparser.parse(names)
177
- expect(names.size).to be > 100
178
- expect(res.keys.size).to eq names.size
179
- end
180
-
181
- it "has parsed name in native ruby format and in returned as \
182
- a hash with name as a key and parsed data as value" do
183
- names = []
184
- read_test_file { |n| names << (n[:name]) if n[:name] }
185
- names.uniq!
186
- pparser = ParallelParser.new(4)
187
- res = pparser.parse(names)
188
- names.each_with_index do |name, i|
189
- expect(res[name].is_a?(Hash)).to be true
190
- expect(res[name][:scientificName][:verbatim]).to eq name
191
- end
192
- end
193
- end