biodiversity 3.5.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rubocop.yml +9 -6
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +1 -6
  6. data/CHANGELOG +3 -0
  7. data/Gemfile +2 -0
  8. data/README.md +37 -178
  9. data/Rakefile +15 -48
  10. data/biodiversity.gemspec +18 -21
  11. data/clib/linux/libgnparser.h +93 -0
  12. data/clib/linux/libgnparser.so +0 -0
  13. data/clib/mac/libgnparser.h +93 -0
  14. data/clib/mac/libgnparser.so +0 -0
  15. data/lib/biodiversity.rb +4 -9
  16. data/lib/biodiversity/parser.rb +65 -281
  17. data/lib/biodiversity/version.rb +8 -1
  18. data/spec/lib/biodiversity_spec.rb +9 -0
  19. data/spec/lib/parser_spec.rb +38 -0
  20. data/spec/spec_helper.rb +4 -81
  21. metadata +27 -102
  22. data/.byebug_history +0 -18
  23. data/.document +0 -5
  24. data/examples/socket_client.rb +0 -25
  25. data/lib/biodiversity/guid.rb +0 -1
  26. data/lib/biodiversity/guid/lsid.rb +0 -16
  27. data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
  28. data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
  29. data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
  30. data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
  31. data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
  32. data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
  33. data/spec/biodiversity_spec.rb +0 -11
  34. data/spec/files/test_data.txt +0 -490
  35. data/spec/files/todo.txt +0 -55
  36. data/spec/guid/lsid.spec.rb +0 -15
  37. data/spec/parser/scientific_name_canonical_spec.rb +0 -36
  38. data/spec/parser/scientific_name_clean_spec.rb +0 -1137
  39. data/spec/parser/scientific_name_dirty_spec.rb +0 -165
  40. data/spec/parser/scientific_name_spec.rb +0 -193
@@ -1,165 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- describe ScientificNameDirty do
4
- before(:all) do
5
- set_parser(ScientificNameDirtyParser.new)
6
- end
7
-
8
- it 'parses clean names' do
9
- expect(parse('Betula verucosa (L.) Bar. 1899')).to_not be_nil
10
- end
11
-
12
- it 'parses double parenthesis' do
13
- sn = 'Eichornia crassipes ( (Martius) ) Solms-Laub.'
14
- expect(parse(sn)).to_not be_nil
15
- expect(value(sn)).to eq 'Eichornia crassipes (Martius) Solms-Laub.'
16
- expect(details(sn)).to eq [{ genus: { string: 'Eichornia' },
17
- species: { string: 'crassipes',
18
- authorship: '( (Martius) ) Solms-Laub.',
19
- combinationAuthorTeam: { authorTeam: 'Solms-Laub.',
20
- author: ['Solms-Laub.'] },
21
- basionymAuthorTeam: { authorTeam: 'Martius',
22
- author: ['Martius'] } } }]
23
- expect(pos(sn)).to eq({ 0 => ['genus', 9], 10 => ['species', 19],
24
- 23 => ['author_word', 30],
25
- 34 => ['author_word', 45] })
26
- end
27
-
28
- it 'parses year without author' do
29
- sn = 'Acarospora cratericola 1929'
30
- expect(parse(sn)).to_not be_nil
31
- expect(pos(sn)).to eq({ 0 => ['genus', 10],
32
- 11 => ['species', 22], 23 => ['year', 27] })
33
- expect(details(sn)).to eq [{ genus: { string: 'Acarospora' },
34
- species: { string: 'cratericola',
35
- year: '1929' } }]
36
- end
37
-
38
- it 'parses double years' do
39
- sn = 'Tridentella tangeroae Bruce, 1987-92'
40
- expect(parse(sn)).to_not be_nil
41
- expect(pos(sn)).to eq({ 0 => ['genus', 11], 12 => ['species', 21],
42
- 22 => ['author_word', 27], 29 => ['year', 36] })
43
- expect(details(sn)).to eq [{ genus: { string: 'Tridentella' },
44
- species: { string: 'tangeroae', authorship: 'Bruce, 1987-92',
45
- basionymAuthorTeam: { authorTeam: 'Bruce', author: ['Bruce'],
46
- year: '1987-92' } } }]
47
- end
48
-
49
- it 'parses dirty years' do
50
- expect(parse('Tridentella tangeroae Bruce, 1988B')).to_not be_nil
51
- expect(parse('Tridentella tangeroae Bruce, 1988b')).to_not be_nil
52
- expect(parse('Tridentella tangeroae Bruce, 1988d')).to_not be_nil
53
- sn = 'Tridentella tangeroae Bruce, 198?'
54
- expect(parse(sn)).to_not be_nil
55
- expect(pos(sn)).to eq({ 0 => ['genus', 11], 12 => ['species', 21],
56
- 22 => ['author_word', 27], 29 => ['year', 33] })
57
- end
58
-
59
- it 'parses year with page number' do
60
- sn = 'Gymnodactylus irregularis WERMUTH 1965: 54'
61
- expect(parse(sn)).to_not be_nil
62
- expect(value(sn)).to eq 'Gymnodactylus irregularis Wermuth 1965'
63
- expect(details(sn)).to eq [{ genus: { string: 'Gymnodactylus' },
64
- species: { string: 'irregularis', authorship: 'WERMUTH 1965: 54',
65
- basionymAuthorTeam: { authorTeam: 'WERMUTH', author: ['Wermuth'],
66
- year: '1965' } } }]
67
- expect(pos(sn)).to eq({ 0 => ['genus', 13], 14 => ['species', 25],
68
- 26 => ['author_word', 33], 34 => ['year', 38] })
69
- end
70
-
71
- it 'parses year with []' do
72
- expect(parse('Anthoscopus Cabanis [1851]')).to_not be_nil
73
- expect(value('Anthoscopus Cabanis [185?]')).
74
- to eq 'Anthoscopus Cabanis (185?)'
75
- expect(parse('Anthoscopus Cabanis [1851?]')).to_not be_nil
76
- expect(value('Anthoscopus Cabanis [1851]')).
77
- to eq 'Anthoscopus Cabanis (1851)'
78
- sn = 'Anthoscopus Cabanis [1851?]'
79
- expect(value(sn)).to eq 'Anthoscopus Cabanis (1851?)'
80
- expect(details(sn)).to eq [{ uninomial: { string: 'Anthoscopus',
81
- authorship: 'Cabanis [1851?]', basionymAuthorTeam:
82
- { authorTeam: 'Cabanis', author: ['Cabanis'],
83
- approximate_year: '(1851?)' } } }]
84
- expect(pos(sn)).to eq({ 0 => ['uninomial', 11],
85
- 12 => ['author_word', 19], 21 => ['year', 26] })
86
- sn = 'Trismegistia monodii Ando, 1973 [1974]'
87
- expect(parse(sn)).to_not be_nil
88
-
89
- #should it be 'Trismegistia monodii Ando 1973 (1974)' instead?
90
- expect(value(sn)).to eq 'Trismegistia monodii Ando 1973 (1974)'
91
-
92
- expect(details(sn)).to eq [{ genus: { string: 'Trismegistia' },
93
- species: { string: 'monodii', authorship: 'Ando, 1973 [1974]',
94
- basionymAuthorTeam: { authorTeam: 'Ando', author: ['Ando'],
95
- year: '1973', approximate_year: '(1974)' } } }]
96
- expect(pos(sn)).to eq({ 0 => ['genus', 12], 13 => ['species', 20],
97
- 21 => ['author_word', 25], 27 => ['year', 31], 33 => ['year', 37] })
98
- expect(parse('Zygaena witti Wiegel [1973]')).to_not be_nil
99
- sn = 'Deyeuxia coarctata Kunth, 1815 [1816]'
100
- expect(parse(sn)).to_not be_nil
101
- expect(pos(sn)).to eq({ 0 => ['genus', 8], 9 => ['species', 18],
102
- 19 => ['author_word', 24], 26 => ['year', 30],
103
- 32 => ['year', 36] })
104
- end
105
-
106
- it 'parses new stuff' do
107
- sn = 'Zoropsis (TaKeoa) nishimurai Yaginuma, 1971' #skipping for now
108
- sn = 'Campylobacter pyloridis Marshall et al.1985.'
109
- expect(details(sn)).to eq [{ genus: { string: 'Campylobacter' },
110
- species: { string: 'pyloridis', authorship: 'Marshall et al.1985.',
111
- basionymAuthorTeam: { authorTeam: 'Marshall et al.',
112
- author: ['Marshall et al.'], year: '1985' } } }]
113
- sn = 'Beijerinckia derxii venezuelae corrig. Thompson and Skerman, 1981'
114
- expect(details(sn)).to eq [{ genus: { string: 'Beijerinckia' },
115
- species: { string: 'derxii' }, infraspecies: [{ string: 'venezuelae',
116
- rank: 'n/a', authorship: 'Thompson and Skerman, 1981',
117
- basionymAuthorTeam: { authorTeam: 'Thompson and Skerman',
118
- author: ['Thompson', 'Skerman'], year: '1981' } }] }]
119
- expect(details('Streptomyces parvisporogenes ignotus 1960')).
120
- to eq [{ genus: { string: 'Streptomyces' },
121
- species: { string: 'parvisporogenes' },
122
- infraspecies: [{ string: 'ignotus', rank: 'n/a', year: '1960' }] }]
123
- expect(details('Oscillaria caviae Simons 1920, according to Simons 1922')).
124
- to eq [{ genus: { string: 'Oscillaria' }, species: { string: 'caviae',
125
- authorship: 'Simons 1920', basionymAuthorTeam:
126
- { authorTeam: 'Simons', author: ['Simons'], year: '1920' } } }]
127
- sn = 'Bacterium monocytogenes hominis\'\' Nyfeldt 1932'
128
- expect(details(sn)).to eq [{ genus: { string: 'Bacterium' },
129
- species: { string: 'monocytogenes' },
130
- infraspecies: [{ string: 'hominis', rank: 'n/a' }] }]
131
- sn = 'Choriozopella trägårdhi Lawrence, 1947'
132
- expect(details(sn)).to eq [{ genus: { string: 'Choriozopella' },
133
- species: { string: 'tragardhi', authorship: 'Lawrence, 1947',
134
- basionymAuthorTeam: { authorTeam: 'Lawrence', author: ['Lawrence'],
135
- year: '1947' } } }]
136
- sn = 'Sparassus françoisi Simon, 1898'
137
- expect(details(sn)).to eq [{ genus: { string: 'Sparassus' },
138
- species: { string: 'francoisi', authorship: 'Simon, 1898',
139
- basionymAuthorTeam: { authorTeam: 'Simon', author: ['Simon'],
140
- year: '1898' } } }]
141
- sn = 'Dyarcyops birói Kulczynski, 1908'
142
- expect(details(sn)).to eq [{ genus: { string: 'Dyarcyops' },
143
- species: { string: 'biroi', authorship: 'Kulczynski, 1908',
144
- basionymAuthorTeam: { authorTeam: 'Kulczynski',
145
- author: ['Kulczynski'], year: '1908' } } }]
146
- end
147
-
148
- it 'parses names with common utf-8 charactes' do
149
- names = ['Rühlella','Sténométope laevissimus Bibron 1855',
150
- 'Döringina Ihering 1929'].each do |name|
151
- expect(parse(name)).to_not be_nil
152
- end
153
- expect(details('Hirsutëlla mâle')).to eq [{ genus: { string: 'Hirsutella' },
154
- species: { string: 'male' } }]
155
- expect(details('Triticum repens vulgäre')).
156
- to eq [{ genus: { string: 'Triticum' }, species: { string: 'repens' },
157
- infraspecies: [{ string: 'vulgare', rank: 'n/a' }] }]
158
- end
159
-
160
- # AsterophUa japonica
161
- # AsyTuktus ridiculw Parent 1931
162
- # AtremOEa Staud 1870
163
-
164
-
165
- end
@@ -1,193 +0,0 @@
1
- # encoding: utf-8
2
-
3
- #NOTE: this spec needs compiled treetop files.
4
-
5
- describe ScientificNameParser do
6
- before(:all) do
7
- set_parser(ScientificNameParser.new)
8
- end
9
-
10
- it "returns version number" do
11
- expect(ScientificNameParser.version).to match /^\d+\.\d+\.\d+/
12
- end
13
-
14
- it "fixes cases" do
15
- names = [
16
- ["QUERCUS ALBA", "Quercus alba"],
17
- ["QUERCUS (QUERCUS) ALBA", "Quercus (Quercus) alba"],
18
- ["QÜERCUS", "Qüercus"],
19
- ["PARDOSA MOéSTA", "Pardosa moésta"],
20
- ]
21
- names.each do |name, capitalization|
22
- expect(ScientificNameParser::fix_case(name)).to eq capitalization
23
- end
24
- end
25
-
26
- it "generates standardized json" do
27
- read_test_file do |y|
28
- expect(JSON.load(json(y[:name]))).to eq JSON.
29
- load(y[:jsn]) unless y[:comment]
30
- end
31
- end
32
-
33
-
34
- # it "generates new test_file" do
35
- # new_test = open(File.expand_path(dir +
36
- # "../../spec/parser/test_data_new.txt"),"w")
37
- # read_test_file do |y|
38
- # if y[:comment]
39
- # new_test.write y[:comment]
40
- # else
41
- # name = y[:name]
42
- # jsn = json(y[:name])# rescue puts(y[:name])
43
- # new_test.write("#{name}|#{jsn}\n")
44
- # end
45
- # end
46
- # end
47
-
48
- it "generates reasonable output if parser failed" do
49
- sn = "ddd sljlkj 3223452432"
50
- expect(json(sn)).to eq "{\"scientificName\":" \
51
- "{\"id\":\"3ebf93d9-b62a-5198-8715-4c8302f0a5d7\",\"parsed\":false," \
52
- "\"parser_version\":\"test_version\"," \
53
- "\"verbatim\":\"ddd sljlkj 3223452432\"}}"
54
- end
55
-
56
- it "shows version when the flag :show_version set to true" do
57
- expect(parse("Homo sapiens")[:scientificName][:parser_version]).
58
- to_not be_nil
59
- end
60
-
61
- it "shows version for not spelled names" do
62
- expect(parse("not_a_name")[:scientificName][:parser_version]).to_not be_nil
63
- end
64
-
65
- it "generates version for viruses" do
66
- expect(parse("Nile virus")[:scientificName][:parser_version]).to_not be_nil
67
- end
68
- end
69
-
70
- describe "ScientificNameParser with ranked canonicals" do
71
- before(:all) do
72
- @parser = ScientificNameParser.new(canonical_with_rank: true)
73
- end
74
-
75
- it "does not influence output for uninomials and binomials" do
76
- data = [
77
- ["Ekbainacanthus Yakowlew 1902","Ekbainacanthus"],
78
- ["Ekboarmia sagnesi herrerai Exposito 2007",
79
- "Ekboarmia sagnesi herrerai"],
80
- ["Ekboarmia holli Oberthür", "Ekboarmia holli"]]
81
-
82
- data.each do |d|
83
- parsed = @parser.parse(d[0])[:scientificName][:canonical]
84
- expect(parsed).to eq d[1]
85
- end
86
- end
87
-
88
- it "preserves rank for ranked multinomials" do
89
- data = [
90
- ["Cola cordifolia var. puberula A. Chev.",
91
- "Cola cordifolia var. puberula"],
92
- ["Abies homolepis forma umbilicata (Mayr) Schelle",
93
- "Abies homolepis forma umbilicata"],
94
- ["Quercus ilex ssp. ballota (Desf.) Samp",
95
- "Quercus ilex ssp. ballota"],
96
- ["Physarum globuliferum forma. flavum Leontyev & Dudka",
97
- "Physarum globuliferum forma. flavum"]
98
- ]
99
- data.each do |d|
100
- parsed = @parser.parse(d[0])[:scientificName][:canonical]
101
- expect(parsed).to eq d[1]
102
- end
103
- end
104
- end
105
-
106
- describe ".add_rank_to_canonical" do
107
- subject(:parser) { ScientificNameParser.new }
108
-
109
- it "adds rank to infraspecies with rank" do
110
- data = [
111
- ["Cola cordifolia var. puberula A. Chev.",
112
- "Cola cordifolia puberula",
113
- "Cola cordifolia var. puberula"],
114
- ["Abies homolepis forma umbilicata (Mayr) Schelle",
115
- "Abies homolepis umbilicata",
116
- "Abies homolepis forma umbilicata"],
117
- ["Quercus ilex ssp. ballota (Desf.) Samp",
118
- "Quercus ilex ballota",
119
- "Quercus ilex ssp. ballota"],
120
- ["Physarum globuliferum forma. flavum Leontyev & Dudka",
121
- "Physarum globuliferum flavum",
122
- "Physarum globuliferum forma. flavum"]
123
- ]
124
- data.each do |d|
125
- parsed = parser.parse(d[0])
126
- canonical1 = parsed[:scientificName][:canonical]
127
- expect(canonical1).to eq d[1]
128
- ScientificNameParser.add_rank_to_canonical(parsed)
129
- canonical2 = parsed[:scientificName][:canonical]
130
- expect(canonical2).to eq d[2]
131
- end
132
- end
133
-
134
- it "returns preprocessing tail if it exists" do
135
- sn = "Stenometope laevissimus sensu Eschmeyer 2004"
136
- res = parser.parse(sn)
137
- expect(res).to_not be_nil
138
- expect(res[:scientificName][:tail]).to eq "sensu Eschmeyer 2004"
139
- end
140
-
141
- it "does not work for hybrids yet" do
142
- data = [["Corda X cordiflora var. puberula",
143
- "Corda cordiflora puberula"]]
144
- data.each do |d|
145
- parsed = parser.parse(d[0])
146
- canonical1 = parsed[:scientificName][:canonical]
147
- expect(canonical1).to eq d[1]
148
- ScientificNameParser.add_rank_to_canonical(parsed)
149
- canonical2 = parsed[:scientificName][:canonical]
150
- expect(canonical2).to eq d[1]
151
- end
152
- end
153
- end
154
-
155
- describe ParallelParser do
156
- it "finds number of cpus" do
157
- pparser = ParallelParser.new
158
- expect(pparser.cpu_num).to be > 0
159
- end
160
-
161
- it "parses several names in parallel" do
162
- names = []
163
- read_test_file { |n| names << (n[:name]) if n[:name] }
164
- names.uniq!
165
- pparser = ParallelParser.new
166
- res = pparser.parse(names)
167
- expect(names.size).to be > 100
168
- expect(res.keys.size).to eq names.size
169
- end
170
-
171
- it "parses several names in parallel with given num of processes" do
172
- names = []
173
- read_test_file { |n| names << (n[:name]) if n[:name] }
174
- names.uniq!
175
- pparser = ParallelParser.new(4)
176
- res = pparser.parse(names)
177
- expect(names.size).to be > 100
178
- expect(res.keys.size).to eq names.size
179
- end
180
-
181
- it "has parsed name in native ruby format and in returned as \
182
- a hash with name as a key and parsed data as value" do
183
- names = []
184
- read_test_file { |n| names << (n[:name]) if n[:name] }
185
- names.uniq!
186
- pparser = ParallelParser.new(4)
187
- res = pparser.parse(names)
188
- names.each_with_index do |name, i|
189
- expect(res[name].is_a?(Hash)).to be true
190
- expect(res[name][:scientificName][:verbatim]).to eq name
191
- end
192
- end
193
- end