biodiversity19 1.0.12 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +7 -0
- data/Gemfile +3 -1
- data/Gemfile.lock +0 -15
- data/VERSION +1 -1
- data/lib/biodiversity/parser.rb +7 -2
- data/lib/biodiversity/parser/scientific_name_clean.treetop +1 -1
- data/spec/parser/test_data.txt +13 -0
- metadata +3 -2
data/CHANGELOG
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
1.0.13 -- canonical forms for cf. aff. qualifiers are modified: canonical for
|
2
|
+
'Aus cf. bus' is now 'Aus bus'; canonical for 'Aus aff. bus' is now 'Aus'.
|
3
|
+
Ranks at the end of the name like 'var', 'ssp', 'spp' are considered junk and
|
4
|
+
are ignored
|
5
|
+
|
6
|
+
1.0.12 -- bug is fixed which prevented 'Cucurbita pepo' be parsed correctly,
|
7
|
+
f., forma, fr. are now treated as any other ranks.
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,16 +1,12 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
archive-tar-minitar (0.5.2)
|
5
|
-
columnize (0.3.4)
|
6
4
|
diff-lcs (1.1.3)
|
7
5
|
git (1.2.5)
|
8
6
|
jeweler (1.6.4)
|
9
7
|
bundler (~> 1.0)
|
10
8
|
git (>= 1.2.5)
|
11
9
|
rake
|
12
|
-
linecache19 (0.5.12)
|
13
|
-
ruby_core_source (>= 0.1.4)
|
14
10
|
parallel (0.5.9)
|
15
11
|
polyglot (0.3.3)
|
16
12
|
rake (0.9.2.2)
|
@@ -22,16 +18,6 @@ GEM
|
|
22
18
|
rspec-expectations (2.7.0)
|
23
19
|
diff-lcs (~> 1.1.2)
|
24
20
|
rspec-mocks (2.7.0)
|
25
|
-
ruby-debug-base19 (0.11.25)
|
26
|
-
columnize (>= 0.3.1)
|
27
|
-
linecache19 (>= 0.5.11)
|
28
|
-
ruby_core_source (>= 0.1.4)
|
29
|
-
ruby-debug19 (0.11.6)
|
30
|
-
columnize (>= 0.3.1)
|
31
|
-
linecache19 (>= 0.5.11)
|
32
|
-
ruby-debug-base19 (>= 0.11.19)
|
33
|
-
ruby_core_source (0.1.5)
|
34
|
-
archive-tar-minitar (>= 0.5.2)
|
35
21
|
treetop (1.4.10)
|
36
22
|
polyglot
|
37
23
|
polyglot (>= 0.3.1)
|
@@ -43,5 +29,4 @@ DEPENDENCIES
|
|
43
29
|
jeweler
|
44
30
|
parallel
|
45
31
|
rspec
|
46
|
-
ruby-debug19
|
47
32
|
treetop
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0.
|
1
|
+
1.0.13
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -12,12 +12,17 @@ module PreProcessor
|
|
12
12
|
TAXON_CONCEPTS2 = /\s+(\(?s\.\s?s\.|\(?s\.\s?l\.|\(?s\.\s?str\.|\(?s\.\s?lat\.|sec\.|sec|near)\b.*$/
|
13
13
|
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
|
14
14
|
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
|
15
|
-
|
15
|
+
COMPARATORS = /\s+(aff\.|aff)\b.*$/i
|
16
|
+
CF_COMPARATOR = /\s+(cf\.|cf)\s+/i
|
17
|
+
LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|sensu|new|non|nec|cf|ssp|subsp|subgen|hybrid|hort\.|hort)\s*$/i
|
16
18
|
|
17
19
|
def self.clean(a_string)
|
18
|
-
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
20
|
+
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2, TAXON_CONCEPTS3, NOMEN_CONCEPTS, COMPARATORS, LAST_WORD_JUNK].each do |i|
|
19
21
|
a_string = a_string.gsub(i, '')
|
20
22
|
end
|
23
|
+
[CF_COMPARATOR].each do |i|
|
24
|
+
a_string = a_string.gsub(i, ' ')
|
25
|
+
end
|
21
26
|
a_string = a_string.tr('ſ','s') #old 's'
|
22
27
|
a_string
|
23
28
|
end
|
@@ -401,7 +401,7 @@ grammar ScientificNameClean
|
|
401
401
|
end
|
402
402
|
}
|
403
403
|
end
|
404
|
-
|
404
|
+
|
405
405
|
rule rank
|
406
406
|
("morph."/"f.sp."/"B"/"ssp."/"ssp"/"mut."/"nat"/"nothosubsp."/"convar."/"pseudovar."/"sect."/"ser."/"var."/"subvar."/ "[var.]" /"var"/"subsp."/"subsp"/"subf."/"race"/"forma"/"form."/"form"/"fo."/"f."/"α"/"ββ"/"β"/"γ"/"δ"/"ε"/"φ"/"θ"/"μ"/"a."/"b."/"c."/"d."/"e."/"g."/"k."/"****"/"**"/"*")
|
407
407
|
{
|
data/spec/parser/test_data.txt
CHANGED
@@ -125,6 +125,19 @@ Hydnellum scrobiculatum var. zonatum f. parvum (Banker) D. Hall & D.E. Stuntz 19
|
|
125
125
|
Senecio fuchsii C.C.Gmel. subsp. fuchsii var. expansus (Boiss. & Heldr.) Hayek|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Senecio fuchsii C.C.Gmel. subsp. fuchsii var. expansus (Boiss. & Heldr.) Hayek", "normalized":"Senecio fuchsii C.C.Gmel. subsp. fuchsii var. expansus (Boiss. et Heldr.) Hayek", "canonical":"Senecio fuchsii fuchsii expansus", "hybrid":false, "details":[{"genus":{"string":"Senecio"}, "species":{"string":"fuchsii", "authorship":"C.C.Gmel.", "basionymAuthorTeam":{"authorTeam":"C.C.Gmel.", "author":["C.C.Gmel."]}}, "infraspecies":[{"string":"fuchsii", "rank":"subsp."}, {"string":"expansus", "rank":"var.", "authorship":"(Boiss. & Heldr.) Hayek", "combinationAuthorTeam":{"authorTeam":"Hayek", "author":["Hayek"]}, "basionymAuthorTeam":{"authorTeam":"Boiss. & Heldr.", "author":["Boiss.", "Heldr."]}}]}], "parser_run":1, "positions":{"0":["genus", 7], "8":["species", 15], "16":["author_word", 25], "26":["infraspecific_type", 32], "33":["infraspecies", 40], "41":["infraspecific_type", 45], "46":["infraspecies", 54], "56":["author_word", 62], "65":["author_word", 71], "73":["author_word", 78]}}}
|
126
126
|
Senecio fuchsii C.C.Gmel. subsp. fuchsii var. fuchsii|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Senecio fuchsii C.C.Gmel. subsp. fuchsii var. fuchsii", "normalized":"Senecio fuchsii C.C.Gmel. subsp. fuchsii var. fuchsii", "canonical":"Senecio fuchsii fuchsii fuchsii", "hybrid":false, "details":[{"genus":{"string":"Senecio"}, "species":{"string":"fuchsii", "authorship":"C.C.Gmel.", "basionymAuthorTeam":{"authorTeam":"C.C.Gmel.", "author":["C.C.Gmel."]}}, "infraspecies":[{"string":"fuchsii", "rank":"subsp."}, {"string":"fuchsii", "rank":"var."}]}], "parser_run":1, "positions":{"0":["genus", 7], "8":["species", 15], "16":["author_word", 25], "26":["infraspecific_type", 32], "33":["infraspecies", 40], "41":["infraspecific_type", 45], "46":["infraspecies", 53]}}}
|
127
127
|
|
128
|
+
|
129
|
+
#species and infraspecies without epithets, comparisons
|
130
|
+
Alviniconcha aff alba|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alviniconcha aff alba", "normalized":"Alviniconcha", "canonical":"Alviniconcha", "hybrid":false, "details":[{"uninomial":{"string":"Alviniconcha"}}], "parser_run":1, "positions":{"0":["uninomial", 12]}}}
|
131
|
+
Alviniconcha aff. alba|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alviniconcha aff. alba", "normalized":"Alviniconcha", "canonical":"Alviniconcha", "hybrid":false, "details":[{"uninomial":{"string":"Alviniconcha"}}], "parser_run":1, "positions":{"0":["uninomial", 12]}}}
|
132
|
+
Alviniconcha cf. alba|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alviniconcha cf. alba", "normalized":"Alviniconcha alba", "canonical":"Alviniconcha alba", "hybrid":false, "details":[{"genus":{"string":"Alviniconcha"}, "species":{"string":"alba"}}], "parser_run":1, "positions":{"0":["genus", 12], "13":["species", 17]}}}
|
133
|
+
Alviniconcha cf alba|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alviniconcha cf alba", "normalized":"Alviniconcha alba", "canonical":"Alviniconcha alba", "hybrid":false, "details":[{"genus":{"string":"Alviniconcha"}, "species":{"string":"alba"}}], "parser_run":1, "positions":{"0":["genus", 12], "13":["species", 17]}}}
|
134
|
+
Alyxia reinwardti var|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alyxia reinwardti var", "normalized":"Alyxia reinwardti", "canonical":"Alyxia reinwardti", "hybrid":false, "details":[{"genus":{"string":"Alyxia"}, "species":{"string":"reinwardti"}}], "parser_run":1, "positions":{"0":["genus", 6], "7":["species", 17]}}}
|
135
|
+
Alyxia reinwardti var.|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alyxia reinwardti var.", "normalized":"Alyxia reinwardti", "canonical":"Alyxia reinwardti", "hybrid":false, "details":[{"genus":{"string":"Alyxia"}, "species":{"string":"reinwardti"}}], "parser_run":1, "positions":{"0":["genus", 6], "7":["species", 17]}}}
|
136
|
+
Alyxia reinwardti ssp|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alyxia reinwardti ssp", "normalized":"Alyxia reinwardti", "canonical":"Alyxia reinwardti", "hybrid":false, "details":[{"genus":{"string":"Alyxia"}, "species":{"string":"reinwardti"}}], "parser_run":1, "positions":{"0":["genus", 6], "7":["species", 17]}}}
|
137
|
+
Alyxia reinwardti ssp.|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alyxia reinwardti ssp.", "normalized":"Alyxia reinwardti ssp.", "canonical":"Alyxia reinwardti", "hybrid":false, "details":[{"genus":{"string":"Alyxia"}, "species":{"string":"reinwardti"}, "status":"ssp."}], "parser_run":1, "positions":{"0":["genus", 6], "7":["species", 17]}}}
|
138
|
+
Alaria spp|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alaria spp", "normalized":"Alaria", "canonical":"Alaria", "hybrid":false, "details":[{"uninomial":{"string":"Alaria"}}], "parser_run":1, "positions":{"0":["uninomial", 6]}}}
|
139
|
+
Alaria spp.|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Alaria spp.", "normalized":"Alaria", "canonical":"Alaria", "hybrid":false, "details":[{"uninomial":{"string":"Alaria"}}], "parser_run":1, "positions":{"0":["uninomial", 6]}}}
|
140
|
+
|
128
141
|
#unknown authorship
|
129
142
|
Tragacantha leporina (?) Kuntze|{"scientificName":{"parsed":true, "parser_version":"test_version", "parser_run":1,"verbatim":"Tragacantha leporina (?) Kuntze","normalized":"Tragacantha leporina (?) Kuntze","canonical":"Tragacantha leporina","hybrid":false,"details":[{"genus":{"string":"Tragacantha"},"species":{"string":"leporina","authorship":"(?) Kuntze","combinationAuthorTeam":{"authorTeam":"Kuntze","author":["Kuntze"]},"basionymAuthorTeam":{"authorTeam":"(?)","author":["?"]}}}],"positions":{"0":["genus",11],"12":["species",20],"22":["unknown_author",23],"25":["author_word",31]}}}
|
130
143
|
Lachenalia tricolor var. nelsonii (auct.) Baker|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Lachenalia tricolor var. nelsonii (auct.) Baker", "normalized":"Lachenalia tricolor var. nelsonii (auct.) Baker", "canonical":"Lachenalia tricolor nelsonii", "hybrid":false, "details":[{"genus":{"string":"Lachenalia"}, "species":{"string":"tricolor"}, "infraspecies":[{"string":"nelsonii", "rank":"var.", "authorship":"(auct.) Baker", "combinationAuthorTeam":{"authorTeam":"Baker", "author":["Baker"]}, "basionymAuthorTeam":{"authorTeam":"auct.", "author":["unknown"]}}]}], "parser_run":1, "positions":{"0":["genus", 10], "11":["species", 19], "20":["infraspecific_type", 24], "25":["infraspecies", 33], "35":["unknown_author", 40], "42":["author_word", 47]}}}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: biodiversity19
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.13
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-06-
|
12
|
+
date: 2012-06-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: treetop
|
@@ -119,6 +119,7 @@ extra_rdoc_files:
|
|
119
119
|
files:
|
120
120
|
- .document
|
121
121
|
- .rvmrc
|
122
|
+
- CHANGELOG
|
122
123
|
- Gemfile
|
123
124
|
- Gemfile.lock
|
124
125
|
- LICENSE
|