dwca_hunter 0.5.5 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +50 -77
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +7 -8
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +31 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +93 -91
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -127
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +27 -34
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
class PaleodbHarvester
|
2
|
+
def initialize(download_dir)
|
3
|
+
@dir = File.join(download_dir, "json")
|
4
|
+
FileUtils.mkdir_p(@dir)
|
5
|
+
@in_dir = download_dir
|
6
|
+
@taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
|
7
|
+
@refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
|
8
|
+
@taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
|
9
|
+
@occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
|
10
|
+
end
|
11
|
+
|
12
|
+
def taxa
|
13
|
+
# "orig_no","taxon_no","record_type","flags","taxon_rank",
|
14
|
+
# "taxon_name","difference","accepted_no","accepted_rank",
|
15
|
+
# "accepted_name","parent_no","reference_no","is_extant","n_occs"
|
16
|
+
taxa = {}
|
17
|
+
name2id = {}
|
18
|
+
@taxa_csv.each do |r|
|
19
|
+
r = strip(r)
|
20
|
+
taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
|
21
|
+
rank: r["taxon_rank"], name: r["taxon_name"],
|
22
|
+
auth: r["taxon_attr"],
|
23
|
+
extinct: extinct(r["is_extant"]),
|
24
|
+
vernacular: r["common_name"],
|
25
|
+
annot: r["difference"], acc_id: r["accepted_no"],
|
26
|
+
acc_rank: r["accepted_rank"],
|
27
|
+
acc_name: r["accepted_name"], ecol: ecol(r),
|
28
|
+
parent_id: r["parent_no"], ref: r["reference_no"],
|
29
|
+
occs_num: r["n_occs"], enterer: enterer(r) }
|
30
|
+
|
31
|
+
name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
|
32
|
+
end
|
33
|
+
f = open(File.join(@dir, "taxa.json"), "w:utf-8")
|
34
|
+
f.write(JSON.pretty_generate(taxa))
|
35
|
+
f.close
|
36
|
+
f = open(File.join(@dir, "name_id.json"), "w:utf-8")
|
37
|
+
f.write(JSON.pretty_generate(name2id))
|
38
|
+
f.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def enterer(r)
|
42
|
+
res = [r["enterer"], r["modifier"]].map(&:to_s)
|
43
|
+
.map(&:strip).uniq.select { |e| e != "" }
|
44
|
+
res.empty? ? "" : res.join(", ")
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
def extinct(val)
|
49
|
+
val == "extinct" ? 1 : 0
|
50
|
+
end
|
51
|
+
|
52
|
+
def ecol(row)
|
53
|
+
row = strip row
|
54
|
+
"#{row['life_habit']} #{row['diet']}"
|
55
|
+
end
|
56
|
+
|
57
|
+
def refs
|
58
|
+
# "reference_no","record_type","ref_type","author1init","author1last",
|
59
|
+
# "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
|
60
|
+
# "editors","pubvol","pubno","firstpage","lastpage","publication_type",
|
61
|
+
# "language","doi"
|
62
|
+
|
63
|
+
# {"id":31671,"orig":true,"author":"Hahn, C. W.",
|
64
|
+
# "year":1834,"title":"Die wanzenartigen Insecten.",
|
65
|
+
# "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
|
66
|
+
# "distribution":"Germany","comment":"n. sp."}
|
67
|
+
refs = {}
|
68
|
+
@refs_csv.each do |r|
|
69
|
+
r = strip r
|
70
|
+
authorship, author = authors(r)
|
71
|
+
refs[r["reference_no"]] = { id: r["reference_no"], author: author,
|
72
|
+
authorship: authorship,
|
73
|
+
year: r["pubyr"], title: r["reftitle"],
|
74
|
+
details: details(r) }
|
75
|
+
end
|
76
|
+
f = open(File.join(@dir, "refs.json"), "w:utf-8")
|
77
|
+
f.write(JSON.pretty_generate(refs))
|
78
|
+
f.close
|
79
|
+
end
|
80
|
+
|
81
|
+
def authors(row)
|
82
|
+
row = strip row
|
83
|
+
au = ["#{row['author1init']} #{row['author1last']}".strip,
|
84
|
+
"#{row['author2init']} #{row['author2last']}".strip,
|
85
|
+
"#{row['otherauthors']}".strip]
|
86
|
+
au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
|
87
|
+
[au[0..1].join(", "), au.join(", ")]
|
88
|
+
end
|
89
|
+
|
90
|
+
def details(row)
|
91
|
+
row = strip row
|
92
|
+
ref = "#{row['pubtitle']}"
|
93
|
+
ref << " #{row['pubno']}" unless row['pubno'].empty?
|
94
|
+
ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
|
95
|
+
ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
|
96
|
+
ref << " (#{row["doi"]})" unless row['doi'].empty?
|
97
|
+
ref.gsub(/[\s]{2,}/, " ").strip
|
98
|
+
end
|
99
|
+
|
100
|
+
def taxa_refs
|
101
|
+
tr = {}
|
102
|
+
@taxa_refs_csv.each do |r|
|
103
|
+
r = strip r
|
104
|
+
row = { acc_id: r["accepted_no"], name: r["accepted_name"],
|
105
|
+
ref_id: r["reference_no"] }
|
106
|
+
if tr.key? r["accepted_no"]
|
107
|
+
tr[r["accepted_no"]] << row
|
108
|
+
else
|
109
|
+
tr[r["accepted_no"]] = [row]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
|
113
|
+
f.write(JSON.pretty_generate(tr))
|
114
|
+
f.close
|
115
|
+
end
|
116
|
+
|
117
|
+
def occurences
|
118
|
+
occ = {}
|
119
|
+
@occurences_csv.each_with_index do |r, i|
|
120
|
+
r = strip r
|
121
|
+
row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
|
122
|
+
state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
|
123
|
+
if occ.key? r["accepted_no"]
|
124
|
+
occ[r["accepted_no"]] << row
|
125
|
+
else
|
126
|
+
occ[r["accepted_no"]] = [row]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
f = open(File.join(@dir, "occurences.json"), "w:utf-8")
|
130
|
+
f.write(JSON.pretty_generate(occ))
|
131
|
+
f.close
|
132
|
+
end
|
133
|
+
|
134
|
+
def strip(row)
|
135
|
+
row.each_with_object({}) do |(k, v), h|
|
136
|
+
h[k] = v.nil? ? nil : v.strip
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceSherborn < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "sherborn"
|
7
|
+
@title = "Index Animalium"
|
8
|
+
@url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
|
9
|
+
@UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"sherborn",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading."
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack; end
|
29
|
+
|
30
|
+
def make_dwca
|
31
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
|
+
get_names
|
33
|
+
generate_dwca
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_names
|
39
|
+
Dir.chdir(@download_dir)
|
40
|
+
collect_names
|
41
|
+
end
|
42
|
+
|
43
|
+
def collect_names
|
44
|
+
dupes = {}
|
45
|
+
@names_index = {}
|
46
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
47
|
+
headers: false, col_sep: "\t")
|
48
|
+
file.each_with_index do |row, i|
|
49
|
+
next if dupes.key?(row[1])
|
50
|
+
|
51
|
+
dupes[row[1]] = true
|
52
|
+
taxon_id = row[0]
|
53
|
+
name_string = row[1]
|
54
|
+
|
55
|
+
@names << { taxon_id: taxon_id,
|
56
|
+
name_string: name_string }
|
57
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def generate_dwca
|
62
|
+
DwcaHunter.logger_write(object_id,
|
63
|
+
"Creating DarwinCore Archive file")
|
64
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
65
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
66
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
67
|
+
@names.each do |n|
|
68
|
+
@core << [n[:taxon_id], n[:name_string], "ICZN"]
|
69
|
+
end
|
70
|
+
|
71
|
+
@eml = {
|
72
|
+
id: @uuid,
|
73
|
+
title: @title,
|
74
|
+
authors: [
|
75
|
+
{ first_name: "Charles Davies",
|
76
|
+
last_name: "Sherborn" }
|
77
|
+
],
|
78
|
+
metadata_providers: [
|
79
|
+
{ first_name: "Dmitry",
|
80
|
+
last_name: "Mozzherin",
|
81
|
+
email: "dmozzherin@gmail.com" }
|
82
|
+
],
|
83
|
+
abstract: "Index Animalium is a monumental work that covers " \
|
84
|
+
"400 000 zoological names registered by science " \
|
85
|
+
"between 1758 and 1850",
|
86
|
+
url: @url
|
87
|
+
}
|
88
|
+
super
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,16 +1,17 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceWikispecies < DwcaHunter::Resource
|
4
5
|
def initialize(opts = {})
|
5
|
-
@wikisp_path = File.join(Dir.tmpdir,
|
6
|
-
@problems_file = open(File.join(Dir.tmpdir,
|
6
|
+
@wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
|
7
|
+
@problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
|
7
8
|
@command = "wikispecies"
|
8
|
-
@title =
|
9
|
-
@url =
|
10
|
-
|
9
|
+
@title = "Wikispecies"
|
10
|
+
@url = "http://dumps.wikimedia.org/specieswiki/latest/" \
|
11
|
+
"specieswiki-latest-pages-articles.xml.bz2"
|
11
12
|
@url = opts[:url] if opts[:url]
|
12
|
-
@uuid =
|
13
|
-
@download_path = File.join(@wikisp_path,
|
13
|
+
@uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
|
14
|
+
@download_path = File.join(@wikisp_path, "data.xml.bz2")
|
14
15
|
@data = []
|
15
16
|
@templates = {}
|
16
17
|
@taxon_ids = {}
|
@@ -19,7 +20,7 @@ module DwcaHunter
|
|
19
20
|
@extensions = []
|
20
21
|
@re = {
|
21
22
|
page_start: /^\s*\<page\>\s*$/,
|
22
|
-
page_end:
|
23
|
+
page_end: %r{^\s*\</page\>\s*$},
|
23
24
|
template: /Template:/i,
|
24
25
|
template_link: /\{\{([^\}]*)\}\}/,
|
25
26
|
vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
|
@@ -27,6 +28,11 @@ module DwcaHunter
|
|
27
28
|
super(opts)
|
28
29
|
end
|
29
30
|
|
31
|
+
def download
|
32
|
+
puts "Downloading from the source"
|
33
|
+
`curl -L #{@url} -o #{@download_path}`
|
34
|
+
end
|
35
|
+
|
30
36
|
def unpack
|
31
37
|
unpack_bz2
|
32
38
|
end
|
@@ -37,22 +43,22 @@ module DwcaHunter
|
|
37
43
|
generate_dwca
|
38
44
|
end
|
39
45
|
|
40
|
-
|
46
|
+
private
|
41
47
|
|
42
48
|
def enrich_data
|
43
|
-
DwcaHunter
|
44
|
-
|
49
|
+
DwcaHunter.logger_write(object_id,
|
50
|
+
"Extracting data from xml file...")
|
45
51
|
Dir.chdir(@download_dir)
|
46
|
-
f = open(
|
52
|
+
f = open("data.xml", "r:utf-8")
|
47
53
|
page_on = false
|
48
|
-
page =
|
54
|
+
page = ""
|
49
55
|
page_num = 0
|
50
56
|
f.each do |l|
|
51
57
|
if l.match(@re[:page_start])
|
52
|
-
page
|
58
|
+
page += l
|
53
59
|
page_on = true
|
54
60
|
elsif page_on
|
55
|
-
page
|
61
|
+
page += l
|
56
62
|
if l.match(@re[:page_end])
|
57
63
|
page_on = false
|
58
64
|
page_xml = Nokogiri::XML.parse(page)
|
@@ -61,22 +67,22 @@ module DwcaHunter
|
|
61
67
|
process_species(page_xml)
|
62
68
|
page_num += 1
|
63
69
|
if page_num % BATCH_SIZE == 0
|
64
|
-
DwcaHunter
|
65
|
-
|
70
|
+
DwcaHunter.logger_write(object_id,
|
71
|
+
"Traversed %s pages" % page_num)
|
66
72
|
end
|
67
|
-
page =
|
73
|
+
page = ""
|
68
74
|
@page_title = nil
|
69
75
|
@page_id = nil
|
70
76
|
end
|
71
77
|
end
|
72
78
|
end
|
73
|
-
DwcaHunter
|
74
|
-
|
79
|
+
DwcaHunter.logger_write(object_id,
|
80
|
+
"Extracted total %s pages" % page_num)
|
75
81
|
f.close
|
76
82
|
end
|
77
83
|
|
78
84
|
def extend_classification
|
79
|
-
DwcaHunter
|
85
|
+
DwcaHunter.logger_write(object_id, "Extending classifications")
|
80
86
|
@data.each_with_index do |d, i|
|
81
87
|
unless d[:classificationPath].empty?
|
82
88
|
n = 50
|
@@ -98,19 +104,21 @@ module DwcaHunter
|
|
98
104
|
# d[:classificationPath] = d[:classificationPath].join("|").
|
99
105
|
# gsub("Main Page", "Life")
|
100
106
|
if i % BATCH_SIZE == 0 && i > 0
|
101
|
-
DwcaHunter
|
102
|
-
|
107
|
+
DwcaHunter.logger_write(object_id,
|
108
|
+
"Extended %s classifications" % i)
|
103
109
|
end
|
104
110
|
end
|
105
111
|
end
|
106
112
|
|
107
113
|
def update_tree(path)
|
108
114
|
path = path.dup
|
109
|
-
return if @paths.
|
115
|
+
return if @paths.key?(path.join("|"))
|
116
|
+
|
110
117
|
(0...path.size).each do |i|
|
111
118
|
subpath = path[0..i]
|
112
|
-
subpath_string = subpath.join(
|
113
|
-
next if @paths.
|
119
|
+
subpath_string = subpath.join("|")
|
120
|
+
next if @paths.key?(subpath_string)
|
121
|
+
|
114
122
|
name = subpath.pop
|
115
123
|
tree_element = subpath.inject(@tree) { |res, n| res[n] }
|
116
124
|
tree_element[name] = {}
|
@@ -119,27 +127,29 @@ module DwcaHunter
|
|
119
127
|
end
|
120
128
|
|
121
129
|
def process_template(x)
|
122
|
-
name = page_title(x).gsub!(@re[:template],
|
123
|
-
text = x.xpath(
|
130
|
+
name = page_title(x).gsub!(@re[:template], "").strip
|
131
|
+
text = x.xpath("//text").text.strip
|
124
132
|
parent_name = text.match(@re[:template_link])
|
125
133
|
if parent_name
|
126
134
|
return if parent_name[1].match(/\#if/)
|
135
|
+
|
127
136
|
list = parent_name[1].split("|")
|
128
|
-
if list.size == 1
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
137
|
+
parent_name = if list.size == 1
|
138
|
+
list[0]
|
139
|
+
elsif list[0].match(/Taxonav/i)
|
140
|
+
list[1]
|
141
|
+
else
|
142
|
+
list[0]
|
143
|
+
end
|
135
144
|
end
|
136
|
-
name.gsub!(/_/,
|
137
|
-
parent_name
|
145
|
+
name.gsub!(/_/, " ")
|
146
|
+
parent_name&.gsub!(/_/, " ")
|
138
147
|
@templates[name] = { parentName: parent_name, id: page_id(x) }
|
139
148
|
end
|
140
149
|
|
141
150
|
def process_species(x)
|
142
151
|
return if page_title(x).match(/Wikispecies/i)
|
152
|
+
|
143
153
|
items = find_species_components(x)
|
144
154
|
if items
|
145
155
|
@data << {
|
@@ -147,7 +157,8 @@ module DwcaHunter
|
|
147
157
|
canonicalForm: page_title(x),
|
148
158
|
scientificName: page_title(x),
|
149
159
|
classificationPath: [],
|
150
|
-
vernacularNames: []
|
160
|
+
vernacularNames: []
|
161
|
+
}
|
151
162
|
get_full_scientific_name(items)
|
152
163
|
get_vernacular_names(items)
|
153
164
|
init_classification_path(items)
|
@@ -155,8 +166,8 @@ module DwcaHunter
|
|
155
166
|
end
|
156
167
|
|
157
168
|
def get_full_scientific_name(items)
|
158
|
-
if items[
|
159
|
-
if name = items[
|
169
|
+
if items["name"]
|
170
|
+
if name = items["name"][0]
|
160
171
|
@data[-1][:scientificName] = parse_name(name, @data[-1])
|
161
172
|
else
|
162
173
|
@problems_file.write("%s\n" % @data[-1][:canonicalForm])
|
@@ -165,19 +176,20 @@ module DwcaHunter
|
|
165
176
|
end
|
166
177
|
|
167
178
|
def get_vernacular_names(items)
|
168
|
-
if items[
|
169
|
-
vn_string = items[
|
179
|
+
if items["vernacular names"] && !items["vernacular names"].empty?
|
180
|
+
vn_string = items["vernacular names"].join("")
|
170
181
|
vn = vn_string.match(@re[:vernacular_names])
|
171
182
|
if vn
|
172
183
|
vn_list = vn[1].strip.split("|")
|
173
184
|
vnames = []
|
174
185
|
vn_list.each do |item|
|
175
|
-
language, name = item.split("=").map
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
186
|
+
language, name = item.split("=").map(&:strip)
|
187
|
+
next unless language && name && language.size < 4 && name.valid_encoding?
|
188
|
+
|
189
|
+
vnames << {
|
190
|
+
name: name,
|
191
|
+
language: language
|
192
|
+
}
|
181
193
|
end
|
182
194
|
|
183
195
|
@data[-1][:vernacularNames] = vnames
|
@@ -186,26 +198,26 @@ module DwcaHunter
|
|
186
198
|
end
|
187
199
|
|
188
200
|
def init_classification_path(items)
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
end
|
201
|
+
# ignore non-template links
|
202
|
+
items["taxonavigation"]&.each do |line|
|
203
|
+
line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
|
204
|
+
next unless template_link = line.match(@re[:template_link])
|
205
|
+
|
206
|
+
template_link = template_link[1].
|
207
|
+
strip.gsub(/Template:/, "").gsub(/_/, " ")
|
208
|
+
unless template_link.match(/\|/)
|
209
|
+
@data[-1][:classificationPath] << template_link
|
210
|
+
break
|
200
211
|
end
|
201
212
|
end
|
202
213
|
end
|
203
214
|
|
204
215
|
def find_species_components(x)
|
205
|
-
items = get_items(x.xpath(
|
206
|
-
is_taxon_item = items.
|
207
|
-
items.
|
216
|
+
items = get_items(x.xpath("//text").text)
|
217
|
+
is_taxon_item = items.key?("name") ||
|
218
|
+
items.key?("taxonavigation")
|
208
219
|
return nil unless is_taxon_item
|
220
|
+
|
209
221
|
items
|
210
222
|
end
|
211
223
|
|
@@ -214,7 +226,7 @@ module DwcaHunter
|
|
214
226
|
items = {}
|
215
227
|
current_item = nil
|
216
228
|
txt.split("\n").each do |l|
|
217
|
-
item =
|
229
|
+
item = l.match(/[\=]+([^\=]+)[\=]+/)
|
218
230
|
if item
|
219
231
|
current_item = item[1].strip.downcase
|
220
232
|
items[current_item] = []
|
@@ -226,11 +238,11 @@ module DwcaHunter
|
|
226
238
|
end
|
227
239
|
|
228
240
|
def page_title(x)
|
229
|
-
@page_title ||= x.xpath(
|
241
|
+
@page_title ||= x.xpath("//title").first.text
|
230
242
|
end
|
231
243
|
|
232
244
|
def page_id(x)
|
233
|
-
@page_id ||= x.xpath(
|
245
|
+
@page_id ||= x.xpath("//id").first.text
|
234
246
|
end
|
235
247
|
|
236
248
|
def template?(page_xml)
|
@@ -238,110 +250,113 @@ module DwcaHunter
|
|
238
250
|
end
|
239
251
|
|
240
252
|
def parse_name(name_string, taxa)
|
241
|
-
name_string.gsub!(
|
253
|
+
name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
|
242
254
|
name_string = name_string.strip
|
243
255
|
old_l = name_string.dup
|
244
|
-
name_string.gsub!
|
256
|
+
name_string.gsub!(/^\*\s*/, "")
|
245
257
|
name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
|
246
258
|
name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
|
247
|
-
name_string.gsub!(/[']{2,}/,
|
248
|
-
name_string.gsub!(/["]{2,}/,
|
249
|
-
name_string.gsub!(/\:\s*\d.*$/,
|
250
|
-
name_string.gsub!(/,\s*\[RSD\]/i,
|
251
|
-
name_string.gsub!(/^\s*†\s*/,
|
252
|
-
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/,
|
259
|
+
name_string.gsub!(/[']{2,}/, " ")
|
260
|
+
name_string.gsub!(/["]{2,}/, " ")
|
261
|
+
name_string.gsub!(/\:\s*\d.*$/, "")
|
262
|
+
name_string.gsub!(/,\s*\[RSD\]/i, "")
|
263
|
+
name_string.gsub!(/^\s*†\s*/, "")
|
264
|
+
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
|
253
265
|
# name_string = DwcaHunter::XML.unescape(name_string)
|
254
|
-
name_string.gsub!(/\<nowiki\>.*$/,
|
255
|
-
name_string.gsub!(
|
256
|
-
name_string.gsub!(/^\s*\†\s*/,
|
257
|
-
name_string.gsub!(/ /,
|
258
|
-
name_string.gsub!(/\s+/,
|
266
|
+
name_string.gsub!(/\<nowiki\>.*$/, "")
|
267
|
+
name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
|
268
|
+
name_string.gsub!(/^\s*\†\s*/, "")
|
269
|
+
name_string.gsub!(/ /, " ")
|
270
|
+
name_string.gsub!(/\s+/, " ")
|
259
271
|
name_string = name_string.strip
|
260
272
|
# puts "%s---%s" % [name_string, old_l]
|
261
|
-
|
273
|
+
name_string
|
262
274
|
end
|
263
275
|
|
264
276
|
def generate_dwca
|
265
|
-
DwcaHunter
|
266
|
-
|
277
|
+
DwcaHunter.logger_write(object_id,
|
278
|
+
"Creating DarwinCore Archive file")
|
267
279
|
@core = [
|
268
|
-
[
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
'http://rs.tdwg.org/dwc/terms/higherClassification',
|
273
|
-
'http://purl.org/dc/terms/source']
|
280
|
+
["http://rs.tdwg.org/dwc/terms/taxonID",
|
281
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
282
|
+
"http://globalnames.org/terms/canonicalForm",
|
283
|
+
"http://purl.org/dc/terms/source"]
|
274
284
|
]
|
275
|
-
DwcaHunter
|
285
|
+
DwcaHunter.logger_write(object_id, "Assembling Core Data")
|
276
286
|
count = 0
|
277
287
|
@data.map do |d|
|
278
288
|
count += 1
|
279
289
|
if count % BATCH_SIZE == 0
|
280
|
-
DwcaHunter
|
281
|
-
|
290
|
+
DwcaHunter.logger_write(object_id,
|
291
|
+
"Traversing %s core data record" % count)
|
282
292
|
end
|
283
|
-
taxon_id =
|
284
|
-
|
285
|
-
|
286
|
-
|
293
|
+
taxon_id = begin
|
294
|
+
(d[:classificationPath].empty? ?
|
295
|
+
d[:taxonId] :
|
296
|
+
@templates[d[:classificationPath].
|
297
|
+
last][:id])
|
298
|
+
rescue StandardError
|
299
|
+
d[:taxonId]
|
300
|
+
end
|
287
301
|
@taxon_ids[d[:taxonId]] = taxon_id
|
288
|
-
parentNameUsageId =
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
302
|
+
parentNameUsageId = begin
|
303
|
+
(d[:classificationPath].size > 1 ?
|
304
|
+
@templates[d[:classificationPath][-2]][:id] :
|
305
|
+
nil)
|
306
|
+
rescue StandardError
|
307
|
+
nil
|
308
|
+
end
|
309
|
+
url = "http://species.wikimedia.org/wiki/" +
|
310
|
+
URI.encode(d[:canonicalForm].gsub(" ", "_"))
|
293
311
|
path = d[:classificationPath]
|
294
312
|
path.pop if path[-1] == d[:canonicalForm]
|
295
|
-
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/,
|
296
|
-
scientific_name =
|
313
|
+
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
|
314
|
+
scientific_name = d[:scientificName] == d[:canonicalForm] ?
|
297
315
|
canonical_form :
|
298
316
|
d[:scientificName]
|
299
317
|
@core << [taxon_id,
|
300
318
|
scientific_name,
|
301
|
-
parentNameUsageId,
|
302
319
|
canonical_form,
|
303
|
-
path.join('|'),
|
304
320
|
url]
|
305
321
|
end
|
306
322
|
@extensions << { data: [[
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
]], file_name:
|
311
|
-
DwcaHunter
|
312
|
-
|
323
|
+
"http://rs.tdwg.org/dwc/terms/TaxonID",
|
324
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
325
|
+
"http://purl.org/dc/terms/language"
|
326
|
+
]], file_name: "vernacular_names.txt" }
|
327
|
+
DwcaHunter.logger_write(object_id,
|
328
|
+
"Creating verncaular name extension for DarwinCore Archive file")
|
313
329
|
count = 0
|
314
330
|
@data.each do |d|
|
315
331
|
count += 1
|
316
332
|
if count % BATCH_SIZE == 0
|
317
|
-
DwcaHunter
|
318
|
-
|
333
|
+
DwcaHunter.logger_write(object_id,
|
334
|
+
"Traversing %s extension data record" % count)
|
319
335
|
end
|
320
336
|
d[:vernacularNames].each do |vn|
|
321
|
-
taxon_id = @taxon_ids[d[:taxonId]]
|
322
|
-
if taxon_id
|
323
|
-
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
|
324
|
-
end
|
337
|
+
taxon_id = @taxon_ids[d[:taxonId]] || nil
|
338
|
+
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
|
325
339
|
end
|
326
340
|
end
|
327
341
|
@eml = {
|
328
342
|
id: @uuid,
|
329
343
|
title: @title,
|
330
|
-
license:
|
344
|
+
license: "http://creativecommons.org/licenses/by-sa/3.0/",
|
331
345
|
authors: [
|
332
|
-
{ first_name:
|
333
|
-
last_name:
|
334
|
-
email:
|
335
|
-
url:
|
336
|
-
|
346
|
+
{ first_name: "Stephen",
|
347
|
+
last_name: "Thorpe",
|
348
|
+
email: "stephen_thorpe@yahoo.co.nz",
|
349
|
+
url: "http://species.wikimedia.org/wiki/Main_Page" }
|
350
|
+
],
|
351
|
+
abstract: "The free species directory that anyone can edit.",
|
337
352
|
metadata_providers: [
|
338
|
-
{ first_name:
|
339
|
-
last_name:
|
340
|
-
email:
|
341
|
-
|
353
|
+
{ first_name: "Dmitry",
|
354
|
+
last_name: "Mozzherin",
|
355
|
+
email: "dmozzherin@mbl.edu" }
|
356
|
+
],
|
357
|
+
url: "http://species.wikimedia.org/wiki/Main_Page"
|
342
358
|
}
|
343
359
|
super
|
344
360
|
end
|
345
|
-
|
346
361
|
end
|
347
362
|
end
|