dwca_hunter 0.5.5 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +50 -77
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +7 -8
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +31 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +93 -91
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -127
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +27 -34
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
class PaleodbHarvester
|
2
|
+
def initialize(download_dir)
|
3
|
+
@dir = File.join(download_dir, "json")
|
4
|
+
FileUtils.mkdir_p(@dir)
|
5
|
+
@in_dir = download_dir
|
6
|
+
@taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
|
7
|
+
@refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
|
8
|
+
@taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
|
9
|
+
@occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
|
10
|
+
end
|
11
|
+
|
12
|
+
def taxa
|
13
|
+
# "orig_no","taxon_no","record_type","flags","taxon_rank",
|
14
|
+
# "taxon_name","difference","accepted_no","accepted_rank",
|
15
|
+
# "accepted_name","parent_no","reference_no","is_extant","n_occs"
|
16
|
+
taxa = {}
|
17
|
+
name2id = {}
|
18
|
+
@taxa_csv.each do |r|
|
19
|
+
r = strip(r)
|
20
|
+
taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
|
21
|
+
rank: r["taxon_rank"], name: r["taxon_name"],
|
22
|
+
auth: r["taxon_attr"],
|
23
|
+
extinct: extinct(r["is_extant"]),
|
24
|
+
vernacular: r["common_name"],
|
25
|
+
annot: r["difference"], acc_id: r["accepted_no"],
|
26
|
+
acc_rank: r["accepted_rank"],
|
27
|
+
acc_name: r["accepted_name"], ecol: ecol(r),
|
28
|
+
parent_id: r["parent_no"], ref: r["reference_no"],
|
29
|
+
occs_num: r["n_occs"], enterer: enterer(r) }
|
30
|
+
|
31
|
+
name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
|
32
|
+
end
|
33
|
+
f = open(File.join(@dir, "taxa.json"), "w:utf-8")
|
34
|
+
f.write(JSON.pretty_generate(taxa))
|
35
|
+
f.close
|
36
|
+
f = open(File.join(@dir, "name_id.json"), "w:utf-8")
|
37
|
+
f.write(JSON.pretty_generate(name2id))
|
38
|
+
f.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def enterer(r)
|
42
|
+
res = [r["enterer"], r["modifier"]].map(&:to_s)
|
43
|
+
.map(&:strip).uniq.select { |e| e != "" }
|
44
|
+
res.empty? ? "" : res.join(", ")
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
def extinct(val)
|
49
|
+
val == "extinct" ? 1 : 0
|
50
|
+
end
|
51
|
+
|
52
|
+
def ecol(row)
|
53
|
+
row = strip row
|
54
|
+
"#{row['life_habit']} #{row['diet']}"
|
55
|
+
end
|
56
|
+
|
57
|
+
def refs
|
58
|
+
# "reference_no","record_type","ref_type","author1init","author1last",
|
59
|
+
# "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
|
60
|
+
# "editors","pubvol","pubno","firstpage","lastpage","publication_type",
|
61
|
+
# "language","doi"
|
62
|
+
|
63
|
+
# {"id":31671,"orig":true,"author":"Hahn, C. W.",
|
64
|
+
# "year":1834,"title":"Die wanzenartigen Insecten.",
|
65
|
+
# "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
|
66
|
+
# "distribution":"Germany","comment":"n. sp."}
|
67
|
+
refs = {}
|
68
|
+
@refs_csv.each do |r|
|
69
|
+
r = strip r
|
70
|
+
authorship, author = authors(r)
|
71
|
+
refs[r["reference_no"]] = { id: r["reference_no"], author: author,
|
72
|
+
authorship: authorship,
|
73
|
+
year: r["pubyr"], title: r["reftitle"],
|
74
|
+
details: details(r) }
|
75
|
+
end
|
76
|
+
f = open(File.join(@dir, "refs.json"), "w:utf-8")
|
77
|
+
f.write(JSON.pretty_generate(refs))
|
78
|
+
f.close
|
79
|
+
end
|
80
|
+
|
81
|
+
def authors(row)
|
82
|
+
row = strip row
|
83
|
+
au = ["#{row['author1init']} #{row['author1last']}".strip,
|
84
|
+
"#{row['author2init']} #{row['author2last']}".strip,
|
85
|
+
"#{row['otherauthors']}".strip]
|
86
|
+
au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
|
87
|
+
[au[0..1].join(", "), au.join(", ")]
|
88
|
+
end
|
89
|
+
|
90
|
+
def details(row)
|
91
|
+
row = strip row
|
92
|
+
ref = "#{row['pubtitle']}"
|
93
|
+
ref << " #{row['pubno']}" unless row['pubno'].empty?
|
94
|
+
ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
|
95
|
+
ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
|
96
|
+
ref << " (#{row["doi"]})" unless row['doi'].empty?
|
97
|
+
ref.gsub(/[\s]{2,}/, " ").strip
|
98
|
+
end
|
99
|
+
|
100
|
+
def taxa_refs
|
101
|
+
tr = {}
|
102
|
+
@taxa_refs_csv.each do |r|
|
103
|
+
r = strip r
|
104
|
+
row = { acc_id: r["accepted_no"], name: r["accepted_name"],
|
105
|
+
ref_id: r["reference_no"] }
|
106
|
+
if tr.key? r["accepted_no"]
|
107
|
+
tr[r["accepted_no"]] << row
|
108
|
+
else
|
109
|
+
tr[r["accepted_no"]] = [row]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
|
113
|
+
f.write(JSON.pretty_generate(tr))
|
114
|
+
f.close
|
115
|
+
end
|
116
|
+
|
117
|
+
def occurences
|
118
|
+
occ = {}
|
119
|
+
@occurences_csv.each_with_index do |r, i|
|
120
|
+
r = strip r
|
121
|
+
row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
|
122
|
+
state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
|
123
|
+
if occ.key? r["accepted_no"]
|
124
|
+
occ[r["accepted_no"]] << row
|
125
|
+
else
|
126
|
+
occ[r["accepted_no"]] = [row]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
f = open(File.join(@dir, "occurences.json"), "w:utf-8")
|
130
|
+
f.write(JSON.pretty_generate(occ))
|
131
|
+
f.close
|
132
|
+
end
|
133
|
+
|
134
|
+
def strip(row)
|
135
|
+
row.each_with_object({}) do |(k, v), h|
|
136
|
+
h[k] = v.nil? ? nil : v.strip
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceSherborn < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "sherborn"
|
7
|
+
@title = "Index Animalium"
|
8
|
+
@url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
|
9
|
+
@UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"sherborn",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading."
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack; end
|
29
|
+
|
30
|
+
def make_dwca
|
31
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
|
+
get_names
|
33
|
+
generate_dwca
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_names
|
39
|
+
Dir.chdir(@download_dir)
|
40
|
+
collect_names
|
41
|
+
end
|
42
|
+
|
43
|
+
def collect_names
|
44
|
+
dupes = {}
|
45
|
+
@names_index = {}
|
46
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
47
|
+
headers: false, col_sep: "\t")
|
48
|
+
file.each_with_index do |row, i|
|
49
|
+
next if dupes.key?(row[1])
|
50
|
+
|
51
|
+
dupes[row[1]] = true
|
52
|
+
taxon_id = row[0]
|
53
|
+
name_string = row[1]
|
54
|
+
|
55
|
+
@names << { taxon_id: taxon_id,
|
56
|
+
name_string: name_string }
|
57
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def generate_dwca
|
62
|
+
DwcaHunter.logger_write(object_id,
|
63
|
+
"Creating DarwinCore Archive file")
|
64
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
65
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
66
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
67
|
+
@names.each do |n|
|
68
|
+
@core << [n[:taxon_id], n[:name_string], "ICZN"]
|
69
|
+
end
|
70
|
+
|
71
|
+
@eml = {
|
72
|
+
id: @uuid,
|
73
|
+
title: @title,
|
74
|
+
authors: [
|
75
|
+
{ first_name: "Charles Davies",
|
76
|
+
last_name: "Sherborn" }
|
77
|
+
],
|
78
|
+
metadata_providers: [
|
79
|
+
{ first_name: "Dmitry",
|
80
|
+
last_name: "Mozzherin",
|
81
|
+
email: "dmozzherin@gmail.com" }
|
82
|
+
],
|
83
|
+
abstract: "Index Animalium is a monumental work that covers " \
|
84
|
+
"400 000 zoological names registered by science " \
|
85
|
+
"between 1758 and 1850",
|
86
|
+
url: @url
|
87
|
+
}
|
88
|
+
super
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,16 +1,17 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceWikispecies < DwcaHunter::Resource
|
4
5
|
def initialize(opts = {})
|
5
|
-
@wikisp_path = File.join(Dir.tmpdir,
|
6
|
-
@problems_file = open(File.join(Dir.tmpdir,
|
6
|
+
@wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
|
7
|
+
@problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
|
7
8
|
@command = "wikispecies"
|
8
|
-
@title =
|
9
|
-
@url =
|
10
|
-
|
9
|
+
@title = "Wikispecies"
|
10
|
+
@url = "http://dumps.wikimedia.org/specieswiki/latest/" \
|
11
|
+
"specieswiki-latest-pages-articles.xml.bz2"
|
11
12
|
@url = opts[:url] if opts[:url]
|
12
|
-
@uuid =
|
13
|
-
@download_path = File.join(@wikisp_path,
|
13
|
+
@uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
|
14
|
+
@download_path = File.join(@wikisp_path, "data.xml.bz2")
|
14
15
|
@data = []
|
15
16
|
@templates = {}
|
16
17
|
@taxon_ids = {}
|
@@ -19,7 +20,7 @@ module DwcaHunter
|
|
19
20
|
@extensions = []
|
20
21
|
@re = {
|
21
22
|
page_start: /^\s*\<page\>\s*$/,
|
22
|
-
page_end:
|
23
|
+
page_end: %r{^\s*\</page\>\s*$},
|
23
24
|
template: /Template:/i,
|
24
25
|
template_link: /\{\{([^\}]*)\}\}/,
|
25
26
|
vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
|
@@ -27,6 +28,11 @@ module DwcaHunter
|
|
27
28
|
super(opts)
|
28
29
|
end
|
29
30
|
|
31
|
+
def download
|
32
|
+
puts "Downloading from the source"
|
33
|
+
`curl -L #{@url} -o #{@download_path}`
|
34
|
+
end
|
35
|
+
|
30
36
|
def unpack
|
31
37
|
unpack_bz2
|
32
38
|
end
|
@@ -37,22 +43,22 @@ module DwcaHunter
|
|
37
43
|
generate_dwca
|
38
44
|
end
|
39
45
|
|
40
|
-
|
46
|
+
private
|
41
47
|
|
42
48
|
def enrich_data
|
43
|
-
DwcaHunter
|
44
|
-
|
49
|
+
DwcaHunter.logger_write(object_id,
|
50
|
+
"Extracting data from xml file...")
|
45
51
|
Dir.chdir(@download_dir)
|
46
|
-
f = open(
|
52
|
+
f = open("data.xml", "r:utf-8")
|
47
53
|
page_on = false
|
48
|
-
page =
|
54
|
+
page = ""
|
49
55
|
page_num = 0
|
50
56
|
f.each do |l|
|
51
57
|
if l.match(@re[:page_start])
|
52
|
-
page
|
58
|
+
page += l
|
53
59
|
page_on = true
|
54
60
|
elsif page_on
|
55
|
-
page
|
61
|
+
page += l
|
56
62
|
if l.match(@re[:page_end])
|
57
63
|
page_on = false
|
58
64
|
page_xml = Nokogiri::XML.parse(page)
|
@@ -61,22 +67,22 @@ module DwcaHunter
|
|
61
67
|
process_species(page_xml)
|
62
68
|
page_num += 1
|
63
69
|
if page_num % BATCH_SIZE == 0
|
64
|
-
DwcaHunter
|
65
|
-
|
70
|
+
DwcaHunter.logger_write(object_id,
|
71
|
+
"Traversed %s pages" % page_num)
|
66
72
|
end
|
67
|
-
page =
|
73
|
+
page = ""
|
68
74
|
@page_title = nil
|
69
75
|
@page_id = nil
|
70
76
|
end
|
71
77
|
end
|
72
78
|
end
|
73
|
-
DwcaHunter
|
74
|
-
|
79
|
+
DwcaHunter.logger_write(object_id,
|
80
|
+
"Extracted total %s pages" % page_num)
|
75
81
|
f.close
|
76
82
|
end
|
77
83
|
|
78
84
|
def extend_classification
|
79
|
-
DwcaHunter
|
85
|
+
DwcaHunter.logger_write(object_id, "Extending classifications")
|
80
86
|
@data.each_with_index do |d, i|
|
81
87
|
unless d[:classificationPath].empty?
|
82
88
|
n = 50
|
@@ -98,19 +104,21 @@ module DwcaHunter
|
|
98
104
|
# d[:classificationPath] = d[:classificationPath].join("|").
|
99
105
|
# gsub("Main Page", "Life")
|
100
106
|
if i % BATCH_SIZE == 0 && i > 0
|
101
|
-
DwcaHunter
|
102
|
-
|
107
|
+
DwcaHunter.logger_write(object_id,
|
108
|
+
"Extended %s classifications" % i)
|
103
109
|
end
|
104
110
|
end
|
105
111
|
end
|
106
112
|
|
107
113
|
def update_tree(path)
|
108
114
|
path = path.dup
|
109
|
-
return if @paths.
|
115
|
+
return if @paths.key?(path.join("|"))
|
116
|
+
|
110
117
|
(0...path.size).each do |i|
|
111
118
|
subpath = path[0..i]
|
112
|
-
subpath_string = subpath.join(
|
113
|
-
next if @paths.
|
119
|
+
subpath_string = subpath.join("|")
|
120
|
+
next if @paths.key?(subpath_string)
|
121
|
+
|
114
122
|
name = subpath.pop
|
115
123
|
tree_element = subpath.inject(@tree) { |res, n| res[n] }
|
116
124
|
tree_element[name] = {}
|
@@ -119,27 +127,29 @@ module DwcaHunter
|
|
119
127
|
end
|
120
128
|
|
121
129
|
def process_template(x)
|
122
|
-
name = page_title(x).gsub!(@re[:template],
|
123
|
-
text = x.xpath(
|
130
|
+
name = page_title(x).gsub!(@re[:template], "").strip
|
131
|
+
text = x.xpath("//text").text.strip
|
124
132
|
parent_name = text.match(@re[:template_link])
|
125
133
|
if parent_name
|
126
134
|
return if parent_name[1].match(/\#if/)
|
135
|
+
|
127
136
|
list = parent_name[1].split("|")
|
128
|
-
if list.size == 1
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
137
|
+
parent_name = if list.size == 1
|
138
|
+
list[0]
|
139
|
+
elsif list[0].match(/Taxonav/i)
|
140
|
+
list[1]
|
141
|
+
else
|
142
|
+
list[0]
|
143
|
+
end
|
135
144
|
end
|
136
|
-
name.gsub!(/_/,
|
137
|
-
parent_name
|
145
|
+
name.gsub!(/_/, " ")
|
146
|
+
parent_name&.gsub!(/_/, " ")
|
138
147
|
@templates[name] = { parentName: parent_name, id: page_id(x) }
|
139
148
|
end
|
140
149
|
|
141
150
|
def process_species(x)
|
142
151
|
return if page_title(x).match(/Wikispecies/i)
|
152
|
+
|
143
153
|
items = find_species_components(x)
|
144
154
|
if items
|
145
155
|
@data << {
|
@@ -147,7 +157,8 @@ module DwcaHunter
|
|
147
157
|
canonicalForm: page_title(x),
|
148
158
|
scientificName: page_title(x),
|
149
159
|
classificationPath: [],
|
150
|
-
vernacularNames: []
|
160
|
+
vernacularNames: []
|
161
|
+
}
|
151
162
|
get_full_scientific_name(items)
|
152
163
|
get_vernacular_names(items)
|
153
164
|
init_classification_path(items)
|
@@ -155,8 +166,8 @@ module DwcaHunter
|
|
155
166
|
end
|
156
167
|
|
157
168
|
def get_full_scientific_name(items)
|
158
|
-
if items[
|
159
|
-
if name = items[
|
169
|
+
if items["name"]
|
170
|
+
if name = items["name"][0]
|
160
171
|
@data[-1][:scientificName] = parse_name(name, @data[-1])
|
161
172
|
else
|
162
173
|
@problems_file.write("%s\n" % @data[-1][:canonicalForm])
|
@@ -165,19 +176,20 @@ module DwcaHunter
|
|
165
176
|
end
|
166
177
|
|
167
178
|
def get_vernacular_names(items)
|
168
|
-
if items[
|
169
|
-
vn_string = items[
|
179
|
+
if items["vernacular names"] && !items["vernacular names"].empty?
|
180
|
+
vn_string = items["vernacular names"].join("")
|
170
181
|
vn = vn_string.match(@re[:vernacular_names])
|
171
182
|
if vn
|
172
183
|
vn_list = vn[1].strip.split("|")
|
173
184
|
vnames = []
|
174
185
|
vn_list.each do |item|
|
175
|
-
language, name = item.split("=").map
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
186
|
+
language, name = item.split("=").map(&:strip)
|
187
|
+
next unless language && name && language.size < 4 && name.valid_encoding?
|
188
|
+
|
189
|
+
vnames << {
|
190
|
+
name: name,
|
191
|
+
language: language
|
192
|
+
}
|
181
193
|
end
|
182
194
|
|
183
195
|
@data[-1][:vernacularNames] = vnames
|
@@ -186,26 +198,26 @@ module DwcaHunter
|
|
186
198
|
end
|
187
199
|
|
188
200
|
def init_classification_path(items)
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
end
|
201
|
+
# ignore non-template links
|
202
|
+
items["taxonavigation"]&.each do |line|
|
203
|
+
line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
|
204
|
+
next unless template_link = line.match(@re[:template_link])
|
205
|
+
|
206
|
+
template_link = template_link[1].
|
207
|
+
strip.gsub(/Template:/, "").gsub(/_/, " ")
|
208
|
+
unless template_link.match(/\|/)
|
209
|
+
@data[-1][:classificationPath] << template_link
|
210
|
+
break
|
200
211
|
end
|
201
212
|
end
|
202
213
|
end
|
203
214
|
|
204
215
|
def find_species_components(x)
|
205
|
-
items = get_items(x.xpath(
|
206
|
-
is_taxon_item = items.
|
207
|
-
items.
|
216
|
+
items = get_items(x.xpath("//text").text)
|
217
|
+
is_taxon_item = items.key?("name") ||
|
218
|
+
items.key?("taxonavigation")
|
208
219
|
return nil unless is_taxon_item
|
220
|
+
|
209
221
|
items
|
210
222
|
end
|
211
223
|
|
@@ -214,7 +226,7 @@ module DwcaHunter
|
|
214
226
|
items = {}
|
215
227
|
current_item = nil
|
216
228
|
txt.split("\n").each do |l|
|
217
|
-
item =
|
229
|
+
item = l.match(/[\=]+([^\=]+)[\=]+/)
|
218
230
|
if item
|
219
231
|
current_item = item[1].strip.downcase
|
220
232
|
items[current_item] = []
|
@@ -226,11 +238,11 @@ module DwcaHunter
|
|
226
238
|
end
|
227
239
|
|
228
240
|
def page_title(x)
|
229
|
-
@page_title ||= x.xpath(
|
241
|
+
@page_title ||= x.xpath("//title").first.text
|
230
242
|
end
|
231
243
|
|
232
244
|
def page_id(x)
|
233
|
-
@page_id ||= x.xpath(
|
245
|
+
@page_id ||= x.xpath("//id").first.text
|
234
246
|
end
|
235
247
|
|
236
248
|
def template?(page_xml)
|
@@ -238,110 +250,113 @@ module DwcaHunter
|
|
238
250
|
end
|
239
251
|
|
240
252
|
def parse_name(name_string, taxa)
|
241
|
-
name_string.gsub!(
|
253
|
+
name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
|
242
254
|
name_string = name_string.strip
|
243
255
|
old_l = name_string.dup
|
244
|
-
name_string.gsub!
|
256
|
+
name_string.gsub!(/^\*\s*/, "")
|
245
257
|
name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
|
246
258
|
name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
|
247
|
-
name_string.gsub!(/[']{2,}/,
|
248
|
-
name_string.gsub!(/["]{2,}/,
|
249
|
-
name_string.gsub!(/\:\s*\d.*$/,
|
250
|
-
name_string.gsub!(/,\s*\[RSD\]/i,
|
251
|
-
name_string.gsub!(/^\s*†\s*/,
|
252
|
-
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/,
|
259
|
+
name_string.gsub!(/[']{2,}/, " ")
|
260
|
+
name_string.gsub!(/["]{2,}/, " ")
|
261
|
+
name_string.gsub!(/\:\s*\d.*$/, "")
|
262
|
+
name_string.gsub!(/,\s*\[RSD\]/i, "")
|
263
|
+
name_string.gsub!(/^\s*†\s*/, "")
|
264
|
+
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
|
253
265
|
# name_string = DwcaHunter::XML.unescape(name_string)
|
254
|
-
name_string.gsub!(/\<nowiki\>.*$/,
|
255
|
-
name_string.gsub!(
|
256
|
-
name_string.gsub!(/^\s*\†\s*/,
|
257
|
-
name_string.gsub!(/ /,
|
258
|
-
name_string.gsub!(/\s+/,
|
266
|
+
name_string.gsub!(/\<nowiki\>.*$/, "")
|
267
|
+
name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
|
268
|
+
name_string.gsub!(/^\s*\†\s*/, "")
|
269
|
+
name_string.gsub!(/ /, " ")
|
270
|
+
name_string.gsub!(/\s+/, " ")
|
259
271
|
name_string = name_string.strip
|
260
272
|
# puts "%s---%s" % [name_string, old_l]
|
261
|
-
|
273
|
+
name_string
|
262
274
|
end
|
263
275
|
|
264
276
|
def generate_dwca
|
265
|
-
DwcaHunter
|
266
|
-
|
277
|
+
DwcaHunter.logger_write(object_id,
|
278
|
+
"Creating DarwinCore Archive file")
|
267
279
|
@core = [
|
268
|
-
[
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
'http://rs.tdwg.org/dwc/terms/higherClassification',
|
273
|
-
'http://purl.org/dc/terms/source']
|
280
|
+
["http://rs.tdwg.org/dwc/terms/taxonID",
|
281
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
282
|
+
"http://globalnames.org/terms/canonicalForm",
|
283
|
+
"http://purl.org/dc/terms/source"]
|
274
284
|
]
|
275
|
-
DwcaHunter
|
285
|
+
DwcaHunter.logger_write(object_id, "Assembling Core Data")
|
276
286
|
count = 0
|
277
287
|
@data.map do |d|
|
278
288
|
count += 1
|
279
289
|
if count % BATCH_SIZE == 0
|
280
|
-
DwcaHunter
|
281
|
-
|
290
|
+
DwcaHunter.logger_write(object_id,
|
291
|
+
"Traversing %s core data record" % count)
|
282
292
|
end
|
283
|
-
taxon_id =
|
284
|
-
|
285
|
-
|
286
|
-
|
293
|
+
taxon_id = begin
|
294
|
+
(d[:classificationPath].empty? ?
|
295
|
+
d[:taxonId] :
|
296
|
+
@templates[d[:classificationPath].
|
297
|
+
last][:id])
|
298
|
+
rescue StandardError
|
299
|
+
d[:taxonId]
|
300
|
+
end
|
287
301
|
@taxon_ids[d[:taxonId]] = taxon_id
|
288
|
-
parentNameUsageId =
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
302
|
+
parentNameUsageId = begin
|
303
|
+
(d[:classificationPath].size > 1 ?
|
304
|
+
@templates[d[:classificationPath][-2]][:id] :
|
305
|
+
nil)
|
306
|
+
rescue StandardError
|
307
|
+
nil
|
308
|
+
end
|
309
|
+
url = "http://species.wikimedia.org/wiki/" +
|
310
|
+
URI.encode(d[:canonicalForm].gsub(" ", "_"))
|
293
311
|
path = d[:classificationPath]
|
294
312
|
path.pop if path[-1] == d[:canonicalForm]
|
295
|
-
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/,
|
296
|
-
scientific_name =
|
313
|
+
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
|
314
|
+
scientific_name = d[:scientificName] == d[:canonicalForm] ?
|
297
315
|
canonical_form :
|
298
316
|
d[:scientificName]
|
299
317
|
@core << [taxon_id,
|
300
318
|
scientific_name,
|
301
|
-
parentNameUsageId,
|
302
319
|
canonical_form,
|
303
|
-
path.join('|'),
|
304
320
|
url]
|
305
321
|
end
|
306
322
|
@extensions << { data: [[
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
]], file_name:
|
311
|
-
DwcaHunter
|
312
|
-
|
323
|
+
"http://rs.tdwg.org/dwc/terms/TaxonID",
|
324
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
325
|
+
"http://purl.org/dc/terms/language"
|
326
|
+
]], file_name: "vernacular_names.txt" }
|
327
|
+
DwcaHunter.logger_write(object_id,
|
328
|
+
"Creating verncaular name extension for DarwinCore Archive file")
|
313
329
|
count = 0
|
314
330
|
@data.each do |d|
|
315
331
|
count += 1
|
316
332
|
if count % BATCH_SIZE == 0
|
317
|
-
DwcaHunter
|
318
|
-
|
333
|
+
DwcaHunter.logger_write(object_id,
|
334
|
+
"Traversing %s extension data record" % count)
|
319
335
|
end
|
320
336
|
d[:vernacularNames].each do |vn|
|
321
|
-
taxon_id = @taxon_ids[d[:taxonId]]
|
322
|
-
if taxon_id
|
323
|
-
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
|
324
|
-
end
|
337
|
+
taxon_id = @taxon_ids[d[:taxonId]] || nil
|
338
|
+
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
|
325
339
|
end
|
326
340
|
end
|
327
341
|
@eml = {
|
328
342
|
id: @uuid,
|
329
343
|
title: @title,
|
330
|
-
license:
|
344
|
+
license: "http://creativecommons.org/licenses/by-sa/3.0/",
|
331
345
|
authors: [
|
332
|
-
{ first_name:
|
333
|
-
last_name:
|
334
|
-
email:
|
335
|
-
url:
|
336
|
-
|
346
|
+
{ first_name: "Stephen",
|
347
|
+
last_name: "Thorpe",
|
348
|
+
email: "stephen_thorpe@yahoo.co.nz",
|
349
|
+
url: "http://species.wikimedia.org/wiki/Main_Page" }
|
350
|
+
],
|
351
|
+
abstract: "The free species directory that anyone can edit.",
|
337
352
|
metadata_providers: [
|
338
|
-
{ first_name:
|
339
|
-
last_name:
|
340
|
-
email:
|
341
|
-
|
353
|
+
{ first_name: "Dmitry",
|
354
|
+
last_name: "Mozzherin",
|
355
|
+
email: "dmozzherin@mbl.edu" }
|
356
|
+
],
|
357
|
+
url: "http://species.wikimedia.org/wiki/Main_Page"
|
342
358
|
}
|
343
359
|
super
|
344
360
|
end
|
345
|
-
|
346
361
|
end
|
347
362
|
end
|