dwca_hunter 0.5.3 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +11 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +90 -84
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +13 -12
- data/exe/dwcahunter +1 -5
- data/lib/dwca_hunter.rb +33 -0
- data/lib/dwca_hunter/resource.rb +8 -3
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +115 -149
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ion.rb +98 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +186 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +166 -184
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +54 -32
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
class PaleodbHarvester
|
2
|
+
def initialize(download_dir)
|
3
|
+
@dir = File.join(download_dir, "json")
|
4
|
+
FileUtils.mkdir_p(@dir)
|
5
|
+
@in_dir = download_dir
|
6
|
+
@taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
|
7
|
+
@refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
|
8
|
+
@taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
|
9
|
+
@occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
|
10
|
+
end
|
11
|
+
|
12
|
+
def taxa
|
13
|
+
# "orig_no","taxon_no","record_type","flags","taxon_rank",
|
14
|
+
# "taxon_name","difference","accepted_no","accepted_rank",
|
15
|
+
# "accepted_name","parent_no","reference_no","is_extant","n_occs"
|
16
|
+
taxa = {}
|
17
|
+
name2id = {}
|
18
|
+
@taxa_csv.each do |r|
|
19
|
+
r = strip(r)
|
20
|
+
taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
|
21
|
+
rank: r["taxon_rank"], name: r["taxon_name"],
|
22
|
+
auth: r["taxon_attr"],
|
23
|
+
extinct: extinct(r["is_extant"]),
|
24
|
+
vernacular: r["common_name"],
|
25
|
+
annot: r["difference"], acc_id: r["accepted_no"],
|
26
|
+
acc_rank: r["accepted_rank"],
|
27
|
+
acc_name: r["accepted_name"], ecol: ecol(r),
|
28
|
+
parent_id: r["parent_no"], ref: r["reference_no"],
|
29
|
+
occs_num: r["n_occs"], enterer: enterer(r) }
|
30
|
+
|
31
|
+
name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
|
32
|
+
end
|
33
|
+
f = open(File.join(@dir, "taxa.json"), "w:utf-8")
|
34
|
+
f.write(JSON.pretty_generate(taxa))
|
35
|
+
f.close
|
36
|
+
f = open(File.join(@dir, "name_id.json"), "w:utf-8")
|
37
|
+
f.write(JSON.pretty_generate(name2id))
|
38
|
+
f.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def enterer(r)
|
42
|
+
res = [r["enterer"], r["modifier"]].map(&:to_s)
|
43
|
+
.map(&:strip).uniq.select { |e| e != "" }
|
44
|
+
res.empty? ? "" : res.join(", ")
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
def extinct(val)
|
49
|
+
val == "extinct" ? 1 : 0
|
50
|
+
end
|
51
|
+
|
52
|
+
def ecol(row)
|
53
|
+
row = strip row
|
54
|
+
"#{row['life_habit']} #{row['diet']}"
|
55
|
+
end
|
56
|
+
|
57
|
+
def refs
|
58
|
+
# "reference_no","record_type","ref_type","author1init","author1last",
|
59
|
+
# "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
|
60
|
+
# "editors","pubvol","pubno","firstpage","lastpage","publication_type",
|
61
|
+
# "language","doi"
|
62
|
+
|
63
|
+
# {"id":31671,"orig":true,"author":"Hahn, C. W.",
|
64
|
+
# "year":1834,"title":"Die wanzenartigen Insecten.",
|
65
|
+
# "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
|
66
|
+
# "distribution":"Germany","comment":"n. sp."}
|
67
|
+
refs = {}
|
68
|
+
@refs_csv.each do |r|
|
69
|
+
r = strip r
|
70
|
+
authorship, author = authors(r)
|
71
|
+
refs[r["reference_no"]] = { id: r["reference_no"], author: author,
|
72
|
+
authorship: authorship,
|
73
|
+
year: r["pubyr"], title: r["reftitle"],
|
74
|
+
details: details(r) }
|
75
|
+
end
|
76
|
+
f = open(File.join(@dir, "refs.json"), "w:utf-8")
|
77
|
+
f.write(JSON.pretty_generate(refs))
|
78
|
+
f.close
|
79
|
+
end
|
80
|
+
|
81
|
+
def authors(row)
|
82
|
+
row = strip row
|
83
|
+
au = ["#{row['author1init']} #{row['author1last']}".strip,
|
84
|
+
"#{row['author2init']} #{row['author2last']}".strip,
|
85
|
+
"#{row['otherauthors']}".strip]
|
86
|
+
au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
|
87
|
+
[au[0..1].join(", "), au.join(", ")]
|
88
|
+
end
|
89
|
+
|
90
|
+
def details(row)
|
91
|
+
row = strip row
|
92
|
+
ref = "#{row['pubtitle']}"
|
93
|
+
ref << " #{row['pubno']}" unless row['pubno'].empty?
|
94
|
+
ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
|
95
|
+
ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
|
96
|
+
ref << " (#{row["doi"]})" unless row['doi'].empty?
|
97
|
+
ref.gsub(/[\s]{2,}/, " ").strip
|
98
|
+
end
|
99
|
+
|
100
|
+
def taxa_refs
|
101
|
+
tr = {}
|
102
|
+
@taxa_refs_csv.each do |r|
|
103
|
+
r = strip r
|
104
|
+
row = { acc_id: r["accepted_no"], name: r["accepted_name"],
|
105
|
+
ref_id: r["reference_no"] }
|
106
|
+
if tr.key? r["accepted_no"]
|
107
|
+
tr[r["accepted_no"]] << row
|
108
|
+
else
|
109
|
+
tr[r["accepted_no"]] = [row]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
|
113
|
+
f.write(JSON.pretty_generate(tr))
|
114
|
+
f.close
|
115
|
+
end
|
116
|
+
|
117
|
+
def occurences
|
118
|
+
occ = {}
|
119
|
+
@occurences_csv.each_with_index do |r, i|
|
120
|
+
r = strip r
|
121
|
+
row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
|
122
|
+
state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
|
123
|
+
if occ.key? r["accepted_no"]
|
124
|
+
occ[r["accepted_no"]] << row
|
125
|
+
else
|
126
|
+
occ[r["accepted_no"]] = [row]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
f = open(File.join(@dir, "occurences.json"), "w:utf-8")
|
130
|
+
f.write(JSON.pretty_generate(occ))
|
131
|
+
f.close
|
132
|
+
end
|
133
|
+
|
134
|
+
def strip(row)
|
135
|
+
row.each_with_object({}) do |(k, v), h|
|
136
|
+
h[k] = v.nil? ? nil : v.strip
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceSherborn < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "sherborn"
|
7
|
+
@title = "Index Animalium"
|
8
|
+
@url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
|
9
|
+
@UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"sherborn",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading."
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack; end
|
29
|
+
|
30
|
+
def make_dwca
|
31
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
|
+
get_names
|
33
|
+
generate_dwca
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_names
|
39
|
+
Dir.chdir(@download_dir)
|
40
|
+
collect_names
|
41
|
+
end
|
42
|
+
|
43
|
+
def collect_names
|
44
|
+
dupes = {}
|
45
|
+
@names_index = {}
|
46
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
47
|
+
headers: false, col_sep: "\t")
|
48
|
+
file.each_with_index do |row, i|
|
49
|
+
next if dupes.key?(row[1])
|
50
|
+
|
51
|
+
dupes[row[1]] = true
|
52
|
+
taxon_id = row[0]
|
53
|
+
name_string = row[1]
|
54
|
+
|
55
|
+
@names << { taxon_id: taxon_id,
|
56
|
+
name_string: name_string }
|
57
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def generate_dwca
|
62
|
+
DwcaHunter.logger_write(object_id,
|
63
|
+
"Creating DarwinCore Archive file")
|
64
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
65
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
66
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
67
|
+
@names.each do |n|
|
68
|
+
@core << [n[:taxon_id], n[:name_string], "ICZN"]
|
69
|
+
end
|
70
|
+
|
71
|
+
@eml = {
|
72
|
+
id: @uuid,
|
73
|
+
title: @title,
|
74
|
+
authors: [
|
75
|
+
{ first_name: "Charles Davies",
|
76
|
+
last_name: "Sherborn" }
|
77
|
+
],
|
78
|
+
metadata_providers: [
|
79
|
+
{ first_name: "Dmitry",
|
80
|
+
last_name: "Mozzherin",
|
81
|
+
email: "dmozzherin@gmail.com" }
|
82
|
+
],
|
83
|
+
abstract: "Index Animalium is a monumental work that covers " \
|
84
|
+
"400 000 zoological names registered by science " \
|
85
|
+
"between 1758 and 1850",
|
86
|
+
url: @url
|
87
|
+
}
|
88
|
+
super
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,145 +1,113 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
4
|
+
# Wikispecies source
|
3
5
|
class ResourceWikispecies < DwcaHunter::Resource
|
4
|
-
def initialize(opts = {})
|
5
|
-
@wikisp_path = File.join(Dir.tmpdir,
|
6
|
-
@problems_file = open(File.join(Dir.tmpdir,
|
6
|
+
def initialize(opts = { download: true, unpack: true })
|
7
|
+
@wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
|
8
|
+
@problems_file = File.open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
|
7
9
|
@command = "wikispecies"
|
8
|
-
@title =
|
9
|
-
@url =
|
10
|
-
|
10
|
+
@title = "Wikispecies"
|
11
|
+
@url = "http://dumps.wikimedia.org/specieswiki/latest/" \
|
12
|
+
"specieswiki-latest-pages-articles.xml.bz2"
|
11
13
|
@url = opts[:url] if opts[:url]
|
12
|
-
@uuid =
|
13
|
-
@download_path = File.join(@wikisp_path,
|
14
|
+
@uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
|
15
|
+
@download_path = File.join(@wikisp_path, "data.xml.bz2")
|
14
16
|
@data = []
|
15
17
|
@templates = {}
|
16
18
|
@taxon_ids = {}
|
17
19
|
@tree = {}
|
18
20
|
@paths = {}
|
19
21
|
@extensions = []
|
22
|
+
@parser = Biodiversity::Parser
|
20
23
|
@re = {
|
21
|
-
page_start: /^\s
|
22
|
-
page_end:
|
24
|
+
page_start: /^\s*<page>\s*$/,
|
25
|
+
page_end: %r{^\s*</page>\s*$},
|
23
26
|
template: /Template:/i,
|
24
|
-
template_link: /\{\{([
|
25
|
-
vernacular_names: /\{\{\s*VN\s*\|([
|
27
|
+
template_link: /\{\{([^}]*)\}\}/,
|
28
|
+
vernacular_names: /\{\{\s*VN\s*\|([^}]+)\}\}/i
|
26
29
|
}
|
27
30
|
super(opts)
|
28
31
|
end
|
29
32
|
|
33
|
+
def download
|
34
|
+
puts "Downloading from the source"
|
35
|
+
`curl -L #{@url} -o #{@download_path}`
|
36
|
+
end
|
37
|
+
|
30
38
|
def unpack
|
31
39
|
unpack_bz2
|
32
40
|
end
|
33
41
|
|
34
42
|
def make_dwca
|
35
43
|
enrich_data
|
36
|
-
extend_classification
|
37
44
|
generate_dwca
|
38
45
|
end
|
39
46
|
|
40
|
-
|
47
|
+
private
|
41
48
|
|
42
49
|
def enrich_data
|
43
|
-
DwcaHunter
|
44
|
-
|
50
|
+
DwcaHunter.logger_write(object_id,
|
51
|
+
"Extracting data from xml file...")
|
45
52
|
Dir.chdir(@download_dir)
|
46
|
-
f = open(
|
53
|
+
f = open("data.xml", "r:utf-8")
|
47
54
|
page_on = false
|
48
|
-
page =
|
55
|
+
page = ""
|
49
56
|
page_num = 0
|
50
57
|
f.each do |l|
|
51
58
|
if l.match(@re[:page_start])
|
52
|
-
page
|
59
|
+
page += l
|
53
60
|
page_on = true
|
54
61
|
elsif page_on
|
55
|
-
page
|
62
|
+
page += l
|
56
63
|
if l.match(@re[:page_end])
|
57
64
|
page_on = false
|
58
65
|
page_xml = Nokogiri::XML.parse(page)
|
59
|
-
template?(page_xml)
|
60
|
-
process_template(page_xml)
|
66
|
+
if template?(page_xml)
|
67
|
+
process_template(page_xml)
|
68
|
+
else
|
61
69
|
process_species(page_xml)
|
70
|
+
end
|
62
71
|
page_num += 1
|
63
|
-
if page_num % BATCH_SIZE
|
64
|
-
DwcaHunter
|
65
|
-
|
72
|
+
if (page_num % BATCH_SIZE).zero?
|
73
|
+
DwcaHunter.logger_write(object_id,
|
74
|
+
"Traversed #{page_num} pages")
|
66
75
|
end
|
67
|
-
page =
|
76
|
+
page = ""
|
68
77
|
@page_title = nil
|
69
78
|
@page_id = nil
|
70
79
|
end
|
71
80
|
end
|
72
81
|
end
|
73
|
-
DwcaHunter
|
74
|
-
|
82
|
+
DwcaHunter.logger_write(object_id,
|
83
|
+
"Extracted total %s pages" % page_num)
|
75
84
|
f.close
|
76
85
|
end
|
77
86
|
|
78
|
-
def extend_classification
|
79
|
-
DwcaHunter::logger_write(self.object_id, 'Extending classifications')
|
80
|
-
@data.each_with_index do |d, i|
|
81
|
-
unless d[:classificationPath].empty?
|
82
|
-
n = 50
|
83
|
-
while n > 0
|
84
|
-
n -= 1
|
85
|
-
if n == 0
|
86
|
-
d[:classificationPath] = []
|
87
|
-
break
|
88
|
-
end
|
89
|
-
parent = @templates[d[:classificationPath].first]
|
90
|
-
if parent
|
91
|
-
d[:classificationPath].unshift(parent[:parentName])
|
92
|
-
else
|
93
|
-
update_tree(d[:classificationPath])
|
94
|
-
break
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
# d[:classificationPath] = d[:classificationPath].join("|").
|
99
|
-
# gsub("Main Page", "Life")
|
100
|
-
if i % BATCH_SIZE == 0 && i > 0
|
101
|
-
DwcaHunter::logger_write(self.object_id,
|
102
|
-
"Extended %s classifications" % i)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def update_tree(path)
|
108
|
-
path = path.dup
|
109
|
-
return if @paths.has_key?(path.join('|'))
|
110
|
-
(0...path.size).each do |i|
|
111
|
-
subpath = path[0..i]
|
112
|
-
subpath_string = subpath.join('|')
|
113
|
-
next if @paths.has_key?(subpath_string)
|
114
|
-
name = subpath.pop
|
115
|
-
tree_element = subpath.inject(@tree) { |res, n| res[n] }
|
116
|
-
tree_element[name] = {}
|
117
|
-
@paths[subpath_string] = 1
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
87
|
def process_template(x)
|
122
|
-
name = page_title(x).gsub!(@re[:template],
|
123
|
-
text = x.xpath(
|
88
|
+
name = page_title(x).gsub!(@re[:template], "").strip
|
89
|
+
text = x.xpath("//text").text.strip
|
124
90
|
parent_name = text.match(@re[:template_link])
|
125
91
|
if parent_name
|
126
92
|
return if parent_name[1].match(/\#if/)
|
93
|
+
|
127
94
|
list = parent_name[1].split("|")
|
128
|
-
if list.size == 1
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
95
|
+
parent_name = if list.size == 1
|
96
|
+
list[0]
|
97
|
+
elsif list[0].match(/Taxonav/i)
|
98
|
+
list[1]
|
99
|
+
else
|
100
|
+
list[0]
|
101
|
+
end
|
135
102
|
end
|
136
|
-
name.gsub!(/_/,
|
137
|
-
parent_name
|
103
|
+
name.gsub!(/_/, " ")
|
104
|
+
parent_name&.gsub!(/_/, " ")
|
138
105
|
@templates[name] = { parentName: parent_name, id: page_id(x) }
|
139
106
|
end
|
140
107
|
|
141
108
|
def process_species(x)
|
142
109
|
return if page_title(x).match(/Wikispecies/i)
|
110
|
+
|
143
111
|
items = find_species_components(x)
|
144
112
|
if items
|
145
113
|
@data << {
|
@@ -147,37 +115,44 @@ module DwcaHunter
|
|
147
115
|
canonicalForm: page_title(x),
|
148
116
|
scientificName: page_title(x),
|
149
117
|
classificationPath: [],
|
150
|
-
vernacularNames: []
|
118
|
+
vernacularNames: []
|
119
|
+
}
|
151
120
|
get_full_scientific_name(items)
|
152
121
|
get_vernacular_names(items)
|
153
|
-
init_classification_path(items)
|
154
122
|
end
|
155
123
|
end
|
156
124
|
|
157
125
|
def get_full_scientific_name(items)
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
126
|
+
name_ary = items["{{int:name}}"]
|
127
|
+
|
128
|
+
if name_ary.nil? || name_ary.empty?
|
129
|
+
@problems_file.write("%s\n" % @data[-1][:canonicalForm])
|
130
|
+
return
|
131
|
+
end
|
132
|
+
|
133
|
+
name = name_ary[0]
|
134
|
+
name = parse_name(name, @data[-1])
|
135
|
+
if name != ""
|
136
|
+
@data[-1][:scientificName] = name
|
164
137
|
end
|
165
138
|
end
|
166
139
|
|
167
140
|
def get_vernacular_names(items)
|
168
|
-
|
169
|
-
|
141
|
+
vern = items["{{int:vernacular names}}"]
|
142
|
+
if vern.is_a?(Array) && vern.size.positive?
|
143
|
+
vn_string = vern.join("")
|
170
144
|
vn = vn_string.match(@re[:vernacular_names])
|
171
145
|
if vn
|
172
146
|
vn_list = vn[1].strip.split("|")
|
173
147
|
vnames = []
|
174
148
|
vn_list.each do |item|
|
175
|
-
language, name = item.split("=").map
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
149
|
+
language, name = item.split("=").map(&:strip)
|
150
|
+
next unless language && name && language.size < 4 && name.valid_encoding?
|
151
|
+
|
152
|
+
vnames << {
|
153
|
+
name: name,
|
154
|
+
language: language
|
155
|
+
}
|
181
156
|
end
|
182
157
|
|
183
158
|
@data[-1][:vernacularNames] = vnames
|
@@ -186,26 +161,26 @@ module DwcaHunter
|
|
186
161
|
end
|
187
162
|
|
188
163
|
def init_classification_path(items)
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
end
|
164
|
+
# ignore non-template links
|
165
|
+
items["taxonavigation"]&.each do |line|
|
166
|
+
line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
|
167
|
+
next unless template_link = line.match(@re[:template_link])
|
168
|
+
|
169
|
+
template_link = template_link[1].
|
170
|
+
strip.gsub(/Template:/, "").gsub(/_/, " ")
|
171
|
+
unless template_link.match(/\|/)
|
172
|
+
@data[-1][:classificationPath] << template_link
|
173
|
+
break
|
200
174
|
end
|
201
175
|
end
|
202
176
|
end
|
203
177
|
|
204
178
|
def find_species_components(x)
|
205
|
-
items = get_items(x.xpath(
|
206
|
-
is_taxon_item = items.
|
207
|
-
items.
|
179
|
+
items = get_items(x.xpath("//text").text)
|
180
|
+
is_taxon_item = items.key?("{{int:name}}") &&
|
181
|
+
items.key?("{{int:taxonavigation}}")
|
208
182
|
return nil unless is_taxon_item
|
183
|
+
|
209
184
|
items
|
210
185
|
end
|
211
186
|
|
@@ -214,7 +189,7 @@ module DwcaHunter
|
|
214
189
|
items = {}
|
215
190
|
current_item = nil
|
216
191
|
txt.split("\n").each do |l|
|
217
|
-
item =
|
192
|
+
item = l.match(/=+([^=]+)=+/)
|
218
193
|
if item
|
219
194
|
current_item = item[1].strip.downcase
|
220
195
|
items[current_item] = []
|
@@ -226,11 +201,11 @@ module DwcaHunter
|
|
226
201
|
end
|
227
202
|
|
228
203
|
def page_title(x)
|
229
|
-
@page_title ||= x.xpath(
|
204
|
+
@page_title ||= x.xpath("//title").first.text
|
230
205
|
end
|
231
206
|
|
232
207
|
def page_id(x)
|
233
|
-
@page_id ||= x.xpath(
|
208
|
+
@page_id ||= x.xpath("//id").first.text
|
234
209
|
end
|
235
210
|
|
236
211
|
def template?(page_xml)
|
@@ -238,110 +213,117 @@ module DwcaHunter
|
|
238
213
|
end
|
239
214
|
|
240
215
|
def parse_name(name_string, taxa)
|
241
|
-
name_string.gsub!(
|
216
|
+
name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
|
242
217
|
name_string = name_string.strip
|
243
218
|
old_l = name_string.dup
|
244
|
-
name_string.gsub!
|
219
|
+
name_string.gsub!(/^\*\s*/, "")
|
245
220
|
name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
|
246
|
-
name_string.gsub!(/\{\{([
|
247
|
-
name_string.gsub!(/
|
248
|
-
name_string.gsub!(/
|
249
|
-
name_string.gsub!(
|
250
|
-
name_string.gsub!(/,\s*\[RSD\]/i,
|
251
|
-
name_string.gsub!(/^\s*†\s*/,
|
252
|
-
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/,
|
221
|
+
name_string.gsub!(/\{\{([^}]+\|)?([^}]*)\}\}/, '\2')
|
222
|
+
name_string.gsub!(/'{2,}/, " ")
|
223
|
+
name_string.gsub!(/"{2,}/, " ")
|
224
|
+
name_string.gsub!(/:\s*\d.*$/, "")
|
225
|
+
name_string.gsub!(/,\s*\[RSD\]/i, "")
|
226
|
+
name_string.gsub!(/^\s*†\s*/, "")
|
227
|
+
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
|
253
228
|
# name_string = DwcaHunter::XML.unescape(name_string)
|
254
|
-
name_string.gsub!(
|
255
|
-
name_string.gsub!(
|
256
|
-
name_string.gsub!(/^\s
|
257
|
-
name_string.gsub!(/ /,
|
258
|
-
name_string.gsub!(/\s+/,
|
259
|
-
|
260
|
-
|
261
|
-
|
229
|
+
name_string.gsub!(/<nowiki>.*$/, "")
|
230
|
+
name_string.gsub!(%r{<br\s*/?\s*>}, "")
|
231
|
+
name_string.gsub!(/^\s*†\s*/, "")
|
232
|
+
name_string.gsub!(/ /, " ")
|
233
|
+
name_string.gsub!(/\s+/, " ")
|
234
|
+
res = name_string.strip
|
235
|
+
parsed = @parser.parse(res, simple: true)
|
236
|
+
if !["1","2"].include?(parsed[:quality])
|
237
|
+
return ""
|
238
|
+
end
|
239
|
+
res
|
262
240
|
end
|
263
241
|
|
264
242
|
def generate_dwca
|
265
|
-
DwcaHunter
|
266
|
-
|
243
|
+
DwcaHunter.logger_write(object_id,
|
244
|
+
"Creating DarwinCore Archive file")
|
267
245
|
@core = [
|
268
|
-
[
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
'http://rs.tdwg.org/dwc/terms/higherClassification',
|
273
|
-
'http://purl.org/dc/terms/source']
|
246
|
+
["http://rs.tdwg.org/dwc/terms/taxonID",
|
247
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
248
|
+
"http://globalnames.org/terms/canonicalForm",
|
249
|
+
"http://purl.org/dc/terms/source"]
|
274
250
|
]
|
275
|
-
DwcaHunter
|
251
|
+
DwcaHunter.logger_write(object_id, "Assembling Core Data")
|
276
252
|
count = 0
|
277
253
|
@data.map do |d|
|
278
254
|
count += 1
|
279
|
-
if count % BATCH_SIZE
|
280
|
-
DwcaHunter
|
281
|
-
|
255
|
+
if (count % BATCH_SIZE).zero?
|
256
|
+
DwcaHunter.logger_write(object_id,
|
257
|
+
"Traversing %s core data record" % count)
|
258
|
+
end
|
259
|
+
taxon_id = begin
|
260
|
+
(if d[:classificationPath].empty?
|
261
|
+
d[:taxonId]
|
262
|
+
else
|
263
|
+
@templates[d[:classificationPath].
|
264
|
+
last][:id]
|
265
|
+
end)
|
266
|
+
rescue StandardError
|
267
|
+
d[:taxonId]
|
282
268
|
end
|
283
|
-
taxon_id = (d[:classificationPath].empty? ?
|
284
|
-
d[:taxonId] :
|
285
|
-
@templates[d[:classificationPath].
|
286
|
-
last][:id]) rescue d[:taxonId]
|
287
269
|
@taxon_ids[d[:taxonId]] = taxon_id
|
288
|
-
parentNameUsageId =
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
270
|
+
parentNameUsageId = begin
|
271
|
+
(@templates[d[:classificationPath][-2]][:id] if d[:classificationPath].size > 1)
|
272
|
+
rescue StandardError
|
273
|
+
nil
|
274
|
+
end
|
275
|
+
url = "http://species.wikimedia.org/wiki/#{CGI.escape(d[:canonicalForm].gsub(' ', '_'))}"
|
293
276
|
path = d[:classificationPath]
|
294
277
|
path.pop if path[-1] == d[:canonicalForm]
|
295
|
-
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/,
|
296
|
-
scientific_name =
|
297
|
-
|
298
|
-
|
278
|
+
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
|
279
|
+
scientific_name = if d[:scientificName] == d[:canonicalForm]
|
280
|
+
canonical_form
|
281
|
+
else
|
282
|
+
d[:scientificName]
|
283
|
+
end
|
299
284
|
@core << [taxon_id,
|
300
285
|
scientific_name,
|
301
|
-
parentNameUsageId,
|
302
286
|
canonical_form,
|
303
|
-
path.join('|'),
|
304
287
|
url]
|
305
288
|
end
|
306
289
|
@extensions << { data: [[
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
]], file_name:
|
311
|
-
DwcaHunter
|
312
|
-
|
290
|
+
"http://rs.tdwg.org/dwc/terms/TaxonID",
|
291
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
292
|
+
"http://purl.org/dc/terms/language"
|
293
|
+
]], file_name: "vernacular_names.txt" }
|
294
|
+
DwcaHunter.logger_write(object_id,
|
295
|
+
"Creating verncaular name extension for DarwinCore Archive file")
|
313
296
|
count = 0
|
314
297
|
@data.each do |d|
|
315
298
|
count += 1
|
316
|
-
if count % BATCH_SIZE
|
317
|
-
DwcaHunter
|
318
|
-
|
299
|
+
if (count % BATCH_SIZE).zero?
|
300
|
+
DwcaHunter.logger_write(object_id,
|
301
|
+
"Traversing %s extension data record" % count)
|
319
302
|
end
|
320
303
|
d[:vernacularNames].each do |vn|
|
321
|
-
taxon_id = @taxon_ids[d[:taxonId]]
|
322
|
-
if taxon_id
|
323
|
-
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
|
324
|
-
end
|
304
|
+
taxon_id = @taxon_ids[d[:taxonId]] || nil
|
305
|
+
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
|
325
306
|
end
|
326
307
|
end
|
327
308
|
@eml = {
|
328
309
|
id: @uuid,
|
329
310
|
title: @title,
|
330
|
-
license:
|
311
|
+
license: "http://creativecommons.org/licenses/by-sa/3.0/",
|
331
312
|
authors: [
|
332
|
-
{ first_name:
|
333
|
-
last_name:
|
334
|
-
email:
|
335
|
-
url:
|
336
|
-
|
313
|
+
{ first_name: "Stephen",
|
314
|
+
last_name: "Thorpe",
|
315
|
+
email: "stephen_thorpe@yahoo.co.nz",
|
316
|
+
url: "http://species.wikimedia.org/wiki/Main_Page" }
|
317
|
+
],
|
318
|
+
abstract: "The free species directory that anyone can edit.",
|
337
319
|
metadata_providers: [
|
338
|
-
{ first_name:
|
339
|
-
last_name:
|
340
|
-
email:
|
341
|
-
|
320
|
+
{ first_name: "Dmitry",
|
321
|
+
last_name: "Mozzherin",
|
322
|
+
email: "dmozzherin@mbl.edu" }
|
323
|
+
],
|
324
|
+
url: "http://species.wikimedia.org/wiki/Main_Page"
|
342
325
|
}
|
343
326
|
super
|
344
327
|
end
|
345
|
-
|
346
328
|
end
|
347
329
|
end
|