dwca_hunter 0.5.3 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +11 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +90 -84
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +13 -12
- data/exe/dwcahunter +1 -5
- data/lib/dwca_hunter.rb +33 -0
- data/lib/dwca_hunter/resource.rb +8 -3
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +115 -149
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ion.rb +98 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +186 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +166 -184
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +54 -32
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
class PaleodbHarvester
|
2
|
+
def initialize(download_dir)
|
3
|
+
@dir = File.join(download_dir, "json")
|
4
|
+
FileUtils.mkdir_p(@dir)
|
5
|
+
@in_dir = download_dir
|
6
|
+
@taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
|
7
|
+
@refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
|
8
|
+
@taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
|
9
|
+
@occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
|
10
|
+
end
|
11
|
+
|
12
|
+
def taxa
|
13
|
+
# "orig_no","taxon_no","record_type","flags","taxon_rank",
|
14
|
+
# "taxon_name","difference","accepted_no","accepted_rank",
|
15
|
+
# "accepted_name","parent_no","reference_no","is_extant","n_occs"
|
16
|
+
taxa = {}
|
17
|
+
name2id = {}
|
18
|
+
@taxa_csv.each do |r|
|
19
|
+
r = strip(r)
|
20
|
+
taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
|
21
|
+
rank: r["taxon_rank"], name: r["taxon_name"],
|
22
|
+
auth: r["taxon_attr"],
|
23
|
+
extinct: extinct(r["is_extant"]),
|
24
|
+
vernacular: r["common_name"],
|
25
|
+
annot: r["difference"], acc_id: r["accepted_no"],
|
26
|
+
acc_rank: r["accepted_rank"],
|
27
|
+
acc_name: r["accepted_name"], ecol: ecol(r),
|
28
|
+
parent_id: r["parent_no"], ref: r["reference_no"],
|
29
|
+
occs_num: r["n_occs"], enterer: enterer(r) }
|
30
|
+
|
31
|
+
name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
|
32
|
+
end
|
33
|
+
f = open(File.join(@dir, "taxa.json"), "w:utf-8")
|
34
|
+
f.write(JSON.pretty_generate(taxa))
|
35
|
+
f.close
|
36
|
+
f = open(File.join(@dir, "name_id.json"), "w:utf-8")
|
37
|
+
f.write(JSON.pretty_generate(name2id))
|
38
|
+
f.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def enterer(r)
|
42
|
+
res = [r["enterer"], r["modifier"]].map(&:to_s)
|
43
|
+
.map(&:strip).uniq.select { |e| e != "" }
|
44
|
+
res.empty? ? "" : res.join(", ")
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
def extinct(val)
|
49
|
+
val == "extinct" ? 1 : 0
|
50
|
+
end
|
51
|
+
|
52
|
+
def ecol(row)
|
53
|
+
row = strip row
|
54
|
+
"#{row['life_habit']} #{row['diet']}"
|
55
|
+
end
|
56
|
+
|
57
|
+
def refs
|
58
|
+
# "reference_no","record_type","ref_type","author1init","author1last",
|
59
|
+
# "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
|
60
|
+
# "editors","pubvol","pubno","firstpage","lastpage","publication_type",
|
61
|
+
# "language","doi"
|
62
|
+
|
63
|
+
# {"id":31671,"orig":true,"author":"Hahn, C. W.",
|
64
|
+
# "year":1834,"title":"Die wanzenartigen Insecten.",
|
65
|
+
# "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
|
66
|
+
# "distribution":"Germany","comment":"n. sp."}
|
67
|
+
refs = {}
|
68
|
+
@refs_csv.each do |r|
|
69
|
+
r = strip r
|
70
|
+
authorship, author = authors(r)
|
71
|
+
refs[r["reference_no"]] = { id: r["reference_no"], author: author,
|
72
|
+
authorship: authorship,
|
73
|
+
year: r["pubyr"], title: r["reftitle"],
|
74
|
+
details: details(r) }
|
75
|
+
end
|
76
|
+
f = open(File.join(@dir, "refs.json"), "w:utf-8")
|
77
|
+
f.write(JSON.pretty_generate(refs))
|
78
|
+
f.close
|
79
|
+
end
|
80
|
+
|
81
|
+
def authors(row)
|
82
|
+
row = strip row
|
83
|
+
au = ["#{row['author1init']} #{row['author1last']}".strip,
|
84
|
+
"#{row['author2init']} #{row['author2last']}".strip,
|
85
|
+
"#{row['otherauthors']}".strip]
|
86
|
+
au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
|
87
|
+
[au[0..1].join(", "), au.join(", ")]
|
88
|
+
end
|
89
|
+
|
90
|
+
def details(row)
|
91
|
+
row = strip row
|
92
|
+
ref = "#{row['pubtitle']}"
|
93
|
+
ref << " #{row['pubno']}" unless row['pubno'].empty?
|
94
|
+
ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
|
95
|
+
ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
|
96
|
+
ref << " (#{row["doi"]})" unless row['doi'].empty?
|
97
|
+
ref.gsub(/[\s]{2,}/, " ").strip
|
98
|
+
end
|
99
|
+
|
100
|
+
def taxa_refs
|
101
|
+
tr = {}
|
102
|
+
@taxa_refs_csv.each do |r|
|
103
|
+
r = strip r
|
104
|
+
row = { acc_id: r["accepted_no"], name: r["accepted_name"],
|
105
|
+
ref_id: r["reference_no"] }
|
106
|
+
if tr.key? r["accepted_no"]
|
107
|
+
tr[r["accepted_no"]] << row
|
108
|
+
else
|
109
|
+
tr[r["accepted_no"]] = [row]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
|
113
|
+
f.write(JSON.pretty_generate(tr))
|
114
|
+
f.close
|
115
|
+
end
|
116
|
+
|
117
|
+
def occurences
|
118
|
+
occ = {}
|
119
|
+
@occurences_csv.each_with_index do |r, i|
|
120
|
+
r = strip r
|
121
|
+
row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
|
122
|
+
state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
|
123
|
+
if occ.key? r["accepted_no"]
|
124
|
+
occ[r["accepted_no"]] << row
|
125
|
+
else
|
126
|
+
occ[r["accepted_no"]] = [row]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
f = open(File.join(@dir, "occurences.json"), "w:utf-8")
|
130
|
+
f.write(JSON.pretty_generate(occ))
|
131
|
+
f.close
|
132
|
+
end
|
133
|
+
|
134
|
+
def strip(row)
|
135
|
+
row.each_with_object({}) do |(k, v), h|
|
136
|
+
h[k] = v.nil? ? nil : v.strip
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceSherborn < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "sherborn"
|
7
|
+
@title = "Index Animalium"
|
8
|
+
@url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
|
9
|
+
@UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"sherborn",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading."
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack; end
|
29
|
+
|
30
|
+
def make_dwca
|
31
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
|
+
get_names
|
33
|
+
generate_dwca
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_names
|
39
|
+
Dir.chdir(@download_dir)
|
40
|
+
collect_names
|
41
|
+
end
|
42
|
+
|
43
|
+
def collect_names
|
44
|
+
dupes = {}
|
45
|
+
@names_index = {}
|
46
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
47
|
+
headers: false, col_sep: "\t")
|
48
|
+
file.each_with_index do |row, i|
|
49
|
+
next if dupes.key?(row[1])
|
50
|
+
|
51
|
+
dupes[row[1]] = true
|
52
|
+
taxon_id = row[0]
|
53
|
+
name_string = row[1]
|
54
|
+
|
55
|
+
@names << { taxon_id: taxon_id,
|
56
|
+
name_string: name_string }
|
57
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def generate_dwca
|
62
|
+
DwcaHunter.logger_write(object_id,
|
63
|
+
"Creating DarwinCore Archive file")
|
64
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
65
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
66
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
67
|
+
@names.each do |n|
|
68
|
+
@core << [n[:taxon_id], n[:name_string], "ICZN"]
|
69
|
+
end
|
70
|
+
|
71
|
+
@eml = {
|
72
|
+
id: @uuid,
|
73
|
+
title: @title,
|
74
|
+
authors: [
|
75
|
+
{ first_name: "Charles Davies",
|
76
|
+
last_name: "Sherborn" }
|
77
|
+
],
|
78
|
+
metadata_providers: [
|
79
|
+
{ first_name: "Dmitry",
|
80
|
+
last_name: "Mozzherin",
|
81
|
+
email: "dmozzherin@gmail.com" }
|
82
|
+
],
|
83
|
+
abstract: "Index Animalium is a monumental work that covers " \
|
84
|
+
"400 000 zoological names registered by science " \
|
85
|
+
"between 1758 and 1850",
|
86
|
+
url: @url
|
87
|
+
}
|
88
|
+
super
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,145 +1,113 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
4
|
+
# Wikispecies source
|
3
5
|
class ResourceWikispecies < DwcaHunter::Resource
|
4
|
-
def initialize(opts = {})
|
5
|
-
@wikisp_path = File.join(Dir.tmpdir,
|
6
|
-
@problems_file = open(File.join(Dir.tmpdir,
|
6
|
+
def initialize(opts = { download: true, unpack: true })
|
7
|
+
@wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
|
8
|
+
@problems_file = File.open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
|
7
9
|
@command = "wikispecies"
|
8
|
-
@title =
|
9
|
-
@url =
|
10
|
-
|
10
|
+
@title = "Wikispecies"
|
11
|
+
@url = "http://dumps.wikimedia.org/specieswiki/latest/" \
|
12
|
+
"specieswiki-latest-pages-articles.xml.bz2"
|
11
13
|
@url = opts[:url] if opts[:url]
|
12
|
-
@uuid =
|
13
|
-
@download_path = File.join(@wikisp_path,
|
14
|
+
@uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
|
15
|
+
@download_path = File.join(@wikisp_path, "data.xml.bz2")
|
14
16
|
@data = []
|
15
17
|
@templates = {}
|
16
18
|
@taxon_ids = {}
|
17
19
|
@tree = {}
|
18
20
|
@paths = {}
|
19
21
|
@extensions = []
|
22
|
+
@parser = Biodiversity::Parser
|
20
23
|
@re = {
|
21
|
-
page_start: /^\s
|
22
|
-
page_end:
|
24
|
+
page_start: /^\s*<page>\s*$/,
|
25
|
+
page_end: %r{^\s*</page>\s*$},
|
23
26
|
template: /Template:/i,
|
24
|
-
template_link: /\{\{([
|
25
|
-
vernacular_names: /\{\{\s*VN\s*\|([
|
27
|
+
template_link: /\{\{([^}]*)\}\}/,
|
28
|
+
vernacular_names: /\{\{\s*VN\s*\|([^}]+)\}\}/i
|
26
29
|
}
|
27
30
|
super(opts)
|
28
31
|
end
|
29
32
|
|
33
|
+
def download
|
34
|
+
puts "Downloading from the source"
|
35
|
+
`curl -L #{@url} -o #{@download_path}`
|
36
|
+
end
|
37
|
+
|
30
38
|
def unpack
|
31
39
|
unpack_bz2
|
32
40
|
end
|
33
41
|
|
34
42
|
def make_dwca
|
35
43
|
enrich_data
|
36
|
-
extend_classification
|
37
44
|
generate_dwca
|
38
45
|
end
|
39
46
|
|
40
|
-
|
47
|
+
private
|
41
48
|
|
42
49
|
def enrich_data
|
43
|
-
DwcaHunter
|
44
|
-
|
50
|
+
DwcaHunter.logger_write(object_id,
|
51
|
+
"Extracting data from xml file...")
|
45
52
|
Dir.chdir(@download_dir)
|
46
|
-
f = open(
|
53
|
+
f = open("data.xml", "r:utf-8")
|
47
54
|
page_on = false
|
48
|
-
page =
|
55
|
+
page = ""
|
49
56
|
page_num = 0
|
50
57
|
f.each do |l|
|
51
58
|
if l.match(@re[:page_start])
|
52
|
-
page
|
59
|
+
page += l
|
53
60
|
page_on = true
|
54
61
|
elsif page_on
|
55
|
-
page
|
62
|
+
page += l
|
56
63
|
if l.match(@re[:page_end])
|
57
64
|
page_on = false
|
58
65
|
page_xml = Nokogiri::XML.parse(page)
|
59
|
-
template?(page_xml)
|
60
|
-
process_template(page_xml)
|
66
|
+
if template?(page_xml)
|
67
|
+
process_template(page_xml)
|
68
|
+
else
|
61
69
|
process_species(page_xml)
|
70
|
+
end
|
62
71
|
page_num += 1
|
63
|
-
if page_num % BATCH_SIZE
|
64
|
-
DwcaHunter
|
65
|
-
|
72
|
+
if (page_num % BATCH_SIZE).zero?
|
73
|
+
DwcaHunter.logger_write(object_id,
|
74
|
+
"Traversed #{page_num} pages")
|
66
75
|
end
|
67
|
-
page =
|
76
|
+
page = ""
|
68
77
|
@page_title = nil
|
69
78
|
@page_id = nil
|
70
79
|
end
|
71
80
|
end
|
72
81
|
end
|
73
|
-
DwcaHunter
|
74
|
-
|
82
|
+
DwcaHunter.logger_write(object_id,
|
83
|
+
"Extracted total %s pages" % page_num)
|
75
84
|
f.close
|
76
85
|
end
|
77
86
|
|
78
|
-
def extend_classification
|
79
|
-
DwcaHunter::logger_write(self.object_id, 'Extending classifications')
|
80
|
-
@data.each_with_index do |d, i|
|
81
|
-
unless d[:classificationPath].empty?
|
82
|
-
n = 50
|
83
|
-
while n > 0
|
84
|
-
n -= 1
|
85
|
-
if n == 0
|
86
|
-
d[:classificationPath] = []
|
87
|
-
break
|
88
|
-
end
|
89
|
-
parent = @templates[d[:classificationPath].first]
|
90
|
-
if parent
|
91
|
-
d[:classificationPath].unshift(parent[:parentName])
|
92
|
-
else
|
93
|
-
update_tree(d[:classificationPath])
|
94
|
-
break
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
# d[:classificationPath] = d[:classificationPath].join("|").
|
99
|
-
# gsub("Main Page", "Life")
|
100
|
-
if i % BATCH_SIZE == 0 && i > 0
|
101
|
-
DwcaHunter::logger_write(self.object_id,
|
102
|
-
"Extended %s classifications" % i)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def update_tree(path)
|
108
|
-
path = path.dup
|
109
|
-
return if @paths.has_key?(path.join('|'))
|
110
|
-
(0...path.size).each do |i|
|
111
|
-
subpath = path[0..i]
|
112
|
-
subpath_string = subpath.join('|')
|
113
|
-
next if @paths.has_key?(subpath_string)
|
114
|
-
name = subpath.pop
|
115
|
-
tree_element = subpath.inject(@tree) { |res, n| res[n] }
|
116
|
-
tree_element[name] = {}
|
117
|
-
@paths[subpath_string] = 1
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
87
|
def process_template(x)
|
122
|
-
name = page_title(x).gsub!(@re[:template],
|
123
|
-
text = x.xpath(
|
88
|
+
name = page_title(x).gsub!(@re[:template], "").strip
|
89
|
+
text = x.xpath("//text").text.strip
|
124
90
|
parent_name = text.match(@re[:template_link])
|
125
91
|
if parent_name
|
126
92
|
return if parent_name[1].match(/\#if/)
|
93
|
+
|
127
94
|
list = parent_name[1].split("|")
|
128
|
-
if list.size == 1
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
95
|
+
parent_name = if list.size == 1
|
96
|
+
list[0]
|
97
|
+
elsif list[0].match(/Taxonav/i)
|
98
|
+
list[1]
|
99
|
+
else
|
100
|
+
list[0]
|
101
|
+
end
|
135
102
|
end
|
136
|
-
name.gsub!(/_/,
|
137
|
-
parent_name
|
103
|
+
name.gsub!(/_/, " ")
|
104
|
+
parent_name&.gsub!(/_/, " ")
|
138
105
|
@templates[name] = { parentName: parent_name, id: page_id(x) }
|
139
106
|
end
|
140
107
|
|
141
108
|
def process_species(x)
|
142
109
|
return if page_title(x).match(/Wikispecies/i)
|
110
|
+
|
143
111
|
items = find_species_components(x)
|
144
112
|
if items
|
145
113
|
@data << {
|
@@ -147,37 +115,44 @@ module DwcaHunter
|
|
147
115
|
canonicalForm: page_title(x),
|
148
116
|
scientificName: page_title(x),
|
149
117
|
classificationPath: [],
|
150
|
-
vernacularNames: []
|
118
|
+
vernacularNames: []
|
119
|
+
}
|
151
120
|
get_full_scientific_name(items)
|
152
121
|
get_vernacular_names(items)
|
153
|
-
init_classification_path(items)
|
154
122
|
end
|
155
123
|
end
|
156
124
|
|
157
125
|
def get_full_scientific_name(items)
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
126
|
+
name_ary = items["{{int:name}}"]
|
127
|
+
|
128
|
+
if name_ary.nil? || name_ary.empty?
|
129
|
+
@problems_file.write("%s\n" % @data[-1][:canonicalForm])
|
130
|
+
return
|
131
|
+
end
|
132
|
+
|
133
|
+
name = name_ary[0]
|
134
|
+
name = parse_name(name, @data[-1])
|
135
|
+
if name != ""
|
136
|
+
@data[-1][:scientificName] = name
|
164
137
|
end
|
165
138
|
end
|
166
139
|
|
167
140
|
def get_vernacular_names(items)
|
168
|
-
|
169
|
-
|
141
|
+
vern = items["{{int:vernacular names}}"]
|
142
|
+
if vern.is_a?(Array) && vern.size.positive?
|
143
|
+
vn_string = vern.join("")
|
170
144
|
vn = vn_string.match(@re[:vernacular_names])
|
171
145
|
if vn
|
172
146
|
vn_list = vn[1].strip.split("|")
|
173
147
|
vnames = []
|
174
148
|
vn_list.each do |item|
|
175
|
-
language, name = item.split("=").map
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
149
|
+
language, name = item.split("=").map(&:strip)
|
150
|
+
next unless language && name && language.size < 4 && name.valid_encoding?
|
151
|
+
|
152
|
+
vnames << {
|
153
|
+
name: name,
|
154
|
+
language: language
|
155
|
+
}
|
181
156
|
end
|
182
157
|
|
183
158
|
@data[-1][:vernacularNames] = vnames
|
@@ -186,26 +161,26 @@ module DwcaHunter
|
|
186
161
|
end
|
187
162
|
|
188
163
|
def init_classification_path(items)
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
end
|
164
|
+
# ignore non-template links
|
165
|
+
items["taxonavigation"]&.each do |line|
|
166
|
+
line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
|
167
|
+
next unless template_link = line.match(@re[:template_link])
|
168
|
+
|
169
|
+
template_link = template_link[1].
|
170
|
+
strip.gsub(/Template:/, "").gsub(/_/, " ")
|
171
|
+
unless template_link.match(/\|/)
|
172
|
+
@data[-1][:classificationPath] << template_link
|
173
|
+
break
|
200
174
|
end
|
201
175
|
end
|
202
176
|
end
|
203
177
|
|
204
178
|
def find_species_components(x)
|
205
|
-
items = get_items(x.xpath(
|
206
|
-
is_taxon_item = items.
|
207
|
-
items.
|
179
|
+
items = get_items(x.xpath("//text").text)
|
180
|
+
is_taxon_item = items.key?("{{int:name}}") &&
|
181
|
+
items.key?("{{int:taxonavigation}}")
|
208
182
|
return nil unless is_taxon_item
|
183
|
+
|
209
184
|
items
|
210
185
|
end
|
211
186
|
|
@@ -214,7 +189,7 @@ module DwcaHunter
|
|
214
189
|
items = {}
|
215
190
|
current_item = nil
|
216
191
|
txt.split("\n").each do |l|
|
217
|
-
item =
|
192
|
+
item = l.match(/=+([^=]+)=+/)
|
218
193
|
if item
|
219
194
|
current_item = item[1].strip.downcase
|
220
195
|
items[current_item] = []
|
@@ -226,11 +201,11 @@ module DwcaHunter
|
|
226
201
|
end
|
227
202
|
|
228
203
|
def page_title(x)
|
229
|
-
@page_title ||= x.xpath(
|
204
|
+
@page_title ||= x.xpath("//title").first.text
|
230
205
|
end
|
231
206
|
|
232
207
|
def page_id(x)
|
233
|
-
@page_id ||= x.xpath(
|
208
|
+
@page_id ||= x.xpath("//id").first.text
|
234
209
|
end
|
235
210
|
|
236
211
|
def template?(page_xml)
|
@@ -238,110 +213,117 @@ module DwcaHunter
|
|
238
213
|
end
|
239
214
|
|
240
215
|
def parse_name(name_string, taxa)
|
241
|
-
name_string.gsub!(
|
216
|
+
name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
|
242
217
|
name_string = name_string.strip
|
243
218
|
old_l = name_string.dup
|
244
|
-
name_string.gsub!
|
219
|
+
name_string.gsub!(/^\*\s*/, "")
|
245
220
|
name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
|
246
|
-
name_string.gsub!(/\{\{([
|
247
|
-
name_string.gsub!(/
|
248
|
-
name_string.gsub!(/
|
249
|
-
name_string.gsub!(
|
250
|
-
name_string.gsub!(/,\s*\[RSD\]/i,
|
251
|
-
name_string.gsub!(/^\s*†\s*/,
|
252
|
-
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/,
|
221
|
+
name_string.gsub!(/\{\{([^}]+\|)?([^}]*)\}\}/, '\2')
|
222
|
+
name_string.gsub!(/'{2,}/, " ")
|
223
|
+
name_string.gsub!(/"{2,}/, " ")
|
224
|
+
name_string.gsub!(/:\s*\d.*$/, "")
|
225
|
+
name_string.gsub!(/,\s*\[RSD\]/i, "")
|
226
|
+
name_string.gsub!(/^\s*†\s*/, "")
|
227
|
+
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
|
253
228
|
# name_string = DwcaHunter::XML.unescape(name_string)
|
254
|
-
name_string.gsub!(
|
255
|
-
name_string.gsub!(
|
256
|
-
name_string.gsub!(/^\s
|
257
|
-
name_string.gsub!(/ /,
|
258
|
-
name_string.gsub!(/\s+/,
|
259
|
-
|
260
|
-
|
261
|
-
|
229
|
+
name_string.gsub!(/<nowiki>.*$/, "")
|
230
|
+
name_string.gsub!(%r{<br\s*/?\s*>}, "")
|
231
|
+
name_string.gsub!(/^\s*†\s*/, "")
|
232
|
+
name_string.gsub!(/ /, " ")
|
233
|
+
name_string.gsub!(/\s+/, " ")
|
234
|
+
res = name_string.strip
|
235
|
+
parsed = @parser.parse(res, simple: true)
|
236
|
+
if !["1","2"].include?(parsed[:quality])
|
237
|
+
return ""
|
238
|
+
end
|
239
|
+
res
|
262
240
|
end
|
263
241
|
|
264
242
|
def generate_dwca
|
265
|
-
DwcaHunter
|
266
|
-
|
243
|
+
DwcaHunter.logger_write(object_id,
|
244
|
+
"Creating DarwinCore Archive file")
|
267
245
|
@core = [
|
268
|
-
[
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
'http://rs.tdwg.org/dwc/terms/higherClassification',
|
273
|
-
'http://purl.org/dc/terms/source']
|
246
|
+
["http://rs.tdwg.org/dwc/terms/taxonID",
|
247
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
248
|
+
"http://globalnames.org/terms/canonicalForm",
|
249
|
+
"http://purl.org/dc/terms/source"]
|
274
250
|
]
|
275
|
-
DwcaHunter
|
251
|
+
DwcaHunter.logger_write(object_id, "Assembling Core Data")
|
276
252
|
count = 0
|
277
253
|
@data.map do |d|
|
278
254
|
count += 1
|
279
|
-
if count % BATCH_SIZE
|
280
|
-
DwcaHunter
|
281
|
-
|
255
|
+
if (count % BATCH_SIZE).zero?
|
256
|
+
DwcaHunter.logger_write(object_id,
|
257
|
+
"Traversing %s core data record" % count)
|
258
|
+
end
|
259
|
+
taxon_id = begin
|
260
|
+
(if d[:classificationPath].empty?
|
261
|
+
d[:taxonId]
|
262
|
+
else
|
263
|
+
@templates[d[:classificationPath].
|
264
|
+
last][:id]
|
265
|
+
end)
|
266
|
+
rescue StandardError
|
267
|
+
d[:taxonId]
|
282
268
|
end
|
283
|
-
taxon_id = (d[:classificationPath].empty? ?
|
284
|
-
d[:taxonId] :
|
285
|
-
@templates[d[:classificationPath].
|
286
|
-
last][:id]) rescue d[:taxonId]
|
287
269
|
@taxon_ids[d[:taxonId]] = taxon_id
|
288
|
-
parentNameUsageId =
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
270
|
+
parentNameUsageId = begin
|
271
|
+
(@templates[d[:classificationPath][-2]][:id] if d[:classificationPath].size > 1)
|
272
|
+
rescue StandardError
|
273
|
+
nil
|
274
|
+
end
|
275
|
+
url = "http://species.wikimedia.org/wiki/#{CGI.escape(d[:canonicalForm].gsub(' ', '_'))}"
|
293
276
|
path = d[:classificationPath]
|
294
277
|
path.pop if path[-1] == d[:canonicalForm]
|
295
|
-
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/,
|
296
|
-
scientific_name =
|
297
|
-
|
298
|
-
|
278
|
+
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
|
279
|
+
scientific_name = if d[:scientificName] == d[:canonicalForm]
|
280
|
+
canonical_form
|
281
|
+
else
|
282
|
+
d[:scientificName]
|
283
|
+
end
|
299
284
|
@core << [taxon_id,
|
300
285
|
scientific_name,
|
301
|
-
parentNameUsageId,
|
302
286
|
canonical_form,
|
303
|
-
path.join('|'),
|
304
287
|
url]
|
305
288
|
end
|
306
289
|
@extensions << { data: [[
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
]], file_name:
|
311
|
-
DwcaHunter
|
312
|
-
|
290
|
+
"http://rs.tdwg.org/dwc/terms/TaxonID",
|
291
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
292
|
+
"http://purl.org/dc/terms/language"
|
293
|
+
]], file_name: "vernacular_names.txt" }
|
294
|
+
DwcaHunter.logger_write(object_id,
|
295
|
+
"Creating verncaular name extension for DarwinCore Archive file")
|
313
296
|
count = 0
|
314
297
|
@data.each do |d|
|
315
298
|
count += 1
|
316
|
-
if count % BATCH_SIZE
|
317
|
-
DwcaHunter
|
318
|
-
|
299
|
+
if (count % BATCH_SIZE).zero?
|
300
|
+
DwcaHunter.logger_write(object_id,
|
301
|
+
"Traversing %s extension data record" % count)
|
319
302
|
end
|
320
303
|
d[:vernacularNames].each do |vn|
|
321
|
-
taxon_id = @taxon_ids[d[:taxonId]]
|
322
|
-
if taxon_id
|
323
|
-
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
|
324
|
-
end
|
304
|
+
taxon_id = @taxon_ids[d[:taxonId]] || nil
|
305
|
+
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
|
325
306
|
end
|
326
307
|
end
|
327
308
|
@eml = {
|
328
309
|
id: @uuid,
|
329
310
|
title: @title,
|
330
|
-
license:
|
311
|
+
license: "http://creativecommons.org/licenses/by-sa/3.0/",
|
331
312
|
authors: [
|
332
|
-
{ first_name:
|
333
|
-
last_name:
|
334
|
-
email:
|
335
|
-
url:
|
336
|
-
|
313
|
+
{ first_name: "Stephen",
|
314
|
+
last_name: "Thorpe",
|
315
|
+
email: "stephen_thorpe@yahoo.co.nz",
|
316
|
+
url: "http://species.wikimedia.org/wiki/Main_Page" }
|
317
|
+
],
|
318
|
+
abstract: "The free species directory that anyone can edit.",
|
337
319
|
metadata_providers: [
|
338
|
-
{ first_name:
|
339
|
-
last_name:
|
340
|
-
email:
|
341
|
-
|
320
|
+
{ first_name: "Dmitry",
|
321
|
+
last_name: "Mozzherin",
|
322
|
+
email: "dmozzherin@mbl.edu" }
|
323
|
+
],
|
324
|
+
url: "http://species.wikimedia.org/wiki/Main_Page"
|
342
325
|
}
|
343
326
|
super
|
344
327
|
end
|
345
|
-
|
346
328
|
end
|
347
329
|
end
|