dwca_hunter 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.byebug_history +45 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +61 -83
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +9 -9
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +39 -8
- data/lib/dwca_hunter/resource.rb +5 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +121 -145
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/eol.rb +85 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ipni.rb +111 -0
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +46 -40
- data/files/birdlife_7.csv +0 -11862
- data/files/fishbase_taxon_cache.tsv +0 -81000
- data/files/reptile_checklist_2014_12.csv +0 -15158
- data/files/species-black.txt +0 -251
@@ -0,0 +1,140 @@
|
|
1
|
+
class PaleodbHarvester
|
2
|
+
def initialize(download_dir)
|
3
|
+
@dir = File.join(download_dir, "json")
|
4
|
+
FileUtils.mkdir_p(@dir)
|
5
|
+
@in_dir = download_dir
|
6
|
+
@taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
|
7
|
+
@refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
|
8
|
+
@taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
|
9
|
+
@occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
|
10
|
+
end
|
11
|
+
|
12
|
+
def taxa
|
13
|
+
# "orig_no","taxon_no","record_type","flags","taxon_rank",
|
14
|
+
# "taxon_name","difference","accepted_no","accepted_rank",
|
15
|
+
# "accepted_name","parent_no","reference_no","is_extant","n_occs"
|
16
|
+
taxa = {}
|
17
|
+
name2id = {}
|
18
|
+
@taxa_csv.each do |r|
|
19
|
+
r = strip(r)
|
20
|
+
taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
|
21
|
+
rank: r["taxon_rank"], name: r["taxon_name"],
|
22
|
+
auth: r["taxon_attr"],
|
23
|
+
extinct: extinct(r["is_extant"]),
|
24
|
+
vernacular: r["common_name"],
|
25
|
+
annot: r["difference"], acc_id: r["accepted_no"],
|
26
|
+
acc_rank: r["accepted_rank"],
|
27
|
+
acc_name: r["accepted_name"], ecol: ecol(r),
|
28
|
+
parent_id: r["parent_no"], ref: r["reference_no"],
|
29
|
+
occs_num: r["n_occs"], enterer: enterer(r) }
|
30
|
+
|
31
|
+
name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
|
32
|
+
end
|
33
|
+
f = open(File.join(@dir, "taxa.json"), "w:utf-8")
|
34
|
+
f.write(JSON.pretty_generate(taxa))
|
35
|
+
f.close
|
36
|
+
f = open(File.join(@dir, "name_id.json"), "w:utf-8")
|
37
|
+
f.write(JSON.pretty_generate(name2id))
|
38
|
+
f.close
|
39
|
+
end
|
40
|
+
|
41
|
+
def enterer(r)
|
42
|
+
res = [r["enterer"], r["modifier"]].map(&:to_s)
|
43
|
+
.map(&:strip).uniq.select { |e| e != "" }
|
44
|
+
res.empty? ? "" : res.join(", ")
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
def extinct(val)
|
49
|
+
val == "extinct" ? 1 : 0
|
50
|
+
end
|
51
|
+
|
52
|
+
def ecol(row)
|
53
|
+
row = strip row
|
54
|
+
"#{row['life_habit']} #{row['diet']}"
|
55
|
+
end
|
56
|
+
|
57
|
+
def refs
|
58
|
+
# "reference_no","record_type","ref_type","author1init","author1last",
|
59
|
+
# "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
|
60
|
+
# "editors","pubvol","pubno","firstpage","lastpage","publication_type",
|
61
|
+
# "language","doi"
|
62
|
+
|
63
|
+
# {"id":31671,"orig":true,"author":"Hahn, C. W.",
|
64
|
+
# "year":1834,"title":"Die wanzenartigen Insecten.",
|
65
|
+
# "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
|
66
|
+
# "distribution":"Germany","comment":"n. sp."}
|
67
|
+
refs = {}
|
68
|
+
@refs_csv.each do |r|
|
69
|
+
r = strip r
|
70
|
+
authorship, author = authors(r)
|
71
|
+
refs[r["reference_no"]] = { id: r["reference_no"], author: author,
|
72
|
+
authorship: authorship,
|
73
|
+
year: r["pubyr"], title: r["reftitle"],
|
74
|
+
details: details(r) }
|
75
|
+
end
|
76
|
+
f = open(File.join(@dir, "refs.json"), "w:utf-8")
|
77
|
+
f.write(JSON.pretty_generate(refs))
|
78
|
+
f.close
|
79
|
+
end
|
80
|
+
|
81
|
+
def authors(row)
|
82
|
+
row = strip row
|
83
|
+
au = ["#{row['author1init']} #{row['author1last']}".strip,
|
84
|
+
"#{row['author2init']} #{row['author2last']}".strip,
|
85
|
+
"#{row['otherauthors']}".strip]
|
86
|
+
au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
|
87
|
+
[au[0..1].join(", "), au.join(", ")]
|
88
|
+
end
|
89
|
+
|
90
|
+
def details(row)
|
91
|
+
row = strip row
|
92
|
+
ref = "#{row['pubtitle']}"
|
93
|
+
ref << " #{row['pubno']}" unless row['pubno'].empty?
|
94
|
+
ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
|
95
|
+
ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
|
96
|
+
ref << " (#{row["doi"]})" unless row['doi'].empty?
|
97
|
+
ref.gsub(/[\s]{2,}/, " ").strip
|
98
|
+
end
|
99
|
+
|
100
|
+
def taxa_refs
|
101
|
+
tr = {}
|
102
|
+
@taxa_refs_csv.each do |r|
|
103
|
+
r = strip r
|
104
|
+
row = { acc_id: r["accepted_no"], name: r["accepted_name"],
|
105
|
+
ref_id: r["reference_no"] }
|
106
|
+
if tr.key? r["accepted_no"]
|
107
|
+
tr[r["accepted_no"]] << row
|
108
|
+
else
|
109
|
+
tr[r["accepted_no"]] = [row]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
|
113
|
+
f.write(JSON.pretty_generate(tr))
|
114
|
+
f.close
|
115
|
+
end
|
116
|
+
|
117
|
+
def occurences
|
118
|
+
occ = {}
|
119
|
+
@occurences_csv.each_with_index do |r, i|
|
120
|
+
r = strip r
|
121
|
+
row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
|
122
|
+
state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
|
123
|
+
if occ.key? r["accepted_no"]
|
124
|
+
occ[r["accepted_no"]] << row
|
125
|
+
else
|
126
|
+
occ[r["accepted_no"]] = [row]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
f = open(File.join(@dir, "occurences.json"), "w:utf-8")
|
130
|
+
f.write(JSON.pretty_generate(occ))
|
131
|
+
f.close
|
132
|
+
end
|
133
|
+
|
134
|
+
def strip(row)
|
135
|
+
row.each_with_object({}) do |(k, v), h|
|
136
|
+
h[k] = v.nil? ? nil : v.strip
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceSherborn < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "sherborn"
|
7
|
+
@title = "Index Animalium"
|
8
|
+
@url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
|
9
|
+
@UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"sherborn",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading."
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack; end
|
29
|
+
|
30
|
+
def make_dwca
|
31
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
|
+
get_names
|
33
|
+
generate_dwca
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_names
|
39
|
+
Dir.chdir(@download_dir)
|
40
|
+
collect_names
|
41
|
+
end
|
42
|
+
|
43
|
+
def collect_names
|
44
|
+
dupes = {}
|
45
|
+
@names_index = {}
|
46
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
47
|
+
headers: false, col_sep: "\t")
|
48
|
+
file.each_with_index do |row, i|
|
49
|
+
next if dupes.key?(row[1])
|
50
|
+
|
51
|
+
dupes[row[1]] = true
|
52
|
+
taxon_id = row[0]
|
53
|
+
name_string = row[1]
|
54
|
+
|
55
|
+
@names << { taxon_id: taxon_id,
|
56
|
+
name_string: name_string }
|
57
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def generate_dwca
|
62
|
+
DwcaHunter.logger_write(object_id,
|
63
|
+
"Creating DarwinCore Archive file")
|
64
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
65
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
66
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
67
|
+
@names.each do |n|
|
68
|
+
@core << [n[:taxon_id], n[:name_string], "ICZN"]
|
69
|
+
end
|
70
|
+
|
71
|
+
@eml = {
|
72
|
+
id: @uuid,
|
73
|
+
title: @title,
|
74
|
+
authors: [
|
75
|
+
{ first_name: "Charles Davies",
|
76
|
+
last_name: "Sherborn" }
|
77
|
+
],
|
78
|
+
metadata_providers: [
|
79
|
+
{ first_name: "Dmitry",
|
80
|
+
last_name: "Mozzherin",
|
81
|
+
email: "dmozzherin@gmail.com" }
|
82
|
+
],
|
83
|
+
abstract: "Index Animalium is a monumental work that covers " \
|
84
|
+
"400 000 zoological names registered by science " \
|
85
|
+
"between 1758 and 1850",
|
86
|
+
url: @url
|
87
|
+
}
|
88
|
+
super
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,18 +1,17 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceWikispecies < DwcaHunter::Resource
|
4
5
|
def initialize(opts = {})
|
5
|
-
@
|
6
|
+
@wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
|
7
|
+
@problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
|
6
8
|
@command = "wikispecies"
|
7
|
-
@title =
|
8
|
-
@url =
|
9
|
-
|
9
|
+
@title = "Wikispecies"
|
10
|
+
@url = "http://dumps.wikimedia.org/specieswiki/latest/" \
|
11
|
+
"specieswiki-latest-pages-articles.xml.bz2"
|
10
12
|
@url = opts[:url] if opts[:url]
|
11
|
-
@uuid =
|
12
|
-
@download_path = File.join(
|
13
|
-
'dwca_hunter',
|
14
|
-
'wikispecies',
|
15
|
-
'data.xml.bz2')
|
13
|
+
@uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
|
14
|
+
@download_path = File.join(@wikisp_path, "data.xml.bz2")
|
16
15
|
@data = []
|
17
16
|
@templates = {}
|
18
17
|
@taxon_ids = {}
|
@@ -21,7 +20,7 @@ module DwcaHunter
|
|
21
20
|
@extensions = []
|
22
21
|
@re = {
|
23
22
|
page_start: /^\s*\<page\>\s*$/,
|
24
|
-
page_end:
|
23
|
+
page_end: %r{^\s*\</page\>\s*$},
|
25
24
|
template: /Template:/i,
|
26
25
|
template_link: /\{\{([^\}]*)\}\}/,
|
27
26
|
vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
|
@@ -29,6 +28,11 @@ module DwcaHunter
|
|
29
28
|
super(opts)
|
30
29
|
end
|
31
30
|
|
31
|
+
def download
|
32
|
+
puts "Downloading from the source"
|
33
|
+
`curl -L #{@url} -o #{@download_path}`
|
34
|
+
end
|
35
|
+
|
32
36
|
def unpack
|
33
37
|
unpack_bz2
|
34
38
|
end
|
@@ -39,22 +43,22 @@ module DwcaHunter
|
|
39
43
|
generate_dwca
|
40
44
|
end
|
41
45
|
|
42
|
-
|
46
|
+
private
|
43
47
|
|
44
48
|
def enrich_data
|
45
|
-
DwcaHunter
|
46
|
-
|
49
|
+
DwcaHunter.logger_write(object_id,
|
50
|
+
"Extracting data from xml file...")
|
47
51
|
Dir.chdir(@download_dir)
|
48
|
-
f = open(
|
52
|
+
f = open("data.xml", "r:utf-8")
|
49
53
|
page_on = false
|
50
|
-
page =
|
54
|
+
page = ""
|
51
55
|
page_num = 0
|
52
56
|
f.each do |l|
|
53
57
|
if l.match(@re[:page_start])
|
54
|
-
page
|
58
|
+
page += l
|
55
59
|
page_on = true
|
56
60
|
elsif page_on
|
57
|
-
page
|
61
|
+
page += l
|
58
62
|
if l.match(@re[:page_end])
|
59
63
|
page_on = false
|
60
64
|
page_xml = Nokogiri::XML.parse(page)
|
@@ -63,22 +67,22 @@ module DwcaHunter
|
|
63
67
|
process_species(page_xml)
|
64
68
|
page_num += 1
|
65
69
|
if page_num % BATCH_SIZE == 0
|
66
|
-
DwcaHunter
|
67
|
-
|
70
|
+
DwcaHunter.logger_write(object_id,
|
71
|
+
"Traversed %s pages" % page_num)
|
68
72
|
end
|
69
|
-
page =
|
73
|
+
page = ""
|
70
74
|
@page_title = nil
|
71
75
|
@page_id = nil
|
72
76
|
end
|
73
77
|
end
|
74
78
|
end
|
75
|
-
DwcaHunter
|
76
|
-
|
79
|
+
DwcaHunter.logger_write(object_id,
|
80
|
+
"Extracted total %s pages" % page_num)
|
77
81
|
f.close
|
78
82
|
end
|
79
83
|
|
80
84
|
def extend_classification
|
81
|
-
DwcaHunter
|
85
|
+
DwcaHunter.logger_write(object_id, "Extending classifications")
|
82
86
|
@data.each_with_index do |d, i|
|
83
87
|
unless d[:classificationPath].empty?
|
84
88
|
n = 50
|
@@ -100,19 +104,21 @@ module DwcaHunter
|
|
100
104
|
# d[:classificationPath] = d[:classificationPath].join("|").
|
101
105
|
# gsub("Main Page", "Life")
|
102
106
|
if i % BATCH_SIZE == 0 && i > 0
|
103
|
-
DwcaHunter
|
104
|
-
|
107
|
+
DwcaHunter.logger_write(object_id,
|
108
|
+
"Extended %s classifications" % i)
|
105
109
|
end
|
106
110
|
end
|
107
111
|
end
|
108
112
|
|
109
113
|
def update_tree(path)
|
110
114
|
path = path.dup
|
111
|
-
return if @paths.
|
115
|
+
return if @paths.key?(path.join("|"))
|
116
|
+
|
112
117
|
(0...path.size).each do |i|
|
113
118
|
subpath = path[0..i]
|
114
|
-
subpath_string = subpath.join(
|
115
|
-
next if @paths.
|
119
|
+
subpath_string = subpath.join("|")
|
120
|
+
next if @paths.key?(subpath_string)
|
121
|
+
|
116
122
|
name = subpath.pop
|
117
123
|
tree_element = subpath.inject(@tree) { |res, n| res[n] }
|
118
124
|
tree_element[name] = {}
|
@@ -121,27 +127,29 @@ module DwcaHunter
|
|
121
127
|
end
|
122
128
|
|
123
129
|
def process_template(x)
|
124
|
-
name = page_title(x).gsub!(@re[:template],
|
125
|
-
text = x.xpath(
|
130
|
+
name = page_title(x).gsub!(@re[:template], "").strip
|
131
|
+
text = x.xpath("//text").text.strip
|
126
132
|
parent_name = text.match(@re[:template_link])
|
127
133
|
if parent_name
|
128
134
|
return if parent_name[1].match(/\#if/)
|
135
|
+
|
129
136
|
list = parent_name[1].split("|")
|
130
|
-
if list.size == 1
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
+
parent_name = if list.size == 1
|
138
|
+
list[0]
|
139
|
+
elsif list[0].match(/Taxonav/i)
|
140
|
+
list[1]
|
141
|
+
else
|
142
|
+
list[0]
|
143
|
+
end
|
137
144
|
end
|
138
|
-
name.gsub!(/_/,
|
139
|
-
parent_name
|
145
|
+
name.gsub!(/_/, " ")
|
146
|
+
parent_name&.gsub!(/_/, " ")
|
140
147
|
@templates[name] = { parentName: parent_name, id: page_id(x) }
|
141
148
|
end
|
142
149
|
|
143
150
|
def process_species(x)
|
144
151
|
return if page_title(x).match(/Wikispecies/i)
|
152
|
+
|
145
153
|
items = find_species_components(x)
|
146
154
|
if items
|
147
155
|
@data << {
|
@@ -149,7 +157,8 @@ module DwcaHunter
|
|
149
157
|
canonicalForm: page_title(x),
|
150
158
|
scientificName: page_title(x),
|
151
159
|
classificationPath: [],
|
152
|
-
vernacularNames: []
|
160
|
+
vernacularNames: []
|
161
|
+
}
|
153
162
|
get_full_scientific_name(items)
|
154
163
|
get_vernacular_names(items)
|
155
164
|
init_classification_path(items)
|
@@ -157,8 +166,8 @@ module DwcaHunter
|
|
157
166
|
end
|
158
167
|
|
159
168
|
def get_full_scientific_name(items)
|
160
|
-
if items[
|
161
|
-
if name = items[
|
169
|
+
if items["name"]
|
170
|
+
if name = items["name"][0]
|
162
171
|
@data[-1][:scientificName] = parse_name(name, @data[-1])
|
163
172
|
else
|
164
173
|
@problems_file.write("%s\n" % @data[-1][:canonicalForm])
|
@@ -167,19 +176,20 @@ module DwcaHunter
|
|
167
176
|
end
|
168
177
|
|
169
178
|
def get_vernacular_names(items)
|
170
|
-
if items[
|
171
|
-
vn_string = items[
|
179
|
+
if items["vernacular names"] && !items["vernacular names"].empty?
|
180
|
+
vn_string = items["vernacular names"].join("")
|
172
181
|
vn = vn_string.match(@re[:vernacular_names])
|
173
182
|
if vn
|
174
183
|
vn_list = vn[1].strip.split("|")
|
175
184
|
vnames = []
|
176
185
|
vn_list.each do |item|
|
177
|
-
language, name = item.split("=").map
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
186
|
+
language, name = item.split("=").map(&:strip)
|
187
|
+
next unless language && name && language.size < 4 && name.valid_encoding?
|
188
|
+
|
189
|
+
vnames << {
|
190
|
+
name: name,
|
191
|
+
language: language
|
192
|
+
}
|
183
193
|
end
|
184
194
|
|
185
195
|
@data[-1][:vernacularNames] = vnames
|
@@ -188,26 +198,26 @@ module DwcaHunter
|
|
188
198
|
end
|
189
199
|
|
190
200
|
def init_classification_path(items)
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
end
|
201
|
+
# ignore non-template links
|
202
|
+
items["taxonavigation"]&.each do |line|
|
203
|
+
line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
|
204
|
+
next unless template_link = line.match(@re[:template_link])
|
205
|
+
|
206
|
+
template_link = template_link[1].
|
207
|
+
strip.gsub(/Template:/, "").gsub(/_/, " ")
|
208
|
+
unless template_link.match(/\|/)
|
209
|
+
@data[-1][:classificationPath] << template_link
|
210
|
+
break
|
202
211
|
end
|
203
212
|
end
|
204
213
|
end
|
205
214
|
|
206
215
|
def find_species_components(x)
|
207
|
-
items = get_items(x.xpath(
|
208
|
-
is_taxon_item = items.
|
209
|
-
items.
|
216
|
+
items = get_items(x.xpath("//text").text)
|
217
|
+
is_taxon_item = items.key?("name") ||
|
218
|
+
items.key?("taxonavigation")
|
210
219
|
return nil unless is_taxon_item
|
220
|
+
|
211
221
|
items
|
212
222
|
end
|
213
223
|
|
@@ -216,7 +226,7 @@ module DwcaHunter
|
|
216
226
|
items = {}
|
217
227
|
current_item = nil
|
218
228
|
txt.split("\n").each do |l|
|
219
|
-
item =
|
229
|
+
item = l.match(/[\=]+([^\=]+)[\=]+/)
|
220
230
|
if item
|
221
231
|
current_item = item[1].strip.downcase
|
222
232
|
items[current_item] = []
|
@@ -228,11 +238,11 @@ module DwcaHunter
|
|
228
238
|
end
|
229
239
|
|
230
240
|
def page_title(x)
|
231
|
-
@page_title ||= x.xpath(
|
241
|
+
@page_title ||= x.xpath("//title").first.text
|
232
242
|
end
|
233
243
|
|
234
244
|
def page_id(x)
|
235
|
-
@page_id ||= x.xpath(
|
245
|
+
@page_id ||= x.xpath("//id").first.text
|
236
246
|
end
|
237
247
|
|
238
248
|
def template?(page_xml)
|
@@ -240,110 +250,113 @@ module DwcaHunter
|
|
240
250
|
end
|
241
251
|
|
242
252
|
def parse_name(name_string, taxa)
|
243
|
-
name_string.gsub!(
|
253
|
+
name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
|
244
254
|
name_string = name_string.strip
|
245
255
|
old_l = name_string.dup
|
246
|
-
name_string.gsub!
|
256
|
+
name_string.gsub!(/^\*\s*/, "")
|
247
257
|
name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
|
248
258
|
name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
|
249
|
-
name_string.gsub!(/[']{2,}/,
|
250
|
-
name_string.gsub!(/["]{2,}/,
|
251
|
-
name_string.gsub!(/\:\s*\d.*$/,
|
252
|
-
name_string.gsub!(/,\s*\[RSD\]/i,
|
253
|
-
name_string.gsub!(/^\s*†\s*/,
|
254
|
-
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/,
|
259
|
+
name_string.gsub!(/[']{2,}/, " ")
|
260
|
+
name_string.gsub!(/["]{2,}/, " ")
|
261
|
+
name_string.gsub!(/\:\s*\d.*$/, "")
|
262
|
+
name_string.gsub!(/,\s*\[RSD\]/i, "")
|
263
|
+
name_string.gsub!(/^\s*†\s*/, "")
|
264
|
+
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
|
255
265
|
# name_string = DwcaHunter::XML.unescape(name_string)
|
256
|
-
name_string.gsub!(/\<nowiki\>.*$/,
|
257
|
-
name_string.gsub!(
|
258
|
-
name_string.gsub!(/^\s*\†\s*/,
|
259
|
-
name_string.gsub!(/ /,
|
260
|
-
name_string.gsub!(/\s+/,
|
266
|
+
name_string.gsub!(/\<nowiki\>.*$/, "")
|
267
|
+
name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
|
268
|
+
name_string.gsub!(/^\s*\†\s*/, "")
|
269
|
+
name_string.gsub!(/ /, " ")
|
270
|
+
name_string.gsub!(/\s+/, " ")
|
261
271
|
name_string = name_string.strip
|
262
272
|
# puts "%s---%s" % [name_string, old_l]
|
263
|
-
|
273
|
+
name_string
|
264
274
|
end
|
265
275
|
|
266
276
|
def generate_dwca
|
267
|
-
DwcaHunter
|
268
|
-
|
277
|
+
DwcaHunter.logger_write(object_id,
|
278
|
+
"Creating DarwinCore Archive file")
|
269
279
|
@core = [
|
270
|
-
[
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
'http://rs.tdwg.org/dwc/terms/higherClassification',
|
275
|
-
'http://purl.org/dc/terms/source']
|
280
|
+
["http://rs.tdwg.org/dwc/terms/taxonID",
|
281
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
282
|
+
"http://globalnames.org/terms/canonicalForm",
|
283
|
+
"http://purl.org/dc/terms/source"]
|
276
284
|
]
|
277
|
-
DwcaHunter
|
285
|
+
DwcaHunter.logger_write(object_id, "Assembling Core Data")
|
278
286
|
count = 0
|
279
287
|
@data.map do |d|
|
280
288
|
count += 1
|
281
289
|
if count % BATCH_SIZE == 0
|
282
|
-
DwcaHunter
|
283
|
-
|
290
|
+
DwcaHunter.logger_write(object_id,
|
291
|
+
"Traversing %s core data record" % count)
|
284
292
|
end
|
285
|
-
taxon_id =
|
286
|
-
|
287
|
-
|
288
|
-
|
293
|
+
taxon_id = begin
|
294
|
+
(d[:classificationPath].empty? ?
|
295
|
+
d[:taxonId] :
|
296
|
+
@templates[d[:classificationPath].
|
297
|
+
last][:id])
|
298
|
+
rescue StandardError
|
299
|
+
d[:taxonId]
|
300
|
+
end
|
289
301
|
@taxon_ids[d[:taxonId]] = taxon_id
|
290
|
-
parentNameUsageId =
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
302
|
+
parentNameUsageId = begin
|
303
|
+
(d[:classificationPath].size > 1 ?
|
304
|
+
@templates[d[:classificationPath][-2]][:id] :
|
305
|
+
nil)
|
306
|
+
rescue StandardError
|
307
|
+
nil
|
308
|
+
end
|
309
|
+
url = "http://species.wikimedia.org/wiki/" +
|
310
|
+
URI.encode(d[:canonicalForm].gsub(" ", "_"))
|
295
311
|
path = d[:classificationPath]
|
296
312
|
path.pop if path[-1] == d[:canonicalForm]
|
297
|
-
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/,
|
298
|
-
scientific_name =
|
313
|
+
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
|
314
|
+
scientific_name = d[:scientificName] == d[:canonicalForm] ?
|
299
315
|
canonical_form :
|
300
316
|
d[:scientificName]
|
301
317
|
@core << [taxon_id,
|
302
318
|
scientific_name,
|
303
|
-
parentNameUsageId,
|
304
319
|
canonical_form,
|
305
|
-
path.join('|'),
|
306
320
|
url]
|
307
321
|
end
|
308
322
|
@extensions << { data: [[
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
]], file_name:
|
313
|
-
DwcaHunter
|
314
|
-
|
323
|
+
"http://rs.tdwg.org/dwc/terms/TaxonID",
|
324
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
325
|
+
"http://purl.org/dc/terms/language"
|
326
|
+
]], file_name: "vernacular_names.txt" }
|
327
|
+
DwcaHunter.logger_write(object_id,
|
328
|
+
"Creating verncaular name extension for DarwinCore Archive file")
|
315
329
|
count = 0
|
316
330
|
@data.each do |d|
|
317
331
|
count += 1
|
318
332
|
if count % BATCH_SIZE == 0
|
319
|
-
DwcaHunter
|
320
|
-
|
333
|
+
DwcaHunter.logger_write(object_id,
|
334
|
+
"Traversing %s extension data record" % count)
|
321
335
|
end
|
322
336
|
d[:vernacularNames].each do |vn|
|
323
|
-
taxon_id = @taxon_ids[d[:taxonId]]
|
324
|
-
if taxon_id
|
325
|
-
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
|
326
|
-
end
|
337
|
+
taxon_id = @taxon_ids[d[:taxonId]] || nil
|
338
|
+
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
|
327
339
|
end
|
328
340
|
end
|
329
341
|
@eml = {
|
330
342
|
id: @uuid,
|
331
343
|
title: @title,
|
332
|
-
license:
|
344
|
+
license: "http://creativecommons.org/licenses/by-sa/3.0/",
|
333
345
|
authors: [
|
334
|
-
{ first_name:
|
335
|
-
last_name:
|
336
|
-
email:
|
337
|
-
url:
|
338
|
-
|
346
|
+
{ first_name: "Stephen",
|
347
|
+
last_name: "Thorpe",
|
348
|
+
email: "stephen_thorpe@yahoo.co.nz",
|
349
|
+
url: "http://species.wikimedia.org/wiki/Main_Page" }
|
350
|
+
],
|
351
|
+
abstract: "The free species directory that anyone can edit.",
|
339
352
|
metadata_providers: [
|
340
|
-
{ first_name:
|
341
|
-
last_name:
|
342
|
-
email:
|
343
|
-
|
353
|
+
{ first_name: "Dmitry",
|
354
|
+
last_name: "Mozzherin",
|
355
|
+
email: "dmozzherin@mbl.edu" }
|
356
|
+
],
|
357
|
+
url: "http://species.wikimedia.org/wiki/Main_Page"
|
344
358
|
}
|
345
359
|
super
|
346
360
|
end
|
347
|
-
|
348
361
|
end
|
349
362
|
end
|