taxpub 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +29 -0
- data/lib/taxpub.rb +242 -0
- data/lib/taxpub/exceptions.rb +5 -0
- data/lib/taxpub/reference.rb +68 -0
- data/lib/taxpub/utils.rb +20 -0
- data/lib/taxpub/validator.rb +31 -0
- data/lib/taxpub/version.rb +7 -0
- metadata +123 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8483945f3e3ce760f89271e2f09b6cdb75887766
|
4
|
+
data.tar.gz: 2b697ef9a5bb87bd9bf522be5c0acb0cf4a62e77
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a339a2fe4cb2eb53c4e37a73ca239363c27618622cf9ab693cdf0664d6a85d6c1985f5441b58fd058f1d304b386419670e4f2ecdcc298df2f24d5f5c7550d3f2
|
7
|
+
data.tar.gz: '05096aab5b7b4db19eafe9c4d8cd7a780b77586a06142d4d25e3667ad985df8eaee0c97f12f5f074c0d4ef8b34caeae76d2fca260c2908f3155ef5241746f14c'
|
data/README.rdoc
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
= taxpub
|
2
|
+
|
3
|
+
Ruby 2.4.1 gem to parse TaxPub documents like those produced by Pensoft Publishers, https://pensoft.net/.
|
4
|
+
|
5
|
+
TaxPub Background: https://www.ncbi.nlm.nih.gov/books/NBK47081/
|
6
|
+
|
7
|
+
== Usage
|
8
|
+
|
9
|
+
> require "taxpub"
|
10
|
+
> tp = TaxPub.new
|
11
|
+
> tp.url = "https://tdwgproceedings.pensoft.net/article/19829/download/xml/"
|
12
|
+
> tp.parse
|
13
|
+
> tp.doi
|
14
|
+
|
15
|
+
== Contributing to taxpub
|
16
|
+
|
17
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
18
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
19
|
+
* Fork the project.
|
20
|
+
* Start a feature/bugfix branch.
|
21
|
+
* Commit and push until you are happy with your contribution.
|
22
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
23
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
24
|
+
|
25
|
+
== Copyright
|
26
|
+
|
27
|
+
Copyright (c) 2017 David Shorthouse. See LICENSE.txt for
|
28
|
+
further details.
|
29
|
+
|
data/lib/taxpub.rb
ADDED
@@ -0,0 +1,242 @@
|
|
1
|
+
require "taxpub/exceptions"
|
2
|
+
require "taxpub/validator"
|
3
|
+
require "taxpub/utils"
|
4
|
+
require "taxpub/reference"
|
5
|
+
require "taxpub/version"
|
6
|
+
require "nokogiri"
|
7
|
+
require "open-uri"
|
8
|
+
require "set"
|
9
|
+
|
10
|
+
class TaxPub
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@parameters = {}
|
14
|
+
@doc = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# View the built parameters
|
19
|
+
#
|
20
|
+
def params
|
21
|
+
@parameters
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# Specify a remote TaxPub URL
|
26
|
+
# Source must be an xml file
|
27
|
+
#
|
28
|
+
# == Example
|
29
|
+
#
|
30
|
+
# instance.url = "https://tdwgproceedings.pensoft.net/article/15141/download/xml/"
|
31
|
+
#
|
32
|
+
def url=(url)
|
33
|
+
Validator.validate_url(url)
|
34
|
+
@parameters[:url] = url
|
35
|
+
end
|
36
|
+
|
37
|
+
def url
|
38
|
+
@parameters[:url] || nil
|
39
|
+
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# Set a file path for a TaxPub XML file
|
43
|
+
#
|
44
|
+
# == Example
|
45
|
+
#
|
46
|
+
# instance.file_path = "/Users/jane/Desktop/taxpub.xml"
|
47
|
+
#
|
48
|
+
def file_path=(file_path)
|
49
|
+
Validator.validate_type(file_path, 'File')
|
50
|
+
@parameters[:file] = File.new(file_path, "r")
|
51
|
+
end
|
52
|
+
|
53
|
+
def file_path
|
54
|
+
@parameters[:file].path rescue nil
|
55
|
+
end
|
56
|
+
|
57
|
+
##
|
58
|
+
# Build the Nokogiri document
|
59
|
+
#
|
60
|
+
def parse
|
61
|
+
if url
|
62
|
+
@doc = Nokogiri::XML(open(url))
|
63
|
+
elsif file_path
|
64
|
+
@doc = File.open(file_path) { |f| Nokogiri::XML(f) }
|
65
|
+
end
|
66
|
+
Validator.validate_nokogiri(@doc)
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
# View the parsed Nokogiri document
|
71
|
+
#
|
72
|
+
def doc
|
73
|
+
@doc
|
74
|
+
end
|
75
|
+
|
76
|
+
##
|
77
|
+
# Get the raw text content of the Nokogiri document
|
78
|
+
#
|
79
|
+
def content
|
80
|
+
Utils.clean_text(@doc.text)
|
81
|
+
end
|
82
|
+
|
83
|
+
##
|
84
|
+
# Get the DOI
|
85
|
+
#
|
86
|
+
def doi
|
87
|
+
Validator.validate_nokogiri(@doc)
|
88
|
+
Utils.expand_doi(@doc.xpath("//*/article-meta/article-id[@pub-id-type='doi']").text)
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# Get the title
|
93
|
+
#
|
94
|
+
def title
|
95
|
+
Validator.validate_nokogiri(@doc)
|
96
|
+
t = @doc.xpath("//*/article-meta/title-group/article-title").text
|
97
|
+
Utils.clean_text(t)
|
98
|
+
end
|
99
|
+
|
100
|
+
##
|
101
|
+
# Get the abstract
|
102
|
+
#
|
103
|
+
def abstract
|
104
|
+
Validator.validate_nokogiri(@doc)
|
105
|
+
a = @doc.xpath("//*/article-meta/abstract").text
|
106
|
+
Utils.clean_text(a)
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# Get the keywords
|
111
|
+
#
|
112
|
+
def keywords
|
113
|
+
Validator.validate_nokogiri(@doc)
|
114
|
+
@doc.xpath("//*/article-meta/kwd-group/kwd")
|
115
|
+
.map{|a| Utils.clean_text(a.text)}
|
116
|
+
end
|
117
|
+
|
118
|
+
##
|
119
|
+
# Get the authors
|
120
|
+
#
|
121
|
+
def authors
|
122
|
+
Validator.validate_nokogiri(@doc)
|
123
|
+
data = []
|
124
|
+
@doc.xpath("//*/contrib[@contrib-type='author']").each do |author|
|
125
|
+
affiliations = []
|
126
|
+
author.xpath("xref/@rid").each do |rid|
|
127
|
+
xpath = "//*/aff[@id='#{rid}']/addr-line"
|
128
|
+
affiliations << Utils.clean_text(@doc.xpath(xpath).text)
|
129
|
+
end
|
130
|
+
orcid = author.xpath("uri[@content-type='orcid']").text
|
131
|
+
given = Utils.clean_text(author.xpath("name/given-names").text)
|
132
|
+
surname = Utils.clean_text(author.xpath("name/surname").text)
|
133
|
+
data << {
|
134
|
+
given: given,
|
135
|
+
surname: surname,
|
136
|
+
fullname: [given, surname].join(" "),
|
137
|
+
email: author.xpath("email").text,
|
138
|
+
affiliations: affiliations,
|
139
|
+
orcid: orcid
|
140
|
+
}
|
141
|
+
end
|
142
|
+
data
|
143
|
+
end
|
144
|
+
|
145
|
+
##
|
146
|
+
# Get the conference part of a proceeding
|
147
|
+
#
|
148
|
+
def conference_part
|
149
|
+
Validator.validate_nokogiri(@doc)
|
150
|
+
xpath = "//*/subj-group[@subj-group-type='conference-part']/subject"
|
151
|
+
coll = @doc.xpath(xpath).text
|
152
|
+
Utils.clean_text(coll)
|
153
|
+
end
|
154
|
+
|
155
|
+
##
|
156
|
+
# Get the presenting author of a proceeding
|
157
|
+
#
|
158
|
+
def presenting_author
|
159
|
+
Validator.validate_nokogiri(@doc)
|
160
|
+
xpath = "//*/sec[@sec-type='Presenting author']/p"
|
161
|
+
author = @doc.xpath(xpath).text
|
162
|
+
Utils.clean_text(author)
|
163
|
+
end
|
164
|
+
|
165
|
+
##
|
166
|
+
# Get the corresponding author
|
167
|
+
#
|
168
|
+
def corresponding_author
|
169
|
+
Validator.validate_nokogiri(@doc)
|
170
|
+
xpath = "//*/author-notes/fn[@fn-type='corresp']/p"
|
171
|
+
author_string = Utils.clean_text(@doc.xpath(xpath).text)
|
172
|
+
author_string.gsub("Corresponding author: ", "").chomp(".")
|
173
|
+
end
|
174
|
+
|
175
|
+
##
|
176
|
+
# Get the ranked taxa
|
177
|
+
#
|
178
|
+
def ranked_taxa
|
179
|
+
Validator.validate_nokogiri(@doc)
|
180
|
+
names = Set.new
|
181
|
+
@doc.xpath("//*//tp:taxon-name").each do |taxon|
|
182
|
+
tp = {}
|
183
|
+
taxon.children.each do |child|
|
184
|
+
next if !child.has_attribute?("taxon-name-part-type")
|
185
|
+
rank = child.attributes["taxon-name-part-type"].value.to_sym
|
186
|
+
if child.has_attribute?("reg")
|
187
|
+
tp[rank] = child.attributes["reg"].value
|
188
|
+
else
|
189
|
+
tp[rank] = child.text
|
190
|
+
end
|
191
|
+
end
|
192
|
+
names.add(tp)
|
193
|
+
end
|
194
|
+
names.to_a
|
195
|
+
end
|
196
|
+
|
197
|
+
##
|
198
|
+
# Get occurrences with dwc keys
|
199
|
+
#
|
200
|
+
def occurrences
|
201
|
+
Validator.validate_nokogiri(@doc)
|
202
|
+
data = []
|
203
|
+
@doc.xpath("//*/list[@list-content='occurrences']/list-item").each do |occ|
|
204
|
+
obj = {}
|
205
|
+
occ.xpath("*/named-content").each do |dwc|
|
206
|
+
prefix = dwc.attributes["content-type"].text.gsub(/dwc\:/, "")
|
207
|
+
obj[prefix.to_sym] = dwc.text
|
208
|
+
end
|
209
|
+
data << obj
|
210
|
+
end
|
211
|
+
data
|
212
|
+
end
|
213
|
+
|
214
|
+
##
|
215
|
+
# Get the figures
|
216
|
+
#
|
217
|
+
def figures
|
218
|
+
Validator.validate_nokogiri(@doc)
|
219
|
+
data = []
|
220
|
+
@doc.xpath("//*/fig").each do |fig|
|
221
|
+
data << {
|
222
|
+
label: Utils.clean_text(fig.xpath("label").text),
|
223
|
+
caption: Utils.clean_text(fig.xpath("caption").text),
|
224
|
+
graphic: {
|
225
|
+
href: fig.xpath("graphic").attribute("href").text,
|
226
|
+
id: fig.xpath("graphic").attribute("id").text
|
227
|
+
}
|
228
|
+
}
|
229
|
+
end
|
230
|
+
data
|
231
|
+
end
|
232
|
+
|
233
|
+
##
|
234
|
+
# Get the cited references
|
235
|
+
#
|
236
|
+
def references
|
237
|
+
Validator.validate_nokogiri(@doc)
|
238
|
+
xpath = "//*/ref-list/ref"
|
239
|
+
@doc.xpath(xpath).map{ |r| Reference.parse(r) }
|
240
|
+
end
|
241
|
+
|
242
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
class TaxPub
|
2
|
+
class Reference
|
3
|
+
|
4
|
+
def self.parse(ref)
|
5
|
+
ele = ref.at_xpath("element-citation") || ref.at_xpath("mixed-citation")
|
6
|
+
|
7
|
+
auths = []
|
8
|
+
ele.xpath("person-group/name").each do |name|
|
9
|
+
auths << {
|
10
|
+
surname: name.xpath("surname").text,
|
11
|
+
given_names: name.xpath("given-names").text
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
institution = ele.xpath("institution").text
|
16
|
+
year = ele.xpath("year").text
|
17
|
+
title = ele.xpath("article-title").text.chomp(".")
|
18
|
+
source = ele.xpath("source").text.chomp(".")
|
19
|
+
volume = ele.xpath("volume").text
|
20
|
+
pages = [ele.xpath("fpage"), ele.xpath("lpage")].reject(&:empty?).join("–")
|
21
|
+
|
22
|
+
if ref.at_xpath("element-citation")
|
23
|
+
doi = Utils.expand_doi(ele.xpath("pub-id[@pub-id-type='doi']").text)
|
24
|
+
uri = ele.xpath("uri").text
|
25
|
+
end
|
26
|
+
|
27
|
+
if ref.at_xpath("mixed-citation")
|
28
|
+
doi = Utils.expand_doi(ele.xpath("ext-link[@ext-link-type='doi']").text)
|
29
|
+
uri = ele.xpath("ext-link[@ext-link-type='uri']").text
|
30
|
+
end
|
31
|
+
|
32
|
+
link = !doi.empty? ? doi : uri
|
33
|
+
|
34
|
+
{
|
35
|
+
title: title,
|
36
|
+
institution: institution,
|
37
|
+
authors: auths,
|
38
|
+
year: year,
|
39
|
+
source: source,
|
40
|
+
volume: volume,
|
41
|
+
pages: pages,
|
42
|
+
doi: doi,
|
43
|
+
uri: uri,
|
44
|
+
full_citation: [
|
45
|
+
institution,
|
46
|
+
self.authors_to_string(auths),
|
47
|
+
year,
|
48
|
+
title,
|
49
|
+
[
|
50
|
+
source,
|
51
|
+
[volume, pages].reject(&:empty?).join(": ")
|
52
|
+
].reject(&:empty?).join(" "),
|
53
|
+
link
|
54
|
+
].reject(&:empty?).join(". ")
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.authors_to_string(auths)
|
59
|
+
authors = auths.dup
|
60
|
+
return "" if authors.empty?
|
61
|
+
first = authors.first.values.join(", ")
|
62
|
+
authors.shift
|
63
|
+
remaining = authors.map{|a| a.values.reverse.join(" ")}.join(", ")
|
64
|
+
[first, remaining].reject(&:empty?).join(", ")
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
data/lib/taxpub/utils.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
class TaxPub
|
2
|
+
class Utils
|
3
|
+
|
4
|
+
def self.clean_text(text)
|
5
|
+
text.encode("UTF-8", :undef => :replace, :invalid => :replace, :replace => " ")
|
6
|
+
.gsub(/[[:space:]]/, " ")
|
7
|
+
.chomp(",")
|
8
|
+
.split
|
9
|
+
.join(" ")
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.expand_doi(doi)
|
13
|
+
if doi[0..2] == "10."
|
14
|
+
doi.prepend("https://doi.org/")
|
15
|
+
end
|
16
|
+
doi
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "uri"
|
2
|
+
|
3
|
+
class TaxPub
|
4
|
+
class Validator
|
5
|
+
|
6
|
+
def self.validate_url(data)
|
7
|
+
validate_type(data, 'String')
|
8
|
+
if data !~ /\A#{URI::regexp(['http', 'https'])}\z/
|
9
|
+
raise InvalidParameterValueError, "URL must be in the form http:// or https://"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.validate_nokogiri(data)
|
14
|
+
if !data.is_a?(Nokogiri::XML::Document)
|
15
|
+
raise InvalidTypeError, "Must be a Nokogiri XML document or the parse method has not been executed"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.validate_type(data, type)
|
20
|
+
case type
|
21
|
+
when 'String', 'Array', 'Integer', 'Hash'
|
22
|
+
raise InvalidParameterValueError, "Must be a #{type}" unless data.is_a?(Object.const_get(type))
|
23
|
+
when 'Boolean'
|
24
|
+
raise InvalidParameterValueError, "Must be a Boolean" unless [true, false].include?(data)
|
25
|
+
when 'File'
|
26
|
+
raise InvalidParameterValueError, "Must be a file path & file must exist" unless File.file?(data)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: taxpub
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- David P. Shorthouse
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-07-25 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '11.1'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '11.1'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.4'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.4'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bundler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.10'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.10'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: byebug
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '9.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '9.0'
|
83
|
+
description: Parses TaxPub XML documents and adds methods to pull out conference data,
|
84
|
+
ranked taxa, occurrences, references, etc.
|
85
|
+
email: davidpshorthouse@gmail.coms
|
86
|
+
executables: []
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- README.rdoc
|
91
|
+
- lib/taxpub.rb
|
92
|
+
- lib/taxpub/exceptions.rb
|
93
|
+
- lib/taxpub/reference.rb
|
94
|
+
- lib/taxpub/utils.rb
|
95
|
+
- lib/taxpub/validator.rb
|
96
|
+
- lib/taxpub/version.rb
|
97
|
+
homepage: https://github.com/dshorthouse/taxpub
|
98
|
+
licenses:
|
99
|
+
- MIT
|
100
|
+
metadata: {}
|
101
|
+
post_install_message:
|
102
|
+
rdoc_options:
|
103
|
+
- "--encoding"
|
104
|
+
- UTF-8
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
requirements: []
|
118
|
+
rubyforge_project:
|
119
|
+
rubygems_version: 2.6.12
|
120
|
+
signing_key:
|
121
|
+
specification_version: 4
|
122
|
+
summary: Parse TaxPub XML documents
|
123
|
+
test_files: []
|