taxpub 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8483945f3e3ce760f89271e2f09b6cdb75887766
4
+ data.tar.gz: 2b697ef9a5bb87bd9bf522be5c0acb0cf4a62e77
5
+ SHA512:
6
+ metadata.gz: a339a2fe4cb2eb53c4e37a73ca239363c27618622cf9ab693cdf0664d6a85d6c1985f5441b58fd058f1d304b386419670e4f2ecdcc298df2f24d5f5c7550d3f2
7
+ data.tar.gz: '05096aab5b7b4db19eafe9c4d8cd7a780b77586a06142d4d25e3667ad985df8eaee0c97f12f5f074c0d4ef8b34caeae76d2fca260c2908f3155ef5241746f14c'
@@ -0,0 +1,29 @@
1
+ = taxpub
2
+
3
+ Ruby 2.4.1 gem to parse TaxPub documents like those produced by Pensoft Publishers, https://pensoft.net/.
4
+
5
+ TaxPub Background: https://www.ncbi.nlm.nih.gov/books/NBK47081/
6
+
7
+ == Usage
8
+
9
+ > require "taxpub"
10
+ > tp = TaxPub.new
11
+ > tp.url = "https://tdwgproceedings.pensoft.net/article/19829/download/xml/"
12
+ > tp.parse
13
+ > tp.doi
14
+
15
+ == Contributing to taxpub
16
+
17
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
18
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
19
+ * Fork the project.
20
+ * Start a feature/bugfix branch.
21
+ * Commit and push until you are happy with your contribution.
22
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
23
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
24
+
25
+ == Copyright
26
+
27
+ Copyright (c) 2017 David Shorthouse. See LICENSE.txt for
28
+ further details.
29
+
@@ -0,0 +1,242 @@
1
+ require "taxpub/exceptions"
2
+ require "taxpub/validator"
3
+ require "taxpub/utils"
4
+ require "taxpub/reference"
5
+ require "taxpub/version"
6
+ require "nokogiri"
7
+ require "open-uri"
8
+ require "set"
9
+
10
+ class TaxPub
11
+
12
+ def initialize
13
+ @parameters = {}
14
+ @doc = {}
15
+ end
16
+
17
+ ##
18
+ # View the built parameters
19
+ #
20
+ def params
21
+ @parameters
22
+ end
23
+
24
+ ##
25
+ # Specify a remote TaxPub URL
26
+ # Source must be an xml file
27
+ #
28
+ # == Example
29
+ #
30
+ # instance.url = "https://tdwgproceedings.pensoft.net/article/15141/download/xml/"
31
+ #
32
+ def url=(url)
33
+ Validator.validate_url(url)
34
+ @parameters[:url] = url
35
+ end
36
+
37
+ def url
38
+ @parameters[:url] || nil
39
+ end
40
+
41
+ ##
42
+ # Set a file path for a TaxPub XML file
43
+ #
44
+ # == Example
45
+ #
46
+ # instance.file_path = "/Users/jane/Desktop/taxpub.xml"
47
+ #
48
+ def file_path=(file_path)
49
+ Validator.validate_type(file_path, 'File')
50
+ @parameters[:file] = File.new(file_path, "r")
51
+ end
52
+
53
+ def file_path
54
+ @parameters[:file].path rescue nil
55
+ end
56
+
57
+ ##
58
+ # Build the Nokogiri document
59
+ #
60
+ def parse
61
+ if url
62
+ @doc = Nokogiri::XML(open(url))
63
+ elsif file_path
64
+ @doc = File.open(file_path) { |f| Nokogiri::XML(f) }
65
+ end
66
+ Validator.validate_nokogiri(@doc)
67
+ end
68
+
69
+ ##
70
+ # View the parsed Nokogiri document
71
+ #
72
+ def doc
73
+ @doc
74
+ end
75
+
76
+ ##
77
+ # Get the raw text content of the Nokogiri document
78
+ #
79
+ def content
80
+ Utils.clean_text(@doc.text)
81
+ end
82
+
83
+ ##
84
+ # Get the DOI
85
+ #
86
+ def doi
87
+ Validator.validate_nokogiri(@doc)
88
+ Utils.expand_doi(@doc.xpath("//*/article-meta/article-id[@pub-id-type='doi']").text)
89
+ end
90
+
91
+ ##
92
+ # Get the title
93
+ #
94
+ def title
95
+ Validator.validate_nokogiri(@doc)
96
+ t = @doc.xpath("//*/article-meta/title-group/article-title").text
97
+ Utils.clean_text(t)
98
+ end
99
+
100
+ ##
101
+ # Get the abstract
102
+ #
103
+ def abstract
104
+ Validator.validate_nokogiri(@doc)
105
+ a = @doc.xpath("//*/article-meta/abstract").text
106
+ Utils.clean_text(a)
107
+ end
108
+
109
+ ##
110
+ # Get the keywords
111
+ #
112
+ def keywords
113
+ Validator.validate_nokogiri(@doc)
114
+ @doc.xpath("//*/article-meta/kwd-group/kwd")
115
+ .map{|a| Utils.clean_text(a.text)}
116
+ end
117
+
118
+ ##
119
+ # Get the authors
120
+ #
121
+ def authors
122
+ Validator.validate_nokogiri(@doc)
123
+ data = []
124
+ @doc.xpath("//*/contrib[@contrib-type='author']").each do |author|
125
+ affiliations = []
126
+ author.xpath("xref/@rid").each do |rid|
127
+ xpath = "//*/aff[@id='#{rid}']/addr-line"
128
+ affiliations << Utils.clean_text(@doc.xpath(xpath).text)
129
+ end
130
+ orcid = author.xpath("uri[@content-type='orcid']").text
131
+ given = Utils.clean_text(author.xpath("name/given-names").text)
132
+ surname = Utils.clean_text(author.xpath("name/surname").text)
133
+ data << {
134
+ given: given,
135
+ surname: surname,
136
+ fullname: [given, surname].join(" "),
137
+ email: author.xpath("email").text,
138
+ affiliations: affiliations,
139
+ orcid: orcid
140
+ }
141
+ end
142
+ data
143
+ end
144
+
145
+ ##
146
+ # Get the conference part of a proceeding
147
+ #
148
+ def conference_part
149
+ Validator.validate_nokogiri(@doc)
150
+ xpath = "//*/subj-group[@subj-group-type='conference-part']/subject"
151
+ coll = @doc.xpath(xpath).text
152
+ Utils.clean_text(coll)
153
+ end
154
+
155
+ ##
156
+ # Get the presenting author of a proceeding
157
+ #
158
+ def presenting_author
159
+ Validator.validate_nokogiri(@doc)
160
+ xpath = "//*/sec[@sec-type='Presenting author']/p"
161
+ author = @doc.xpath(xpath).text
162
+ Utils.clean_text(author)
163
+ end
164
+
165
+ ##
166
+ # Get the corresponding author
167
+ #
168
+ def corresponding_author
169
+ Validator.validate_nokogiri(@doc)
170
+ xpath = "//*/author-notes/fn[@fn-type='corresp']/p"
171
+ author_string = Utils.clean_text(@doc.xpath(xpath).text)
172
+ author_string.gsub("Corresponding author: ", "").chomp(".")
173
+ end
174
+
175
+ ##
176
+ # Get the ranked taxa
177
+ #
178
+ def ranked_taxa
179
+ Validator.validate_nokogiri(@doc)
180
+ names = Set.new
181
+ @doc.xpath("//*//tp:taxon-name").each do |taxon|
182
+ tp = {}
183
+ taxon.children.each do |child|
184
+ next if !child.has_attribute?("taxon-name-part-type")
185
+ rank = child.attributes["taxon-name-part-type"].value.to_sym
186
+ if child.has_attribute?("reg")
187
+ tp[rank] = child.attributes["reg"].value
188
+ else
189
+ tp[rank] = child.text
190
+ end
191
+ end
192
+ names.add(tp)
193
+ end
194
+ names.to_a
195
+ end
196
+
197
+ ##
198
+ # Get occurrences with dwc keys
199
+ #
200
+ def occurrences
201
+ Validator.validate_nokogiri(@doc)
202
+ data = []
203
+ @doc.xpath("//*/list[@list-content='occurrences']/list-item").each do |occ|
204
+ obj = {}
205
+ occ.xpath("*/named-content").each do |dwc|
206
+ prefix = dwc.attributes["content-type"].text.gsub(/dwc\:/, "")
207
+ obj[prefix.to_sym] = dwc.text
208
+ end
209
+ data << obj
210
+ end
211
+ data
212
+ end
213
+
214
+ ##
215
+ # Get the figures
216
+ #
217
+ def figures
218
+ Validator.validate_nokogiri(@doc)
219
+ data = []
220
+ @doc.xpath("//*/fig").each do |fig|
221
+ data << {
222
+ label: Utils.clean_text(fig.xpath("label").text),
223
+ caption: Utils.clean_text(fig.xpath("caption").text),
224
+ graphic: {
225
+ href: fig.xpath("graphic").attribute("href").text,
226
+ id: fig.xpath("graphic").attribute("id").text
227
+ }
228
+ }
229
+ end
230
+ data
231
+ end
232
+
233
+ ##
234
+ # Get the cited references
235
+ #
236
+ def references
237
+ Validator.validate_nokogiri(@doc)
238
+ xpath = "//*/ref-list/ref"
239
+ @doc.xpath(xpath).map{ |r| Reference.parse(r) }
240
+ end
241
+
242
+ end
@@ -0,0 +1,5 @@
1
+ class TaxPub
2
+ class Error < RuntimeError; end
3
+ class InvalidParameterValueError < TaxPub::Error; end
4
+ class InvalidTypeError < TypeError; end
5
+ end
@@ -0,0 +1,68 @@
1
+ class TaxPub
2
+ class Reference
3
+
4
+ def self.parse(ref)
5
+ ele = ref.at_xpath("element-citation") || ref.at_xpath("mixed-citation")
6
+
7
+ auths = []
8
+ ele.xpath("person-group/name").each do |name|
9
+ auths << {
10
+ surname: name.xpath("surname").text,
11
+ given_names: name.xpath("given-names").text
12
+ }
13
+ end
14
+
15
+ institution = ele.xpath("institution").text
16
+ year = ele.xpath("year").text
17
+ title = ele.xpath("article-title").text.chomp(".")
18
+ source = ele.xpath("source").text.chomp(".")
19
+ volume = ele.xpath("volume").text
20
+ pages = [ele.xpath("fpage"), ele.xpath("lpage")].reject(&:empty?).join("–")
21
+
22
+ if ref.at_xpath("element-citation")
23
+ doi = Utils.expand_doi(ele.xpath("pub-id[@pub-id-type='doi']").text)
24
+ uri = ele.xpath("uri").text
25
+ end
26
+
27
+ if ref.at_xpath("mixed-citation")
28
+ doi = Utils.expand_doi(ele.xpath("ext-link[@ext-link-type='doi']").text)
29
+ uri = ele.xpath("ext-link[@ext-link-type='uri']").text
30
+ end
31
+
32
+ link = !doi.empty? ? doi : uri
33
+
34
+ {
35
+ title: title,
36
+ institution: institution,
37
+ authors: auths,
38
+ year: year,
39
+ source: source,
40
+ volume: volume,
41
+ pages: pages,
42
+ doi: doi,
43
+ uri: uri,
44
+ full_citation: [
45
+ institution,
46
+ self.authors_to_string(auths),
47
+ year,
48
+ title,
49
+ [
50
+ source,
51
+ [volume, pages].reject(&:empty?).join(": ")
52
+ ].reject(&:empty?).join(" "),
53
+ link
54
+ ].reject(&:empty?).join(". ")
55
+ }
56
+ end
57
+
58
+ def self.authors_to_string(auths)
59
+ authors = auths.dup
60
+ return "" if authors.empty?
61
+ first = authors.first.values.join(", ")
62
+ authors.shift
63
+ remaining = authors.map{|a| a.values.reverse.join(" ")}.join(", ")
64
+ [first, remaining].reject(&:empty?).join(", ")
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,20 @@
1
+ class TaxPub
2
+ class Utils
3
+
4
+ def self.clean_text(text)
5
+ text.encode("UTF-8", :undef => :replace, :invalid => :replace, :replace => " ")
6
+ .gsub(/[[:space:]]/, " ")
7
+ .chomp(",")
8
+ .split
9
+ .join(" ")
10
+ end
11
+
12
+ def self.expand_doi(doi)
13
+ if doi[0..2] == "10."
14
+ doi.prepend("https://doi.org/")
15
+ end
16
+ doi
17
+ end
18
+
19
+ end
20
+ end
@@ -0,0 +1,31 @@
1
+ require "uri"
2
+
3
+ class TaxPub
4
+ class Validator
5
+
6
+ def self.validate_url(data)
7
+ validate_type(data, 'String')
8
+ if data !~ /\A#{URI::regexp(['http', 'https'])}\z/
9
+ raise InvalidParameterValueError, "URL must be in the form http:// or https://"
10
+ end
11
+ end
12
+
13
+ def self.validate_nokogiri(data)
14
+ if !data.is_a?(Nokogiri::XML::Document)
15
+ raise InvalidTypeError, "Must be a Nokogiri XML document or the parse method has not been executed"
16
+ end
17
+ end
18
+
19
+ def self.validate_type(data, type)
20
+ case type
21
+ when 'String', 'Array', 'Integer', 'Hash'
22
+ raise InvalidParameterValueError, "Must be a #{type}" unless data.is_a?(Object.const_get(type))
23
+ when 'Boolean'
24
+ raise InvalidParameterValueError, "Must be a Boolean" unless [true, false].include?(data)
25
+ when 'File'
26
+ raise InvalidParameterValueError, "Must be a file path & file must exist" unless File.file?(data)
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,7 @@
1
+ class TaxPub
2
+ VERSION = "0.0.1"
3
+
4
+ def self.version
5
+ VERSION
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: taxpub
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - David P. Shorthouse
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-07-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '11.1'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '11.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.4'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.4'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.10'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.10'
69
+ - !ruby/object:Gem::Dependency
70
+ name: byebug
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '9.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '9.0'
83
+ description: Parses TaxPub XML documents and adds methods to pull out conference data,
84
+ ranked taxa, occurrences, references, etc.
85
+ email: davidpshorthouse@gmail.coms
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - README.rdoc
91
+ - lib/taxpub.rb
92
+ - lib/taxpub/exceptions.rb
93
+ - lib/taxpub/reference.rb
94
+ - lib/taxpub/utils.rb
95
+ - lib/taxpub/validator.rb
96
+ - lib/taxpub/version.rb
97
+ homepage: https://github.com/dshorthouse/taxpub
98
+ licenses:
99
+ - MIT
100
+ metadata: {}
101
+ post_install_message:
102
+ rdoc_options:
103
+ - "--encoding"
104
+ - UTF-8
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 2.6.12
120
+ signing_key:
121
+ specification_version: 4
122
+ summary: Parse TaxPub XML documents
123
+ test_files: []