stanford-mods-normalizer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0a0d58dd1d4b6d6fcfa74b8338e9af8d0d95b62aa8f7967ead17018fb2d616fe
4
+ data.tar.gz: 39381c4aefad607ed2602296e13904b606e102d3a6e5e7d18de3ca37380845dc
5
+ SHA512:
6
+ metadata.gz: e5226a45ff7ccf1d1f27b54082f35fe5c95ea12730961c35444fb79ded554ed943cabf8d9e30ac54b76006650779ec86fc33be294092bf1a07ffac859fb3ccbc
7
+ data.tar.gz: '09d6b33969eb6a38c24fa4696e9754cdcb105046cfa7ddec2990fba5d2cfef21423509718a0da68ea541d8f68d29c1660cb1896b3e71a255d667213814c6e0b8'
@@ -0,0 +1,5 @@
1
+ # rspec failure tracking
2
+ .rspec_status
3
+
4
+ Gemfile.lock
5
+ pkg/
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,8 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
3
+ Metrics/LineLength:
4
+ Max: 140
5
+
6
+ Metrics/BlockLength:
7
+ Exclude:
8
+ - 'spec/**/*_spec.rb'
@@ -0,0 +1,48 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2018-03-14 09:10:19 -0500 using RuboCop version 0.53.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 3
10
+ Metrics/AbcSize:
11
+ Max: 30
12
+
13
+ # Offense count: 2
14
+ # Configuration parameters: CountComments, ExcludedMethods.
15
+ Metrics/BlockLength:
16
+ Max: 116
17
+
18
+ # Offense count: 1
19
+ # Configuration parameters: CountComments.
20
+ Metrics/ClassLength:
21
+ Max: 110
22
+
23
+ # Offense count: 1
24
+ Metrics/CyclomaticComplexity:
25
+ Max: 8
26
+
27
+ # Offense count: 4
28
+ # Configuration parameters: CountComments.
29
+ Metrics/MethodLength:
30
+ Max: 14
31
+
32
+ # Offense count: 2
33
+ Metrics/PerceivedComplexity:
34
+ Max: 8
35
+
36
+ # Offense count: 1
37
+ # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
38
+ # AllowedNames: io, id
39
+ Naming/UncommunicativeMethodParamName:
40
+ Exclude:
41
+ - 'lib/stanford/mods/normalizer.rb'
42
+
43
+ # Offense count: 1
44
+ Style/Documentation:
45
+ Exclude:
46
+ - 'spec/**/*'
47
+ - 'test/**/*'
48
+ - 'lib/stanford/mods/normalizer.rb'
@@ -0,0 +1,4 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.6
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in stanford-mods-normalizer.gemspec
6
+ gemspec
@@ -0,0 +1,5 @@
1
+ [![Build Status](https://travis-ci.org/sul-dlss/mods_normalizer.svg?branch=master)](https://travis-ci.org/sul-dlss/mods_normalizer)
2
+
3
+ # Stanford::Mods::Normalizer
4
+
5
+ Provides methods to normalize MODS XML according to the Stanford guidelines
@@ -0,0 +1,14 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+ require 'rubocop/rake_task'
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+
7
+ desc 'Run style checker'
8
+ RuboCop::RakeTask.new(:rubocop) do |task|
9
+ task.fail_on_error = true
10
+ end
11
+
12
+ task default: :ci
13
+
14
+ task ci: %i[rubocop spec]
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stanford/mods/normalizer/version'
4
+
5
+ module Stanford
6
+ module Mods
7
+ class Normalizer
8
+ # Your code goes here...
9
+ require 'nokogiri'
10
+
11
+ # Linefeed character entity reference
12
+ LINEFEED = '
'.freeze
13
+
14
+ # Select all single <dateCreated> and <dateIssued> fields
15
+ LONE_DATE_XPATH = '//mods:originInfo/mods:dateCreated[1][not(following-sibling::*[1][self::mods:dateCreated])]' \
16
+ ' | //mods:originInfo/mods:dateIssued[1][not(following-sibling::*[1][self::mods:dateIssued])]'.freeze
17
+
18
+ # Select all <dateCreated> and <dateIssued> fields
19
+ DATE_CREATED_ISSUED_XPATH = '//mods:dateCreated | //mods:dateIssued'.freeze
20
+
21
+ # The official MODS namespace, courtesy of the Library of Congress
22
+ MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'.freeze
23
+
24
+ # Selects <abstract>, <tableOfContents> and <note> when no namespace is present
25
+ LINEFEED_XPATH = '//abstract | //tableOfContents | //note'.freeze
26
+
27
+ # Selects <abstract>, <tableOfContents> and <note> when a namespace is present
28
+ LINEFEED_XPATH_NAMESPACED = '//ns:abstract | //ns:tableOfContents | //ns:note'.freeze
29
+
30
+ # Checks if a node has attributes that we make exeptions for. There are two such exceptions.
31
+ #
32
+ # * A "collection" attribute with the value "yes" <em>on a typeOfResource tag</em>.
33
+ # * A "manuscript" attribute with the value "yes" <em>on a typeOfResource tag</em>.
34
+ #
35
+ # Nodes that fall under any of these exceptions should not be deleted, even if they have no content.
36
+ #
37
+ # @param [Nokogiri::XML::Element] node An XML node.
38
+ # @return [Boolean] true if the node contains any of the exceptional attributes, false otherwise.
39
+ def exceptional?(node)
40
+ return false if node.nil?
41
+
42
+ tag = node.name
43
+ attributes = node.attributes
44
+
45
+ return false if attributes.empty?
46
+
47
+ attributes.each do |key, value|
48
+ next unless tag == 'typeOfResource'
49
+ # Note that according to the MODS schema, any other value than 'yes' for these attributes is invalid
50
+ if (key == 'collection' && value.to_s.casecmp('yes').zero?) ||
51
+ (key == 'manuscript' && value.to_s.casecmp('yes').zero?)
52
+ return true
53
+ end
54
+ end
55
+ false
56
+ end
57
+
58
+ # Recursive helper method for {Normalizer#clean_linefeeds} to do string substitution.
59
+ #
60
+ # @param [Nokogiri::XML::Element] node An XML node
61
+ # @return [String] A string composed of the entire contents of the given node,
62
+ # with substitutions made as described for {#clean_linefeeds}.
63
+ def substitute_linefeeds(node)
64
+ new_text = ''
65
+
66
+ # If we substitute in '&#10;' by itself, Nokogiri interprets that and then prints '&amp;#10;' when printing the document later. This
67
+ # is an ugly way to add linefeed characters in a way that we at least get well-formatted output in the end.
68
+ if node.text?
69
+ new_text = node.content.gsub(/(\r\n|\n|\r|\\n)/, Nokogiri::HTML(LINEFEED).text)
70
+ else
71
+ if node.node_name == 'br'
72
+ new_text += Nokogiri::HTML(LINEFEED).text
73
+ elsif node.node_name == 'p'
74
+ new_text += Nokogiri::HTML(LINEFEED).text + Nokogiri::HTML(LINEFEED).text
75
+ end
76
+
77
+ node.children.each do |c|
78
+ new_text += substitute_linefeeds(c)
79
+ end
80
+ end
81
+ new_text
82
+ end
83
+
84
+ # Given the root of an XML document, replaces linefeed characters inside <tableOfContents>, <abstract> and <note> XML node by &#10;
85
+ # \n, \r, <br> and <br/> are all replaced by a single &#10;
86
+ # <p> is replaced by two &#10;
87
+ # </p> is removed
88
+ # \r\n is replaced by &#10;
89
+ # Any tags not listed above are removed. MODS 3.5 does not allow for anything other than text inside these three nodes.
90
+ #
91
+ # @param [Nokogiri::XML::NodeSet] node_list All <tableOfContents>, <abstract> and <node> elements.
92
+ # @return [Void] This method doesn't return anything, but introduces
93
+ # UTF-8 linefeed characters in place, as described above.
94
+ def clean_linefeeds(node_list)
95
+ node_list.each do |current_node|
96
+ new_text = substitute_linefeeds(current_node)
97
+ current_node.children.remove
98
+ current_node.content = new_text
99
+ end
100
+ end
101
+
102
+ # Cleans up the text of a node:
103
+ #
104
+ # * Removes extra whitespace at the beginning and end.
105
+ # * Removes any consecutive whitespace within the string.
106
+ #
107
+ # @param [String] s The text of an XML node.
108
+ # @return [String] The cleaned string, as described. Returns nil if the input is nil, or if the input is an empty string.
109
+ def clean_text(s)
110
+ return nil unless !s.nil? && s != ''
111
+ s.gsub(/\s+/, ' ').strip
112
+ end
113
+
114
+ # Removes empty attributes from a given node.
115
+ #
116
+ # @param [Nokogiri::XML::Element] node An XML node.
117
+ # @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
118
+ def remove_empty_attributes(node)
119
+ children = node.children
120
+ attributes = node.attributes
121
+
122
+ attributes.each do |key, value|
123
+ node.remove_attribute(key) if value.to_s.strip.empty?
124
+ end
125
+
126
+ children.each do |c|
127
+ remove_empty_attributes(c)
128
+ end
129
+ end
130
+
131
+ # Removes empty nodes from an XML tree. See {#exceptional?} for nodes that are kept even if empty.
132
+ #
133
+ # @param [Nokogiri::XML::Element] node An XML node.
134
+ # @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
135
+ def remove_empty_nodes(node)
136
+ children = node.children
137
+
138
+ if node.text?
139
+ return node.remove if node.to_s.strip.empty?
140
+ return
141
+ elsif !children.empty?
142
+ children.each do |c|
143
+ remove_empty_nodes(c)
144
+ end
145
+ end
146
+
147
+ node.remove if !exceptional?(node) && node.children.empty?
148
+ end
149
+
150
+ # Removes leading and trailing spaces from a node.
151
+ #
152
+ # @param [Nokogiri::XML::Element] node An XML node.
153
+ # @return [Void] This method doesn't return anything, but modifies the entire XML tree starting at the
154
+ # the given node, removing leading and trailing spaces from all text. If the input is nil,
155
+ # an exception will be raised.
156
+ def trim_text(node)
157
+ children = node.children
158
+
159
+ if node.text?
160
+ node.parent.content = node.text.strip
161
+ else
162
+ children.each do |c|
163
+ trim_text(c)
164
+ end
165
+ end
166
+ end
167
+
168
+ # Sometimes there are spurious decimal digits within the date fields. This method removes any trailing decimal points within
169
+ # <dateCreated> and <dateIssued>.
170
+ #
171
+ # @param [Nokogiri::XML::NodeSet] nodes A set of all affected <dateCreated> and <dateIssued> elements.
172
+ # @return [Void] The given document is modified in place.
173
+ def clean_date_values(nodes)
174
+ nodes.each do |current_node|
175
+ current_node.content = current_node.content.sub(/(.*)\.\d+$/, '\1')
176
+ end
177
+ end
178
+
179
+ # Normalizes the given MODS XML document according to the Stanford guidelines.
180
+ #
181
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
182
+ # @return [Void] The given document is modified in place.
183
+ def normalize_mods_document(root)
184
+ node_list = if root.namespace.nil?
185
+ root.xpath(LINEFEED_XPATH)
186
+ else
187
+ root.xpath(LINEFEED_XPATH_NAMESPACED, 'ns' => root.namespace.href)
188
+ end
189
+ clean_linefeeds(node_list) # Do this before deleting <br> and <p> with remove_empty_nodes()
190
+
191
+ remove_empty_attributes(root)
192
+ remove_empty_nodes(root)
193
+ trim_text(root)
194
+ clean_date_values(root.xpath(DATE_CREATED_ISSUED_XPATH, 'mods' => MODS_NAMESPACE))
195
+ end
196
+
197
+ # Normalizes the given MODS XML document according to the Stanford guidelines.
198
+ #
199
+ # @deprecated Use normalize_mods_document instead.
200
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
201
+ # @return [Void] The given document is modified in place.
202
+ def normalize_document(root)
203
+ normalize_mods_document(root)
204
+ end
205
+
206
+ # Normalizes the given XML document string according to the Stanford guidelines.
207
+ #
208
+ # @param [String] xml_string An XML document
209
+ # @return [String] The XML string, with normalizations applied.
210
+ def normalize_xml_string(xml_string)
211
+ doc = Nokogiri::XML(xml_string)
212
+ normalize_document(doc.root)
213
+ doc.to_s
214
+ end
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,7 @@
1
+ module Stanford
2
+ module Mods
3
+ class Normalizer
4
+ VERSION = '0.1.0'.freeze
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,30 @@
1
+
2
+ lib = File.expand_path('lib', __dir__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'stanford/mods/normalizer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'stanford-mods-normalizer'
8
+ spec.version = Stanford::Mods::Normalizer::VERSION
9
+ spec.authors = ['Justin Coyne']
10
+ spec.email = ['jcoyne@justincoyne.com']
11
+
12
+ spec.summary = 'Provides methods to normalize MODS XML according to the Stanford guidelines '
13
+ spec.homepage = 'https://github.com/sul-dlss/mods_normalizer'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
16
+ f.match(%r{^(test|spec|features)/})
17
+ end
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_dependency 'nokogiri', '~> 1.8'
23
+ spec.add_development_dependency 'rubocop', '~> 0.53'
24
+ spec.add_development_dependency 'rubocop-rspec', '~> 0.18'
25
+
26
+ spec.add_development_dependency 'bundler', '~> 1.16'
27
+ spec.add_development_dependency 'equivalent-xml', '>= 0.6.0'
28
+ spec.add_development_dependency 'rake', '~> 10.0'
29
+ spec.add_development_dependency 'rspec', '~> 3.0'
30
+ end
metadata ADDED
@@ -0,0 +1,152 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: stanford-mods-normalizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Justin Coyne
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-03-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rubocop
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.53'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.53'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rubocop-rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.18'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.18'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.16'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.16'
69
+ - !ruby/object:Gem::Dependency
70
+ name: equivalent-xml
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: 0.6.0
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 0.6.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.0'
111
+ description:
112
+ email:
113
+ - jcoyne@justincoyne.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".gitignore"
119
+ - ".rspec"
120
+ - ".rubocop.yml"
121
+ - ".rubocop_todo.yml"
122
+ - ".travis.yml"
123
+ - Gemfile
124
+ - README.md
125
+ - Rakefile
126
+ - lib/stanford/mods/normalizer.rb
127
+ - lib/stanford/mods/normalizer/version.rb
128
+ - stanford-mods-normalizer.gemspec
129
+ homepage: https://github.com/sul-dlss/mods_normalizer
130
+ licenses: []
131
+ metadata: {}
132
+ post_install_message:
133
+ rdoc_options: []
134
+ require_paths:
135
+ - lib
136
+ required_ruby_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ required_rubygems_version: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ requirements: []
147
+ rubyforge_project:
148
+ rubygems_version: 2.7.1
149
+ signing_key:
150
+ specification_version: 4
151
+ summary: Provides methods to normalize MODS XML according to the Stanford guidelines
152
+ test_files: []