RubyGems - stanford-mods-normalizer - Versions diffs - 0.1.0 - Mend

stanford-mods-normalizer 0.1.0

Files changed (13) hide show

checksums.yaml +7 -0
data/.gitignore +5 -0
data/.rspec +3 -0
data/.rubocop.yml +8 -0
data/.rubocop_todo.yml +48 -0
data/.travis.yml +4 -0
data/Gemfile +6 -0
data/README.md +5 -0
data/Rakefile +14 -0
data/lib/stanford/mods/normalizer.rb +217 -0
data/lib/stanford/mods/normalizer/version.rb +7 -0
data/stanford-mods-normalizer.gemspec +30 -0
metadata +152 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 0a0d58dd1d4b6d6fcfa74b8338e9af8d0d95b62aa8f7967ead17018fb2d616fe
+  data.tar.gz: 39381c4aefad607ed2602296e13904b606e102d3a6e5e7d18de3ca37380845dc
+SHA512:
+  metadata.gz: e5226a45ff7ccf1d1f27b54082f35fe5c95ea12730961c35444fb79ded554ed943cabf8d9e30ac54b76006650779ec86fc33be294092bf1a07ffac859fb3ccbc
+  data.tar.gz: '09d6b33969eb6a38c24fa4696e9754cdcb105046cfa7ddec2990fba5d2cfef21423509718a0da68ea541d8f68d29c1660cb1896b3e71a255d667213814c6e0b8'

data/.gitignore ADDED

@@ -0,0 +1,5 @@
+# rspec failure tracking
+.rspec_status
+Gemfile.lock
+pkg/

data/.rspec ADDED

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.rubocop.yml ADDED

@@ -0,0 +1,8 @@
+inherit_from: .rubocop_todo.yml
+Metrics/LineLength:
+  Max: 140
+Metrics/BlockLength:
+  Exclude:
+    - 'spec/**/*_spec.rb'

data/.rubocop_todo.yml ADDED

@@ -0,0 +1,48 @@
+# This configuration was generated by
+# `rubocop --auto-gen-config`
+# on 2018-03-14 09:10:19 -0500 using RuboCop version 0.53.0.
+# The point is for the user to remove these configuration records
+# one by one as the offenses are removed from the code base.
+# Note that changes in the inspected code, or installation of new
+# versions of RuboCop, may require this file to be generated again.
+# Offense count: 3
+Metrics/AbcSize:
+  Max: 30
+# Offense count: 2
+# Configuration parameters: CountComments, ExcludedMethods.
+Metrics/BlockLength:
+  Max: 116
+# Offense count: 1
+# Configuration parameters: CountComments.
+Metrics/ClassLength:
+  Max: 110
+# Offense count: 1
+Metrics/CyclomaticComplexity:
+  Max: 8
+# Offense count: 4
+# Configuration parameters: CountComments.
+Metrics/MethodLength:
+  Max: 14
+# Offense count: 2
+Metrics/PerceivedComplexity:
+  Max: 8
+# Offense count: 1
+# Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
+# AllowedNames: io, id
+Naming/UncommunicativeMethodParamName:
+  Exclude:
+    - 'lib/stanford/mods/normalizer.rb'
+# Offense count: 1
+Style/Documentation:
+  Exclude:
+    - 'spec/**/*'
+    - 'test/**/*'
+    - 'lib/stanford/mods/normalizer.rb'

data/.travis.yml ADDED

@@ -0,0 +1,4 @@
+sudo: false
+language: ruby
+rvm:
+  - 2.3.6

data/Gemfile ADDED

@@ -0,0 +1,6 @@
+source 'https://rubygems.org'
+git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
+# Specify your gem's dependencies in stanford-mods-normalizer.gemspec
+gemspec

data/README.md ADDED

@@ -0,0 +1,5 @@
+[![Build Status](https://travis-ci.org/sul-dlss/mods_normalizer.svg?branch=master)](https://travis-ci.org/sul-dlss/mods_normalizer)
+# Stanford::Mods::Normalizer
+Provides methods to normalize MODS XML according to the Stanford guidelines

data/Rakefile ADDED

@@ -0,0 +1,14 @@
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+require 'rubocop/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+desc 'Run style checker'
+RuboCop::RakeTask.new(:rubocop) do |task|
+  task.fail_on_error = true
+end
+task default: :ci
+task ci: %i[rubocop spec]

data/lib/stanford/mods/normalizer.rb ADDED

@@ -0,0 +1,217 @@
+# frozen_string_literal: true
+require 'stanford/mods/normalizer/version'
+module Stanford
+  module Mods
+    class Normalizer
+      # Your code goes here...
+      require 'nokogiri'
+      # Linefeed character entity reference
+      LINEFEED = '&#10;'.freeze
+      # Select all single <dateCreated> and <dateIssued> fields
+      LONE_DATE_XPATH = '//mods:originInfo/mods:dateCreated[1][not(following-sibling::*[1][self::mods:dateCreated])]' \
+                        ' | //mods:originInfo/mods:dateIssued[1][not(following-sibling::*[1][self::mods:dateIssued])]'.freeze
+      # Select all <dateCreated> and <dateIssued> fields
+      DATE_CREATED_ISSUED_XPATH = '//mods:dateCreated | //mods:dateIssued'.freeze
+      # The official MODS namespace, courtesy of the Library of Congress
+      MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'.freeze
+      # Selects <abstract>, <tableOfContents> and <note> when no namespace is present
+      LINEFEED_XPATH = '//abstract | //tableOfContents | //note'.freeze
+      # Selects <abstract>, <tableOfContents> and <note> when a namespace is present
+      LINEFEED_XPATH_NAMESPACED = '//ns:abstract | //ns:tableOfContents | //ns:note'.freeze
+      # Checks if a node has attributes that we make exeptions for. There are two such exceptions.
+      #
+      # * A "collection" attribute with the value "yes" <em>on a typeOfResource tag</em>.
+      # * A "manuscript" attribute with the value "yes" <em>on a typeOfResource tag</em>.
+      #
+      # Nodes that fall under any of these exceptions should not be deleted, even if they have no content.
+      #
+      # @param  [Nokogiri::XML::Element]   node    An XML node.
+      # @return [Boolean]                  true if the node contains any of the exceptional attributes, false otherwise.
+      def exceptional?(node)
+        return false if node.nil?
+        tag = node.name
+        attributes = node.attributes
+        return false if attributes.empty?
+        attributes.each do |key, value|
+          next unless tag == 'typeOfResource'
+          # Note that according to the MODS schema, any other value than 'yes' for these attributes is invalid
+          if (key == 'collection' && value.to_s.casecmp('yes').zero?) ||
+             (key == 'manuscript' && value.to_s.casecmp('yes').zero?)
+            return true
+          end
+        end
+        false
+      end
+      # Recursive helper method for {Normalizer#clean_linefeeds} to do string substitution.
+      #
+      # @param [Nokogiri::XML::Element]   node   An XML node
+      # @return [String]                  A string composed of the entire contents of the given node,
+      #                                   with substitutions made as described for {#clean_linefeeds}.
+      def substitute_linefeeds(node)
+        new_text = ''
+        # If we substitute in '&#10;' by itself, Nokogiri interprets that and then prints '&amp;#10;' when printing the document later. This
+        # is an ugly way to add linefeed characters in a way that we at least get well-formatted output in the end.
+        if node.text?
+          new_text = node.content.gsub(/(\r\n|\n|\r|\\n)/, Nokogiri::HTML(LINEFEED).text)
+        else
+          if node.node_name == 'br'
+            new_text += Nokogiri::HTML(LINEFEED).text
+          elsif node.node_name == 'p'
+            new_text += Nokogiri::HTML(LINEFEED).text + Nokogiri::HTML(LINEFEED).text
+          end
+          node.children.each do |c|
+            new_text += substitute_linefeeds(c)
+          end
+        end
+        new_text
+      end
+      # Given the root of an XML document, replaces linefeed characters inside <tableOfContents>, <abstract> and <note> XML node by &#10;
+      # \n, \r, <br> and <br/> are all replaced by a single &#10;
+      # <p> is replaced by two &#10;
+      # </p> is removed
+      # \r\n is replaced by &#10;
+      # Any tags not listed above are removed. MODS 3.5 does not allow for anything other than text inside these three nodes.
+      #
+      # @param   [Nokogiri::XML::NodeSet]    node_list  All <tableOfContents>, <abstract> and <node> elements.
+      # @return  [Void]                      This method doesn't return anything, but introduces
+      #                                      UTF-8 linefeed characters in place, as described above.
+      def clean_linefeeds(node_list)
+        node_list.each do |current_node|
+          new_text = substitute_linefeeds(current_node)
+          current_node.children.remove
+          current_node.content = new_text
+        end
+      end
+      # Cleans up the text of a node:
+      #
+      # * Removes extra whitespace at the beginning and end.
+      # * Removes any consecutive whitespace within the string.
+      #
+      # @param [String]   s   The text of an XML node.
+      # @return [String]  The cleaned string, as described. Returns nil if the input is nil, or if the input is an empty string.
+      def clean_text(s)
+        return nil unless !s.nil? && s != ''
+        s.gsub(/\s+/, ' ').strip
+      end
+      # Removes empty attributes from a given node.
+      #
+      # @param [Nokogiri::XML::Element]   node An XML node.
+      # @return [Void]                    This method doesn't return anything, but modifies the XML tree starting at the given node.
+      def remove_empty_attributes(node)
+        children = node.children
+        attributes = node.attributes
+        attributes.each do |key, value|
+          node.remove_attribute(key) if value.to_s.strip.empty?
+        end
+        children.each do |c|
+          remove_empty_attributes(c)
+        end
+      end
+      # Removes empty nodes from an XML tree. See {#exceptional?} for nodes that are kept even if empty.
+      #
+      # @param  [Nokogiri::XML::Element]   node An XML node.
+      # @return [Void]                     This method doesn't return anything, but modifies the XML tree starting at the given node.
+      def remove_empty_nodes(node)
+        children = node.children
+        if node.text?
+          return node.remove if node.to_s.strip.empty?
+          return
+        elsif !children.empty?
+          children.each do |c|
+            remove_empty_nodes(c)
+          end
+        end
+        node.remove if !exceptional?(node) && node.children.empty?
+      end
+      # Removes leading and trailing spaces from a node.
+      #
+      # @param  [Nokogiri::XML::Element]  node An XML node.
+      # @return [Void]                    This method doesn't return anything, but modifies the entire XML tree starting at the
+      #                                   the given node, removing leading and trailing spaces from all text. If the input is nil,
+      #                                   an exception will be raised.
+      def trim_text(node)
+        children = node.children
+        if node.text?
+          node.parent.content = node.text.strip
+        else
+          children.each do |c|
+            trim_text(c)
+          end
+        end
+      end
+      # Sometimes there are spurious decimal digits within the date fields. This method removes any trailing decimal points within
+      # <dateCreated> and <dateIssued>.
+      #
+      # @param [Nokogiri::XML::NodeSet]   nodes  A set of all affected <dateCreated> and <dateIssued> elements.
+      # @return [Void]                    The given document is modified in place.
+      def clean_date_values(nodes)
+        nodes.each do |current_node|
+          current_node.content = current_node.content.sub(/(.*)\.\d+$/, '\1')
+        end
+      end
+      # Normalizes the given MODS XML document according to the Stanford guidelines.
+      #
+      # @param  [Nokogiri::XML::Element]  root  The root of a MODS XML document.
+      # @return [Void]                    The given document is modified in place.
+      def normalize_mods_document(root)
+        node_list = if root.namespace.nil?
+                      root.xpath(LINEFEED_XPATH)
+                    else
+                      root.xpath(LINEFEED_XPATH_NAMESPACED, 'ns' => root.namespace.href)
+                    end
+        clean_linefeeds(node_list) # Do this before deleting <br> and <p> with remove_empty_nodes()
+        remove_empty_attributes(root)
+        remove_empty_nodes(root)
+        trim_text(root)
+        clean_date_values(root.xpath(DATE_CREATED_ISSUED_XPATH, 'mods' => MODS_NAMESPACE))
+      end
+      # Normalizes the given MODS XML document according to the Stanford guidelines.
+      #
+      # @deprecated Use normalize_mods_document instead.
+      # @param  [Nokogiri::XML::Element]  root  The root of a MODS XML document.
+      # @return [Void]                    The given document is modified in place.
+      def normalize_document(root)
+        normalize_mods_document(root)
+      end
+      # Normalizes the given XML document string according to the Stanford guidelines.
+      #
+      # @param  [String]   xml_string    An XML document
+      # @return [String]                 The XML string, with normalizations applied.
+      def normalize_xml_string(xml_string)
+        doc = Nokogiri::XML(xml_string)
+        normalize_document(doc.root)
+        doc.to_s
+      end
+    end
+  end
+end

data/lib/stanford/mods/normalizer/version.rb ADDED

@@ -0,0 +1,7 @@
+module Stanford
+  module Mods
+    class Normalizer
+      VERSION = '0.1.0'.freeze
+    end
+  end
+end

data/stanford-mods-normalizer.gemspec ADDED

@@ -0,0 +1,30 @@
+lib = File.expand_path('lib', __dir__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'stanford/mods/normalizer/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'stanford-mods-normalizer'
+  spec.version       = Stanford::Mods::Normalizer::VERSION
+  spec.authors       = ['Justin Coyne']
+  spec.email         = ['jcoyne@justincoyne.com']
+  spec.summary       = 'Provides methods to normalize MODS XML according to the Stanford guidelines '
+  spec.homepage      = 'https://github.com/sul-dlss/mods_normalizer'
+  spec.files = `git ls-files -z`.split("\x0").reject do |f|
+    f.match(%r{^(test|spec|features)/})
+  end
+  spec.bindir        = 'exe'
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ['lib']
+  spec.add_dependency 'nokogiri', '~> 1.8'
+  spec.add_development_dependency 'rubocop', '~> 0.53'
+  spec.add_development_dependency 'rubocop-rspec', '~> 0.18'
+  spec.add_development_dependency 'bundler', '~> 1.16'
+  spec.add_development_dependency 'equivalent-xml', '>= 0.6.0'
+  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rspec', '~> 3.0'
+end

metadata ADDED

@@ -0,0 +1,152 @@
+--- !ruby/object:Gem::Specification
+name: stanford-mods-normalizer
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Justin Coyne
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2018-03-14 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.8'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.8'
+- !ruby/object:Gem::Dependency
+  name: rubocop
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.53'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.53'
+- !ruby/object:Gem::Dependency
+  name: rubocop-rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.18'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.18'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.16'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.16'
+- !ruby/object:Gem::Dependency
+  name: equivalent-xml
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.6.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.6.0
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+description:
+email:
+- jcoyne@justincoyne.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".rubocop.yml"
+- ".rubocop_todo.yml"
+- ".travis.yml"
+- Gemfile
+- README.md
+- Rakefile
+- lib/stanford/mods/normalizer.rb
+- lib/stanford/mods/normalizer/version.rb
+- stanford-mods-normalizer.gemspec
+homepage: https://github.com/sul-dlss/mods_normalizer
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.7.1
+signing_key:
+specification_version: 4
+summary: Provides methods to normalize MODS XML according to the Stanford guidelines
+test_files: []