RubyGems - modsulator - Versions diffs - 0.0.6 - Mend

modsulator 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +7 -0
data/LICENSE +202 -0
data/README.md +7 -0
data/Rakefile +43 -0
data/bin/modsulator +16 -0
data/lib/doc/Modsulator.html +725 -0
data/lib/doc/_index.html +101 -0
data/lib/doc/class_list.html +58 -0
data/lib/doc/css/common.css +1 -0
data/lib/doc/css/full_list.css +57 -0
data/lib/doc/css/style.css +339 -0
data/lib/doc/file_list.html +57 -0
data/lib/doc/frames.html +26 -0
data/lib/doc/index.html +101 -0
data/lib/doc/js/app.js +219 -0
data/lib/doc/js/full_list.js +181 -0
data/lib/doc/js/jquery.js +4 -0
data/lib/doc/method_list.html +87 -0
data/lib/doc/top-level-namespace.html +112 -0
data/lib/modsulator.rb +193 -0
data/lib/modsulator/mods-3-5.xsd +1463 -0
data/lib/modsulator/modsulator.xsd +35 -0
data/lib/modsulator/modsulator_sheet.rb +62 -0
data/lib/modsulator/modsulator_template.xlsx +0 -0
data/lib/modsulator/modsulator_template.xml +198 -0
data/lib/modsulator/normalizer.rb +218 -0
data/lib/modsulator/validator.rb +37 -0
data/spec/features/modsulator_sheet_unit_spec.rb +18 -0
data/spec/features/normalizer_unit_spec.rb +152 -0
data/spec/features/process_template_spec.rb +29 -0
data/spec/features/validator_unit_spec.rb +43 -0
data/spec/fixtures/Fitch_Chavez.xlsx +0 -0
data/spec/fixtures/Fitch_Chavez.xml +2806 -0
data/spec/fixtures/Fitch_King.xlsx +0 -0
data/spec/fixtures/Fitch_King.xml +2560 -0
data/spec/fixtures/M1463_AV_manifest.xlsx +0 -0
data/spec/fixtures/M1463_AV_manifest.xml +2373 -0
data/spec/fixtures/Matter_manifest.csv +295 -0
data/spec/fixtures/Matter_manifest.xml +3309 -0
data/spec/fixtures/PosadaSpreadsheet.xlsx +0 -0
data/spec/fixtures/PosadaSpreadsheet.xml +1259 -0
data/spec/fixtures/ars0056_manifest.csv +1 -0
data/spec/fixtures/ars0056_manifest.xml +9728 -0
data/spec/fixtures/crowdsourcing_bridget_1.xlsx +0 -0
data/spec/fixtures/crowdsourcing_bridget_1.xml +606 -0
data/spec/fixtures/crowdsourcing_bridget_2.xlsx +0 -0
data/spec/fixtures/crowdsourcing_bridget_2.xml +3433 -0
data/spec/fixtures/filled_template.xlsx +0 -0
data/spec/fixtures/invalid_crowdsourcing_bridget_1.xml +606 -0
data/spec/fixtures/manifest_v0174.csv +34 -0
data/spec/fixtures/manifest_v0174.xml +1026 -0
data/spec/fixtures/roman_coins_mods_manifest.csv +176 -0
data/spec/fixtures/roman_coins_mods_manifest.xml +3025 -0
data/spec/fixtures/test:002.xml +63 -0
data/spec/fixtures/test_002.csv +4 -0
data/spec/fixtures/test_002.xlsx +0 -0
data/spec/integration_tests/integration_spec.rb +40 -0
data/spec/lib/modsulator_spec.rb +21 -0
data/spec/spec_helper.rb +97 -0
metadata +261 -0

data/lib/modsulator/modsulator.xsd ADDED Viewed

@@ -0,0 +1,35 @@
+<?xml version="1.0"?>
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" targetNamespace="http://library.stanford.edu/xmlDocs" xmlns="http://library.stanford.edu/xmlDocs" elementFormDefault="qualified" attributeFormDefault="unqualified">
+  <!-- This XML Schema Definition defines the "xmlDocs" vocabulary that is produced by the MODSulator. -->
+  <!-- The main content of xmlDocs is full MODS 3.5 XML, as defined by the Library of Congress. Import their schema definition. -->
+  <xs:import namespace="http://www.loc.gov/mods/v3" schemaLocation="mods-3-5.xsd"/>
+  <xs:element name="xmlDocs">
+    <xs:complexType>
+      <xs:sequence minOccurs="0" maxOccurs="unbounded">
+        <xs:element name="xmlDoc"/>
+      </xs:sequence>
+      <xs:attributeGroup ref="docsAttributeGroup"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="xmlDoc">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element name="mods"/>
+      </xs:sequence>
+      <xs:attributeGroup ref="docAttributeGroup"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:attributeGroup name="docsAttributeGroup">
+    <xs:attribute name="datetime" type="xs:string"/>
+    <xs:attribute name="sourceFile" type="xs:string"/>
+  </xs:attributeGroup>
+  <xs:attributeGroup name="docAttributeGroup">
+      <xs:attribute name="id" type="xs:string"/>
+      <xs:attribute name="objectId" type="xs:string"/>
+  </xs:attributeGroup>
+</xs:schema>

data/lib/modsulator/modsulator_sheet.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# File "modsulator_sheet.rb" - a class to load and validate metadata spreadsheets (.xlsx or .csv) for input
+# to Modsulator.
+require 'json'
+require 'roo'
+# This class provides methods to parse Stanford's MODS spreadsheets into either an array of hashes, or a JSON string.
+class ModsulatorSheet
+  attr_reader :file, :filename
+  # Creates a new ModsulatorSheet. When called with temporary files, the filename must be specified separately, hence the
+  # second argument.
+  # @param [File]    file        The input spreadsheet
+  # @param [String]  filename    The filename of the input spreadsheet.
+  def initialize(file, filename)
+    @file = file
+    @filename = filename
+  end
+  # Loads the input spreadsheet into an array of hashes. This spreadsheet should conform to the Stanford MODS template format,
+  # which has three header rows. The first row is a kind of "super header", the second row is an intermediate header and the
+  # third row is the header row that names the fields. The data rows are in the fourth row onwards.
+  #
+  # @return [Array<Hash>]      An array with one entry per data row in the spreadsheet. Each entry is a hash, indexed by
+  #                            the spreadsheet headers.
+  def rows
+    # Parse the spreadsheet, automatically finding the header row by looking for "druid" and "sourceId" and leave the
+    # header row itself out of the resulting array. Everything preceding the header row is discarded. Would like to use
+    # clean: true here, but the latest release of Roo 1.13.2 crashes. 2.0.0beta1 seems to work though.
+    @rows ||= spreadsheet.parse(header_search: ['druid', 'sourceId']).drop(1)
+  end
+  # Opens a spreadsheet based on its filename extension.
+  #
+  # @return [Roo::CSV, Roo::Excel, Roo::Excelx]   A Roo object, whose type depends on the extension of the given filename.
+  def spreadsheet
+    @spreadsheet ||= case File.extname(@filename)
+                     when '.csv' then Roo::Spreadsheet.open(@file, extension: :csv)
+                     when '.xls' then Roo::Spreadsheet.open(@file, extension: :xls)
+                     when '.xlsx' then Roo::Spreadsheet.open(@file, extension: :xlsx)
+                     else fail "Unknown file type: #{@filename}"
+    end
+  end
+  # Get the headers used in the spreadsheet
+  def headers
+    rows.first.keys
+  end
+  # Convert the loaded spreadsheet to a JSON string.
+  # @return [String]  A JSON string.
+  def to_json
+    json_hash = {}
+    json_hash['filename'] = File.basename(filename)
+    json_hash['rows'] = rows
+    json_hash.to_json
+  end
+end

data/lib/modsulator/modsulator_template.xlsx ADDED Viewed

Binary file

data/lib/modsulator/modsulator_template.xml ADDED Viewed

@@ -0,0 +1,198 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<mods xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" version="3.5" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-5.xsd">
+  <titleInfo authority="[[ti1:authority]]" valueURI="[[ti1:valueURI]]">
+    <nonSort>[[ti1:nonSort]]</nonSort>
+    <title>[[ti1:title]]</title>
+    <subTitle>[[ti1:subTitle]]</subTitle>
+    <partNumber>[[ti1:partNumber]]</partNumber>
+    <partName>[[ti1:partName]]</partName>
+  </titleInfo>
+  <titleInfo type="[[ti2:type]]" authority="[[ti2:authority]]" valueURI="[[ti2:valueURI]]">
+    <nonSort>[[ti2:nonSort]]</nonSort>
+    <title>[[ti2:title]]</title>
+    <subTitle>[[ti2:subTitle]]</subTitle>
+    <partNumber>[[ti2:partNumber]]</partNumber>
+    <partName>[[ti2:partName]]</partName>
+  </titleInfo>
+  <titleInfo type="[[ti3:type]]" authority="[[ti3:authority]]" valueURI="[[ti3:valueURI]]">
+    <nonSort>[[ti3:nonSort]]</nonSort>
+    <title>[[ti3:title]]</title>
+    <subTitle>[[ti3:subTitle]]</subTitle>
+    <partNumber>[[ti3:partNumber]]</partNumber>
+    <partName>[[ti3:partName]]</partName>
+  </titleInfo>
+  <name type="[[na1:type]]" authority="[[na1:authority]]" valueURI="[[na1:valueURI]]">
+    <namePart>[[na1:namePart]]</namePart>
+    <role>
+      <roleTerm type="code" authority="[[ro1:authority]]" valueURI="[[ro1:valueURI]]">[[ro1:roleCode]]</roleTerm>
+      <roleTerm type="text" authority="[[ro1:authority]]" valueURI="[[ro1:valueURI]]">[[ro1:roleText]]</roleTerm>
+    </role>
+  </name>
+  <name type="[[na2:type]]" authority="[[na2:authority]]" valueURI="[[na2:valueURI]]">
+    <namePart>[[na2:namePart]]</namePart>
+    <role>
+      <roleTerm type="code" authority="[[ro2:authority]]" valueURI="[[ro2:valueURI]]">[[ro2:roleCode]]</roleTerm>
+      <roleTerm type="text" authority="[[ro2:authority]]" valueURI="[[ro2:valueURI]]">[[ro2:roleText]]</roleTerm>
+    </role>
+  </name>
+  <name type="[[na3:type]]" authority="[[na3:authority]]" valueURI="[[na3:valueURI]]">
+    <namePart>[[na3:namePart]]</namePart>
+    <role>
+      <roleTerm type="code" authority="[[ro3:authority]]" valueURI="[[ro3:valueURI]]">[[ro3:roleCode]]</roleTerm>
+      <roleTerm type="text" authority="[[ro3:authority]]" valueURI="[[ro3:valueURI]]">[[ro3:roleText]]</roleTerm>
+    </role>
+  </name>
+  <name type="[[na4:type]]" authority="[[na4:authority]]" valueURI="[[na4:valueURI]]">
+    <namePart>[[na4:namePart]]</namePart>
+    <role>
+      <roleTerm type="code" authority="[[ro4:authority]]" valueURI="[[ro4:valueURI]]">[[ro4:roleCode]]</roleTerm>
+      <roleTerm type="text" authority="[[ro4:authority]]" valueURI="[[ro4:valueURI]]">[[ro4:roleText]]</roleTerm>
+    </role>
+  </name>
+  <name type="[[na5:type]]" authority="[[na5:authority]]" valueURI="[[na5:valueURI]]">
+    <namePart>[[na5:namePart]]</namePart>
+    <role>
+      <roleTerm type="code" authority="[[ro5:authority]]" valueURI="[[ro5:valueURI]]">[[ro5:roleCode]]</roleTerm>
+      <roleTerm type="text" authority="[[ro5:authority]]" valueURI="[[ro5:valueURI]]">[[ro5:roleText]]</roleTerm>
+    </role>
+  </name>
+  <typeOfResource manuscript="[[ty1:manuscript]]">[[ty1:typeOfResource]]</typeOfResource>
+  <typeOfResource>[[ty2:typeOfResource]]</typeOfResource>
+  <typeOfResource>[[ty3:typeOfResource]]</typeOfResource>
+  <genre type="[[ge1:type]]" authority="[[ge1:authority]]" valueURI="[[ge1:valueURI]]">[[ge1:genre]]</genre>
+  <genre type="[[ge2:type]]" authority="[[ge2:authority]]" valueURI="[[ge2:valueURI]]">[[ge2:genre]]</genre>
+  <genre type="[[ge3:type]]" authority="[[ge3:authority]]" valueURI="[[ge3:valueURI]]">[[ge3:genre]]</genre>
+  <originInfo>
+    <place>
+      <placeTerm type="code" authority="[[pl:authority]]" valueURI="[[pl:valueURI]]">[[pl:placeCode]]</placeTerm>
+      <placeTerm type="text" authority="[[pl:authority]]" valueURI="[[pl:valueURI]]">[[pl:placeText]]</placeTerm>
+    </place>
+    <publisher>[[or:publisher]]</publisher>
+    <dateCreated keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="start">[[dt:dateCreated]]</dateCreated>
+    <dateCreated keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="end">[[dt:dateCreated2]]</dateCreated>
+    <dateIssued keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="start">[[dt:dateIssued]]</dateIssued>
+    <dateIssued keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="end">[[dt:dateIssued2]]</dateIssued>
+  </originInfo>
+  <language>
+    <languageTerm authority="[[la1:authority]]" valueURI="[[la1:valueURI]]" type="code">[[la1:code]]</languageTerm>
+    <languageTerm authority="[[la1:authority]]" valueURI="[[la1:valueURI]]" type="text">[[la1:text]]</languageTerm>
+  </language>
+  <language>
+    <languageTerm authority="[[la2:authority]]" valueURI="[[la2:valueURI]]" type="code">[[la2:code]]</languageTerm>
+    <languageTerm authority="[[la2:authority]]" valueURI="[[la2:valueURI]]" type="text">[[la2:text]]</languageTerm>
+  </language>
+  <language>
+    <languageTerm authority="[[la3:authority]]" valueURI="[[la3:valueURI]]" type="code">[[la3:code]]</languageTerm>
+    <languageTerm authority="[[la3:authority]]" valueURI="[[la3:valueURI]]" type="text">[[la3:text]]</languageTerm>
+  </language>
+  <physicalDescription>
+    <form>[[ph1:form]]</form>
+    <extent>[[ph1:extent]]</extent>
+    <digitalOrigin>[[ph1:digitalOrigin]]</digitalOrigin>
+    <note displayLabel="[[ph1:displayLabel]]">[[ph1:note]]</note>
+  </physicalDescription>
+  <abstract displayLabel="[[ab:displayLabel]]">[[ab:abstract]]</abstract>
+  <tableOfContents displayLabel="[[tc:displayLabel]]">[[tc:tableOfContents]]</tableOfContents>
+  <note type="[[no1:type]]" displayLabel="[[no1:displayLabel]]">[[no1:note]]</note>
+  <note type="[[no2:type]]" displayLabel="[[no2:displayLabel]]">[[no2:note]]</note>
+  <note type="[[no3:type]]" displayLabel="[[no3:displayLabel]]">[[no3:note]]</note>
+  <note type="[[no4:type]]" displayLabel="[[no4:displayLabel]]">[[no4:note]]</note>
+  <note type="[[no5:type]]" displayLabel="[[no5:displayLabel]]">[[no5:note]]</note>
+  <subject authority="[[sn1:authority]]">
+    <name type="[[sn1:p1:nameType]]" authority="[[sn1:p1:authority]]" valueURI="[[sn1:p1:valueURI]]">
+      <namePart>[[sn1:p1:name]]</namePart>
+    </name>
+    <titleInfo>
+       <title>[[sn1:p1:title]]</title>
+    </titleInfo>
+    <[[sn1:p2:type]] authority="[[sn1:p2:authority]]" valueURI="[[sn1:p2:valueURI]]">[[sn1:p2:value]]</[[sn1:p2:type]]>
+    <[[sn1:p3:type]] authority="[[sn1:p3:authority]]" valueURI="[[sn1:p3:valueURI]]">[[sn1:p3:value]]</[[sn1:p3:type]]>
+  </subject>
+  <subject authority="[[sn2:authority]]">
+    <name type="[[sn2:p1:nameType]]" authority="[[sn2:p1:authority]]" valueURI="[[sn2:p1:valueURI]]">
+      <namePart>[[sn2:p1:name]]</namePart>
+    </name>
+    <titleInfo>
+       <title>[[sn2:p1:title]]</title>
+    </titleInfo>
+    <[[sn2:p2:type]] authority="[[sn2:p2:authority]]" valueURI="[[sn2:p2:valueURI]]">[[sn2:p2:value]]</[[sn2:p2:type]]>
+    <[[sn2:p3:type]] authority="[[sn2:p3:authority]]" valueURI="[[sn2:p3:valueURI]]">[[sn2:p3:value]]</[[sn2:p3:type]]>
+  </subject>
+  <subject authority="[[sn3:authority]]">
+    <name type="[[sn3:p1:nameType]]" authority="[[sn3:p1:authority]]" valueURI="[[sn3:p1:valueURI]]">
+      <namePart>[[sn3:p1:name]]</namePart>
+    </name>
+    <titleInfo>
+       <title>[[sn3:p1:title]]</title>
+    </titleInfo>
+    <[[sn3:p2:type]] authority="[[sn3:p2:authority]]" valueURI="[[sn3:p2:valueURI]]">[[sn3:p2:value]]</[[sn3:p2:type]]>
+    <[[sn3:p3:type]] authority="[[sn3:p3:authority]]" valueURI="[[sn3:p3:valueURI]]">[[sn3:p3:value]]</[[sn3:p3:type]]>
+  </subject>
+  <subject authority="[[sn4:authority]]">
+    <name type="[[sn4:p1:nameType]]" authority="[[sn4:p1:authority]]" valueURI="[[sn4:p1:valueURI]]">
+      <namePart>[[sn4:p1:name]]</namePart>
+    </name>
+    <titleInfo>
+       <title>[[sn4:p1:title]]</title>
+    </titleInfo>
+    <[[sn4:p2:type]] authority="[[sn4:p2:authority]]" valueURI="[[sn4:p2:valueURI]]">[[sn4:p2:value]]</[[sn4:p2:type]]>
+    <[[sn4:p3:type]] authority="[[sn4:p3:authority]]" valueURI="[[sn4:p3:valueURI]]">[[sn4:p3:value]]</[[sn4:p3:type]]>
+  </subject>
+  <subject authority="[[sn5:authority]]">
+    <name type="[[sn5:p1:nameType]]" authority="[[sn5:p1:authority]]" valueURI="[[sn5:p1:valueURI]]">
+      <namePart>[[sn5:p1:name]]</namePart>
+    </name>
+    <titleInfo>
+       <title>[[sn5:p1:title]]</title>
+    </titleInfo>
+    <[[sn5:p2:type]] authority="[[sn5:p2:authority]]" valueURI="[[sn5:p2:valueURI]]">[[sn5:p2:value]]</[[sn5:p2:type]]>
+    <[[sn5:p3:type]] authority="[[sn5:p3:authority]]" valueURI="[[sn5:p3:valueURI]]">[[sn5:p3:value]]</[[sn5:p3:type]]>
+  </subject>
+  <subject authority="[[su1:authority]]">
+    <[[su1:p1:type]] authority="[[su1:p1:authority]]" valueURI="[[su1:p1:valueURI]]">[[su1:p1:value]]</[[su1:p1:type]]>
+    <[[su1:p2:type]] authority="[[su1:p2:authority]]" valueURI="[[su1:p2:valueURI]]">[[su1:p2:value]]</[[su1:p2:type]]>
+    <[[su1:p3:type]] authority="[[su1:p3:authority]]" valueURI="[[su1:p3:valueURI]]">[[su1:p3:value]]</[[su1:p3:type]]>
+  </subject>
+  <subject authority="[[su2:authority]]">
+    <[[su2:p1:type]] authority="[[su2:p1:authority]]" valueURI="[[su2:p1:valueURI]]">[[su2:p1:value]]</[[su2:p1:type]]>
+    <[[su2:p2:type]] authority="[[su2:p2:authority]]" valueURI="[[su2:p2:valueURI]]">[[su2:p2:value]]</[[su2:p2:type]]>
+    <[[su2:p3:type]] authority="[[su2:p3:authority]]" valueURI="[[su2:p3:valueURI]]">[[su2:p3:value]]</[[su2:p3:type]]>
+  </subject>
+  <subject authority="[[su3:authority]]">
+    <[[su3:p1:type]] authority="[[su3:p1:authority]]" valueURI="[[su3:p1:valueURI]]">[[su3:p1:value]]</[[su3:p1:type]]>
+    <[[su3:p2:type]] authority="[[su3:p2:authority]]" valueURI="[[su3:p2:valueURI]]">[[su3:p2:value]]</[[su3:p2:type]]>
+    <[[su3:p3:type]] authority="[[su3:p3:authority]]" valueURI="[[su3:p3:valueURI]]">[[su3:p3:value]]</[[su3:p3:type]]>
+  </subject>
+  <subject authority="[[su4:authority]]">
+    <[[su4:p1:type]] authority="[[su4:p1:authority]]" valueURI="[[su4:p1:valueURI]]">[[su4:p1:value]]</[[su4:p1:type]]>
+    <[[su4:p2:type]] authority="[[su4:p2:authority]]" valueURI="[[su4:p2:valueURI]]">[[su4:p2:value]]</[[su4:p2:type]]>
+    <[[su4:p3:type]] authority="[[su4:p3:authority]]" valueURI="[[su4:p3:valueURI]]">[[su4:p3:value]]</[[su4:p3:type]]>
+  </subject>
+  <subject authority="[[su5:authority]]">
+    <[[su5:p1:type]] authority="[[su5:p1:authority]]" valueURI="[[su5:p1:valueURI]]">[[su5:p1:value]]</[[su5:p1:type]]>
+    <[[su5:p2:type]] authority="[[su5:p2:authority]]" valueURI="[[su5:p2:valueURI]]">[[su5:p2:value]]</[[su5:p2:type]]>
+    <[[su5:p3:type]] authority="[[su5:p3:authority]]" valueURI="[[su5:p3:valueURI]]">[[su5:p3:value]]</[[su5:p3:type]]>
+  </subject>
+  <subject>
+    <cartographics>
+      <scale>[[sc1:scale]]</scale>
+      <projection>[[sc1:projection]]</projection>
+      <coordinates>[[sc1:coordinates]]</coordinates>
+    </cartographics>
+  </subject>
+  <identifier type="[[id1:type]]" displayLabel="[[id1:displayLabel]]">[[id1:identifier]]</identifier>
+  <identifier type="[[id2:type]]" displayLabel="[[id2:displayLabel]]">[[id2:identifier]]</identifier>
+  <identifier type="[[id3:type]]" displayLabel="[[id3:displayLabel]]">[[id3:identifier]]</identifier>
+  <identifier type="[[id4:type]]" displayLabel="[[id4:displayLabel]]">[[id4:identifier]]</identifier>
+  <identifier type="[[id5:type]]" displayLabel="[[id5:displayLabel]]">[[id5:identifier]]</identifier>
+  <location>
+    <physicalLocation type="repository" authority="[[lo:authority]]" valueURI="[[lo:valueURI]]">[[lo:repository]]</physicalLocation>
+    <shelfLocator>[[lo:callNumber]]</shelfLocator>
+  </location>
+  <relatedItem type="[[ri1:type]]">
+    <titleInfo>
+      <title>[[ri1:title]]</title>
+    </titleInfo>
+    <url>[[ri1:url]]</url>
+  </relatedItem>
+</mods>

data/lib/modsulator/normalizer.rb ADDED Viewed

@@ -0,0 +1,218 @@
+# File "normalizer.rb" - defines a class for normalizing MODS XML according to the Stanford guidelines.
+require 'nokogiri'
+# This class provides methods to normalize MODS XML according to the Stanford guidelines.
+# @see https://consul.stanford.edu/display/chimera/MODS+validation+and+normalization Requirements (Stanford Consul page - requires login)
+class Normalizer
+  # Linefeed character entity reference
+  LINEFEED = '&#10;'
+  # Checks if a node has attributes that we make exeptions for. There are two such exceptions.
+  #
+  # * A "collection" attribute with the value "yes" <em>on a typeOfResource tag</em>.
+  # * A "manuscript" attribute with the value "yes" <em>on a typeOfResource tag</em>.
+  #
+  # Nodes that fall under any of these exceptions should not be deleted, even if they have no content.
+  #
+  # @param  [Nokogiri::XML::Element]   node    An XML node.
+  # @return [Boolean]                  true if the node contains any of the exceptional attributes, false otherwise.
+  def exceptional?(node)
+    return false unless node != nil
+    tag = node.name
+    attributes = node.attributes
+    return false if(attributes.empty?)
+    attributes.each do |key, value|
+      if(tag == 'typeOfResource')  # Note that according to the MODS schema, any other value than 'yes' for these attributes is invalid
+        if((key == 'collection' && value.to_s.downcase == 'yes') ||
+           (key == 'manuscript' && value.to_s.downcase == 'yes'))
+          return true
+        end
+      end
+    end
+    return false
+  end
+  # Recursive helper method for {Normalizer#clean_linefeeds} to do string substitution.
+  #
+  # @param [Nokogiri::XML::Element]   node   An XML node
+  # @return [String]                  A string composed of the entire contents of the given node, with substitutions made as described for {#clean_linefeeds}.
+  def substitute_linefeeds(node)
+    new_text = String.new
+    # If we substitute in '&#10;' by itself, Nokogiri interprets that and then prints '&amp;#10;' when printing the document later. This
+    # is an ugly way to add linefeed characters in a way that we at least get well-formatted output in the end.
+    if(node.text?)
+      new_text = node.content.gsub(/\r\n/, Nokogiri::HTML(LINEFEED).text).gsub(/\n/, Nokogiri::HTML(LINEFEED).text).gsub(/\r/, Nokogiri::HTML(LINEFEED).text).gsub('\\n', Nokogiri::HTML(LINEFEED).text)
+    else
+      if(node.node_name == 'br')
+        new_text += Nokogiri::HTML(LINEFEED).text
+      elsif(node.node_name == 'p')
+        new_text += Nokogiri::HTML(LINEFEED).text + Nokogiri::HTML(LINEFEED).text
+      end
+      node.children.each do |c|
+        new_text += substitute_linefeeds(c)
+      end
+    end
+    return new_text
+  end
+  # Given the root of an XML document, replaces linefeed characters inside <tableOfContents>, <abstract> and <note> XML node by &#10;
+  # \n, \r, <br> and <br/> are all replaced by a single &#10;
+  # <p> is replaced by two &#10;
+  # </p> is removed
+  # \r\n is replaced by &#10;
+  # Any tags not listed above are removed. MODS 3.5 does not allow for anything other than text inside these three nodes.
+  #
+  # @param   [Nokogiri::XML::Element]    node  The root node of an XML document
+  # @return  [Void]                      This method doesn't return anything, but introduces UTF-8 linefeed characters in place, as described above.
+  def clean_linefeeds(node)
+    node_list = []
+    if(node.namespace.nil?)
+      node_list = node.xpath('//abstract | //tableOfContents | //note')
+    else
+      node_list = node.xpath('//ns:abstract | //ns:tableOfContents | //ns:note', 'ns' => node.namespace.href)
+    end
+    node_list.each do |current_node|
+      new_text = substitute_linefeeds(current_node)
+      current_node.children.remove
+      current_node.content = new_text
+    end
+  end
+  # Cleans up the text of a node:
+  #
+  # * Removes extra whitespace at the beginning and end.
+  # * Removes any consecutive whitespace within the string.
+  #
+  # @param [String]   s   The text of an XML node.
+  # @return [String]  The cleaned string, as described. Returns nil if the input is nil, or if the input is an empty string.
+  def clean_text(s)
+    return nil unless s != nil && s != ''
+    return s.gsub!(/\s+/, ' ').strip!
+  end
+  # Removes empty attributes from a given node.
+  #
+  # @param [Nokogiri::XML::Element]   node An XML node.
+  # @return [Void]                    This method doesn't return anything, but modifies the XML tree starting at the given node.
+  def remove_empty_attributes(node)
+    children = node.children
+    attributes = node.attributes
+    attributes.each do |key, value|
+      node.remove_attribute(key) if(value.to_s.strip.empty?)
+    end
+    children.each do |c|
+      remove_empty_attributes(c)
+    end
+  end
+  # Removes empty nodes from an XML tree. See {#exceptional?} for nodes that are kept even if empty.
+  #
+  # @param  [Nokogiri::XML::Element]   node An XML node.
+  # @return [Void]                     This method doesn't return anything, but modifies the XML tree starting at the given node.
+  def remove_empty_nodes(node)
+    children = node.children
+    if(node.text?)
+      if(node.to_s.strip.empty?)
+        node.remove
+      else
+        return
+      end
+    elsif(children.length > 0)
+      children.each do |c|
+        remove_empty_nodes(c)
+      end
+    end
+    if(!exceptional?(node) && (node.children.length == 0))
+      node.remove
+    end
+  end
+  # Removes leading and trailing spaces from a node.
+  #
+  # @param  [Nokogiri::XML::Element]  node An XML node.
+  # @return [Void]                    This method doesn't return anything, but modifies the entire XML tree starting at the
+  #                                   the given node, removing leading and trailing spaces from all text. If the input is nil,
+  #                                   an exception will be raised.
+  def trim_text(node)
+    children = node.children
+    if(node.text?)
+      node.parent.content = node.text.strip
+    else
+      children.each do |c|
+        trim_text(c)
+      end
+    end
+  end
+  # Removes the point attribute from single <dateCreated> and <dateIssued> elements.
+  #
+  # @param [Nokogiri::XML::Element]   root  The root of a MODS XML document.
+  # @return [Void]                    The given document is modified in place.
+  def clean_date_attributes(root)
+    # Find all the <dateCreated> and <dateIssued> elements that are NOT immediately followed by another element with the same name
+    root.xpath('//mods:originInfo/mods:dateCreated[1][not(following-sibling::*[1][self::mods:dateCreated])] | //mods:originInfo/mods:dateIssued[1][not(following-sibling::*[1][self::mods:dateIssued])]', 'mods' => 'http://www.loc.gov/mods/v3').each do |current_element|
+      attributes = current_element.attributes
+      if(attributes.key?('point'))
+        current_element.remove_attribute('point')
+      end
+    end
+  end
+  # Sometimes there are spurious decimal digits within the date fields. This method removes any trailing decimal points within
+  # <dateCreated> and <dateIssued>.
+  #
+  # @param [Nokogiri::XML::Element]   root  The root of a MODS XML document.
+  # @return [Void]                    The given document is modified in place.
+  def clean_date_values(root)
+    root.xpath('//mods:dateCreated | //mods:dateIssued', 'mods' => 'http://www.loc.gov/mods/v3').each do |current_node|
+      current_node.content = current_node.content.sub(/(.*)\.\d+$/, '\1')
+    end
+  end
+  # Normalizes the given XML document according to the Stanford guidelines.
+  #
+  # @param  [Nokogiri::XML::Element]  root  The root of a MODS XML document.
+  # @return [Void]                    The given document is modified in place.
+  def normalize_document(root)
+    clean_linefeeds(root)   # Do this before deleting <br> and <p> with remove_empty_nodes()
+    remove_empty_attributes(root)
+    remove_empty_nodes(root)
+    trim_text(root)
+    clean_date_attributes(root)
+    clean_date_values(root)
+  end
+  # Normalizes the given XML document string according to the Stanford guidelines.
+  #
+  # @param  [String]   xml_string    An XML document
+  # @return [String]                 The XML string, with normalizations applied.
+  def normalize_xml_string(xml_string)
+    doc = Nokogiri::XML(xml_string)
+    normalize_document(doc.root)
+    doc.to_s
+  end
+end