modsulator 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +202 -0
- data/README.md +7 -0
- data/Rakefile +43 -0
- data/bin/modsulator +16 -0
- data/lib/doc/Modsulator.html +725 -0
- data/lib/doc/_index.html +101 -0
- data/lib/doc/class_list.html +58 -0
- data/lib/doc/css/common.css +1 -0
- data/lib/doc/css/full_list.css +57 -0
- data/lib/doc/css/style.css +339 -0
- data/lib/doc/file_list.html +57 -0
- data/lib/doc/frames.html +26 -0
- data/lib/doc/index.html +101 -0
- data/lib/doc/js/app.js +219 -0
- data/lib/doc/js/full_list.js +181 -0
- data/lib/doc/js/jquery.js +4 -0
- data/lib/doc/method_list.html +87 -0
- data/lib/doc/top-level-namespace.html +112 -0
- data/lib/modsulator.rb +193 -0
- data/lib/modsulator/mods-3-5.xsd +1463 -0
- data/lib/modsulator/modsulator.xsd +35 -0
- data/lib/modsulator/modsulator_sheet.rb +62 -0
- data/lib/modsulator/modsulator_template.xlsx +0 -0
- data/lib/modsulator/modsulator_template.xml +198 -0
- data/lib/modsulator/normalizer.rb +218 -0
- data/lib/modsulator/validator.rb +37 -0
- data/spec/features/modsulator_sheet_unit_spec.rb +18 -0
- data/spec/features/normalizer_unit_spec.rb +152 -0
- data/spec/features/process_template_spec.rb +29 -0
- data/spec/features/validator_unit_spec.rb +43 -0
- data/spec/fixtures/Fitch_Chavez.xlsx +0 -0
- data/spec/fixtures/Fitch_Chavez.xml +2806 -0
- data/spec/fixtures/Fitch_King.xlsx +0 -0
- data/spec/fixtures/Fitch_King.xml +2560 -0
- data/spec/fixtures/M1463_AV_manifest.xlsx +0 -0
- data/spec/fixtures/M1463_AV_manifest.xml +2373 -0
- data/spec/fixtures/Matter_manifest.csv +295 -0
- data/spec/fixtures/Matter_manifest.xml +3309 -0
- data/spec/fixtures/PosadaSpreadsheet.xlsx +0 -0
- data/spec/fixtures/PosadaSpreadsheet.xml +1259 -0
- data/spec/fixtures/ars0056_manifest.csv +1 -0
- data/spec/fixtures/ars0056_manifest.xml +9728 -0
- data/spec/fixtures/crowdsourcing_bridget_1.xlsx +0 -0
- data/spec/fixtures/crowdsourcing_bridget_1.xml +606 -0
- data/spec/fixtures/crowdsourcing_bridget_2.xlsx +0 -0
- data/spec/fixtures/crowdsourcing_bridget_2.xml +3433 -0
- data/spec/fixtures/filled_template.xlsx +0 -0
- data/spec/fixtures/invalid_crowdsourcing_bridget_1.xml +606 -0
- data/spec/fixtures/manifest_v0174.csv +34 -0
- data/spec/fixtures/manifest_v0174.xml +1026 -0
- data/spec/fixtures/roman_coins_mods_manifest.csv +176 -0
- data/spec/fixtures/roman_coins_mods_manifest.xml +3025 -0
- data/spec/fixtures/test:002.xml +63 -0
- data/spec/fixtures/test_002.csv +4 -0
- data/spec/fixtures/test_002.xlsx +0 -0
- data/spec/integration_tests/integration_spec.rb +40 -0
- data/spec/lib/modsulator_spec.rb +21 -0
- data/spec/spec_helper.rb +97 -0
- metadata +261 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" targetNamespace="http://library.stanford.edu/xmlDocs" xmlns="http://library.stanford.edu/xmlDocs" elementFormDefault="qualified" attributeFormDefault="unqualified">
|
3
|
+
<!-- This XML Schema Definition defines the "xmlDocs" vocabulary that is produced by the MODSulator. -->
|
4
|
+
<!-- The main content of xmlDocs is full MODS 3.5 XML, as defined by the Library of Congress. Import their schema definition. -->
|
5
|
+
<xs:import namespace="http://www.loc.gov/mods/v3" schemaLocation="mods-3-5.xsd"/>
|
6
|
+
|
7
|
+
<xs:element name="xmlDocs">
|
8
|
+
<xs:complexType>
|
9
|
+
<xs:sequence minOccurs="0" maxOccurs="unbounded">
|
10
|
+
<xs:element name="xmlDoc"/>
|
11
|
+
</xs:sequence>
|
12
|
+
<xs:attributeGroup ref="docsAttributeGroup"/>
|
13
|
+
</xs:complexType>
|
14
|
+
</xs:element>
|
15
|
+
|
16
|
+
<xs:element name="xmlDoc">
|
17
|
+
<xs:complexType>
|
18
|
+
<xs:sequence>
|
19
|
+
<xs:element name="mods"/>
|
20
|
+
</xs:sequence>
|
21
|
+
<xs:attributeGroup ref="docAttributeGroup"/>
|
22
|
+
</xs:complexType>
|
23
|
+
</xs:element>
|
24
|
+
|
25
|
+
<xs:attributeGroup name="docsAttributeGroup">
|
26
|
+
<xs:attribute name="datetime" type="xs:string"/>
|
27
|
+
<xs:attribute name="sourceFile" type="xs:string"/>
|
28
|
+
</xs:attributeGroup>
|
29
|
+
|
30
|
+
<xs:attributeGroup name="docAttributeGroup">
|
31
|
+
<xs:attribute name="id" type="xs:string"/>
|
32
|
+
<xs:attribute name="objectId" type="xs:string"/>
|
33
|
+
</xs:attributeGroup>
|
34
|
+
|
35
|
+
</xs:schema>
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# File "modsulator_sheet.rb" - a class to load and validate metadata spreadsheets (.xlsx or .csv) for input
|
2
|
+
# to Modsulator.
|
3
|
+
|
4
|
+
require 'json'
|
5
|
+
require 'roo'
|
6
|
+
|
7
|
+
# This class provides methods to parse Stanford's MODS spreadsheets into either an array of hashes, or a JSON string.
|
8
|
+
class ModsulatorSheet
|
9
|
+
attr_reader :file, :filename
|
10
|
+
|
11
|
+
# Creates a new ModsulatorSheet. When called with temporary files, the filename must be specified separately, hence the
|
12
|
+
# second argument.
|
13
|
+
# @param [File] file The input spreadsheet
|
14
|
+
# @param [String] filename The filename of the input spreadsheet.
|
15
|
+
def initialize(file, filename)
|
16
|
+
@file = file
|
17
|
+
@filename = filename
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
# Loads the input spreadsheet into an array of hashes. This spreadsheet should conform to the Stanford MODS template format,
|
22
|
+
# which has three header rows. The first row is a kind of "super header", the second row is an intermediate header and the
|
23
|
+
# third row is the header row that names the fields. The data rows are in the fourth row onwards.
|
24
|
+
#
|
25
|
+
# @return [Array<Hash>] An array with one entry per data row in the spreadsheet. Each entry is a hash, indexed by
|
26
|
+
# the spreadsheet headers.
|
27
|
+
def rows
|
28
|
+
# Parse the spreadsheet, automatically finding the header row by looking for "druid" and "sourceId" and leave the
|
29
|
+
# header row itself out of the resulting array. Everything preceding the header row is discarded. Would like to use
|
30
|
+
# clean: true here, but the latest release of Roo 1.13.2 crashes. 2.0.0beta1 seems to work though.
|
31
|
+
@rows ||= spreadsheet.parse(header_search: ['druid', 'sourceId']).drop(1)
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# Opens a spreadsheet based on its filename extension.
|
36
|
+
#
|
37
|
+
# @return [Roo::CSV, Roo::Excel, Roo::Excelx] A Roo object, whose type depends on the extension of the given filename.
|
38
|
+
def spreadsheet
|
39
|
+
@spreadsheet ||= case File.extname(@filename)
|
40
|
+
when '.csv' then Roo::Spreadsheet.open(@file, extension: :csv)
|
41
|
+
when '.xls' then Roo::Spreadsheet.open(@file, extension: :xls)
|
42
|
+
when '.xlsx' then Roo::Spreadsheet.open(@file, extension: :xlsx)
|
43
|
+
else fail "Unknown file type: #{@filename}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# Get the headers used in the spreadsheet
|
49
|
+
def headers
|
50
|
+
rows.first.keys
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
# Convert the loaded spreadsheet to a JSON string.
|
55
|
+
# @return [String] A JSON string.
|
56
|
+
def to_json
|
57
|
+
json_hash = {}
|
58
|
+
json_hash['filename'] = File.basename(filename)
|
59
|
+
json_hash['rows'] = rows
|
60
|
+
json_hash.to_json
|
61
|
+
end
|
62
|
+
end
|
Binary file
|
@@ -0,0 +1,198 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<mods xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" version="3.5" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-5.xsd">
|
3
|
+
<titleInfo authority="[[ti1:authority]]" valueURI="[[ti1:valueURI]]">
|
4
|
+
<nonSort>[[ti1:nonSort]]</nonSort>
|
5
|
+
<title>[[ti1:title]]</title>
|
6
|
+
<subTitle>[[ti1:subTitle]]</subTitle>
|
7
|
+
<partNumber>[[ti1:partNumber]]</partNumber>
|
8
|
+
<partName>[[ti1:partName]]</partName>
|
9
|
+
</titleInfo>
|
10
|
+
<titleInfo type="[[ti2:type]]" authority="[[ti2:authority]]" valueURI="[[ti2:valueURI]]">
|
11
|
+
<nonSort>[[ti2:nonSort]]</nonSort>
|
12
|
+
<title>[[ti2:title]]</title>
|
13
|
+
<subTitle>[[ti2:subTitle]]</subTitle>
|
14
|
+
<partNumber>[[ti2:partNumber]]</partNumber>
|
15
|
+
<partName>[[ti2:partName]]</partName>
|
16
|
+
</titleInfo>
|
17
|
+
<titleInfo type="[[ti3:type]]" authority="[[ti3:authority]]" valueURI="[[ti3:valueURI]]">
|
18
|
+
<nonSort>[[ti3:nonSort]]</nonSort>
|
19
|
+
<title>[[ti3:title]]</title>
|
20
|
+
<subTitle>[[ti3:subTitle]]</subTitle>
|
21
|
+
<partNumber>[[ti3:partNumber]]</partNumber>
|
22
|
+
<partName>[[ti3:partName]]</partName>
|
23
|
+
</titleInfo>
|
24
|
+
<name type="[[na1:type]]" authority="[[na1:authority]]" valueURI="[[na1:valueURI]]">
|
25
|
+
<namePart>[[na1:namePart]]</namePart>
|
26
|
+
<role>
|
27
|
+
<roleTerm type="code" authority="[[ro1:authority]]" valueURI="[[ro1:valueURI]]">[[ro1:roleCode]]</roleTerm>
|
28
|
+
<roleTerm type="text" authority="[[ro1:authority]]" valueURI="[[ro1:valueURI]]">[[ro1:roleText]]</roleTerm>
|
29
|
+
</role>
|
30
|
+
</name>
|
31
|
+
<name type="[[na2:type]]" authority="[[na2:authority]]" valueURI="[[na2:valueURI]]">
|
32
|
+
<namePart>[[na2:namePart]]</namePart>
|
33
|
+
<role>
|
34
|
+
<roleTerm type="code" authority="[[ro2:authority]]" valueURI="[[ro2:valueURI]]">[[ro2:roleCode]]</roleTerm>
|
35
|
+
<roleTerm type="text" authority="[[ro2:authority]]" valueURI="[[ro2:valueURI]]">[[ro2:roleText]]</roleTerm>
|
36
|
+
</role>
|
37
|
+
</name>
|
38
|
+
<name type="[[na3:type]]" authority="[[na3:authority]]" valueURI="[[na3:valueURI]]">
|
39
|
+
<namePart>[[na3:namePart]]</namePart>
|
40
|
+
<role>
|
41
|
+
<roleTerm type="code" authority="[[ro3:authority]]" valueURI="[[ro3:valueURI]]">[[ro3:roleCode]]</roleTerm>
|
42
|
+
<roleTerm type="text" authority="[[ro3:authority]]" valueURI="[[ro3:valueURI]]">[[ro3:roleText]]</roleTerm>
|
43
|
+
</role>
|
44
|
+
</name>
|
45
|
+
<name type="[[na4:type]]" authority="[[na4:authority]]" valueURI="[[na4:valueURI]]">
|
46
|
+
<namePart>[[na4:namePart]]</namePart>
|
47
|
+
<role>
|
48
|
+
<roleTerm type="code" authority="[[ro4:authority]]" valueURI="[[ro4:valueURI]]">[[ro4:roleCode]]</roleTerm>
|
49
|
+
<roleTerm type="text" authority="[[ro4:authority]]" valueURI="[[ro4:valueURI]]">[[ro4:roleText]]</roleTerm>
|
50
|
+
</role>
|
51
|
+
</name>
|
52
|
+
<name type="[[na5:type]]" authority="[[na5:authority]]" valueURI="[[na5:valueURI]]">
|
53
|
+
<namePart>[[na5:namePart]]</namePart>
|
54
|
+
<role>
|
55
|
+
<roleTerm type="code" authority="[[ro5:authority]]" valueURI="[[ro5:valueURI]]">[[ro5:roleCode]]</roleTerm>
|
56
|
+
<roleTerm type="text" authority="[[ro5:authority]]" valueURI="[[ro5:valueURI]]">[[ro5:roleText]]</roleTerm>
|
57
|
+
</role>
|
58
|
+
</name>
|
59
|
+
<typeOfResource manuscript="[[ty1:manuscript]]">[[ty1:typeOfResource]]</typeOfResource>
|
60
|
+
<typeOfResource>[[ty2:typeOfResource]]</typeOfResource>
|
61
|
+
<typeOfResource>[[ty3:typeOfResource]]</typeOfResource>
|
62
|
+
<genre type="[[ge1:type]]" authority="[[ge1:authority]]" valueURI="[[ge1:valueURI]]">[[ge1:genre]]</genre>
|
63
|
+
<genre type="[[ge2:type]]" authority="[[ge2:authority]]" valueURI="[[ge2:valueURI]]">[[ge2:genre]]</genre>
|
64
|
+
<genre type="[[ge3:type]]" authority="[[ge3:authority]]" valueURI="[[ge3:valueURI]]">[[ge3:genre]]</genre>
|
65
|
+
<originInfo>
|
66
|
+
<place>
|
67
|
+
<placeTerm type="code" authority="[[pl:authority]]" valueURI="[[pl:valueURI]]">[[pl:placeCode]]</placeTerm>
|
68
|
+
<placeTerm type="text" authority="[[pl:authority]]" valueURI="[[pl:valueURI]]">[[pl:placeText]]</placeTerm>
|
69
|
+
</place>
|
70
|
+
<publisher>[[or:publisher]]</publisher>
|
71
|
+
<dateCreated keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="start">[[dt:dateCreated]]</dateCreated>
|
72
|
+
<dateCreated keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="end">[[dt:dateCreated2]]</dateCreated>
|
73
|
+
<dateIssued keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="start">[[dt:dateIssued]]</dateIssued>
|
74
|
+
<dateIssued keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="end">[[dt:dateIssued2]]</dateIssued>
|
75
|
+
</originInfo>
|
76
|
+
<language>
|
77
|
+
<languageTerm authority="[[la1:authority]]" valueURI="[[la1:valueURI]]" type="code">[[la1:code]]</languageTerm>
|
78
|
+
<languageTerm authority="[[la1:authority]]" valueURI="[[la1:valueURI]]" type="text">[[la1:text]]</languageTerm>
|
79
|
+
</language>
|
80
|
+
<language>
|
81
|
+
<languageTerm authority="[[la2:authority]]" valueURI="[[la2:valueURI]]" type="code">[[la2:code]]</languageTerm>
|
82
|
+
<languageTerm authority="[[la2:authority]]" valueURI="[[la2:valueURI]]" type="text">[[la2:text]]</languageTerm>
|
83
|
+
</language>
|
84
|
+
<language>
|
85
|
+
<languageTerm authority="[[la3:authority]]" valueURI="[[la3:valueURI]]" type="code">[[la3:code]]</languageTerm>
|
86
|
+
<languageTerm authority="[[la3:authority]]" valueURI="[[la3:valueURI]]" type="text">[[la3:text]]</languageTerm>
|
87
|
+
</language>
|
88
|
+
<physicalDescription>
|
89
|
+
<form>[[ph1:form]]</form>
|
90
|
+
<extent>[[ph1:extent]]</extent>
|
91
|
+
<digitalOrigin>[[ph1:digitalOrigin]]</digitalOrigin>
|
92
|
+
<note displayLabel="[[ph1:displayLabel]]">[[ph1:note]]</note>
|
93
|
+
</physicalDescription>
|
94
|
+
<abstract displayLabel="[[ab:displayLabel]]">[[ab:abstract]]</abstract>
|
95
|
+
<tableOfContents displayLabel="[[tc:displayLabel]]">[[tc:tableOfContents]]</tableOfContents>
|
96
|
+
<note type="[[no1:type]]" displayLabel="[[no1:displayLabel]]">[[no1:note]]</note>
|
97
|
+
<note type="[[no2:type]]" displayLabel="[[no2:displayLabel]]">[[no2:note]]</note>
|
98
|
+
<note type="[[no3:type]]" displayLabel="[[no3:displayLabel]]">[[no3:note]]</note>
|
99
|
+
<note type="[[no4:type]]" displayLabel="[[no4:displayLabel]]">[[no4:note]]</note>
|
100
|
+
<note type="[[no5:type]]" displayLabel="[[no5:displayLabel]]">[[no5:note]]</note>
|
101
|
+
<subject authority="[[sn1:authority]]">
|
102
|
+
<name type="[[sn1:p1:nameType]]" authority="[[sn1:p1:authority]]" valueURI="[[sn1:p1:valueURI]]">
|
103
|
+
<namePart>[[sn1:p1:name]]</namePart>
|
104
|
+
</name>
|
105
|
+
<titleInfo>
|
106
|
+
<title>[[sn1:p1:title]]</title>
|
107
|
+
</titleInfo>
|
108
|
+
<[[sn1:p2:type]] authority="[[sn1:p2:authority]]" valueURI="[[sn1:p2:valueURI]]">[[sn1:p2:value]]</[[sn1:p2:type]]>
|
109
|
+
<[[sn1:p3:type]] authority="[[sn1:p3:authority]]" valueURI="[[sn1:p3:valueURI]]">[[sn1:p3:value]]</[[sn1:p3:type]]>
|
110
|
+
</subject>
|
111
|
+
<subject authority="[[sn2:authority]]">
|
112
|
+
<name type="[[sn2:p1:nameType]]" authority="[[sn2:p1:authority]]" valueURI="[[sn2:p1:valueURI]]">
|
113
|
+
<namePart>[[sn2:p1:name]]</namePart>
|
114
|
+
</name>
|
115
|
+
<titleInfo>
|
116
|
+
<title>[[sn2:p1:title]]</title>
|
117
|
+
</titleInfo>
|
118
|
+
<[[sn2:p2:type]] authority="[[sn2:p2:authority]]" valueURI="[[sn2:p2:valueURI]]">[[sn2:p2:value]]</[[sn2:p2:type]]>
|
119
|
+
<[[sn2:p3:type]] authority="[[sn2:p3:authority]]" valueURI="[[sn2:p3:valueURI]]">[[sn2:p3:value]]</[[sn2:p3:type]]>
|
120
|
+
</subject>
|
121
|
+
<subject authority="[[sn3:authority]]">
|
122
|
+
<name type="[[sn3:p1:nameType]]" authority="[[sn3:p1:authority]]" valueURI="[[sn3:p1:valueURI]]">
|
123
|
+
<namePart>[[sn3:p1:name]]</namePart>
|
124
|
+
</name>
|
125
|
+
<titleInfo>
|
126
|
+
<title>[[sn3:p1:title]]</title>
|
127
|
+
</titleInfo>
|
128
|
+
<[[sn3:p2:type]] authority="[[sn3:p2:authority]]" valueURI="[[sn3:p2:valueURI]]">[[sn3:p2:value]]</[[sn3:p2:type]]>
|
129
|
+
<[[sn3:p3:type]] authority="[[sn3:p3:authority]]" valueURI="[[sn3:p3:valueURI]]">[[sn3:p3:value]]</[[sn3:p3:type]]>
|
130
|
+
</subject>
|
131
|
+
<subject authority="[[sn4:authority]]">
|
132
|
+
<name type="[[sn4:p1:nameType]]" authority="[[sn4:p1:authority]]" valueURI="[[sn4:p1:valueURI]]">
|
133
|
+
<namePart>[[sn4:p1:name]]</namePart>
|
134
|
+
</name>
|
135
|
+
<titleInfo>
|
136
|
+
<title>[[sn4:p1:title]]</title>
|
137
|
+
</titleInfo>
|
138
|
+
<[[sn4:p2:type]] authority="[[sn4:p2:authority]]" valueURI="[[sn4:p2:valueURI]]">[[sn4:p2:value]]</[[sn4:p2:type]]>
|
139
|
+
<[[sn4:p3:type]] authority="[[sn4:p3:authority]]" valueURI="[[sn4:p3:valueURI]]">[[sn4:p3:value]]</[[sn4:p3:type]]>
|
140
|
+
</subject>
|
141
|
+
<subject authority="[[sn5:authority]]">
|
142
|
+
<name type="[[sn5:p1:nameType]]" authority="[[sn5:p1:authority]]" valueURI="[[sn5:p1:valueURI]]">
|
143
|
+
<namePart>[[sn5:p1:name]]</namePart>
|
144
|
+
</name>
|
145
|
+
<titleInfo>
|
146
|
+
<title>[[sn5:p1:title]]</title>
|
147
|
+
</titleInfo>
|
148
|
+
<[[sn5:p2:type]] authority="[[sn5:p2:authority]]" valueURI="[[sn5:p2:valueURI]]">[[sn5:p2:value]]</[[sn5:p2:type]]>
|
149
|
+
<[[sn5:p3:type]] authority="[[sn5:p3:authority]]" valueURI="[[sn5:p3:valueURI]]">[[sn5:p3:value]]</[[sn5:p3:type]]>
|
150
|
+
</subject>
|
151
|
+
<subject authority="[[su1:authority]]">
|
152
|
+
<[[su1:p1:type]] authority="[[su1:p1:authority]]" valueURI="[[su1:p1:valueURI]]">[[su1:p1:value]]</[[su1:p1:type]]>
|
153
|
+
<[[su1:p2:type]] authority="[[su1:p2:authority]]" valueURI="[[su1:p2:valueURI]]">[[su1:p2:value]]</[[su1:p2:type]]>
|
154
|
+
<[[su1:p3:type]] authority="[[su1:p3:authority]]" valueURI="[[su1:p3:valueURI]]">[[su1:p3:value]]</[[su1:p3:type]]>
|
155
|
+
</subject>
|
156
|
+
<subject authority="[[su2:authority]]">
|
157
|
+
<[[su2:p1:type]] authority="[[su2:p1:authority]]" valueURI="[[su2:p1:valueURI]]">[[su2:p1:value]]</[[su2:p1:type]]>
|
158
|
+
<[[su2:p2:type]] authority="[[su2:p2:authority]]" valueURI="[[su2:p2:valueURI]]">[[su2:p2:value]]</[[su2:p2:type]]>
|
159
|
+
<[[su2:p3:type]] authority="[[su2:p3:authority]]" valueURI="[[su2:p3:valueURI]]">[[su2:p3:value]]</[[su2:p3:type]]>
|
160
|
+
</subject>
|
161
|
+
<subject authority="[[su3:authority]]">
|
162
|
+
<[[su3:p1:type]] authority="[[su3:p1:authority]]" valueURI="[[su3:p1:valueURI]]">[[su3:p1:value]]</[[su3:p1:type]]>
|
163
|
+
<[[su3:p2:type]] authority="[[su3:p2:authority]]" valueURI="[[su3:p2:valueURI]]">[[su3:p2:value]]</[[su3:p2:type]]>
|
164
|
+
<[[su3:p3:type]] authority="[[su3:p3:authority]]" valueURI="[[su3:p3:valueURI]]">[[su3:p3:value]]</[[su3:p3:type]]>
|
165
|
+
</subject>
|
166
|
+
<subject authority="[[su4:authority]]">
|
167
|
+
<[[su4:p1:type]] authority="[[su4:p1:authority]]" valueURI="[[su4:p1:valueURI]]">[[su4:p1:value]]</[[su4:p1:type]]>
|
168
|
+
<[[su4:p2:type]] authority="[[su4:p2:authority]]" valueURI="[[su4:p2:valueURI]]">[[su4:p2:value]]</[[su4:p2:type]]>
|
169
|
+
<[[su4:p3:type]] authority="[[su4:p3:authority]]" valueURI="[[su4:p3:valueURI]]">[[su4:p3:value]]</[[su4:p3:type]]>
|
170
|
+
</subject>
|
171
|
+
<subject authority="[[su5:authority]]">
|
172
|
+
<[[su5:p1:type]] authority="[[su5:p1:authority]]" valueURI="[[su5:p1:valueURI]]">[[su5:p1:value]]</[[su5:p1:type]]>
|
173
|
+
<[[su5:p2:type]] authority="[[su5:p2:authority]]" valueURI="[[su5:p2:valueURI]]">[[su5:p2:value]]</[[su5:p2:type]]>
|
174
|
+
<[[su5:p3:type]] authority="[[su5:p3:authority]]" valueURI="[[su5:p3:valueURI]]">[[su5:p3:value]]</[[su5:p3:type]]>
|
175
|
+
</subject>
|
176
|
+
<subject>
|
177
|
+
<cartographics>
|
178
|
+
<scale>[[sc1:scale]]</scale>
|
179
|
+
<projection>[[sc1:projection]]</projection>
|
180
|
+
<coordinates>[[sc1:coordinates]]</coordinates>
|
181
|
+
</cartographics>
|
182
|
+
</subject>
|
183
|
+
<identifier type="[[id1:type]]" displayLabel="[[id1:displayLabel]]">[[id1:identifier]]</identifier>
|
184
|
+
<identifier type="[[id2:type]]" displayLabel="[[id2:displayLabel]]">[[id2:identifier]]</identifier>
|
185
|
+
<identifier type="[[id3:type]]" displayLabel="[[id3:displayLabel]]">[[id3:identifier]]</identifier>
|
186
|
+
<identifier type="[[id4:type]]" displayLabel="[[id4:displayLabel]]">[[id4:identifier]]</identifier>
|
187
|
+
<identifier type="[[id5:type]]" displayLabel="[[id5:displayLabel]]">[[id5:identifier]]</identifier>
|
188
|
+
<location>
|
189
|
+
<physicalLocation type="repository" authority="[[lo:authority]]" valueURI="[[lo:valueURI]]">[[lo:repository]]</physicalLocation>
|
190
|
+
<shelfLocator>[[lo:callNumber]]</shelfLocator>
|
191
|
+
</location>
|
192
|
+
<relatedItem type="[[ri1:type]]">
|
193
|
+
<titleInfo>
|
194
|
+
<title>[[ri1:title]]</title>
|
195
|
+
</titleInfo>
|
196
|
+
<url>[[ri1:url]]</url>
|
197
|
+
</relatedItem>
|
198
|
+
</mods>
|
@@ -0,0 +1,218 @@
|
|
1
|
+
# File "normalizer.rb" - defines a class for normalizing MODS XML according to the Stanford guidelines.
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
# This class provides methods to normalize MODS XML according to the Stanford guidelines.
|
6
|
+
# @see https://consul.stanford.edu/display/chimera/MODS+validation+and+normalization Requirements (Stanford Consul page - requires login)
|
7
|
+
class Normalizer
|
8
|
+
# Linefeed character entity reference
|
9
|
+
LINEFEED = ' '
|
10
|
+
|
11
|
+
# Checks if a node has attributes that we make exeptions for. There are two such exceptions.
|
12
|
+
#
|
13
|
+
# * A "collection" attribute with the value "yes" <em>on a typeOfResource tag</em>.
|
14
|
+
# * A "manuscript" attribute with the value "yes" <em>on a typeOfResource tag</em>.
|
15
|
+
#
|
16
|
+
# Nodes that fall under any of these exceptions should not be deleted, even if they have no content.
|
17
|
+
#
|
18
|
+
# @param [Nokogiri::XML::Element] node An XML node.
|
19
|
+
# @return [Boolean] true if the node contains any of the exceptional attributes, false otherwise.
|
20
|
+
def exceptional?(node)
|
21
|
+
return false unless node != nil
|
22
|
+
|
23
|
+
tag = node.name
|
24
|
+
attributes = node.attributes
|
25
|
+
|
26
|
+
return false if(attributes.empty?)
|
27
|
+
|
28
|
+
attributes.each do |key, value|
|
29
|
+
if(tag == 'typeOfResource') # Note that according to the MODS schema, any other value than 'yes' for these attributes is invalid
|
30
|
+
if((key == 'collection' && value.to_s.downcase == 'yes') ||
|
31
|
+
(key == 'manuscript' && value.to_s.downcase == 'yes'))
|
32
|
+
return true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
return false
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
# Recursive helper method for {Normalizer#clean_linefeeds} to do string substitution.
|
41
|
+
#
|
42
|
+
# @param [Nokogiri::XML::Element] node An XML node
|
43
|
+
# @return [String] A string composed of the entire contents of the given node, with substitutions made as described for {#clean_linefeeds}.
|
44
|
+
def substitute_linefeeds(node)
|
45
|
+
new_text = String.new
|
46
|
+
|
47
|
+
# If we substitute in ' ' by itself, Nokogiri interprets that and then prints '&#10;' when printing the document later. This
|
48
|
+
# is an ugly way to add linefeed characters in a way that we at least get well-formatted output in the end.
|
49
|
+
if(node.text?)
|
50
|
+
new_text = node.content.gsub(/\r\n/, Nokogiri::HTML(LINEFEED).text).gsub(/\n/, Nokogiri::HTML(LINEFEED).text).gsub(/\r/, Nokogiri::HTML(LINEFEED).text).gsub('\\n', Nokogiri::HTML(LINEFEED).text)
|
51
|
+
else
|
52
|
+
if(node.node_name == 'br')
|
53
|
+
new_text += Nokogiri::HTML(LINEFEED).text
|
54
|
+
elsif(node.node_name == 'p')
|
55
|
+
new_text += Nokogiri::HTML(LINEFEED).text + Nokogiri::HTML(LINEFEED).text
|
56
|
+
end
|
57
|
+
|
58
|
+
node.children.each do |c|
|
59
|
+
new_text += substitute_linefeeds(c)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
return new_text
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
# Given the root of an XML document, replaces linefeed characters inside <tableOfContents>, <abstract> and <note> XML node by
|
67
|
+
# \n, \r, <br> and <br/> are all replaced by a single
|
68
|
+
# <p> is replaced by two
|
69
|
+
# </p> is removed
|
70
|
+
# \r\n is replaced by
|
71
|
+
# Any tags not listed above are removed. MODS 3.5 does not allow for anything other than text inside these three nodes.
|
72
|
+
#
|
73
|
+
# @param [Nokogiri::XML::Element] node The root node of an XML document
|
74
|
+
# @return [Void] This method doesn't return anything, but introduces UTF-8 linefeed characters in place, as described above.
|
75
|
+
def clean_linefeeds(node)
|
76
|
+
node_list = []
|
77
|
+
if(node.namespace.nil?)
|
78
|
+
node_list = node.xpath('//abstract | //tableOfContents | //note')
|
79
|
+
else
|
80
|
+
node_list = node.xpath('//ns:abstract | //ns:tableOfContents | //ns:note', 'ns' => node.namespace.href)
|
81
|
+
end
|
82
|
+
|
83
|
+
node_list.each do |current_node|
|
84
|
+
new_text = substitute_linefeeds(current_node)
|
85
|
+
current_node.children.remove
|
86
|
+
current_node.content = new_text
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
# Cleans up the text of a node:
|
92
|
+
#
|
93
|
+
# * Removes extra whitespace at the beginning and end.
|
94
|
+
# * Removes any consecutive whitespace within the string.
|
95
|
+
#
|
96
|
+
# @param [String] s The text of an XML node.
|
97
|
+
# @return [String] The cleaned string, as described. Returns nil if the input is nil, or if the input is an empty string.
|
98
|
+
def clean_text(s)
|
99
|
+
return nil unless s != nil && s != ''
|
100
|
+
return s.gsub!(/\s+/, ' ').strip!
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
# Removes empty attributes from a given node.
|
105
|
+
#
|
106
|
+
# @param [Nokogiri::XML::Element] node An XML node.
|
107
|
+
# @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
|
108
|
+
def remove_empty_attributes(node)
|
109
|
+
children = node.children
|
110
|
+
attributes = node.attributes
|
111
|
+
|
112
|
+
attributes.each do |key, value|
|
113
|
+
node.remove_attribute(key) if(value.to_s.strip.empty?)
|
114
|
+
end
|
115
|
+
|
116
|
+
children.each do |c|
|
117
|
+
remove_empty_attributes(c)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
# Removes empty nodes from an XML tree. See {#exceptional?} for nodes that are kept even if empty.
|
123
|
+
#
|
124
|
+
# @param [Nokogiri::XML::Element] node An XML node.
|
125
|
+
# @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
|
126
|
+
def remove_empty_nodes(node)
|
127
|
+
children = node.children
|
128
|
+
|
129
|
+
if(node.text?)
|
130
|
+
if(node.to_s.strip.empty?)
|
131
|
+
node.remove
|
132
|
+
else
|
133
|
+
return
|
134
|
+
end
|
135
|
+
elsif(children.length > 0)
|
136
|
+
children.each do |c|
|
137
|
+
remove_empty_nodes(c)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
if(!exceptional?(node) && (node.children.length == 0))
|
142
|
+
node.remove
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
# Removes leading and trailing spaces from a node.
|
148
|
+
#
|
149
|
+
# @param [Nokogiri::XML::Element] node An XML node.
|
150
|
+
# @return [Void] This method doesn't return anything, but modifies the entire XML tree starting at the
|
151
|
+
# the given node, removing leading and trailing spaces from all text. If the input is nil,
|
152
|
+
# an exception will be raised.
|
153
|
+
def trim_text(node)
|
154
|
+
children = node.children
|
155
|
+
|
156
|
+
if(node.text?)
|
157
|
+
node.parent.content = node.text.strip
|
158
|
+
else
|
159
|
+
children.each do |c|
|
160
|
+
trim_text(c)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
|
166
|
+
# Removes the point attribute from single <dateCreated> and <dateIssued> elements.
|
167
|
+
#
|
168
|
+
# @param [Nokogiri::XML::Element] root The root of a MODS XML document.
|
169
|
+
# @return [Void] The given document is modified in place.
|
170
|
+
def clean_date_attributes(root)
|
171
|
+
|
172
|
+
# Find all the <dateCreated> and <dateIssued> elements that are NOT immediately followed by another element with the same name
|
173
|
+
root.xpath('//mods:originInfo/mods:dateCreated[1][not(following-sibling::*[1][self::mods:dateCreated])] | //mods:originInfo/mods:dateIssued[1][not(following-sibling::*[1][self::mods:dateIssued])]', 'mods' => 'http://www.loc.gov/mods/v3').each do |current_element|
|
174
|
+
attributes = current_element.attributes
|
175
|
+
if(attributes.key?('point'))
|
176
|
+
current_element.remove_attribute('point')
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
# Sometimes there are spurious decimal digits within the date fields. This method removes any trailing decimal points within
|
183
|
+
# <dateCreated> and <dateIssued>.
|
184
|
+
#
|
185
|
+
# @param [Nokogiri::XML::Element] root The root of a MODS XML document.
|
186
|
+
# @return [Void] The given document is modified in place.
|
187
|
+
def clean_date_values(root)
|
188
|
+
root.xpath('//mods:dateCreated | //mods:dateIssued', 'mods' => 'http://www.loc.gov/mods/v3').each do |current_node|
|
189
|
+
current_node.content = current_node.content.sub(/(.*)\.\d+$/, '\1')
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
# Normalizes the given XML document according to the Stanford guidelines.
|
195
|
+
#
|
196
|
+
# @param [Nokogiri::XML::Element] root The root of a MODS XML document.
|
197
|
+
# @return [Void] The given document is modified in place.
|
198
|
+
def normalize_document(root)
|
199
|
+
clean_linefeeds(root) # Do this before deleting <br> and <p> with remove_empty_nodes()
|
200
|
+
remove_empty_attributes(root)
|
201
|
+
remove_empty_nodes(root)
|
202
|
+
trim_text(root)
|
203
|
+
clean_date_attributes(root)
|
204
|
+
clean_date_values(root)
|
205
|
+
end
|
206
|
+
|
207
|
+
|
208
|
+
# Normalizes the given XML document string according to the Stanford guidelines.
|
209
|
+
#
|
210
|
+
# @param [String] xml_string An XML document
|
211
|
+
# @return [String] The XML string, with normalizations applied.
|
212
|
+
def normalize_xml_string(xml_string)
|
213
|
+
doc = Nokogiri::XML(xml_string)
|
214
|
+
normalize_document(doc.root)
|
215
|
+
doc.to_s
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|