modsulator 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +202 -0
  3. data/README.md +7 -0
  4. data/Rakefile +43 -0
  5. data/bin/modsulator +16 -0
  6. data/lib/doc/Modsulator.html +725 -0
  7. data/lib/doc/_index.html +101 -0
  8. data/lib/doc/class_list.html +58 -0
  9. data/lib/doc/css/common.css +1 -0
  10. data/lib/doc/css/full_list.css +57 -0
  11. data/lib/doc/css/style.css +339 -0
  12. data/lib/doc/file_list.html +57 -0
  13. data/lib/doc/frames.html +26 -0
  14. data/lib/doc/index.html +101 -0
  15. data/lib/doc/js/app.js +219 -0
  16. data/lib/doc/js/full_list.js +181 -0
  17. data/lib/doc/js/jquery.js +4 -0
  18. data/lib/doc/method_list.html +87 -0
  19. data/lib/doc/top-level-namespace.html +112 -0
  20. data/lib/modsulator.rb +193 -0
  21. data/lib/modsulator/mods-3-5.xsd +1463 -0
  22. data/lib/modsulator/modsulator.xsd +35 -0
  23. data/lib/modsulator/modsulator_sheet.rb +62 -0
  24. data/lib/modsulator/modsulator_template.xlsx +0 -0
  25. data/lib/modsulator/modsulator_template.xml +198 -0
  26. data/lib/modsulator/normalizer.rb +218 -0
  27. data/lib/modsulator/validator.rb +37 -0
  28. data/spec/features/modsulator_sheet_unit_spec.rb +18 -0
  29. data/spec/features/normalizer_unit_spec.rb +152 -0
  30. data/spec/features/process_template_spec.rb +29 -0
  31. data/spec/features/validator_unit_spec.rb +43 -0
  32. data/spec/fixtures/Fitch_Chavez.xlsx +0 -0
  33. data/spec/fixtures/Fitch_Chavez.xml +2806 -0
  34. data/spec/fixtures/Fitch_King.xlsx +0 -0
  35. data/spec/fixtures/Fitch_King.xml +2560 -0
  36. data/spec/fixtures/M1463_AV_manifest.xlsx +0 -0
  37. data/spec/fixtures/M1463_AV_manifest.xml +2373 -0
  38. data/spec/fixtures/Matter_manifest.csv +295 -0
  39. data/spec/fixtures/Matter_manifest.xml +3309 -0
  40. data/spec/fixtures/PosadaSpreadsheet.xlsx +0 -0
  41. data/spec/fixtures/PosadaSpreadsheet.xml +1259 -0
  42. data/spec/fixtures/ars0056_manifest.csv +1 -0
  43. data/spec/fixtures/ars0056_manifest.xml +9728 -0
  44. data/spec/fixtures/crowdsourcing_bridget_1.xlsx +0 -0
  45. data/spec/fixtures/crowdsourcing_bridget_1.xml +606 -0
  46. data/spec/fixtures/crowdsourcing_bridget_2.xlsx +0 -0
  47. data/spec/fixtures/crowdsourcing_bridget_2.xml +3433 -0
  48. data/spec/fixtures/filled_template.xlsx +0 -0
  49. data/spec/fixtures/invalid_crowdsourcing_bridget_1.xml +606 -0
  50. data/spec/fixtures/manifest_v0174.csv +34 -0
  51. data/spec/fixtures/manifest_v0174.xml +1026 -0
  52. data/spec/fixtures/roman_coins_mods_manifest.csv +176 -0
  53. data/spec/fixtures/roman_coins_mods_manifest.xml +3025 -0
  54. data/spec/fixtures/test:002.xml +63 -0
  55. data/spec/fixtures/test_002.csv +4 -0
  56. data/spec/fixtures/test_002.xlsx +0 -0
  57. data/spec/integration_tests/integration_spec.rb +40 -0
  58. data/spec/lib/modsulator_spec.rb +21 -0
  59. data/spec/spec_helper.rb +97 -0
  60. metadata +261 -0
@@ -0,0 +1,35 @@
1
+ <?xml version="1.0"?>
2
+ <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" targetNamespace="http://library.stanford.edu/xmlDocs" xmlns="http://library.stanford.edu/xmlDocs" elementFormDefault="qualified" attributeFormDefault="unqualified">
3
+ <!-- This XML Schema Definition defines the "xmlDocs" vocabulary that is produced by the MODSulator. -->
4
+ <!-- The main content of xmlDocs is full MODS 3.5 XML, as defined by the Library of Congress. Import their schema definition. -->
5
+ <xs:import namespace="http://www.loc.gov/mods/v3" schemaLocation="mods-3-5.xsd"/>
6
+
7
+ <xs:element name="xmlDocs">
8
+ <xs:complexType>
9
+ <xs:sequence minOccurs="0" maxOccurs="unbounded">
10
+ <xs:element name="xmlDoc"/>
11
+ </xs:sequence>
12
+ <xs:attributeGroup ref="docsAttributeGroup"/>
13
+ </xs:complexType>
14
+ </xs:element>
15
+
16
+ <xs:element name="xmlDoc">
17
+ <xs:complexType>
18
+ <xs:sequence>
19
+ <xs:element name="mods"/>
20
+ </xs:sequence>
21
+ <xs:attributeGroup ref="docAttributeGroup"/>
22
+ </xs:complexType>
23
+ </xs:element>
24
+
25
+ <xs:attributeGroup name="docsAttributeGroup">
26
+ <xs:attribute name="datetime" type="xs:string"/>
27
+ <xs:attribute name="sourceFile" type="xs:string"/>
28
+ </xs:attributeGroup>
29
+
30
+ <xs:attributeGroup name="docAttributeGroup">
31
+ <xs:attribute name="id" type="xs:string"/>
32
+ <xs:attribute name="objectId" type="xs:string"/>
33
+ </xs:attributeGroup>
34
+
35
+ </xs:schema>
@@ -0,0 +1,62 @@
1
+ # File "modsulator_sheet.rb" - a class to load and validate metadata spreadsheets (.xlsx or .csv) for input
2
+ # to Modsulator.
3
+
4
+ require 'json'
5
+ require 'roo'
6
+
7
+ # This class provides methods to parse Stanford's MODS spreadsheets into either an array of hashes, or a JSON string.
8
+ class ModsulatorSheet
9
+ attr_reader :file, :filename
10
+
11
+ # Creates a new ModsulatorSheet. When called with temporary files, the filename must be specified separately, hence the
12
+ # second argument.
13
+ # @param [File] file The input spreadsheet
14
+ # @param [String] filename The filename of the input spreadsheet.
15
+ def initialize(file, filename)
16
+ @file = file
17
+ @filename = filename
18
+ end
19
+
20
+
21
+ # Loads the input spreadsheet into an array of hashes. This spreadsheet should conform to the Stanford MODS template format,
22
+ # which has three header rows. The first row is a kind of "super header", the second row is an intermediate header and the
23
+ # third row is the header row that names the fields. The data rows are in the fourth row onwards.
24
+ #
25
+ # @return [Array<Hash>] An array with one entry per data row in the spreadsheet. Each entry is a hash, indexed by
26
+ # the spreadsheet headers.
27
+ def rows
28
+ # Parse the spreadsheet, automatically finding the header row by looking for "druid" and "sourceId" and leave the
29
+ # header row itself out of the resulting array. Everything preceding the header row is discarded. Would like to use
30
+ # clean: true here, but the latest release of Roo 1.13.2 crashes. 2.0.0beta1 seems to work though.
31
+ @rows ||= spreadsheet.parse(header_search: ['druid', 'sourceId']).drop(1)
32
+ end
33
+
34
+
35
+ # Opens a spreadsheet based on its filename extension.
36
+ #
37
+ # @return [Roo::CSV, Roo::Excel, Roo::Excelx] A Roo object, whose type depends on the extension of the given filename.
38
+ def spreadsheet
39
+ @spreadsheet ||= case File.extname(@filename)
40
+ when '.csv' then Roo::Spreadsheet.open(@file, extension: :csv)
41
+ when '.xls' then Roo::Spreadsheet.open(@file, extension: :xls)
42
+ when '.xlsx' then Roo::Spreadsheet.open(@file, extension: :xlsx)
43
+ else fail "Unknown file type: #{@filename}"
44
+ end
45
+ end
46
+
47
+
48
+ # Get the headers used in the spreadsheet
49
+ def headers
50
+ rows.first.keys
51
+ end
52
+
53
+
54
+ # Convert the loaded spreadsheet to a JSON string.
55
+ # @return [String] A JSON string.
56
+ def to_json
57
+ json_hash = {}
58
+ json_hash['filename'] = File.basename(filename)
59
+ json_hash['rows'] = rows
60
+ json_hash.to_json
61
+ end
62
+ end
@@ -0,0 +1,198 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <mods xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" version="3.5" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-5.xsd">
3
+ <titleInfo authority="[[ti1:authority]]" valueURI="[[ti1:valueURI]]">
4
+ <nonSort>[[ti1:nonSort]]</nonSort>
5
+ <title>[[ti1:title]]</title>
6
+ <subTitle>[[ti1:subTitle]]</subTitle>
7
+ <partNumber>[[ti1:partNumber]]</partNumber>
8
+ <partName>[[ti1:partName]]</partName>
9
+ </titleInfo>
10
+ <titleInfo type="[[ti2:type]]" authority="[[ti2:authority]]" valueURI="[[ti2:valueURI]]">
11
+ <nonSort>[[ti2:nonSort]]</nonSort>
12
+ <title>[[ti2:title]]</title>
13
+ <subTitle>[[ti2:subTitle]]</subTitle>
14
+ <partNumber>[[ti2:partNumber]]</partNumber>
15
+ <partName>[[ti2:partName]]</partName>
16
+ </titleInfo>
17
+ <titleInfo type="[[ti3:type]]" authority="[[ti3:authority]]" valueURI="[[ti3:valueURI]]">
18
+ <nonSort>[[ti3:nonSort]]</nonSort>
19
+ <title>[[ti3:title]]</title>
20
+ <subTitle>[[ti3:subTitle]]</subTitle>
21
+ <partNumber>[[ti3:partNumber]]</partNumber>
22
+ <partName>[[ti3:partName]]</partName>
23
+ </titleInfo>
24
+ <name type="[[na1:type]]" authority="[[na1:authority]]" valueURI="[[na1:valueURI]]">
25
+ <namePart>[[na1:namePart]]</namePart>
26
+ <role>
27
+ <roleTerm type="code" authority="[[ro1:authority]]" valueURI="[[ro1:valueURI]]">[[ro1:roleCode]]</roleTerm>
28
+ <roleTerm type="text" authority="[[ro1:authority]]" valueURI="[[ro1:valueURI]]">[[ro1:roleText]]</roleTerm>
29
+ </role>
30
+ </name>
31
+ <name type="[[na2:type]]" authority="[[na2:authority]]" valueURI="[[na2:valueURI]]">
32
+ <namePart>[[na2:namePart]]</namePart>
33
+ <role>
34
+ <roleTerm type="code" authority="[[ro2:authority]]" valueURI="[[ro2:valueURI]]">[[ro2:roleCode]]</roleTerm>
35
+ <roleTerm type="text" authority="[[ro2:authority]]" valueURI="[[ro2:valueURI]]">[[ro2:roleText]]</roleTerm>
36
+ </role>
37
+ </name>
38
+ <name type="[[na3:type]]" authority="[[na3:authority]]" valueURI="[[na3:valueURI]]">
39
+ <namePart>[[na3:namePart]]</namePart>
40
+ <role>
41
+ <roleTerm type="code" authority="[[ro3:authority]]" valueURI="[[ro3:valueURI]]">[[ro3:roleCode]]</roleTerm>
42
+ <roleTerm type="text" authority="[[ro3:authority]]" valueURI="[[ro3:valueURI]]">[[ro3:roleText]]</roleTerm>
43
+ </role>
44
+ </name>
45
+ <name type="[[na4:type]]" authority="[[na4:authority]]" valueURI="[[na4:valueURI]]">
46
+ <namePart>[[na4:namePart]]</namePart>
47
+ <role>
48
+ <roleTerm type="code" authority="[[ro4:authority]]" valueURI="[[ro4:valueURI]]">[[ro4:roleCode]]</roleTerm>
49
+ <roleTerm type="text" authority="[[ro4:authority]]" valueURI="[[ro4:valueURI]]">[[ro4:roleText]]</roleTerm>
50
+ </role>
51
+ </name>
52
+ <name type="[[na5:type]]" authority="[[na5:authority]]" valueURI="[[na5:valueURI]]">
53
+ <namePart>[[na5:namePart]]</namePart>
54
+ <role>
55
+ <roleTerm type="code" authority="[[ro5:authority]]" valueURI="[[ro5:valueURI]]">[[ro5:roleCode]]</roleTerm>
56
+ <roleTerm type="text" authority="[[ro5:authority]]" valueURI="[[ro5:valueURI]]">[[ro5:roleText]]</roleTerm>
57
+ </role>
58
+ </name>
59
+ <typeOfResource manuscript="[[ty1:manuscript]]">[[ty1:typeOfResource]]</typeOfResource>
60
+ <typeOfResource>[[ty2:typeOfResource]]</typeOfResource>
61
+ <typeOfResource>[[ty3:typeOfResource]]</typeOfResource>
62
+ <genre type="[[ge1:type]]" authority="[[ge1:authority]]" valueURI="[[ge1:valueURI]]">[[ge1:genre]]</genre>
63
+ <genre type="[[ge2:type]]" authority="[[ge2:authority]]" valueURI="[[ge2:valueURI]]">[[ge2:genre]]</genre>
64
+ <genre type="[[ge3:type]]" authority="[[ge3:authority]]" valueURI="[[ge3:valueURI]]">[[ge3:genre]]</genre>
65
+ <originInfo>
66
+ <place>
67
+ <placeTerm type="code" authority="[[pl:authority]]" valueURI="[[pl:valueURI]]">[[pl:placeCode]]</placeTerm>
68
+ <placeTerm type="text" authority="[[pl:authority]]" valueURI="[[pl:valueURI]]">[[pl:placeText]]</placeTerm>
69
+ </place>
70
+ <publisher>[[or:publisher]]</publisher>
71
+ <dateCreated keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="start">[[dt:dateCreated]]</dateCreated>
72
+ <dateCreated keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="end">[[dt:dateCreated2]]</dateCreated>
73
+ <dateIssued keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="start">[[dt:dateIssued]]</dateIssued>
74
+ <dateIssued keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="end">[[dt:dateIssued2]]</dateIssued>
75
+ </originInfo>
76
+ <language>
77
+ <languageTerm authority="[[la1:authority]]" valueURI="[[la1:valueURI]]" type="code">[[la1:code]]</languageTerm>
78
+ <languageTerm authority="[[la1:authority]]" valueURI="[[la1:valueURI]]" type="text">[[la1:text]]</languageTerm>
79
+ </language>
80
+ <language>
81
+ <languageTerm authority="[[la2:authority]]" valueURI="[[la2:valueURI]]" type="code">[[la2:code]]</languageTerm>
82
+ <languageTerm authority="[[la2:authority]]" valueURI="[[la2:valueURI]]" type="text">[[la2:text]]</languageTerm>
83
+ </language>
84
+ <language>
85
+ <languageTerm authority="[[la3:authority]]" valueURI="[[la3:valueURI]]" type="code">[[la3:code]]</languageTerm>
86
+ <languageTerm authority="[[la3:authority]]" valueURI="[[la3:valueURI]]" type="text">[[la3:text]]</languageTerm>
87
+ </language>
88
+ <physicalDescription>
89
+ <form>[[ph1:form]]</form>
90
+ <extent>[[ph1:extent]]</extent>
91
+ <digitalOrigin>[[ph1:digitalOrigin]]</digitalOrigin>
92
+ <note displayLabel="[[ph1:displayLabel]]">[[ph1:note]]</note>
93
+ </physicalDescription>
94
+ <abstract displayLabel="[[ab:displayLabel]]">[[ab:abstract]]</abstract>
95
+ <tableOfContents displayLabel="[[tc:displayLabel]]">[[tc:tableOfContents]]</tableOfContents>
96
+ <note type="[[no1:type]]" displayLabel="[[no1:displayLabel]]">[[no1:note]]</note>
97
+ <note type="[[no2:type]]" displayLabel="[[no2:displayLabel]]">[[no2:note]]</note>
98
+ <note type="[[no3:type]]" displayLabel="[[no3:displayLabel]]">[[no3:note]]</note>
99
+ <note type="[[no4:type]]" displayLabel="[[no4:displayLabel]]">[[no4:note]]</note>
100
+ <note type="[[no5:type]]" displayLabel="[[no5:displayLabel]]">[[no5:note]]</note>
101
+ <subject authority="[[sn1:authority]]">
102
+ <name type="[[sn1:p1:nameType]]" authority="[[sn1:p1:authority]]" valueURI="[[sn1:p1:valueURI]]">
103
+ <namePart>[[sn1:p1:name]]</namePart>
104
+ </name>
105
+ <titleInfo>
106
+ <title>[[sn1:p1:title]]</title>
107
+ </titleInfo>
108
+ <[[sn1:p2:type]] authority="[[sn1:p2:authority]]" valueURI="[[sn1:p2:valueURI]]">[[sn1:p2:value]]</[[sn1:p2:type]]>
109
+ <[[sn1:p3:type]] authority="[[sn1:p3:authority]]" valueURI="[[sn1:p3:valueURI]]">[[sn1:p3:value]]</[[sn1:p3:type]]>
110
+ </subject>
111
+ <subject authority="[[sn2:authority]]">
112
+ <name type="[[sn2:p1:nameType]]" authority="[[sn2:p1:authority]]" valueURI="[[sn2:p1:valueURI]]">
113
+ <namePart>[[sn2:p1:name]]</namePart>
114
+ </name>
115
+ <titleInfo>
116
+ <title>[[sn2:p1:title]]</title>
117
+ </titleInfo>
118
+ <[[sn2:p2:type]] authority="[[sn2:p2:authority]]" valueURI="[[sn2:p2:valueURI]]">[[sn2:p2:value]]</[[sn2:p2:type]]>
119
+ <[[sn2:p3:type]] authority="[[sn2:p3:authority]]" valueURI="[[sn2:p3:valueURI]]">[[sn2:p3:value]]</[[sn2:p3:type]]>
120
+ </subject>
121
+ <subject authority="[[sn3:authority]]">
122
+ <name type="[[sn3:p1:nameType]]" authority="[[sn3:p1:authority]]" valueURI="[[sn3:p1:valueURI]]">
123
+ <namePart>[[sn3:p1:name]]</namePart>
124
+ </name>
125
+ <titleInfo>
126
+ <title>[[sn3:p1:title]]</title>
127
+ </titleInfo>
128
+ <[[sn3:p2:type]] authority="[[sn3:p2:authority]]" valueURI="[[sn3:p2:valueURI]]">[[sn3:p2:value]]</[[sn3:p2:type]]>
129
+ <[[sn3:p3:type]] authority="[[sn3:p3:authority]]" valueURI="[[sn3:p3:valueURI]]">[[sn3:p3:value]]</[[sn3:p3:type]]>
130
+ </subject>
131
+ <subject authority="[[sn4:authority]]">
132
+ <name type="[[sn4:p1:nameType]]" authority="[[sn4:p1:authority]]" valueURI="[[sn4:p1:valueURI]]">
133
+ <namePart>[[sn4:p1:name]]</namePart>
134
+ </name>
135
+ <titleInfo>
136
+ <title>[[sn4:p1:title]]</title>
137
+ </titleInfo>
138
+ <[[sn4:p2:type]] authority="[[sn4:p2:authority]]" valueURI="[[sn4:p2:valueURI]]">[[sn4:p2:value]]</[[sn4:p2:type]]>
139
+ <[[sn4:p3:type]] authority="[[sn4:p3:authority]]" valueURI="[[sn4:p3:valueURI]]">[[sn4:p3:value]]</[[sn4:p3:type]]>
140
+ </subject>
141
+ <subject authority="[[sn5:authority]]">
142
+ <name type="[[sn5:p1:nameType]]" authority="[[sn5:p1:authority]]" valueURI="[[sn5:p1:valueURI]]">
143
+ <namePart>[[sn5:p1:name]]</namePart>
144
+ </name>
145
+ <titleInfo>
146
+ <title>[[sn5:p1:title]]</title>
147
+ </titleInfo>
148
+ <[[sn5:p2:type]] authority="[[sn5:p2:authority]]" valueURI="[[sn5:p2:valueURI]]">[[sn5:p2:value]]</[[sn5:p2:type]]>
149
+ <[[sn5:p3:type]] authority="[[sn5:p3:authority]]" valueURI="[[sn5:p3:valueURI]]">[[sn5:p3:value]]</[[sn5:p3:type]]>
150
+ </subject>
151
+ <subject authority="[[su1:authority]]">
152
+ <[[su1:p1:type]] authority="[[su1:p1:authority]]" valueURI="[[su1:p1:valueURI]]">[[su1:p1:value]]</[[su1:p1:type]]>
153
+ <[[su1:p2:type]] authority="[[su1:p2:authority]]" valueURI="[[su1:p2:valueURI]]">[[su1:p2:value]]</[[su1:p2:type]]>
154
+ <[[su1:p3:type]] authority="[[su1:p3:authority]]" valueURI="[[su1:p3:valueURI]]">[[su1:p3:value]]</[[su1:p3:type]]>
155
+ </subject>
156
+ <subject authority="[[su2:authority]]">
157
+ <[[su2:p1:type]] authority="[[su2:p1:authority]]" valueURI="[[su2:p1:valueURI]]">[[su2:p1:value]]</[[su2:p1:type]]>
158
+ <[[su2:p2:type]] authority="[[su2:p2:authority]]" valueURI="[[su2:p2:valueURI]]">[[su2:p2:value]]</[[su2:p2:type]]>
159
+ <[[su2:p3:type]] authority="[[su2:p3:authority]]" valueURI="[[su2:p3:valueURI]]">[[su2:p3:value]]</[[su2:p3:type]]>
160
+ </subject>
161
+ <subject authority="[[su3:authority]]">
162
+ <[[su3:p1:type]] authority="[[su3:p1:authority]]" valueURI="[[su3:p1:valueURI]]">[[su3:p1:value]]</[[su3:p1:type]]>
163
+ <[[su3:p2:type]] authority="[[su3:p2:authority]]" valueURI="[[su3:p2:valueURI]]">[[su3:p2:value]]</[[su3:p2:type]]>
164
+ <[[su3:p3:type]] authority="[[su3:p3:authority]]" valueURI="[[su3:p3:valueURI]]">[[su3:p3:value]]</[[su3:p3:type]]>
165
+ </subject>
166
+ <subject authority="[[su4:authority]]">
167
+ <[[su4:p1:type]] authority="[[su4:p1:authority]]" valueURI="[[su4:p1:valueURI]]">[[su4:p1:value]]</[[su4:p1:type]]>
168
+ <[[su4:p2:type]] authority="[[su4:p2:authority]]" valueURI="[[su4:p2:valueURI]]">[[su4:p2:value]]</[[su4:p2:type]]>
169
+ <[[su4:p3:type]] authority="[[su4:p3:authority]]" valueURI="[[su4:p3:valueURI]]">[[su4:p3:value]]</[[su4:p3:type]]>
170
+ </subject>
171
+ <subject authority="[[su5:authority]]">
172
+ <[[su5:p1:type]] authority="[[su5:p1:authority]]" valueURI="[[su5:p1:valueURI]]">[[su5:p1:value]]</[[su5:p1:type]]>
173
+ <[[su5:p2:type]] authority="[[su5:p2:authority]]" valueURI="[[su5:p2:valueURI]]">[[su5:p2:value]]</[[su5:p2:type]]>
174
+ <[[su5:p3:type]] authority="[[su5:p3:authority]]" valueURI="[[su5:p3:valueURI]]">[[su5:p3:value]]</[[su5:p3:type]]>
175
+ </subject>
176
+ <subject>
177
+ <cartographics>
178
+ <scale>[[sc1:scale]]</scale>
179
+ <projection>[[sc1:projection]]</projection>
180
+ <coordinates>[[sc1:coordinates]]</coordinates>
181
+ </cartographics>
182
+ </subject>
183
+ <identifier type="[[id1:type]]" displayLabel="[[id1:displayLabel]]">[[id1:identifier]]</identifier>
184
+ <identifier type="[[id2:type]]" displayLabel="[[id2:displayLabel]]">[[id2:identifier]]</identifier>
185
+ <identifier type="[[id3:type]]" displayLabel="[[id3:displayLabel]]">[[id3:identifier]]</identifier>
186
+ <identifier type="[[id4:type]]" displayLabel="[[id4:displayLabel]]">[[id4:identifier]]</identifier>
187
+ <identifier type="[[id5:type]]" displayLabel="[[id5:displayLabel]]">[[id5:identifier]]</identifier>
188
+ <location>
189
+ <physicalLocation type="repository" authority="[[lo:authority]]" valueURI="[[lo:valueURI]]">[[lo:repository]]</physicalLocation>
190
+ <shelfLocator>[[lo:callNumber]]</shelfLocator>
191
+ </location>
192
+ <relatedItem type="[[ri1:type]]">
193
+ <titleInfo>
194
+ <title>[[ri1:title]]</title>
195
+ </titleInfo>
196
+ <url>[[ri1:url]]</url>
197
+ </relatedItem>
198
+ </mods>
@@ -0,0 +1,218 @@
1
+ # File "normalizer.rb" - defines a class for normalizing MODS XML according to the Stanford guidelines.
2
+
3
+ require 'nokogiri'
4
+
5
+ # This class provides methods to normalize MODS XML according to the Stanford guidelines.
6
+ # @see https://consul.stanford.edu/display/chimera/MODS+validation+and+normalization Requirements (Stanford Consul page - requires login)
7
+ class Normalizer
8
+ # Linefeed character entity reference
9
+ LINEFEED = '&#10;'
10
+
11
+ # Checks if a node has attributes that we make exeptions for. There are two such exceptions.
12
+ #
13
+ # * A "collection" attribute with the value "yes" <em>on a typeOfResource tag</em>.
14
+ # * A "manuscript" attribute with the value "yes" <em>on a typeOfResource tag</em>.
15
+ #
16
+ # Nodes that fall under any of these exceptions should not be deleted, even if they have no content.
17
+ #
18
+ # @param [Nokogiri::XML::Element] node An XML node.
19
+ # @return [Boolean] true if the node contains any of the exceptional attributes, false otherwise.
20
+ def exceptional?(node)
21
+ return false unless node != nil
22
+
23
+ tag = node.name
24
+ attributes = node.attributes
25
+
26
+ return false if(attributes.empty?)
27
+
28
+ attributes.each do |key, value|
29
+ if(tag == 'typeOfResource') # Note that according to the MODS schema, any other value than 'yes' for these attributes is invalid
30
+ if((key == 'collection' && value.to_s.downcase == 'yes') ||
31
+ (key == 'manuscript' && value.to_s.downcase == 'yes'))
32
+ return true
33
+ end
34
+ end
35
+ end
36
+ return false
37
+ end
38
+
39
+
40
+ # Recursive helper method for {Normalizer#clean_linefeeds} to do string substitution.
41
+ #
42
+ # @param [Nokogiri::XML::Element] node An XML node
43
+ # @return [String] A string composed of the entire contents of the given node, with substitutions made as described for {#clean_linefeeds}.
44
+ def substitute_linefeeds(node)
45
+ new_text = String.new
46
+
47
+ # If we substitute in '&#10;' by itself, Nokogiri interprets that and then prints '&amp;#10;' when printing the document later. This
48
+ # is an ugly way to add linefeed characters in a way that we at least get well-formatted output in the end.
49
+ if(node.text?)
50
+ new_text = node.content.gsub(/\r\n/, Nokogiri::HTML(LINEFEED).text).gsub(/\n/, Nokogiri::HTML(LINEFEED).text).gsub(/\r/, Nokogiri::HTML(LINEFEED).text).gsub('\\n', Nokogiri::HTML(LINEFEED).text)
51
+ else
52
+ if(node.node_name == 'br')
53
+ new_text += Nokogiri::HTML(LINEFEED).text
54
+ elsif(node.node_name == 'p')
55
+ new_text += Nokogiri::HTML(LINEFEED).text + Nokogiri::HTML(LINEFEED).text
56
+ end
57
+
58
+ node.children.each do |c|
59
+ new_text += substitute_linefeeds(c)
60
+ end
61
+ end
62
+ return new_text
63
+ end
64
+
65
+
66
+ # Given the root of an XML document, replaces linefeed characters inside <tableOfContents>, <abstract> and <note> XML node by &#10;
67
+ # \n, \r, <br> and <br/> are all replaced by a single &#10;
68
+ # <p> is replaced by two &#10;
69
+ # </p> is removed
70
+ # \r\n is replaced by &#10;
71
+ # Any tags not listed above are removed. MODS 3.5 does not allow for anything other than text inside these three nodes.
72
+ #
73
+ # @param [Nokogiri::XML::Element] node The root node of an XML document
74
+ # @return [Void] This method doesn't return anything, but introduces UTF-8 linefeed characters in place, as described above.
75
+ def clean_linefeeds(node)
76
+ node_list = []
77
+ if(node.namespace.nil?)
78
+ node_list = node.xpath('//abstract | //tableOfContents | //note')
79
+ else
80
+ node_list = node.xpath('//ns:abstract | //ns:tableOfContents | //ns:note', 'ns' => node.namespace.href)
81
+ end
82
+
83
+ node_list.each do |current_node|
84
+ new_text = substitute_linefeeds(current_node)
85
+ current_node.children.remove
86
+ current_node.content = new_text
87
+ end
88
+ end
89
+
90
+
91
+ # Cleans up the text of a node:
92
+ #
93
+ # * Removes extra whitespace at the beginning and end.
94
+ # * Removes any consecutive whitespace within the string.
95
+ #
96
+ # @param [String] s The text of an XML node.
97
+ # @return [String] The cleaned string, as described. Returns nil if the input is nil, or if the input is an empty string.
98
+ def clean_text(s)
99
+ return nil unless s != nil && s != ''
100
+ return s.gsub!(/\s+/, ' ').strip!
101
+ end
102
+
103
+
104
+ # Removes empty attributes from a given node.
105
+ #
106
+ # @param [Nokogiri::XML::Element] node An XML node.
107
+ # @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
108
+ def remove_empty_attributes(node)
109
+ children = node.children
110
+ attributes = node.attributes
111
+
112
+ attributes.each do |key, value|
113
+ node.remove_attribute(key) if(value.to_s.strip.empty?)
114
+ end
115
+
116
+ children.each do |c|
117
+ remove_empty_attributes(c)
118
+ end
119
+ end
120
+
121
+
122
+ # Removes empty nodes from an XML tree. See {#exceptional?} for nodes that are kept even if empty.
123
+ #
124
+ # @param [Nokogiri::XML::Element] node An XML node.
125
+ # @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
126
+ def remove_empty_nodes(node)
127
+ children = node.children
128
+
129
+ if(node.text?)
130
+ if(node.to_s.strip.empty?)
131
+ node.remove
132
+ else
133
+ return
134
+ end
135
+ elsif(children.length > 0)
136
+ children.each do |c|
137
+ remove_empty_nodes(c)
138
+ end
139
+ end
140
+
141
+ if(!exceptional?(node) && (node.children.length == 0))
142
+ node.remove
143
+ end
144
+ end
145
+
146
+
147
+ # Removes leading and trailing spaces from a node.
148
+ #
149
+ # @param [Nokogiri::XML::Element] node An XML node.
150
+ # @return [Void] This method doesn't return anything, but modifies the entire XML tree starting at the
151
+ # the given node, removing leading and trailing spaces from all text. If the input is nil,
152
+ # an exception will be raised.
153
+ def trim_text(node)
154
+ children = node.children
155
+
156
+ if(node.text?)
157
+ node.parent.content = node.text.strip
158
+ else
159
+ children.each do |c|
160
+ trim_text(c)
161
+ end
162
+ end
163
+ end
164
+
165
+
166
+ # Removes the point attribute from single <dateCreated> and <dateIssued> elements.
167
+ #
168
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
169
+ # @return [Void] The given document is modified in place.
170
+ def clean_date_attributes(root)
171
+
172
+ # Find all the <dateCreated> and <dateIssued> elements that are NOT immediately followed by another element with the same name
173
+ root.xpath('//mods:originInfo/mods:dateCreated[1][not(following-sibling::*[1][self::mods:dateCreated])] | //mods:originInfo/mods:dateIssued[1][not(following-sibling::*[1][self::mods:dateIssued])]', 'mods' => 'http://www.loc.gov/mods/v3').each do |current_element|
174
+ attributes = current_element.attributes
175
+ if(attributes.key?('point'))
176
+ current_element.remove_attribute('point')
177
+ end
178
+ end
179
+ end
180
+
181
+
182
+ # Sometimes there are spurious decimal digits within the date fields. This method removes any trailing decimal points within
183
+ # <dateCreated> and <dateIssued>.
184
+ #
185
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
186
+ # @return [Void] The given document is modified in place.
187
+ def clean_date_values(root)
188
+ root.xpath('//mods:dateCreated | //mods:dateIssued', 'mods' => 'http://www.loc.gov/mods/v3').each do |current_node|
189
+ current_node.content = current_node.content.sub(/(.*)\.\d+$/, '\1')
190
+ end
191
+ end
192
+
193
+
194
+ # Normalizes the given XML document according to the Stanford guidelines.
195
+ #
196
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
197
+ # @return [Void] The given document is modified in place.
198
+ def normalize_document(root)
199
+ clean_linefeeds(root) # Do this before deleting <br> and <p> with remove_empty_nodes()
200
+ remove_empty_attributes(root)
201
+ remove_empty_nodes(root)
202
+ trim_text(root)
203
+ clean_date_attributes(root)
204
+ clean_date_values(root)
205
+ end
206
+
207
+
208
+ # Normalizes the given XML document string according to the Stanford guidelines.
209
+ #
210
+ # @param [String] xml_string An XML document
211
+ # @return [String] The XML string, with normalizations applied.
212
+ def normalize_xml_string(xml_string)
213
+ doc = Nokogiri::XML(xml_string)
214
+ normalize_document(doc.root)
215
+ doc.to_s
216
+ end
217
+ end
218
+