modsulator 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +202 -0
  3. data/README.md +7 -0
  4. data/Rakefile +43 -0
  5. data/bin/modsulator +16 -0
  6. data/lib/doc/Modsulator.html +725 -0
  7. data/lib/doc/_index.html +101 -0
  8. data/lib/doc/class_list.html +58 -0
  9. data/lib/doc/css/common.css +1 -0
  10. data/lib/doc/css/full_list.css +57 -0
  11. data/lib/doc/css/style.css +339 -0
  12. data/lib/doc/file_list.html +57 -0
  13. data/lib/doc/frames.html +26 -0
  14. data/lib/doc/index.html +101 -0
  15. data/lib/doc/js/app.js +219 -0
  16. data/lib/doc/js/full_list.js +181 -0
  17. data/lib/doc/js/jquery.js +4 -0
  18. data/lib/doc/method_list.html +87 -0
  19. data/lib/doc/top-level-namespace.html +112 -0
  20. data/lib/modsulator.rb +193 -0
  21. data/lib/modsulator/mods-3-5.xsd +1463 -0
  22. data/lib/modsulator/modsulator.xsd +35 -0
  23. data/lib/modsulator/modsulator_sheet.rb +62 -0
  24. data/lib/modsulator/modsulator_template.xlsx +0 -0
  25. data/lib/modsulator/modsulator_template.xml +198 -0
  26. data/lib/modsulator/normalizer.rb +218 -0
  27. data/lib/modsulator/validator.rb +37 -0
  28. data/spec/features/modsulator_sheet_unit_spec.rb +18 -0
  29. data/spec/features/normalizer_unit_spec.rb +152 -0
  30. data/spec/features/process_template_spec.rb +29 -0
  31. data/spec/features/validator_unit_spec.rb +43 -0
  32. data/spec/fixtures/Fitch_Chavez.xlsx +0 -0
  33. data/spec/fixtures/Fitch_Chavez.xml +2806 -0
  34. data/spec/fixtures/Fitch_King.xlsx +0 -0
  35. data/spec/fixtures/Fitch_King.xml +2560 -0
  36. data/spec/fixtures/M1463_AV_manifest.xlsx +0 -0
  37. data/spec/fixtures/M1463_AV_manifest.xml +2373 -0
  38. data/spec/fixtures/Matter_manifest.csv +295 -0
  39. data/spec/fixtures/Matter_manifest.xml +3309 -0
  40. data/spec/fixtures/PosadaSpreadsheet.xlsx +0 -0
  41. data/spec/fixtures/PosadaSpreadsheet.xml +1259 -0
  42. data/spec/fixtures/ars0056_manifest.csv +1 -0
  43. data/spec/fixtures/ars0056_manifest.xml +9728 -0
  44. data/spec/fixtures/crowdsourcing_bridget_1.xlsx +0 -0
  45. data/spec/fixtures/crowdsourcing_bridget_1.xml +606 -0
  46. data/spec/fixtures/crowdsourcing_bridget_2.xlsx +0 -0
  47. data/spec/fixtures/crowdsourcing_bridget_2.xml +3433 -0
  48. data/spec/fixtures/filled_template.xlsx +0 -0
  49. data/spec/fixtures/invalid_crowdsourcing_bridget_1.xml +606 -0
  50. data/spec/fixtures/manifest_v0174.csv +34 -0
  51. data/spec/fixtures/manifest_v0174.xml +1026 -0
  52. data/spec/fixtures/roman_coins_mods_manifest.csv +176 -0
  53. data/spec/fixtures/roman_coins_mods_manifest.xml +3025 -0
  54. data/spec/fixtures/test:002.xml +63 -0
  55. data/spec/fixtures/test_002.csv +4 -0
  56. data/spec/fixtures/test_002.xlsx +0 -0
  57. data/spec/integration_tests/integration_spec.rb +40 -0
  58. data/spec/lib/modsulator_spec.rb +21 -0
  59. data/spec/spec_helper.rb +97 -0
  60. metadata +261 -0
@@ -0,0 +1,35 @@
1
+ <?xml version="1.0"?>
2
+ <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" targetNamespace="http://library.stanford.edu/xmlDocs" xmlns="http://library.stanford.edu/xmlDocs" elementFormDefault="qualified" attributeFormDefault="unqualified">
3
+ <!-- This XML Schema Definition defines the "xmlDocs" vocabulary that is produced by the MODSulator. -->
4
+ <!-- The main content of xmlDocs is full MODS 3.5 XML, as defined by the Library of Congress. Import their schema definition. -->
5
+ <xs:import namespace="http://www.loc.gov/mods/v3" schemaLocation="mods-3-5.xsd"/>
6
+
7
+ <xs:element name="xmlDocs">
8
+ <xs:complexType>
9
+ <xs:sequence minOccurs="0" maxOccurs="unbounded">
10
+ <xs:element name="xmlDoc"/>
11
+ </xs:sequence>
12
+ <xs:attributeGroup ref="docsAttributeGroup"/>
13
+ </xs:complexType>
14
+ </xs:element>
15
+
16
+ <xs:element name="xmlDoc">
17
+ <xs:complexType>
18
+ <xs:sequence>
19
+ <xs:element name="mods"/>
20
+ </xs:sequence>
21
+ <xs:attributeGroup ref="docAttributeGroup"/>
22
+ </xs:complexType>
23
+ </xs:element>
24
+
25
+ <xs:attributeGroup name="docsAttributeGroup">
26
+ <xs:attribute name="datetime" type="xs:string"/>
27
+ <xs:attribute name="sourceFile" type="xs:string"/>
28
+ </xs:attributeGroup>
29
+
30
+ <xs:attributeGroup name="docAttributeGroup">
31
+ <xs:attribute name="id" type="xs:string"/>
32
+ <xs:attribute name="objectId" type="xs:string"/>
33
+ </xs:attributeGroup>
34
+
35
+ </xs:schema>
@@ -0,0 +1,62 @@
1
+ # File "modsulator_sheet.rb" - a class to load and validate metadata spreadsheets (.xlsx or .csv) for input
2
+ # to Modsulator.
3
+
4
+ require 'json'
5
+ require 'roo'
6
+
7
+ # This class provides methods to parse Stanford's MODS spreadsheets into either an array of hashes, or a JSON string.
8
+ class ModsulatorSheet
9
+ attr_reader :file, :filename
10
+
11
+ # Creates a new ModsulatorSheet. When called with temporary files, the filename must be specified separately, hence the
12
+ # second argument.
13
+ # @param [File] file The input spreadsheet
14
+ # @param [String] filename The filename of the input spreadsheet.
15
+ def initialize(file, filename)
16
+ @file = file
17
+ @filename = filename
18
+ end
19
+
20
+
21
+ # Loads the input spreadsheet into an array of hashes. This spreadsheet should conform to the Stanford MODS template format,
22
+ # which has three header rows. The first row is a kind of "super header", the second row is an intermediate header and the
23
+ # third row is the header row that names the fields. The data rows are in the fourth row onwards.
24
+ #
25
+ # @return [Array<Hash>] An array with one entry per data row in the spreadsheet. Each entry is a hash, indexed by
26
+ # the spreadsheet headers.
27
+ def rows
28
+ # Parse the spreadsheet, automatically finding the header row by looking for "druid" and "sourceId" and leave the
29
+ # header row itself out of the resulting array. Everything preceding the header row is discarded. Would like to use
30
+ # clean: true here, but the latest release of Roo 1.13.2 crashes. 2.0.0beta1 seems to work though.
31
+ @rows ||= spreadsheet.parse(header_search: ['druid', 'sourceId']).drop(1)
32
+ end
33
+
34
+
35
+ # Opens a spreadsheet based on its filename extension.
36
+ #
37
+ # @return [Roo::CSV, Roo::Excel, Roo::Excelx] A Roo object, whose type depends on the extension of the given filename.
38
+ def spreadsheet
39
+ @spreadsheet ||= case File.extname(@filename)
40
+ when '.csv' then Roo::Spreadsheet.open(@file, extension: :csv)
41
+ when '.xls' then Roo::Spreadsheet.open(@file, extension: :xls)
42
+ when '.xlsx' then Roo::Spreadsheet.open(@file, extension: :xlsx)
43
+ else fail "Unknown file type: #{@filename}"
44
+ end
45
+ end
46
+
47
+
48
+ # Get the headers used in the spreadsheet
49
+ def headers
50
+ rows.first.keys
51
+ end
52
+
53
+
54
+ # Convert the loaded spreadsheet to a JSON string.
55
+ # @return [String] A JSON string.
56
+ def to_json
57
+ json_hash = {}
58
+ json_hash['filename'] = File.basename(filename)
59
+ json_hash['rows'] = rows
60
+ json_hash.to_json
61
+ end
62
+ end
@@ -0,0 +1,198 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <mods xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" version="3.5" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-5.xsd">
3
+ <titleInfo authority="[[ti1:authority]]" valueURI="[[ti1:valueURI]]">
4
+ <nonSort>[[ti1:nonSort]]</nonSort>
5
+ <title>[[ti1:title]]</title>
6
+ <subTitle>[[ti1:subTitle]]</subTitle>
7
+ <partNumber>[[ti1:partNumber]]</partNumber>
8
+ <partName>[[ti1:partName]]</partName>
9
+ </titleInfo>
10
+ <titleInfo type="[[ti2:type]]" authority="[[ti2:authority]]" valueURI="[[ti2:valueURI]]">
11
+ <nonSort>[[ti2:nonSort]]</nonSort>
12
+ <title>[[ti2:title]]</title>
13
+ <subTitle>[[ti2:subTitle]]</subTitle>
14
+ <partNumber>[[ti2:partNumber]]</partNumber>
15
+ <partName>[[ti2:partName]]</partName>
16
+ </titleInfo>
17
+ <titleInfo type="[[ti3:type]]" authority="[[ti3:authority]]" valueURI="[[ti3:valueURI]]">
18
+ <nonSort>[[ti3:nonSort]]</nonSort>
19
+ <title>[[ti3:title]]</title>
20
+ <subTitle>[[ti3:subTitle]]</subTitle>
21
+ <partNumber>[[ti3:partNumber]]</partNumber>
22
+ <partName>[[ti3:partName]]</partName>
23
+ </titleInfo>
24
+ <name type="[[na1:type]]" authority="[[na1:authority]]" valueURI="[[na1:valueURI]]">
25
+ <namePart>[[na1:namePart]]</namePart>
26
+ <role>
27
+ <roleTerm type="code" authority="[[ro1:authority]]" valueURI="[[ro1:valueURI]]">[[ro1:roleCode]]</roleTerm>
28
+ <roleTerm type="text" authority="[[ro1:authority]]" valueURI="[[ro1:valueURI]]">[[ro1:roleText]]</roleTerm>
29
+ </role>
30
+ </name>
31
+ <name type="[[na2:type]]" authority="[[na2:authority]]" valueURI="[[na2:valueURI]]">
32
+ <namePart>[[na2:namePart]]</namePart>
33
+ <role>
34
+ <roleTerm type="code" authority="[[ro2:authority]]" valueURI="[[ro2:valueURI]]">[[ro2:roleCode]]</roleTerm>
35
+ <roleTerm type="text" authority="[[ro2:authority]]" valueURI="[[ro2:valueURI]]">[[ro2:roleText]]</roleTerm>
36
+ </role>
37
+ </name>
38
+ <name type="[[na3:type]]" authority="[[na3:authority]]" valueURI="[[na3:valueURI]]">
39
+ <namePart>[[na3:namePart]]</namePart>
40
+ <role>
41
+ <roleTerm type="code" authority="[[ro3:authority]]" valueURI="[[ro3:valueURI]]">[[ro3:roleCode]]</roleTerm>
42
+ <roleTerm type="text" authority="[[ro3:authority]]" valueURI="[[ro3:valueURI]]">[[ro3:roleText]]</roleTerm>
43
+ </role>
44
+ </name>
45
+ <name type="[[na4:type]]" authority="[[na4:authority]]" valueURI="[[na4:valueURI]]">
46
+ <namePart>[[na4:namePart]]</namePart>
47
+ <role>
48
+ <roleTerm type="code" authority="[[ro4:authority]]" valueURI="[[ro4:valueURI]]">[[ro4:roleCode]]</roleTerm>
49
+ <roleTerm type="text" authority="[[ro4:authority]]" valueURI="[[ro4:valueURI]]">[[ro4:roleText]]</roleTerm>
50
+ </role>
51
+ </name>
52
+ <name type="[[na5:type]]" authority="[[na5:authority]]" valueURI="[[na5:valueURI]]">
53
+ <namePart>[[na5:namePart]]</namePart>
54
+ <role>
55
+ <roleTerm type="code" authority="[[ro5:authority]]" valueURI="[[ro5:valueURI]]">[[ro5:roleCode]]</roleTerm>
56
+ <roleTerm type="text" authority="[[ro5:authority]]" valueURI="[[ro5:valueURI]]">[[ro5:roleText]]</roleTerm>
57
+ </role>
58
+ </name>
59
+ <typeOfResource manuscript="[[ty1:manuscript]]">[[ty1:typeOfResource]]</typeOfResource>
60
+ <typeOfResource>[[ty2:typeOfResource]]</typeOfResource>
61
+ <typeOfResource>[[ty3:typeOfResource]]</typeOfResource>
62
+ <genre type="[[ge1:type]]" authority="[[ge1:authority]]" valueURI="[[ge1:valueURI]]">[[ge1:genre]]</genre>
63
+ <genre type="[[ge2:type]]" authority="[[ge2:authority]]" valueURI="[[ge2:valueURI]]">[[ge2:genre]]</genre>
64
+ <genre type="[[ge3:type]]" authority="[[ge3:authority]]" valueURI="[[ge3:valueURI]]">[[ge3:genre]]</genre>
65
+ <originInfo>
66
+ <place>
67
+ <placeTerm type="code" authority="[[pl:authority]]" valueURI="[[pl:valueURI]]">[[pl:placeCode]]</placeTerm>
68
+ <placeTerm type="text" authority="[[pl:authority]]" valueURI="[[pl:valueURI]]">[[pl:placeText]]</placeTerm>
69
+ </place>
70
+ <publisher>[[or:publisher]]</publisher>
71
+ <dateCreated keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="start">[[dt:dateCreated]]</dateCreated>
72
+ <dateCreated keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="end">[[dt:dateCreated2]]</dateCreated>
73
+ <dateIssued keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="start">[[dt:dateIssued]]</dateIssued>
74
+ <dateIssued keyDate="yes" encoding="[[dt:encoding]]" qualifier="[[dt:qualifier]]" point="end">[[dt:dateIssued2]]</dateIssued>
75
+ </originInfo>
76
+ <language>
77
+ <languageTerm authority="[[la1:authority]]" valueURI="[[la1:valueURI]]" type="code">[[la1:code]]</languageTerm>
78
+ <languageTerm authority="[[la1:authority]]" valueURI="[[la1:valueURI]]" type="text">[[la1:text]]</languageTerm>
79
+ </language>
80
+ <language>
81
+ <languageTerm authority="[[la2:authority]]" valueURI="[[la2:valueURI]]" type="code">[[la2:code]]</languageTerm>
82
+ <languageTerm authority="[[la2:authority]]" valueURI="[[la2:valueURI]]" type="text">[[la2:text]]</languageTerm>
83
+ </language>
84
+ <language>
85
+ <languageTerm authority="[[la3:authority]]" valueURI="[[la3:valueURI]]" type="code">[[la3:code]]</languageTerm>
86
+ <languageTerm authority="[[la3:authority]]" valueURI="[[la3:valueURI]]" type="text">[[la3:text]]</languageTerm>
87
+ </language>
88
+ <physicalDescription>
89
+ <form>[[ph1:form]]</form>
90
+ <extent>[[ph1:extent]]</extent>
91
+ <digitalOrigin>[[ph1:digitalOrigin]]</digitalOrigin>
92
+ <note displayLabel="[[ph1:displayLabel]]">[[ph1:note]]</note>
93
+ </physicalDescription>
94
+ <abstract displayLabel="[[ab:displayLabel]]">[[ab:abstract]]</abstract>
95
+ <tableOfContents displayLabel="[[tc:displayLabel]]">[[tc:tableOfContents]]</tableOfContents>
96
+ <note type="[[no1:type]]" displayLabel="[[no1:displayLabel]]">[[no1:note]]</note>
97
+ <note type="[[no2:type]]" displayLabel="[[no2:displayLabel]]">[[no2:note]]</note>
98
+ <note type="[[no3:type]]" displayLabel="[[no3:displayLabel]]">[[no3:note]]</note>
99
+ <note type="[[no4:type]]" displayLabel="[[no4:displayLabel]]">[[no4:note]]</note>
100
+ <note type="[[no5:type]]" displayLabel="[[no5:displayLabel]]">[[no5:note]]</note>
101
+ <subject authority="[[sn1:authority]]">
102
+ <name type="[[sn1:p1:nameType]]" authority="[[sn1:p1:authority]]" valueURI="[[sn1:p1:valueURI]]">
103
+ <namePart>[[sn1:p1:name]]</namePart>
104
+ </name>
105
+ <titleInfo>
106
+ <title>[[sn1:p1:title]]</title>
107
+ </titleInfo>
108
+ <[[sn1:p2:type]] authority="[[sn1:p2:authority]]" valueURI="[[sn1:p2:valueURI]]">[[sn1:p2:value]]</[[sn1:p2:type]]>
109
+ <[[sn1:p3:type]] authority="[[sn1:p3:authority]]" valueURI="[[sn1:p3:valueURI]]">[[sn1:p3:value]]</[[sn1:p3:type]]>
110
+ </subject>
111
+ <subject authority="[[sn2:authority]]">
112
+ <name type="[[sn2:p1:nameType]]" authority="[[sn2:p1:authority]]" valueURI="[[sn2:p1:valueURI]]">
113
+ <namePart>[[sn2:p1:name]]</namePart>
114
+ </name>
115
+ <titleInfo>
116
+ <title>[[sn2:p1:title]]</title>
117
+ </titleInfo>
118
+ <[[sn2:p2:type]] authority="[[sn2:p2:authority]]" valueURI="[[sn2:p2:valueURI]]">[[sn2:p2:value]]</[[sn2:p2:type]]>
119
+ <[[sn2:p3:type]] authority="[[sn2:p3:authority]]" valueURI="[[sn2:p3:valueURI]]">[[sn2:p3:value]]</[[sn2:p3:type]]>
120
+ </subject>
121
+ <subject authority="[[sn3:authority]]">
122
+ <name type="[[sn3:p1:nameType]]" authority="[[sn3:p1:authority]]" valueURI="[[sn3:p1:valueURI]]">
123
+ <namePart>[[sn3:p1:name]]</namePart>
124
+ </name>
125
+ <titleInfo>
126
+ <title>[[sn3:p1:title]]</title>
127
+ </titleInfo>
128
+ <[[sn3:p2:type]] authority="[[sn3:p2:authority]]" valueURI="[[sn3:p2:valueURI]]">[[sn3:p2:value]]</[[sn3:p2:type]]>
129
+ <[[sn3:p3:type]] authority="[[sn3:p3:authority]]" valueURI="[[sn3:p3:valueURI]]">[[sn3:p3:value]]</[[sn3:p3:type]]>
130
+ </subject>
131
+ <subject authority="[[sn4:authority]]">
132
+ <name type="[[sn4:p1:nameType]]" authority="[[sn4:p1:authority]]" valueURI="[[sn4:p1:valueURI]]">
133
+ <namePart>[[sn4:p1:name]]</namePart>
134
+ </name>
135
+ <titleInfo>
136
+ <title>[[sn4:p1:title]]</title>
137
+ </titleInfo>
138
+ <[[sn4:p2:type]] authority="[[sn4:p2:authority]]" valueURI="[[sn4:p2:valueURI]]">[[sn4:p2:value]]</[[sn4:p2:type]]>
139
+ <[[sn4:p3:type]] authority="[[sn4:p3:authority]]" valueURI="[[sn4:p3:valueURI]]">[[sn4:p3:value]]</[[sn4:p3:type]]>
140
+ </subject>
141
+ <subject authority="[[sn5:authority]]">
142
+ <name type="[[sn5:p1:nameType]]" authority="[[sn5:p1:authority]]" valueURI="[[sn5:p1:valueURI]]">
143
+ <namePart>[[sn5:p1:name]]</namePart>
144
+ </name>
145
+ <titleInfo>
146
+ <title>[[sn5:p1:title]]</title>
147
+ </titleInfo>
148
+ <[[sn5:p2:type]] authority="[[sn5:p2:authority]]" valueURI="[[sn5:p2:valueURI]]">[[sn5:p2:value]]</[[sn5:p2:type]]>
149
+ <[[sn5:p3:type]] authority="[[sn5:p3:authority]]" valueURI="[[sn5:p3:valueURI]]">[[sn5:p3:value]]</[[sn5:p3:type]]>
150
+ </subject>
151
+ <subject authority="[[su1:authority]]">
152
+ <[[su1:p1:type]] authority="[[su1:p1:authority]]" valueURI="[[su1:p1:valueURI]]">[[su1:p1:value]]</[[su1:p1:type]]>
153
+ <[[su1:p2:type]] authority="[[su1:p2:authority]]" valueURI="[[su1:p2:valueURI]]">[[su1:p2:value]]</[[su1:p2:type]]>
154
+ <[[su1:p3:type]] authority="[[su1:p3:authority]]" valueURI="[[su1:p3:valueURI]]">[[su1:p3:value]]</[[su1:p3:type]]>
155
+ </subject>
156
+ <subject authority="[[su2:authority]]">
157
+ <[[su2:p1:type]] authority="[[su2:p1:authority]]" valueURI="[[su2:p1:valueURI]]">[[su2:p1:value]]</[[su2:p1:type]]>
158
+ <[[su2:p2:type]] authority="[[su2:p2:authority]]" valueURI="[[su2:p2:valueURI]]">[[su2:p2:value]]</[[su2:p2:type]]>
159
+ <[[su2:p3:type]] authority="[[su2:p3:authority]]" valueURI="[[su2:p3:valueURI]]">[[su2:p3:value]]</[[su2:p3:type]]>
160
+ </subject>
161
+ <subject authority="[[su3:authority]]">
162
+ <[[su3:p1:type]] authority="[[su3:p1:authority]]" valueURI="[[su3:p1:valueURI]]">[[su3:p1:value]]</[[su3:p1:type]]>
163
+ <[[su3:p2:type]] authority="[[su3:p2:authority]]" valueURI="[[su3:p2:valueURI]]">[[su3:p2:value]]</[[su3:p2:type]]>
164
+ <[[su3:p3:type]] authority="[[su3:p3:authority]]" valueURI="[[su3:p3:valueURI]]">[[su3:p3:value]]</[[su3:p3:type]]>
165
+ </subject>
166
+ <subject authority="[[su4:authority]]">
167
+ <[[su4:p1:type]] authority="[[su4:p1:authority]]" valueURI="[[su4:p1:valueURI]]">[[su4:p1:value]]</[[su4:p1:type]]>
168
+ <[[su4:p2:type]] authority="[[su4:p2:authority]]" valueURI="[[su4:p2:valueURI]]">[[su4:p2:value]]</[[su4:p2:type]]>
169
+ <[[su4:p3:type]] authority="[[su4:p3:authority]]" valueURI="[[su4:p3:valueURI]]">[[su4:p3:value]]</[[su4:p3:type]]>
170
+ </subject>
171
+ <subject authority="[[su5:authority]]">
172
+ <[[su5:p1:type]] authority="[[su5:p1:authority]]" valueURI="[[su5:p1:valueURI]]">[[su5:p1:value]]</[[su5:p1:type]]>
173
+ <[[su5:p2:type]] authority="[[su5:p2:authority]]" valueURI="[[su5:p2:valueURI]]">[[su5:p2:value]]</[[su5:p2:type]]>
174
+ <[[su5:p3:type]] authority="[[su5:p3:authority]]" valueURI="[[su5:p3:valueURI]]">[[su5:p3:value]]</[[su5:p3:type]]>
175
+ </subject>
176
+ <subject>
177
+ <cartographics>
178
+ <scale>[[sc1:scale]]</scale>
179
+ <projection>[[sc1:projection]]</projection>
180
+ <coordinates>[[sc1:coordinates]]</coordinates>
181
+ </cartographics>
182
+ </subject>
183
+ <identifier type="[[id1:type]]" displayLabel="[[id1:displayLabel]]">[[id1:identifier]]</identifier>
184
+ <identifier type="[[id2:type]]" displayLabel="[[id2:displayLabel]]">[[id2:identifier]]</identifier>
185
+ <identifier type="[[id3:type]]" displayLabel="[[id3:displayLabel]]">[[id3:identifier]]</identifier>
186
+ <identifier type="[[id4:type]]" displayLabel="[[id4:displayLabel]]">[[id4:identifier]]</identifier>
187
+ <identifier type="[[id5:type]]" displayLabel="[[id5:displayLabel]]">[[id5:identifier]]</identifier>
188
+ <location>
189
+ <physicalLocation type="repository" authority="[[lo:authority]]" valueURI="[[lo:valueURI]]">[[lo:repository]]</physicalLocation>
190
+ <shelfLocator>[[lo:callNumber]]</shelfLocator>
191
+ </location>
192
+ <relatedItem type="[[ri1:type]]">
193
+ <titleInfo>
194
+ <title>[[ri1:title]]</title>
195
+ </titleInfo>
196
+ <url>[[ri1:url]]</url>
197
+ </relatedItem>
198
+ </mods>
@@ -0,0 +1,218 @@
1
+ # File "normalizer.rb" - defines a class for normalizing MODS XML according to the Stanford guidelines.
2
+
3
+ require 'nokogiri'
4
+
5
+ # This class provides methods to normalize MODS XML according to the Stanford guidelines.
6
+ # @see https://consul.stanford.edu/display/chimera/MODS+validation+and+normalization Requirements (Stanford Consul page - requires login)
7
+ class Normalizer
8
+ # Linefeed character entity reference
9
+ LINEFEED = '&#10;'
10
+
11
+ # Checks if a node has attributes that we make exeptions for. There are two such exceptions.
12
+ #
13
+ # * A "collection" attribute with the value "yes" <em>on a typeOfResource tag</em>.
14
+ # * A "manuscript" attribute with the value "yes" <em>on a typeOfResource tag</em>.
15
+ #
16
+ # Nodes that fall under any of these exceptions should not be deleted, even if they have no content.
17
+ #
18
+ # @param [Nokogiri::XML::Element] node An XML node.
19
+ # @return [Boolean] true if the node contains any of the exceptional attributes, false otherwise.
20
+ def exceptional?(node)
21
+ return false unless node != nil
22
+
23
+ tag = node.name
24
+ attributes = node.attributes
25
+
26
+ return false if(attributes.empty?)
27
+
28
+ attributes.each do |key, value|
29
+ if(tag == 'typeOfResource') # Note that according to the MODS schema, any other value than 'yes' for these attributes is invalid
30
+ if((key == 'collection' && value.to_s.downcase == 'yes') ||
31
+ (key == 'manuscript' && value.to_s.downcase == 'yes'))
32
+ return true
33
+ end
34
+ end
35
+ end
36
+ return false
37
+ end
38
+
39
+
40
+ # Recursive helper method for {Normalizer#clean_linefeeds} to do string substitution.
41
+ #
42
+ # @param [Nokogiri::XML::Element] node An XML node
43
+ # @return [String] A string composed of the entire contents of the given node, with substitutions made as described for {#clean_linefeeds}.
44
+ def substitute_linefeeds(node)
45
+ new_text = String.new
46
+
47
+ # If we substitute in '&#10;' by itself, Nokogiri interprets that and then prints '&amp;#10;' when printing the document later. This
48
+ # is an ugly way to add linefeed characters in a way that we at least get well-formatted output in the end.
49
+ if(node.text?)
50
+ new_text = node.content.gsub(/\r\n/, Nokogiri::HTML(LINEFEED).text).gsub(/\n/, Nokogiri::HTML(LINEFEED).text).gsub(/\r/, Nokogiri::HTML(LINEFEED).text).gsub('\\n', Nokogiri::HTML(LINEFEED).text)
51
+ else
52
+ if(node.node_name == 'br')
53
+ new_text += Nokogiri::HTML(LINEFEED).text
54
+ elsif(node.node_name == 'p')
55
+ new_text += Nokogiri::HTML(LINEFEED).text + Nokogiri::HTML(LINEFEED).text
56
+ end
57
+
58
+ node.children.each do |c|
59
+ new_text += substitute_linefeeds(c)
60
+ end
61
+ end
62
+ return new_text
63
+ end
64
+
65
+
66
+ # Given the root of an XML document, replaces linefeed characters inside <tableOfContents>, <abstract> and <note> XML node by &#10;
67
+ # \n, \r, <br> and <br/> are all replaced by a single &#10;
68
+ # <p> is replaced by two &#10;
69
+ # </p> is removed
70
+ # \r\n is replaced by &#10;
71
+ # Any tags not listed above are removed. MODS 3.5 does not allow for anything other than text inside these three nodes.
72
+ #
73
+ # @param [Nokogiri::XML::Element] node The root node of an XML document
74
+ # @return [Void] This method doesn't return anything, but introduces UTF-8 linefeed characters in place, as described above.
75
+ def clean_linefeeds(node)
76
+ node_list = []
77
+ if(node.namespace.nil?)
78
+ node_list = node.xpath('//abstract | //tableOfContents | //note')
79
+ else
80
+ node_list = node.xpath('//ns:abstract | //ns:tableOfContents | //ns:note', 'ns' => node.namespace.href)
81
+ end
82
+
83
+ node_list.each do |current_node|
84
+ new_text = substitute_linefeeds(current_node)
85
+ current_node.children.remove
86
+ current_node.content = new_text
87
+ end
88
+ end
89
+
90
+
91
+ # Cleans up the text of a node:
92
+ #
93
+ # * Removes extra whitespace at the beginning and end.
94
+ # * Removes any consecutive whitespace within the string.
95
+ #
96
+ # @param [String] s The text of an XML node.
97
+ # @return [String] The cleaned string, as described. Returns nil if the input is nil, or if the input is an empty string.
98
+ def clean_text(s)
99
+ return nil unless s != nil && s != ''
100
+ return s.gsub!(/\s+/, ' ').strip!
101
+ end
102
+
103
+
104
+ # Removes empty attributes from a given node.
105
+ #
106
+ # @param [Nokogiri::XML::Element] node An XML node.
107
+ # @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
108
+ def remove_empty_attributes(node)
109
+ children = node.children
110
+ attributes = node.attributes
111
+
112
+ attributes.each do |key, value|
113
+ node.remove_attribute(key) if(value.to_s.strip.empty?)
114
+ end
115
+
116
+ children.each do |c|
117
+ remove_empty_attributes(c)
118
+ end
119
+ end
120
+
121
+
122
+ # Removes empty nodes from an XML tree. See {#exceptional?} for nodes that are kept even if empty.
123
+ #
124
+ # @param [Nokogiri::XML::Element] node An XML node.
125
+ # @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
126
+ def remove_empty_nodes(node)
127
+ children = node.children
128
+
129
+ if(node.text?)
130
+ if(node.to_s.strip.empty?)
131
+ node.remove
132
+ else
133
+ return
134
+ end
135
+ elsif(children.length > 0)
136
+ children.each do |c|
137
+ remove_empty_nodes(c)
138
+ end
139
+ end
140
+
141
+ if(!exceptional?(node) && (node.children.length == 0))
142
+ node.remove
143
+ end
144
+ end
145
+
146
+
147
+ # Removes leading and trailing spaces from a node.
148
+ #
149
+ # @param [Nokogiri::XML::Element] node An XML node.
150
+ # @return [Void] This method doesn't return anything, but modifies the entire XML tree starting at the
151
+ # the given node, removing leading and trailing spaces from all text. If the input is nil,
152
+ # an exception will be raised.
153
+ def trim_text(node)
154
+ children = node.children
155
+
156
+ if(node.text?)
157
+ node.parent.content = node.text.strip
158
+ else
159
+ children.each do |c|
160
+ trim_text(c)
161
+ end
162
+ end
163
+ end
164
+
165
+
166
+ # Removes the point attribute from single <dateCreated> and <dateIssued> elements.
167
+ #
168
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
169
+ # @return [Void] The given document is modified in place.
170
+ def clean_date_attributes(root)
171
+
172
+ # Find all the <dateCreated> and <dateIssued> elements that are NOT immediately followed by another element with the same name
173
+ root.xpath('//mods:originInfo/mods:dateCreated[1][not(following-sibling::*[1][self::mods:dateCreated])] | //mods:originInfo/mods:dateIssued[1][not(following-sibling::*[1][self::mods:dateIssued])]', 'mods' => 'http://www.loc.gov/mods/v3').each do |current_element|
174
+ attributes = current_element.attributes
175
+ if(attributes.key?('point'))
176
+ current_element.remove_attribute('point')
177
+ end
178
+ end
179
+ end
180
+
181
+
182
+ # Sometimes there are spurious decimal digits within the date fields. This method removes any trailing decimal points within
183
+ # <dateCreated> and <dateIssued>.
184
+ #
185
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
186
+ # @return [Void] The given document is modified in place.
187
+ def clean_date_values(root)
188
+ root.xpath('//mods:dateCreated | //mods:dateIssued', 'mods' => 'http://www.loc.gov/mods/v3').each do |current_node|
189
+ current_node.content = current_node.content.sub(/(.*)\.\d+$/, '\1')
190
+ end
191
+ end
192
+
193
+
194
+ # Normalizes the given XML document according to the Stanford guidelines.
195
+ #
196
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
197
+ # @return [Void] The given document is modified in place.
198
+ def normalize_document(root)
199
+ clean_linefeeds(root) # Do this before deleting <br> and <p> with remove_empty_nodes()
200
+ remove_empty_attributes(root)
201
+ remove_empty_nodes(root)
202
+ trim_text(root)
203
+ clean_date_attributes(root)
204
+ clean_date_values(root)
205
+ end
206
+
207
+
208
+ # Normalizes the given XML document string according to the Stanford guidelines.
209
+ #
210
+ # @param [String] xml_string An XML document
211
+ # @return [String] The XML string, with normalizations applied.
212
+ def normalize_xml_string(xml_string)
213
+ doc = Nokogiri::XML(xml_string)
214
+ normalize_document(doc.root)
215
+ doc.to_s
216
+ end
217
+ end
218
+