text-checkm 0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ 2.7.0
@@ -0,0 +1,7 @@
1
+ require 'simplecov-rcov'
2
+
3
+ SimpleCov.start do
4
+ coverage_dir 'tmp/reports'
5
+ formatter SimpleCov::Formatter::RcovFormatter
6
+ minimum_coverage 100
7
+ end
@@ -0,0 +1,7 @@
1
+ # 0.7 (22 July 2020)
2
+
3
+ - fork from [ruby-microservices/checkm@b7a23d6](https://github.com/ruby-microservices/checkm/tree/b7a23d6a72af643cb9554bf16ff49fc27eded827)
4
+ (last MIT-licensed version)
5
+ - update for Ruby 2.7.0
6
+ - rename module from `Checkm` to `Text::Checkm` to avoid name collisions
7
+ - bump version to 0.7 to match [spec](SPEC.txt) version
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'http://rubygems.org'
2
+ gemspec
@@ -0,0 +1,98 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ text-checkm (0.7)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ ast (2.4.1)
10
+ builder (3.2.4)
11
+ bundle-audit (0.1.0)
12
+ bundler-audit
13
+ bundler-audit (0.7.0.1)
14
+ bundler (>= 1.2.0, < 3)
15
+ thor (>= 0.18, < 2)
16
+ ci_reporter (2.0.0)
17
+ builder (>= 2.1.2)
18
+ ci_reporter_rspec (1.0.0)
19
+ ci_reporter (~> 2.0)
20
+ rspec (>= 2.14, < 4)
21
+ colorize (0.8.1)
22
+ diff-lcs (1.4.4)
23
+ docile (1.3.2)
24
+ ffi (1.13.1)
25
+ io-console (0.5.6)
26
+ irb (1.2.4)
27
+ reline (>= 0.0.1)
28
+ json (2.3.1)
29
+ listen (3.1.5)
30
+ rb-fsevent (~> 0.9, >= 0.9.4)
31
+ rb-inotify (~> 0.9, >= 0.9.7)
32
+ ruby_dep (~> 1.2)
33
+ parallel (1.19.2)
34
+ parser (2.7.1.4)
35
+ ast (~> 2.4.1)
36
+ rainbow (3.0.0)
37
+ rake (13.0.1)
38
+ rb-fsevent (0.10.4)
39
+ rb-inotify (0.10.1)
40
+ ffi (~> 1.0)
41
+ regexp_parser (1.7.1)
42
+ reline (0.1.4)
43
+ io-console (~> 0.5)
44
+ rexml (3.2.4)
45
+ rspec (3.9.0)
46
+ rspec-core (~> 3.9.0)
47
+ rspec-expectations (~> 3.9.0)
48
+ rspec-mocks (~> 3.9.0)
49
+ rspec-core (3.9.2)
50
+ rspec-support (~> 3.9.3)
51
+ rspec-expectations (3.9.2)
52
+ diff-lcs (>= 1.2.0, < 2.0)
53
+ rspec-support (~> 3.9.0)
54
+ rspec-mocks (3.9.1)
55
+ diff-lcs (>= 1.2.0, < 2.0)
56
+ rspec-support (~> 3.9.0)
57
+ rspec-support (3.9.3)
58
+ rubocop (0.86.0)
59
+ parallel (~> 1.10)
60
+ parser (>= 2.7.0.1)
61
+ rainbow (>= 2.2.2, < 4.0)
62
+ regexp_parser (>= 1.7)
63
+ rexml
64
+ rubocop-ast (>= 0.0.3, < 1.0)
65
+ ruby-progressbar (~> 1.7)
66
+ unicode-display_width (>= 1.4.0, < 2.0)
67
+ rubocop-ast (0.2.0)
68
+ parser (>= 2.7.0.1)
69
+ ruby-progressbar (1.10.1)
70
+ ruby_dep (1.5.0)
71
+ simplecov (0.16.1)
72
+ docile (~> 1.1)
73
+ json (>= 1.8, < 3)
74
+ simplecov-html (~> 0.10.0)
75
+ simplecov-html (0.10.2)
76
+ simplecov-rcov (0.2.3)
77
+ simplecov (>= 0.4.1)
78
+ thor (1.0.1)
79
+ unicode-display_width (1.7.0)
80
+
81
+ PLATFORMS
82
+ ruby
83
+
84
+ DEPENDENCIES
85
+ bundle-audit
86
+ ci_reporter_rspec
87
+ colorize
88
+ irb
89
+ listen (>= 3.0.5, < 3.2)
90
+ rake (>= 13.0)
91
+ rspec-support
92
+ rubocop (= 0.86)
93
+ simplecov (~> 0.16.1)
94
+ simplecov-rcov
95
+ text-checkm!
96
+
97
+ BUNDLED WITH
98
+ 2.1.2
@@ -0,0 +1,23 @@
1
+ # The MIT License (MIT)
2
+
3
+ Copyright © 2010 Chris Beer
4
+
5
+ Copyright © 2020 The Regents of the University of California
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a
8
+ copy of this software and associated documentation files (the “Software”),
9
+ to deal in the Software without restriction, including without limitation
10
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
11
+ and/or sell copies of the Software, and to permit persons to whom the
12
+ Software is furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
+ DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,23 @@
1
+ # Text::Checkm
2
+
3
+ Ruby implementation of
4
+ [Checkm](https://confluence.ucop.edu/display/Curation/Checkm), a
5
+ general-purpose text-based file manifest format developed at the
6
+ [California Digital Library](https://cdlib.org/) by John Kunze, Stephen
7
+ Abrams, and David Loy.
8
+
9
+ (See [SPEC.txt](SPEC.txt) in this repository for details.)
10
+
11
+ ## Copyright
12
+
13
+ Based on [checkm](https://github.com/ruby-microservices/checkm/), copyright
14
+ © 2010 Chris Beer. (For compliance with [UC policies on the use and
15
+ creation of open source
16
+ software](https://security.ucop.edu/resources/open-source-software-licensing.html),
17
+ this library is based on
18
+ [ruby-microservices/checkm@b7a23d6](https://github.com/ruby-microservices/checkm/tree/b7a23d6a72af643cb9554bf16ff49fc27eded827),
19
+ the last MIT-licensed revision.)
20
+
21
+ Subsequent work © 2020 the Regents of the University of California. See
22
+ [LICENSE.md](LICENSE.md) for more details.
23
+
@@ -0,0 +1,36 @@
1
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path('Gemfile', __dir__)
2
+ require 'bundler/setup' # Set up gems listed in the Gemfile.
3
+
4
+ # ------------------------------------------------------------
5
+ # Application code
6
+
7
+ File.expand_path('lib', __dir__).tap do |lib|
8
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
9
+ end
10
+
11
+ # ------------------------------------------------------------
12
+ # RSpec
13
+
14
+ require 'rspec/core/rake_task'
15
+ require 'ci/reporter/rake/rspec'
16
+
17
+ ENV['CI_REPORTS'] ||= File.expand_path('artifacts', __dir__)
18
+
19
+ namespace :spec do
20
+ desc 'Run all tests'
21
+ RSpec::Core::RakeTask.new(:all) do |task|
22
+ task.rspec_opts = %w[--color --format documentation --order default]
23
+ task.pattern = 'spec/**/*_spec.rb'
24
+ end
25
+ end
26
+
27
+ desc 'Run all tests'
28
+ task spec: ['spec:all']
29
+
30
+ # ------------------------------------------------------------
31
+ # Custom tasks
32
+
33
+ desc 'Run tests, check test coverage, check code style'
34
+ task default: %i[coverage rubocop bundle:audit]
35
+
36
+ # TODO: YARD
@@ -0,0 +1,468 @@
1
+ ---------------------------------------------------------------------------
2
+
3
+ NOTE: This is an unofficial copy of the Checkm 0.7 specification,
4
+ adapted from a diff between the 0.3 and 0.7 specifications captured by
5
+ the Internet Archive's Wayback Machine on 23 October 2015 and retrieved
6
+ on 22 July 2020. It is provided here for reference only, and should not
7
+ be considered normative. (D. Moles, 2020-07-22)
8
+
9
+ ---------------------------------------------------------------------------
10
+
11
+ Repository Working Group J. Kunze
12
+ S. Abrams
13
+ D. Loy
14
+ California Digital Library
15
+
16
+ June 11, 2010
17
+
18
+ Checkm: a checksum-based manifest format (v0.7)
19
+
20
+
21
+ Abstract
22
+
23
+ Checkm is a general-purpose text-based file manifest format. Each
24
+ line of a Checkm manifest is a set of '|'-separated tokens, the first
25
+ of which identifies the corresponding digital content by filename or
26
+ URL. Other tokens identify digest algorithm, checksum, content
27
+ length, and modification time. Tokens may be left unspecified with
28
+ empty fields or by ending the line early, the degenerate case being a
29
+ simple file list. It is up to tools that use the Checkm format to
30
+ specify any further restrictions on tokens (e.g., allowed defaults
31
+ and digest algorithms) and on overall manifest completeness and
32
+ coherence. A structured comment mechanism permits a way to document
33
+ extensions and restrictions. Checkm is designed to support tools that
34
+ verify the bit-level integrity of groups of files in support of such
35
+ things as content fixity, replication, import, and export. A manifest
36
+ may be single-level or multi-level (hierarchical), the latter being
37
+ useful, for example, in harvesting material from very large web sites
38
+ (cf. sitemaps).
39
+
40
+
41
+ 1. Checkm overview
42
+
43
+ Checkm (pronounced "check 'em") is a simple text-based manifest
44
+ format for digital content. A manifest is a set of lines, each of
45
+ which describes a unit of content via up to six whitespace-separated
46
+ tokens. The meaning of a token is given by its position within the
47
+ line. For example, the first three tokens give the name of the
48
+ content, a checksum algorithm, and a digest (checksum) computed using
49
+ that algorithm, respectively. Here's a manifest identifying two files
50
+ with MD5 checksums (not recommended for serious use but short enough
51
+ to fit in these examples).
52
+
53
+ #%checkm_0.7
54
+ # My first manifest. Two files total.
55
+ # Filename |Algorithm| Digest
56
+ book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
57
+ images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
58
+
59
+ Checkm is purely concerned with format and not with such things as
60
+ completeness and fitness for a given application. It defines the
61
+ meanings of the six tokens but does not mandate their use. For
62
+ example, a file package transfer tool could require use of four
63
+ tokens, but another tool designed for fixity checking might only
64
+ require two tokens. The next example is a bare-bones manifest in
65
+ which all but the first token have been dropped, in other words, it's
66
+ just a list of filenames or URLs, one per line. This is a useful
67
+ degenerate case when only a list of named units of content is needed.
68
+
69
+ #%checkm_0.7
70
+ # My second manifest. Just a list of files.
71
+ # Filename (no other tokens given)
72
+ http://example.org/i/chap9.xml
73
+ http://example.org/i/chap9fig2.png
74
+
75
+ To leave tokens unspecified that would occur in the middle rather
76
+ than at the end of a line, leave the corresponding fields empty. A
77
+ field is considered empty if the line terminates before it is reached
78
+ or if it consists only of linear whitespace, namely, zero or more
79
+ SPACE (hex 20) or TAB (hex 09) characters. For example, a package
80
+ transfer application that also renames files might use the following
81
+ manifest.
82
+
83
+ #%checkm_0.7
84
+ # My third manifest.
85
+ # Filename and Target specified, not Alg, Digest, Length, or ModTime
86
+ http://example.org/i/chap9.xml ||||| book/Chapter9.xml
87
+ http://example.org/i/chap9fig2.png ||||| images/r862.png
88
+
89
+
90
+ Each non-comment line can contain up to six tokens, and has the form,
91
+
92
+ [@]SourceFileOrURL | Alg | Digest | Length | ModTime | TargetFileOrURL
93
+
94
+ where "[@]" indicates an optional '@' that causes the identified
95
+ content to be "included" as a manifest extension. In principle there
96
+ is no upper or lower limit on the number of lines in a Checkm
97
+ manifest, however, practical considerations may call for extending a
98
+ single-level manifest to a multi-level manifest.
99
+
100
+
101
+ 2. Multi-level manifests
102
+
103
+ If supported, a multi-level manifest permits one large manifest to be
104
+ spread over a number of smaller manifests. To trigger this, the
105
+ SourceFileOrURL token that begins a line is preceded by a literal
106
+ '@'. It invokes a simple inclusion mechanism indicating that the
107
+ identified content is also in Checkm format and extends the current
108
+ manifest; this is similar to mainstream sitemap extension mechanisms
109
+ (cf. [SITEMAPS]). A tool can be said to support only single-level
110
+ Checkm if it does not support multi-level manifests.
111
+
112
+ Included manifests may themselves recursively include other
113
+ manifests. There is no limit either to the number of inclusions or to
114
+ the depth of a multi-level manifest. Cycles in the inclusion graph
115
+ are generally considered to be in poor taste.
116
+
117
+
118
+ 3. Checkm lines and tokens
119
+
120
+ Manifest lines end with either LF (hex 0a) or CRLF (hex 0d0a). Blank
121
+ lines are ignored. Lines that begin with '#' are considered
122
+ "comments" that are to be ignored by processors except for those
123
+ implementing Checkm extensions (described later).
124
+
125
+ Checkm tokens on a given line all relate to the unit of content or to
126
+ the extended functionality identified by the first token on the line.
127
+ A unit of content is a contiguous sequence of octets (for most
128
+ purposes this is a "file") identified by a filename or URL.
129
+
130
+ Tokens consist of UTF-8 characters [RFC3629] separated by a '|'
131
+ character (hex 7c). Any linear whitespace found at the start or end
132
+ of a token is ignored. Any characters not allowed in a token or in a
133
+ URL, such as '|' or whitespace, may be represented using URL percent-
134
+ encoding [RFC3986].
135
+
136
+ Tokens may be left unspecified by simply dropping them from the end
137
+ of the line or by leaving the field empty (zero or more linear
138
+ whitespace characters). Checkm is silent about which tokens are
139
+ required or prohibited and what defaults may be in effect. Checkm is
140
+ also silent about manifest completeness (which units of content must
141
+ be included) and hyper-specification (whether one unit of content can
142
+ or must have more than one line describing it, e.g., resulting from
143
+ two digest algorithms).
144
+
145
+
146
+ 4. Content lines
147
+
148
+ The first of up to six tokens on a non-comment line look like this
149
+
150
+ [@]SourceFileOrURL | Alg | Digest | Length | ModTime | TargetFileOrURL
151
+ TOKEN NUMBER: 1 2 3 4 5 6
152
+
153
+ The token's numbered position determines its meaning, as explained in
154
+ the correspondingly numbered subsections below.
155
+
156
+ Any extra fields at positions 7 and higher are considered to be Checkm
157
+ extensions.
158
+
159
+ 4.1. [@]SourceFileOrURL: content identifier
160
+
161
+ The SourceFileOrURL token identifies digital content, and may be
162
+ given as '-' to indicate that the content may be found on the
163
+ equivalent of Unix "stdin". This token may be a URL or a relative or
164
+ absolute filename. To prevent interpretation of a relative pathname
165
+ that begins with '#' or '@', one can insert "./" in front of the
166
+ name. Whether this token is a filename or a URL, any characters not
167
+ allowed in a URL must be represented using URL percent-encoding
168
+ [RFC3986].
169
+
170
+ If any SourceFileOrURL token in a manifest is preceded by the
171
+ optional '@', the line amounts to an "include" statement and the
172
+ manifest is considered to be "multi-level". Other tokens on that line
173
+ still relate to the content but the "included" content itself is
174
+ considered to be an extension of the current manifest. For example, a
175
+ multi-level Checkm manifest totaling 4 million lines could be
176
+ represented by a 2000-line manifest, each line of which references a
177
+ 2000-line single-level manifest.
178
+
179
+ If none of the lines in a manifest is preceded by '@', the manifest
180
+ is considered to be "single-level". It is permissible for a tool that
181
+ conforms to Checkm to declare support for only single-level
182
+ manifests.
183
+
184
+ 4.2. Alg: algorithm
185
+
186
+ Alg is either the literal string "dir" (designating a directory), a
187
+ string specifying a cryptographic checksum algorithm, or empty to
188
+ leave it unspecified. The special case of "dir" is useful for listing
189
+ an empty directory, which has neither a fixed octetstream over which
190
+ to compute a digest nor a contained filename to imply the directory's
191
+ existence. For example,
192
+
193
+
194
+ #%checkm_0.7
195
+ # My fourth manifest. Two files and a directory.
196
+ # Filename |Algorithm| Digest
197
+ book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
198
+ icons/ | dir
199
+ images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
200
+
201
+ Implementors of tools that use Checkm are strongly encouraged to
202
+ support at least two widely implemented checksum algorithms:
203
+
204
+ "md5" [RFC1321]
205
+
206
+ "sha1" [RFC3174]
207
+
208
+ "sha256" [FIPS180-2]
209
+
210
+ When using other algorithms, the name of the algorithm should be
211
+ normalized for use in the manifest's filename, by lowercasing the
212
+ common name of the algorithm, and removing all non-alphanumeric
213
+ characters.
214
+
215
+ 4.3. Digest: computed checksum
216
+
217
+ Digest is a string representing the checksum calculated according to
218
+ the Alg algorithm over the content, or empty to leave it unspecified.
219
+
220
+
221
+ 4.4. Length of content
222
+
223
+ Length is the number (base 10) of octets in the identified content,
224
+ or empty to leave it unspecified. It is typically useful in
225
+ providing a rapid test for altered content and for estimating file
226
+ transfer times.
227
+
228
+
229
+ 4.5. ModTime: time last modified
230
+
231
+ ModTime is a lexically sort-friendly date such as [TEMPER]
232
+ ('YYYYMMDDhhmmss') or [W3CDTF] (YYYY-MM-DDThh:mm:ss), or empty to
233
+ leave it unspecified. It should represent the UTC time when the
234
+ content was last modified and is typically useful in incremental or
235
+ priority harvesting of content (cf. [OAI] and [SITEMAPS]).
236
+
237
+
238
+ 4.6. TargetFileOrURL: other location
239
+
240
+ TargetFileOrURL is a secondary location for the content that
241
+ applications would use as necessary. For instance, a transfer tool
242
+ that also renames files could use this token as the destination name.
243
+
244
+
245
+ 5. Extensions: structured comment lines
246
+
247
+ Comment lines that begin with a token of the form '#%_symbol_' are
248
+ special structured comment lines that usually indicate specific
249
+ optional functionality that extends the core Checkm specification.
250
+ Matching against a _symbol_ is case-insensitive (e.g., #%foo is
251
+ equivalent to #%FOO). The rest of a structured comment line is
252
+ tokenized in the same way as non-comment lines. The structured
253
+ comment symbols that follow are currently reserved.
254
+
255
+ 5.1. Optional extension: #%checkm_0.7
256
+
257
+ It is highly recommended that the first line of a Checkm manifest be
258
+ of the form
259
+
260
+ #%checkm_M.N
261
+
262
+ where M.N identify major and minor version numbers. The current
263
+ version is 0.7.
264
+
265
+ 5.2. Optional extension: #%eof
266
+
267
+ A line consisting of
268
+
269
+ #%eof
270
+
271
+ is reserved as an explicit end of manifest file marker. It can be
272
+ used to distinguish manifests that might be empty because of an error
273
+ from those that are deliberately empty.
274
+
275
+ 5.3. Optional extension: #%fields
276
+
277
+ To precisely identify all fields in a given Checkm manifest, before
278
+ any non-comment lines include a line of the form
279
+
280
+ #%fields | Field_Id | ...
281
+
282
+ containing one or more instances of a Field_Id, each identifying the
283
+ corresponding manifest field. A Field_Id may be a simple string
284
+ suggestive of the respective field's function or it may be a globally
285
+ unique URL. If a Field_Id URL is resolvable, it should document any
286
+ restriction or extension in effect. The #%fields structured comment
287
+ may form part of a #%profile definition.
288
+
289
+ Semantics of the basic fields 1 through 6 may not be altered except
290
+ to narrow their meanings, such as to restrict the values of field 3
291
+ to one particular algorithm. Semantics of the extension fields (7 and
292
+ higher) may be defined at will.
293
+
294
+
295
+ 5.4. Optional extension: #%prefix
296
+
297
+ To define an abbeviation for a long URL in a manner reminiscent of
298
+ Turtle [Turtle], before any use of the abbreviation include a line of
299
+ the form
300
+
301
+ #%prefix | Abbrev: | URL
302
+
303
+ where Abbrev (which may be empty) is a "prefix" that will stand in
304
+ for the given URL when it used in other structured comments (and not
305
+ in non-comment lines). The #%prefix structured comment may form part
306
+ of a #%profile definition.
307
+
308
+ 5.5. Optional extension: #%profile
309
+
310
+ To declare that a Checkm manifest conforms to a specific profile,
311
+ before any non-comment lines include a line of the form
312
+
313
+ #%profile | ProfileURL
314
+
315
+ where ProfileURL is a unique identifier for a specific profile. If
316
+ the URL is resolvable, it should document any restrictions and
317
+ extensions. Some example profiles appear in an appendix.
318
+
319
+
320
+ 6. Conformance Terminology
321
+
322
+ A tool that uses the Checkm format should document which parts of the
323
+ format it supports. For example, documentation should state what
324
+ extensions, if any, are in use. One common restriction could be
325
+ expressed something like,
326
+
327
+ "... which must be a single-level, 3-column Checkm manifest with
328
+ relative filenames."
329
+
330
+ This terminology suggests that, for this particular tool, an
331
+ exception or undefined behavior is the likely result of supplying a
332
+ Checkm manifest that has any line beginning with '@', a URL, or an
333
+ absolute pathname, or that has any line with more than or fewer than
334
+ 3 tokens.
335
+
336
+
337
+ 7. Example two-level Checkm manifest
338
+
339
+ #%checkm_0.7
340
+ # A two-level manifest.
341
+
342
+ #Filename |Alg |Checksum |Length
343
+ foo.bar |sha1|2eacd0da7aa89b094f5121eb2901bf4de2219ef1 | 366
344
+ foo.bar |md5 |3e83471320227c0797a0c251f28db0c5 | 366
345
+ # This next line "includes" the manifest in file "myfirst".
346
+ @myfirst |md5 |6ab96c8930621d50cef31da4df6d9ed8 | 264
347
+
348
+ where the included file "myfirst" contains 264 octets and lists two
349
+ files:
350
+
351
+ #%checkm_0.7
352
+ # My first manifest. Two files total.
353
+ # Filename |Algorithm| Digest
354
+ book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
355
+ images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
356
+
357
+ 8. References
358
+
359
+ [FIPS180-2]
360
+ NIST, "FIPS 180-2: Secure Hash Standard (SHS)",
361
+ February 2004, <http://csrc.nist.gov/publications/fips/
362
+ fips180-2/fips180-2withchangenotice.pdf>.
363
+
364
+ [OAI] Lagoze, C. and H. Van de Sompel, "Open Archives Initiative
365
+ Protocol for Metadata Harvesting", June 2002, <http://
366
+ www.openarchives.org/OAI/openarchivesprotocol.html>.
367
+
368
+ [RFC1321] Rivest, R., "The MD5 Message-Digest Algorithm", RFC 1321,
369
+ April 1992.
370
+
371
+ [RFC3174] Eastlake, D. and P. Jones, "US Secure Hash Algorithm 1
372
+ (SHA1)", RFC 3174, September 2001.
373
+
374
+ [RFC3629] Yergeau, F., "UTF-8, a transformation format of ISO
375
+ 10646", STD 63, RFC 3629, November 2003.
376
+
377
+ [RFC3986] Berners-Lee, T., Fielding, R., and L. Masinter, "Uniform
378
+ Resource Identifier (URI): Generic Syntax", STD 66,
379
+ RFC 3986, January 2005.
380
+
381
+ [SITEMAPS]
382
+ sitemaps.org, "Sitemaps XML format", February 2008,
383
+ <http://sitemaps.org/protocol.php>.
384
+
385
+ [TEMPER] Blair, C. and J. Kunze, "Temporal Enumerated Ranges",
386
+ August 2007,
387
+ <http://www.cdlib.org/inside/diglib/ark/temperspec.pdf>.
388
+
389
+ [Turtle] Beckett, D. and T. Berners-Lee, "Turtle - Terse RDF Triple
390
+ Language", January 2008,
391
+ <http://www.w3.org/TeamSubmission/turtle/>.
392
+
393
+ [W3CDTF] Wolf, M. and C. Wicksteed, "Date and Time Formats (W3C
394
+ profile of ISO8601)",
395
+ <http://www.w3.org/TR/NOTE-datetime>.
396
+
397
+
398
+ Appendix A. Example profiles
399
+
400
+ The most important attribute of a Checkm profile is a globally unique
401
+ identifier, such as,
402
+
403
+ http://merritt.cdlib.org/registry/mrt-ingest-manifest
404
+
405
+ which applications can use for conditional processing. If, in
406
+ addition, this identifier is resolvable, it should return a text file
407
+ with the same format as a Checkm manifest but with no non-comment
408
+ lines. This file formally documents any particular ways in which the
409
+ first six Checkm fields may be restricted and what any additional
410
+ fields mean. As an example, the profile URL above corresponds to
411
+
412
+ #%checkm_0.7
413
+ #
414
+ # This is a profile definition for a "Merritt ingest" manifest.
415
+ #
416
+ #%profile | http://merritt.cdlib.org/registry/mrt-ingest-manifest
417
+ #%prefix | mrt: | http://merritt.cdlib.org/terms#
418
+ #%prefix | nfo: |
419
+ http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#
420
+ #%fields | nfo:fileUrl | nfo:hashAlgorithm | nfo:hashValue |
421
+ nfo:fileSize | nfo:fileLastModified | nfo:fileName |
422
+ mrt:mimeType
423
+
424
+ In this example and the next, indented lines artificially occur where
425
+ long lines have been wrapped for display purposes. The profile below
426
+ uses Checkm inclusion lines as a way to describe "digital objects".
427
+
428
+ #%checkm_0.7
429
+ #
430
+ # This is a profile definition for a "Merritt batch" manifest.
431
+ # It is meant to be used with Checkm "inclusion" lines, as in
432
+ #
433
+ # @url | [alg] | [value] | [length] | | filename | [primary] [ | local ]
434
+ #
435
+ #%profile | http://merritt.cdlib.org/registry/mrt-batch-manifest
436
+ #%prefix | mrt: | http://merritt.cdlib.org/terms#
437
+ #%prefix | nfo: |
438
+ http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#
439
+ #%fields | nfo:fileUrl | nfo:hashAlgorithm | nfo:hashValue |
440
+ nfo:fileSize | nfo:fileLastModified | nfo:fileName |
441
+ mrt:primaryIdentifier | mrt:localIdentifier
442
+
443
+
444
+ Authors' Addresses
445
+
446
+ John A. Kunze
447
+ California Digital Library
448
+ 415 20th St, 4th Floor
449
+ Oakland, CA 94612
450
+ US
451
+
452
+ Email: jak@ucop.edu
453
+
454
+ Stephen Abrams
455
+ California Digital Library
456
+ 415 20th St, 4th Floor
457
+ Oakland, CA 94612
458
+ US
459
+
460
+ Email: stephen.abrams@ucop.edu
461
+
462
+ David Loy
463
+ California Digital Library
464
+ 415 20th St, 4th Floor
465
+ Oakland, CA 94612
466
+ US
467
+
468
+ Email: david.loy@ucop.edu