text-checkm 0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ 2.7.0
@@ -0,0 +1,7 @@
1
+ require 'simplecov-rcov'
2
+
3
+ SimpleCov.start do
4
+ coverage_dir 'tmp/reports'
5
+ formatter SimpleCov::Formatter::RcovFormatter
6
+ minimum_coverage 100
7
+ end
@@ -0,0 +1,7 @@
1
+ # 0.7 (22 July 2020)
2
+
3
+ - fork from [ruby-microservices/checkm@b7a23d6](https://github.com/ruby-microservices/checkm/tree/b7a23d6a72af643cb9554bf16ff49fc27eded827)
4
+ (last MIT-licensed version)
5
+ - update for Ruby 2.7.0
6
+ - rename module from `Checkm` to `Text::Checkm` to avoid name collisions
7
+ - bump version to 0.7 to match [spec](SPEC.txt) version
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'http://rubygems.org'
2
+ gemspec
@@ -0,0 +1,98 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ text-checkm (0.7)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ ast (2.4.1)
10
+ builder (3.2.4)
11
+ bundle-audit (0.1.0)
12
+ bundler-audit
13
+ bundler-audit (0.7.0.1)
14
+ bundler (>= 1.2.0, < 3)
15
+ thor (>= 0.18, < 2)
16
+ ci_reporter (2.0.0)
17
+ builder (>= 2.1.2)
18
+ ci_reporter_rspec (1.0.0)
19
+ ci_reporter (~> 2.0)
20
+ rspec (>= 2.14, < 4)
21
+ colorize (0.8.1)
22
+ diff-lcs (1.4.4)
23
+ docile (1.3.2)
24
+ ffi (1.13.1)
25
+ io-console (0.5.6)
26
+ irb (1.2.4)
27
+ reline (>= 0.0.1)
28
+ json (2.3.1)
29
+ listen (3.1.5)
30
+ rb-fsevent (~> 0.9, >= 0.9.4)
31
+ rb-inotify (~> 0.9, >= 0.9.7)
32
+ ruby_dep (~> 1.2)
33
+ parallel (1.19.2)
34
+ parser (2.7.1.4)
35
+ ast (~> 2.4.1)
36
+ rainbow (3.0.0)
37
+ rake (13.0.1)
38
+ rb-fsevent (0.10.4)
39
+ rb-inotify (0.10.1)
40
+ ffi (~> 1.0)
41
+ regexp_parser (1.7.1)
42
+ reline (0.1.4)
43
+ io-console (~> 0.5)
44
+ rexml (3.2.4)
45
+ rspec (3.9.0)
46
+ rspec-core (~> 3.9.0)
47
+ rspec-expectations (~> 3.9.0)
48
+ rspec-mocks (~> 3.9.0)
49
+ rspec-core (3.9.2)
50
+ rspec-support (~> 3.9.3)
51
+ rspec-expectations (3.9.2)
52
+ diff-lcs (>= 1.2.0, < 2.0)
53
+ rspec-support (~> 3.9.0)
54
+ rspec-mocks (3.9.1)
55
+ diff-lcs (>= 1.2.0, < 2.0)
56
+ rspec-support (~> 3.9.0)
57
+ rspec-support (3.9.3)
58
+ rubocop (0.86.0)
59
+ parallel (~> 1.10)
60
+ parser (>= 2.7.0.1)
61
+ rainbow (>= 2.2.2, < 4.0)
62
+ regexp_parser (>= 1.7)
63
+ rexml
64
+ rubocop-ast (>= 0.0.3, < 1.0)
65
+ ruby-progressbar (~> 1.7)
66
+ unicode-display_width (>= 1.4.0, < 2.0)
67
+ rubocop-ast (0.2.0)
68
+ parser (>= 2.7.0.1)
69
+ ruby-progressbar (1.10.1)
70
+ ruby_dep (1.5.0)
71
+ simplecov (0.16.1)
72
+ docile (~> 1.1)
73
+ json (>= 1.8, < 3)
74
+ simplecov-html (~> 0.10.0)
75
+ simplecov-html (0.10.2)
76
+ simplecov-rcov (0.2.3)
77
+ simplecov (>= 0.4.1)
78
+ thor (1.0.1)
79
+ unicode-display_width (1.7.0)
80
+
81
+ PLATFORMS
82
+ ruby
83
+
84
+ DEPENDENCIES
85
+ bundle-audit
86
+ ci_reporter_rspec
87
+ colorize
88
+ irb
89
+ listen (>= 3.0.5, < 3.2)
90
+ rake (>= 13.0)
91
+ rspec-support
92
+ rubocop (= 0.86)
93
+ simplecov (~> 0.16.1)
94
+ simplecov-rcov
95
+ text-checkm!
96
+
97
+ BUNDLED WITH
98
+ 2.1.2
@@ -0,0 +1,23 @@
1
+ # The MIT License (MIT)
2
+
3
+ Copyright © 2010 Chris Beer
4
+
5
+ Copyright © 2020 The Regents of the University of California
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a
8
+ copy of this software and associated documentation files (the “Software”),
9
+ to deal in the Software without restriction, including without limitation
10
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
11
+ and/or sell copies of the Software, and to permit persons to whom the
12
+ Software is furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
+ DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,23 @@
1
+ # Text::Checkm
2
+
3
+ Ruby implementation of
4
+ [Checkm](https://confluence.ucop.edu/display/Curation/Checkm), a
5
+ general-purpose text-based file manifest format developed at the
6
+ [California Digital Library](https://cdlib.org/) by John Kunze, Stephen
7
+ Abrams, and David Loy.
8
+
9
+ (See [SPEC.txt](SPEC.txt) in this repository for details.)
10
+
11
+ ## Copyright
12
+
13
+ Based on [checkm](https://github.com/ruby-microservices/checkm/), copyright
14
+ © 2010 Chris Beer. (For compliance with [UC policies on the use and
15
+ creation of open source
16
+ software](https://security.ucop.edu/resources/open-source-software-licensing.html),
17
+ this library is based on
18
+ [ruby-microservices/checkm@b7a23d6](https://github.com/ruby-microservices/checkm/tree/b7a23d6a72af643cb9554bf16ff49fc27eded827),
19
+ the last MIT-licensed revision.)
20
+
21
+ Subsequent work © 2020 the Regents of the University of California. See
22
+ [LICENSE.md](LICENSE.md) for more details.
23
+
@@ -0,0 +1,36 @@
1
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path('Gemfile', __dir__)
2
+ require 'bundler/setup' # Set up gems listed in the Gemfile.
3
+
4
+ # ------------------------------------------------------------
5
+ # Application code
6
+
7
+ File.expand_path('lib', __dir__).tap do |lib|
8
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
9
+ end
10
+
11
+ # ------------------------------------------------------------
12
+ # RSpec
13
+
14
+ require 'rspec/core/rake_task'
15
+ require 'ci/reporter/rake/rspec'
16
+
17
+ ENV['CI_REPORTS'] ||= File.expand_path('artifacts', __dir__)
18
+
19
+ namespace :spec do
20
+ desc 'Run all tests'
21
+ RSpec::Core::RakeTask.new(:all) do |task|
22
+ task.rspec_opts = %w[--color --format documentation --order default]
23
+ task.pattern = 'spec/**/*_spec.rb'
24
+ end
25
+ end
26
+
27
+ desc 'Run all tests'
28
+ task spec: ['spec:all']
29
+
30
+ # ------------------------------------------------------------
31
+ # Custom tasks
32
+
33
+ desc 'Run tests, check test coverage, check code style'
34
+ task default: %i[coverage rubocop bundle:audit]
35
+
36
+ # TODO: YARD
@@ -0,0 +1,468 @@
1
+ ---------------------------------------------------------------------------
2
+
3
+ NOTE: This is an unofficial copy of the Checkm 0.7 specification,
4
+ adapted from a diff between the 0.3 and 0.7 specifications captured by
5
+ the Internet Archive's Wayback Machine on 23 October 2015 and retrieved
6
+ on 22 July 2020. It is provided here for reference only, and should not
7
+ be considered normative. (D. Moles, 2020-07-22)
8
+
9
+ ---------------------------------------------------------------------------
10
+
11
+ Repository Working Group J. Kunze
12
+ S. Abrams
13
+ D. Loy
14
+ California Digital Library
15
+
16
+ June 11, 2010
17
+
18
+ Checkm: a checksum-based manifest format (v0.7)
19
+
20
+
21
+ Abstract
22
+
23
+ Checkm is a general-purpose text-based file manifest format. Each
24
+ line of a Checkm manifest is a set of '|'-separated tokens, the first
25
+ of which identifies the corresponding digital content by filename or
26
+ URL. Other tokens identify digest algorithm, checksum, content
27
+ length, and modification time. Tokens may be left unspecified with
28
+ empty fields or by ending the line early, the degenerate case being a
29
+ simple file list. It is up to tools that use the Checkm format to
30
+ specify any further restrictions on tokens (e.g., allowed defaults
31
+ and digest algorithms) and on overall manifest completeness and
32
+ coherence. A structured comment mechanism permits a way to document
33
+ extensions and restrictions. Checkm is designed to support tools that
34
+ verify the bit-level integrity of groups of files in support of such
35
+ things as content fixity, replication, import, and export. A manifest
36
+ may be single-level or multi-level (hierarchical), the latter being
37
+ useful, for example, in harvesting material from very large web sites
38
+ (cf. sitemaps).
39
+
40
+
41
+ 1. Checkm overview
42
+
43
+ Checkm (pronounced "check 'em") is a simple text-based manifest
44
+ format for digital content. A manifest is a set of lines, each of
45
+ which describes a unit of content via up to six whitespace-separated
46
+ tokens. The meaning of a token is given by its position within the
47
+ line. For example, the first three tokens give the name of the
48
+ content, a checksum algorithm, and a digest (checksum) computed using
49
+ that algorithm, respectively. Here's a manifest identifying two files
50
+ with MD5 checksums (not recommended for serious use but short enough
51
+ to fit in these examples).
52
+
53
+ #%checkm_0.7
54
+ # My first manifest. Two files total.
55
+ # Filename |Algorithm| Digest
56
+ book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
57
+ images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
58
+
59
+ Checkm is purely concerned with format and not with such things as
60
+ completeness and fitness for a given application. It defines the
61
+ meanings of the six tokens but does not mandate their use. For
62
+ example, a file package transfer tool could require use of four
63
+ tokens, but another tool designed for fixity checking might only
64
+ require two tokens. The next example is a bare-bones manifest in
65
+ which all but the first token have been dropped, in other words, it's
66
+ just a list of filenames or URLs, one per line. This is a useful
67
+ degenerate case when only a list of named units of content is needed.
68
+
69
+ #%checkm_0.7
70
+ # My second manifest. Just a list of files.
71
+ # Filename (no other tokens given)
72
+ http://example.org/i/chap9.xml
73
+ http://example.org/i/chap9fig2.png
74
+
75
+ To leave tokens unspecified that would occur in the middle rather
76
+ than at the end of a line, leave the corresponding fields empty. A
77
+ field is considered empty if the line terminates before it is reached
78
+ or if it consists only of linear whitespace, namely, zero or more
79
+ SPACE (hex 20) or TAB (hex 09) characters. For example, a package
80
+ transfer application that also renames files might use the following
81
+ manifest.
82
+
83
+ #%checkm_0.7
84
+ # My third manifest.
85
+ # Filename and Target specified, not Alg, Digest, Length, or ModTime
86
+ http://example.org/i/chap9.xml ||||| book/Chapter9.xml
87
+ http://example.org/i/chap9fig2.png ||||| images/r862.png
88
+
89
+
90
+ Each non-comment line can contain up to six tokens, and has the form,
91
+
92
+ [@]SourceFileOrURL | Alg | Digest | Length | ModTime | TargetFileOrURL
93
+
94
+ where "[@]" indicates an optional '@' that causes the identified
95
+ content to be "included" as a manifest extension. In principle there
96
+ is no upper or lower limit on the number of lines in a Checkm
97
+ manifest, however, practical considerations may call for extending a
98
+ single-level manifest to a multi-level manifest.
99
+
100
+
101
+ 2. Multi-level manifests
102
+
103
+ If supported, a multi-level manifest permits one large manifest to be
104
+ spread over a number of smaller manifests. To trigger this, the
105
+ SourceFileOrURL token that begins a line is preceded by a literal
106
+ '@'. It invokes a simple inclusion mechanism indicating that the
107
+ identified content is also in Checkm format and extends the current
108
+ manifest; this is similar to mainstream sitemap extension mechanisms
109
+ (cf. [SITEMAPS]). A tool can be said to support only single-level
110
+ Checkm if it does not support multi-level manifests.
111
+
112
+ Included manifests may themselves recursively include other
113
+ manifests. There is no limit either to the number of inclusions or to
114
+ the depth of a multi-level manifest. Cycles in the inclusion graph
115
+ are generally considered to be in poor taste.
116
+
117
+
118
+ 3. Checkm lines and tokens
119
+
120
+ Manifest lines end with either LF (hex 0a) or CRLF (hex 0d0a). Blank
121
+ lines are ignored. Lines that begin with '#' are considered
122
+ "comments" that are to be ignored by processors except for those
123
+ implementing Checkm extensions (described later).
124
+
125
+ Checkm tokens on a given line all relate to the unit of content or to
126
+ the extended functionality identified by the first token on the line.
127
+ A unit of content is a contiguous sequence of octets (for most
128
+ purposes this is a "file") identified by a filename or URL.
129
+
130
+ Tokens consist of UTF-8 characters [RFC3629] separated by a '|'
131
+ character (hex 7c). Any linear whitespace found at the start or end
132
+ of a token is ignored. Any characters not allowed in a token or in a
133
+ URL, such as '|' or whitespace, may be represented using URL percent-
134
+ encoding [RFC3986].
135
+
136
+ Tokens may be left unspecified by simply dropping them from the end
137
+ of the line or by leaving the field empty (zero or more linear
138
+ whitespace characters). Checkm is silent about which tokens are
139
+ required or prohibited and what defaults may be in effect. Checkm is
140
+ also silent about manifest completeness (which units of content must
141
+ be included) and hyper-specification (whether one unit of content can
142
+ or must have more than one line describing it, e.g., resulting from
143
+ two digest algorithms).
144
+
145
+
146
+ 4. Content lines
147
+
148
+ The first of up to six tokens on a non-comment line look like this
149
+
150
+ [@]SourceFileOrURL | Alg | Digest | Length | ModTime | TargetFileOrURL
151
+ TOKEN NUMBER: 1 2 3 4 5 6
152
+
153
+ The token's numbered position determines its meaning, as explained in
154
+ the correspondingly numbered subsections below.
155
+
156
+ Any extra fields at positions 7 and higher are considered to be Checkm
157
+ extensions.
158
+
159
+ 4.1. [@]SourceFileOrURL: content identifier
160
+
161
+ The SourceFileOrURL token identifies digital content, and may be
162
+ given as '-' to indicate that the content may be found on the
163
+ equivalent of Unix "stdin". This token may be a URL or a relative or
164
+ absolute filename. To prevent interpretation of a relative pathname
165
+ that begins with '#' or '@', one can insert "./" in front of the
166
+ name. Whether this token is a filename or a URL, any characters not
167
+ allowed in a URL must be represented using URL percent-encoding
168
+ [RFC3986].
169
+
170
+ If any SourceFileOrURL token in a manifest is preceded by the
171
+ optional '@', the line amounts to an "include" statement and the
172
+ manifest is considered to be "multi-level". Other tokens on that line
173
+ still relate to the content but the "included" content itself is
174
+ considered to be an extension of the current manifest. For example, a
175
+ multi-level Checkm manifest totaling 4 million lines could be
176
+ represented by a 2000-line manifest, each line of which references a
177
+ 2000-line single-level manifest.
178
+
179
+ If none of the lines in a manifest is preceded by '@', the manifest
180
+ is considered to be "single-level". It is permissible for a tool that
181
+ conforms to Checkm to declare support for only single-level
182
+ manifests.
183
+
184
+ 4.2. Alg: algorithm
185
+
186
+ Alg is either the literal string "dir" (designating a directory), a
187
+ string specifying a cryptographic checksum algorithm, or empty to
188
+ leave it unspecified. The special case of "dir" is useful for listing
189
+ an empty directory, which has neither a fixed octetstream over which
190
+ to compute a digest nor a contained filename to imply the directory's
191
+ existence. For example,
192
+
193
+
194
+ #%checkm_0.7
195
+ # My fourth manifest. Two files and a directory.
196
+ # Filename |Algorithm| Digest
197
+ book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
198
+ icons/ | dir
199
+ images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
200
+
201
+ Implementors of tools that use Checkm are strongly encouraged to
202
+ support at least two widely implemented checksum algorithms:
203
+
204
+ "md5" [RFC1321]
205
+
206
+ "sha1" [RFC3174]
207
+
208
+ "sha256" [FIPS180-2]
209
+
210
+ When using other algorithms, the name of the algorithm should be
211
+ normalized for use in the manifest's filename, by lowercasing the
212
+ common name of the algorithm, and removing all non-alphanumeric
213
+ characters.
214
+
215
+ 4.3. Digest: computed checksum
216
+
217
+ Digest is a string representing the checksum calculated according to
218
+ the Alg algorithm over the content, or empty to leave it unspecified.
219
+
220
+
221
+ 4.4. Length of content
222
+
223
+ Length is the number (base 10) of octets in the identified content,
224
+ or empty to leave it unspecified. It is typically useful in
225
+ providing a rapid test for altered content and for estimating file
226
+ transfer times.
227
+
228
+
229
+ 4.5. ModTime: time last modified
230
+
231
+ ModTime is a lexically sort-friendly date such as [TEMPER]
232
+ ('YYYYMMDDhhmmss') or [W3CDTF] (YYYY-MM-DDThh:mm:ss), or empty to
233
+ leave it unspecified. It should represent the UTC time when the
234
+ content was last modified and is typically useful in incremental or
235
+ priority harvesting of content (cf. [OAI] and [SITEMAPS]).
236
+
237
+
238
+ 4.6. TargetFileOrURL: other location
239
+
240
+ TargetFileOrURL is a secondary location for the content that
241
+ applications would use as necessary. For instance, a transfer tool
242
+ that also renames files could use this token as the destination name.
243
+
244
+
245
+ 5. Extensions: structured comment lines
246
+
247
+ Comment lines that begin with a token of the form '#%_symbol_' are
248
+ special structured comment lines that usually indicate specific
249
+ optional functionality that extends the core Checkm specification.
250
+ Matching against a _symbol_ is case-insensitive (e.g., #%foo is
251
+ equivalent to #%FOO). The rest of a structured comment line is
252
+ tokenized in the same way as non-comment lines. The structured
253
+ comment symbols that follow are currently reserved.
254
+
255
+ 5.1. Optional extension: #%checkm_0.7
256
+
257
+ It is highly recommended that the first line of a Checkm manifest be
258
+ of the form
259
+
260
+ #%checkm_M.N
261
+
262
+ where M.N identify major and minor version numbers. The current
263
+ version is 0.7.
264
+
265
+ 5.2. Optional extension: #%eof
266
+
267
+ A line consisting of
268
+
269
+ #%eof
270
+
271
+ is reserved as an explicit end of manifest file marker. It can be
272
+ used to distinguish manifests that might be empty because of an error
273
+ from those that are deliberately empty.
274
+
275
+ 5.3. Optional extension: #%fields
276
+
277
+ To precisely identify all fields in a given Checkm manifest, before
278
+ any non-comment lines include a line of the form
279
+
280
+ #%fields | Field_Id | ...
281
+
282
+ containing one or more instances of a Field_Id, each identifying the
283
+ corresponding manifest field. A Field_Id may be a simple string
284
+ suggestive of the respective field's function or it may be a globally
285
+ unique URL. If a Field_Id URL is resolvable, it should document any
286
+ restriction or extension in effect. The #%fields structured comment
287
+ may form part of a #%profile definition.
288
+
289
+ Semantics of the basic fields 1 through 6 may not be altered except
290
+ to narrow their meanings, such as to restrict the values of field 3
291
+ to one particular algorithm. Semantics of the extension fields (7 and
292
+ higher) may be defined at will.
293
+
294
+
295
+ 5.4. Optional extension: #%prefix
296
+
297
+ To define an abbeviation for a long URL in a manner reminiscent of
298
+ Turtle [Turtle], before any use of the abbreviation include a line of
299
+ the form
300
+
301
+ #%prefix | Abbrev: | URL
302
+
303
+ where Abbrev (which may be empty) is a "prefix" that will stand in
304
+ for the given URL when it used in other structured comments (and not
305
+ in non-comment lines). The #%prefix structured comment may form part
306
+ of a #%profile definition.
307
+
308
+ 5.5. Optional extension: #%profile
309
+
310
+ To declare that a Checkm manifest conforms to a specific profile,
311
+ before any non-comment lines include a line of the form
312
+
313
+ #%profile | ProfileURL
314
+
315
+ where ProfileURL is a unique identifier for a specific profile. If
316
+ the URL is resolvable, it should document any restrictions and
317
+ extensions. Some example profiles appear in an appendix.
318
+
319
+
320
+ 6. Conformance Terminology
321
+
322
+ A tool that uses the Checkm format should document which parts of the
323
+ format it supports. For example, documentation should state what
324
+ extensions, if any, are in use. One common restriction could be
325
+ expressed something like,
326
+
327
+ "... which must be a single-level, 3-column Checkm manifest with
328
+ relative filenames."
329
+
330
+ This terminology suggests that, for this particular tool, an
331
+ exception or undefined behavior is the likely result of supplying a
332
+ Checkm manifest that has any line beginning with '@', a URL, or an
333
+ absolute pathname, or that has any line with more than or fewer than
334
+ 3 tokens.
335
+
336
+
337
+ 7. Example two-level Checkm manifest
338
+
339
+ #%checkm_0.7
340
+ # A two-level manifest.
341
+
342
+ #Filename |Alg |Checksum |Length
343
+ foo.bar |sha1|2eacd0da7aa89b094f5121eb2901bf4de2219ef1 | 366
344
+ foo.bar |md5 |3e83471320227c0797a0c251f28db0c5 | 366
345
+ # This next line "includes" the manifest in file "myfirst".
346
+ @myfirst |md5 |6ab96c8930621d50cef31da4df6d9ed8 | 264
347
+
348
+ where the included file "myfirst" contains 264 octets and lists two
349
+ files:
350
+
351
+ #%checkm_0.7
352
+ # My first manifest. Two files total.
353
+ # Filename |Algorithm| Digest
354
+ book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
355
+ images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
356
+
357
+ 8. References
358
+
359
+ [FIPS180-2]
360
+ NIST, "FIPS 180-2: Secure Hash Standard (SHS)",
361
+ February 2004, <http://csrc.nist.gov/publications/fips/
362
+ fips180-2/fips180-2withchangenotice.pdf>.
363
+
364
+ [OAI] Lagoze, C. and H. Van de Sompel, "Open Archives Initiative
365
+ Protocol for Metadata Harvesting", June 2002, <http://
366
+ www.openarchives.org/OAI/openarchivesprotocol.html>.
367
+
368
+ [RFC1321] Rivest, R., "The MD5 Message-Digest Algorithm", RFC 1321,
369
+ April 1992.
370
+
371
+ [RFC3174] Eastlake, D. and P. Jones, "US Secure Hash Algorithm 1
372
+ (SHA1)", RFC 3174, September 2001.
373
+
374
+ [RFC3629] Yergeau, F., "UTF-8, a transformation format of ISO
375
+ 10646", STD 63, RFC 3629, November 2003.
376
+
377
+ [RFC3986] Berners-Lee, T., Fielding, R., and L. Masinter, "Uniform
378
+ Resource Identifier (URI): Generic Syntax", STD 66,
379
+ RFC 3986, January 2005.
380
+
381
+ [SITEMAPS]
382
+ sitemaps.org, "Sitemaps XML format", February 2008,
383
+ <http://sitemaps.org/protocol.php>.
384
+
385
+ [TEMPER] Blair, C. and J. Kunze, "Temporal Enumerated Ranges",
386
+ August 2007,
387
+ <http://www.cdlib.org/inside/diglib/ark/temperspec.pdf>.
388
+
389
+ [Turtle] Beckett, D. and T. Berners-Lee, "Turtle - Terse RDF Triple
390
+ Language", January 2008,
391
+ <http://www.w3.org/TeamSubmission/turtle/>.
392
+
393
+ [W3CDTF] Wolf, M. and C. Wicksteed, "Date and Time Formats (W3C
394
+ profile of ISO8601)",
395
+ <http://www.w3.org/TR/NOTE-datetime>.
396
+
397
+
398
+ Appendix A. Example profiles
399
+
400
+ The most important attribute of a Checkm profile is a globally unique
401
+ identifier, such as,
402
+
403
+ http://merritt.cdlib.org/registry/mrt-ingest-manifest
404
+
405
+ which applications can use for conditional processing. If, in
406
+ addition, this identifier is resolvable, it should return a text file
407
+ with the same format as a Checkm manifest but with no non-comment
408
+ lines. This file formally documents any particular ways in which the
409
+ first six Checkm fields may be restricted and what any additional
410
+ fields mean. As an example, the profile URL above corresponds to
411
+
412
+ #%checkm_0.7
413
+ #
414
+ # This is a profile definition for a "Merritt ingest" manifest.
415
+ #
416
+ #%profile | http://merritt.cdlib.org/registry/mrt-ingest-manifest
417
+ #%prefix | mrt: | http://merritt.cdlib.org/terms#
418
+ #%prefix | nfo: |
419
+ http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#
420
+ #%fields | nfo:fileUrl | nfo:hashAlgorithm | nfo:hashValue |
421
+ nfo:fileSize | nfo:fileLastModified | nfo:fileName |
422
+ mrt:mimeType
423
+
424
+ In this example and the next, indented lines artificially occur where
425
+ long lines have been wrapped for display purposes. The profile below
426
+ uses Checkm inclusion lines as a way to describe "digital objects".
427
+
428
+ #%checkm_0.7
429
+ #
430
+ # This is a profile definition for a "Merritt batch" manifest.
431
+ # It is meant to be used with Checkm "inclusion" lines, as in
432
+ #
433
+ # @url | [alg] | [value] | [length] | | filename | [primary] [ | local ]
434
+ #
435
+ #%profile | http://merritt.cdlib.org/registry/mrt-batch-manifest
436
+ #%prefix | mrt: | http://merritt.cdlib.org/terms#
437
+ #%prefix | nfo: |
438
+ http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#
439
+ #%fields | nfo:fileUrl | nfo:hashAlgorithm | nfo:hashValue |
440
+ nfo:fileSize | nfo:fileLastModified | nfo:fileName |
441
+ mrt:primaryIdentifier | mrt:localIdentifier
442
+
443
+
444
+ Authors' Addresses
445
+
446
+ John A. Kunze
447
+ California Digital Library
448
+ 415 20th St, 4th Floor
449
+ Oakland, CA 94612
450
+ US
451
+
452
+ Email: jak@ucop.edu
453
+
454
+ Stephen Abrams
455
+ California Digital Library
456
+ 415 20th St, 4th Floor
457
+ Oakland, CA 94612
458
+ US
459
+
460
+ Email: stephen.abrams@ucop.edu
461
+
462
+ David Loy
463
+ California Digital Library
464
+ 415 20th St, 4th Floor
465
+ Oakland, CA 94612
466
+ US
467
+
468
+ Email: david.loy@ucop.edu