text-checkm 0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +312 -0
- data/.idea/$CACHE_FILE$ +26 -0
- data/.idea/.gitignore +6 -0
- data/.idea/go.imports.xml +6 -0
- data/.idea/inspectionProfiles/Project_Default.xml +17 -0
- data/.idea/inspectionProfiles/profiles_settings.xml +7 -0
- data/.idea/misc.xml +9 -0
- data/.idea/modules.xml +8 -0
- data/.idea/vcs.xml +6 -0
- data/.rubocop.yml +144 -0
- data/.ruby-version +1 -0
- data/.simplecov +7 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +98 -0
- data/LICENSE.md +23 -0
- data/README.md +23 -0
- data/Rakefile +36 -0
- data/SPEC.txt +468 -0
- data/lib/text/checkm.rb +2 -0
- data/lib/text/checkm/checksum.rb +40 -0
- data/lib/text/checkm/entry.rb +77 -0
- data/lib/text/checkm/manifest.rb +115 -0
- data/lib/text/checkm/module_info.rb +16 -0
- data/rakelib/bundle.rake +8 -0
- data/rakelib/coverage.rake +5 -0
- data/rakelib/rubocop.rake +16 -0
- data/spec/.rubocop.yml +19 -0
- data/spec/data/merritt-manifest.checkm +9 -0
- data/spec/data/myfirst.checkm +5 -0
- data/spec/data/test_1/1 +1 -0
- data/spec/data/two-level-manifest.checkm +8 -0
- data/spec/lib/text/checkm/checksum_spec.rb +24 -0
- data/spec/lib/text/checkm/entry_spec.rb +52 -0
- data/spec/lib/text/checkm/manifest_spec.rb +157 -0
- data/spec/spec_helper.rb +23 -0
- data/text-checkm.gemspec +29 -0
- data/text-checkm.iml +51 -0
- metadata +236 -0
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.7.0
|
data/.simplecov
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
# 0.7 (22 July 2020)
|
2
|
+
|
3
|
+
- fork from [ruby-microservices/checkm@b7a23d6](https://github.com/ruby-microservices/checkm/tree/b7a23d6a72af643cb9554bf16ff49fc27eded827)
|
4
|
+
(last MIT-licensed version)
|
5
|
+
- update for Ruby 2.7.0
|
6
|
+
- rename module from `Checkm` to `Text::Checkm` to avoid name collisions
|
7
|
+
- bump version to 0.7 to match [spec](SPEC.txt) version
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
text-checkm (0.7)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: http://rubygems.org/
|
8
|
+
specs:
|
9
|
+
ast (2.4.1)
|
10
|
+
builder (3.2.4)
|
11
|
+
bundle-audit (0.1.0)
|
12
|
+
bundler-audit
|
13
|
+
bundler-audit (0.7.0.1)
|
14
|
+
bundler (>= 1.2.0, < 3)
|
15
|
+
thor (>= 0.18, < 2)
|
16
|
+
ci_reporter (2.0.0)
|
17
|
+
builder (>= 2.1.2)
|
18
|
+
ci_reporter_rspec (1.0.0)
|
19
|
+
ci_reporter (~> 2.0)
|
20
|
+
rspec (>= 2.14, < 4)
|
21
|
+
colorize (0.8.1)
|
22
|
+
diff-lcs (1.4.4)
|
23
|
+
docile (1.3.2)
|
24
|
+
ffi (1.13.1)
|
25
|
+
io-console (0.5.6)
|
26
|
+
irb (1.2.4)
|
27
|
+
reline (>= 0.0.1)
|
28
|
+
json (2.3.1)
|
29
|
+
listen (3.1.5)
|
30
|
+
rb-fsevent (~> 0.9, >= 0.9.4)
|
31
|
+
rb-inotify (~> 0.9, >= 0.9.7)
|
32
|
+
ruby_dep (~> 1.2)
|
33
|
+
parallel (1.19.2)
|
34
|
+
parser (2.7.1.4)
|
35
|
+
ast (~> 2.4.1)
|
36
|
+
rainbow (3.0.0)
|
37
|
+
rake (13.0.1)
|
38
|
+
rb-fsevent (0.10.4)
|
39
|
+
rb-inotify (0.10.1)
|
40
|
+
ffi (~> 1.0)
|
41
|
+
regexp_parser (1.7.1)
|
42
|
+
reline (0.1.4)
|
43
|
+
io-console (~> 0.5)
|
44
|
+
rexml (3.2.4)
|
45
|
+
rspec (3.9.0)
|
46
|
+
rspec-core (~> 3.9.0)
|
47
|
+
rspec-expectations (~> 3.9.0)
|
48
|
+
rspec-mocks (~> 3.9.0)
|
49
|
+
rspec-core (3.9.2)
|
50
|
+
rspec-support (~> 3.9.3)
|
51
|
+
rspec-expectations (3.9.2)
|
52
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
53
|
+
rspec-support (~> 3.9.0)
|
54
|
+
rspec-mocks (3.9.1)
|
55
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
56
|
+
rspec-support (~> 3.9.0)
|
57
|
+
rspec-support (3.9.3)
|
58
|
+
rubocop (0.86.0)
|
59
|
+
parallel (~> 1.10)
|
60
|
+
parser (>= 2.7.0.1)
|
61
|
+
rainbow (>= 2.2.2, < 4.0)
|
62
|
+
regexp_parser (>= 1.7)
|
63
|
+
rexml
|
64
|
+
rubocop-ast (>= 0.0.3, < 1.0)
|
65
|
+
ruby-progressbar (~> 1.7)
|
66
|
+
unicode-display_width (>= 1.4.0, < 2.0)
|
67
|
+
rubocop-ast (0.2.0)
|
68
|
+
parser (>= 2.7.0.1)
|
69
|
+
ruby-progressbar (1.10.1)
|
70
|
+
ruby_dep (1.5.0)
|
71
|
+
simplecov (0.16.1)
|
72
|
+
docile (~> 1.1)
|
73
|
+
json (>= 1.8, < 3)
|
74
|
+
simplecov-html (~> 0.10.0)
|
75
|
+
simplecov-html (0.10.2)
|
76
|
+
simplecov-rcov (0.2.3)
|
77
|
+
simplecov (>= 0.4.1)
|
78
|
+
thor (1.0.1)
|
79
|
+
unicode-display_width (1.7.0)
|
80
|
+
|
81
|
+
PLATFORMS
|
82
|
+
ruby
|
83
|
+
|
84
|
+
DEPENDENCIES
|
85
|
+
bundle-audit
|
86
|
+
ci_reporter_rspec
|
87
|
+
colorize
|
88
|
+
irb
|
89
|
+
listen (>= 3.0.5, < 3.2)
|
90
|
+
rake (>= 13.0)
|
91
|
+
rspec-support
|
92
|
+
rubocop (= 0.86)
|
93
|
+
simplecov (~> 0.16.1)
|
94
|
+
simplecov-rcov
|
95
|
+
text-checkm!
|
96
|
+
|
97
|
+
BUNDLED WITH
|
98
|
+
2.1.2
|
data/LICENSE.md
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright © 2010 Chris Beer
|
4
|
+
|
5
|
+
Copyright © 2020 The Regents of the University of California
|
6
|
+
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a
|
8
|
+
copy of this software and associated documentation files (the “Software”),
|
9
|
+
to deal in the Software without restriction, including without limitation
|
10
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
11
|
+
and/or sell copies of the Software, and to permit persons to whom the
|
12
|
+
Software is furnished to do so, subject to the following conditions:
|
13
|
+
|
14
|
+
The above copyright notice and this permission notice shall be included in
|
15
|
+
all copies or substantial portions of the Software.
|
16
|
+
|
17
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
22
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
23
|
+
DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# Text::Checkm
|
2
|
+
|
3
|
+
Ruby implementation of
|
4
|
+
[Checkm](https://confluence.ucop.edu/display/Curation/Checkm), a
|
5
|
+
general-purpose text-based file manifest format developed at the
|
6
|
+
[California Digital Library](https://cdlib.org/) by John Kunze, Stephen
|
7
|
+
Abrams, and David Loy.
|
8
|
+
|
9
|
+
(See [SPEC.txt](SPEC.txt) in this repository for details.)
|
10
|
+
|
11
|
+
## Copyright
|
12
|
+
|
13
|
+
Based on [checkm](https://github.com/ruby-microservices/checkm/), copyright
|
14
|
+
© 2010 Chris Beer. (For compliance with [UC policies on the use and
|
15
|
+
creation of open source
|
16
|
+
software](https://security.ucop.edu/resources/open-source-software-licensing.html),
|
17
|
+
this library is based on
|
18
|
+
[ruby-microservices/checkm@b7a23d6](https://github.com/ruby-microservices/checkm/tree/b7a23d6a72af643cb9554bf16ff49fc27eded827),
|
19
|
+
the last MIT-licensed revision.)
|
20
|
+
|
21
|
+
Subsequent work © 2020 the Regents of the University of California. See
|
22
|
+
[LICENSE.md](LICENSE.md) for more details.
|
23
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path('Gemfile', __dir__)
|
2
|
+
require 'bundler/setup' # Set up gems listed in the Gemfile.
|
3
|
+
|
4
|
+
# ------------------------------------------------------------
|
5
|
+
# Application code
|
6
|
+
|
7
|
+
File.expand_path('lib', __dir__).tap do |lib|
|
8
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
9
|
+
end
|
10
|
+
|
11
|
+
# ------------------------------------------------------------
|
12
|
+
# RSpec
|
13
|
+
|
14
|
+
require 'rspec/core/rake_task'
|
15
|
+
require 'ci/reporter/rake/rspec'
|
16
|
+
|
17
|
+
ENV['CI_REPORTS'] ||= File.expand_path('artifacts', __dir__)
|
18
|
+
|
19
|
+
namespace :spec do
|
20
|
+
desc 'Run all tests'
|
21
|
+
RSpec::Core::RakeTask.new(:all) do |task|
|
22
|
+
task.rspec_opts = %w[--color --format documentation --order default]
|
23
|
+
task.pattern = 'spec/**/*_spec.rb'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
desc 'Run all tests'
|
28
|
+
task spec: ['spec:all']
|
29
|
+
|
30
|
+
# ------------------------------------------------------------
|
31
|
+
# Custom tasks
|
32
|
+
|
33
|
+
desc 'Run tests, check test coverage, check code style'
|
34
|
+
task default: %i[coverage rubocop bundle:audit]
|
35
|
+
|
36
|
+
# TODO: YARD
|
data/SPEC.txt
ADDED
@@ -0,0 +1,468 @@
|
|
1
|
+
---------------------------------------------------------------------------
|
2
|
+
|
3
|
+
NOTE: This is an unofficial copy of the Checkm 0.7 specification,
|
4
|
+
adapted from a diff between the 0.3 and 0.7 specifications captured by
|
5
|
+
the Internet Archive's Wayback Machine on 23 October 2015 and retrieved
|
6
|
+
on 22 July 2020. It is provided here for reference only, and should not
|
7
|
+
be considered normative. (D. Moles, 2020-07-22)
|
8
|
+
|
9
|
+
---------------------------------------------------------------------------
|
10
|
+
|
11
|
+
Repository Working Group J. Kunze
|
12
|
+
S. Abrams
|
13
|
+
D. Loy
|
14
|
+
California Digital Library
|
15
|
+
|
16
|
+
June 11, 2010
|
17
|
+
|
18
|
+
Checkm: a checksum-based manifest format (v0.7)
|
19
|
+
|
20
|
+
|
21
|
+
Abstract
|
22
|
+
|
23
|
+
Checkm is a general-purpose text-based file manifest format. Each
|
24
|
+
line of a Checkm manifest is a set of '|'-separated tokens, the first
|
25
|
+
of which identifies the corresponding digital content by filename or
|
26
|
+
URL. Other tokens identify digest algorithm, checksum, content
|
27
|
+
length, and modification time. Tokens may be left unspecified with
|
28
|
+
empty fields or by ending the line early, the degenerate case being a
|
29
|
+
simple file list. It is up to tools that use the Checkm format to
|
30
|
+
specify any further restrictions on tokens (e.g., allowed defaults
|
31
|
+
and digest algorithms) and on overall manifest completeness and
|
32
|
+
coherence. A structured comment mechanism permits a way to document
|
33
|
+
extensions and restrictions. Checkm is designed to support tools that
|
34
|
+
verify the bit-level integrity of groups of files in support of such
|
35
|
+
things as content fixity, replication, import, and export. A manifest
|
36
|
+
may be single-level or multi-level (hierarchical), the latter being
|
37
|
+
useful, for example, in harvesting material from very large web sites
|
38
|
+
(cf. sitemaps).
|
39
|
+
|
40
|
+
|
41
|
+
1. Checkm overview
|
42
|
+
|
43
|
+
Checkm (pronounced "check 'em") is a simple text-based manifest
|
44
|
+
format for digital content. A manifest is a set of lines, each of
|
45
|
+
which describes a unit of content via up to six whitespace-separated
|
46
|
+
tokens. The meaning of a token is given by its position within the
|
47
|
+
line. For example, the first three tokens give the name of the
|
48
|
+
content, a checksum algorithm, and a digest (checksum) computed using
|
49
|
+
that algorithm, respectively. Here's a manifest identifying two files
|
50
|
+
with MD5 checksums (not recommended for serious use but short enough
|
51
|
+
to fit in these examples).
|
52
|
+
|
53
|
+
#%checkm_0.7
|
54
|
+
# My first manifest. Two files total.
|
55
|
+
# Filename |Algorithm| Digest
|
56
|
+
book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
|
57
|
+
images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
|
58
|
+
|
59
|
+
Checkm is purely concerned with format and not with such things as
|
60
|
+
completeness and fitness for a given application. It defines the
|
61
|
+
meanings of the six tokens but does not mandate their use. For
|
62
|
+
example, a file package transfer tool could require use of four
|
63
|
+
tokens, but another tool designed for fixity checking might only
|
64
|
+
require two tokens. The next example is a bare-bones manifest in
|
65
|
+
which all but the first token have been dropped, in other words, it's
|
66
|
+
just a list of filenames or URLs, one per line. This is a useful
|
67
|
+
degenerate case when only a list of named units of content is needed.
|
68
|
+
|
69
|
+
#%checkm_0.7
|
70
|
+
# My second manifest. Just a list of files.
|
71
|
+
# Filename (no other tokens given)
|
72
|
+
http://example.org/i/chap9.xml
|
73
|
+
http://example.org/i/chap9fig2.png
|
74
|
+
|
75
|
+
To leave tokens unspecified that would occur in the middle rather
|
76
|
+
than at the end of a line, leave the corresponding fields empty. A
|
77
|
+
field is considered empty if the line terminates before it is reached
|
78
|
+
or if it consists only of linear whitespace, namely, zero or more
|
79
|
+
SPACE (hex 20) or TAB (hex 09) characters. For example, a package
|
80
|
+
transfer application that also renames files might use the following
|
81
|
+
manifest.
|
82
|
+
|
83
|
+
#%checkm_0.7
|
84
|
+
# My third manifest.
|
85
|
+
# Filename and Target specified, not Alg, Digest, Length, or ModTime
|
86
|
+
http://example.org/i/chap9.xml ||||| book/Chapter9.xml
|
87
|
+
http://example.org/i/chap9fig2.png ||||| images/r862.png
|
88
|
+
|
89
|
+
|
90
|
+
Each non-comment line can contain up to six tokens, and has the form,
|
91
|
+
|
92
|
+
[@]SourceFileOrURL | Alg | Digest | Length | ModTime | TargetFileOrURL
|
93
|
+
|
94
|
+
where "[@]" indicates an optional '@' that causes the identified
|
95
|
+
content to be "included" as a manifest extension. In principle there
|
96
|
+
is no upper or lower limit on the number of lines in a Checkm
|
97
|
+
manifest, however, practical considerations may call for extending a
|
98
|
+
single-level manifest to a multi-level manifest.
|
99
|
+
|
100
|
+
|
101
|
+
2. Multi-level manifests
|
102
|
+
|
103
|
+
If supported, a multi-level manifest permits one large manifest to be
|
104
|
+
spread over a number of smaller manifests. To trigger this, the
|
105
|
+
SourceFileOrURL token that begins a line is preceded by a literal
|
106
|
+
'@'. It invokes a simple inclusion mechanism indicating that the
|
107
|
+
identified content is also in Checkm format and extends the current
|
108
|
+
manifest; this is similar to mainstream sitemap extension mechanisms
|
109
|
+
(cf. [SITEMAPS]). A tool can be said to support only single-level
|
110
|
+
Checkm if it does not support multi-level manifests.
|
111
|
+
|
112
|
+
Included manifests may themselves recursively include other
|
113
|
+
manifests. There is no limit either to the number of inclusions or to
|
114
|
+
the depth of a multi-level manifest. Cycles in the inclusion graph
|
115
|
+
are generally considered to be in poor taste.
|
116
|
+
|
117
|
+
|
118
|
+
3. Checkm lines and tokens
|
119
|
+
|
120
|
+
Manifest lines end with either LF (hex 0a) or CRLF (hex 0d0a). Blank
|
121
|
+
lines are ignored. Lines that begin with '#' are considered
|
122
|
+
"comments" that are to be ignored by processors except for those
|
123
|
+
implementing Checkm extensions (described later).
|
124
|
+
|
125
|
+
Checkm tokens on a given line all relate to the unit of content or to
|
126
|
+
the extended functionality identified by the first token on the line.
|
127
|
+
A unit of content is a contiguous sequence of octets (for most
|
128
|
+
purposes this is a "file") identified by a filename or URL.
|
129
|
+
|
130
|
+
Tokens consist of UTF-8 characters [RFC3629] separated by a '|'
|
131
|
+
character (hex 7c). Any linear whitespace found at the start or end
|
132
|
+
of a token is ignored. Any characters not allowed in a token or in a
|
133
|
+
URL, such as '|' or whitespace, may be represented using URL percent-
|
134
|
+
encoding [RFC3986].
|
135
|
+
|
136
|
+
Tokens may be left unspecified by simply dropping them from the end
|
137
|
+
of the line or by leaving the field empty (zero or more linear
|
138
|
+
whitespace characters). Checkm is silent about which tokens are
|
139
|
+
required or prohibited and what defaults may be in effect. Checkm is
|
140
|
+
also silent about manifest completeness (which units of content must
|
141
|
+
be included) and hyper-specification (whether one unit of content can
|
142
|
+
or must have more than one line describing it, e.g., resulting from
|
143
|
+
two digest algorithms).
|
144
|
+
|
145
|
+
|
146
|
+
4. Content lines
|
147
|
+
|
148
|
+
The first of up to six tokens on a non-comment line look like this
|
149
|
+
|
150
|
+
[@]SourceFileOrURL | Alg | Digest | Length | ModTime | TargetFileOrURL
|
151
|
+
TOKEN NUMBER: 1 2 3 4 5 6
|
152
|
+
|
153
|
+
The token's numbered position determines its meaning, as explained in
|
154
|
+
the correspondingly numbered subsections below.
|
155
|
+
|
156
|
+
Any extra fields at positions 7 and higher are considered to be Checkm
|
157
|
+
extensions.
|
158
|
+
|
159
|
+
4.1. [@]SourceFileOrURL: content identifier
|
160
|
+
|
161
|
+
The SourceFileOrURL token identifies digital content, and may be
|
162
|
+
given as '-' to indicate that the content may be found on the
|
163
|
+
equivalent of Unix "stdin". This token may be a URL or a relative or
|
164
|
+
absolute filename. To prevent interpretation of a relative pathname
|
165
|
+
that begins with '#' or '@', one can insert "./" in front of the
|
166
|
+
name. Whether this token is a filename or a URL, any characters not
|
167
|
+
allowed in a URL must be represented using URL percent-encoding
|
168
|
+
[RFC3986].
|
169
|
+
|
170
|
+
If any SourceFileOrURL token in a manifest is preceded by the
|
171
|
+
optional '@', the line amounts to an "include" statement and the
|
172
|
+
manifest is considered to be "multi-level". Other tokens on that line
|
173
|
+
still relate to the content but the "included" content itself is
|
174
|
+
considered to be an extension of the current manifest. For example, a
|
175
|
+
multi-level Checkm manifest totaling 4 million lines could be
|
176
|
+
represented by a 2000-line manifest, each line of which references a
|
177
|
+
2000-line single-level manifest.
|
178
|
+
|
179
|
+
If none of the lines in a manifest is preceded by '@', the manifest
|
180
|
+
is considered to be "single-level". It is permissible for a tool that
|
181
|
+
conforms to Checkm to declare support for only single-level
|
182
|
+
manifests.
|
183
|
+
|
184
|
+
4.2. Alg: algorithm
|
185
|
+
|
186
|
+
Alg is either the literal string "dir" (designating a directory), a
|
187
|
+
string specifying a cryptographic checksum algorithm, or empty to
|
188
|
+
leave it unspecified. The special case of "dir" is useful for listing
|
189
|
+
an empty directory, which has neither a fixed octetstream over which
|
190
|
+
to compute a digest nor a contained filename to imply the directory's
|
191
|
+
existence. For example,
|
192
|
+
|
193
|
+
|
194
|
+
#%checkm_0.7
|
195
|
+
# My fourth manifest. Two files and a directory.
|
196
|
+
# Filename |Algorithm| Digest
|
197
|
+
book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
|
198
|
+
icons/ | dir
|
199
|
+
images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
|
200
|
+
|
201
|
+
Implementors of tools that use Checkm are strongly encouraged to
|
202
|
+
support at least two widely implemented checksum algorithms:
|
203
|
+
|
204
|
+
"md5" [RFC1321]
|
205
|
+
|
206
|
+
"sha1" [RFC3174]
|
207
|
+
|
208
|
+
"sha256" [FIPS180-2]
|
209
|
+
|
210
|
+
When using other algorithms, the name of the algorithm should be
|
211
|
+
normalized for use in the manifest's filename, by lowercasing the
|
212
|
+
common name of the algorithm, and removing all non-alphanumeric
|
213
|
+
characters.
|
214
|
+
|
215
|
+
4.3. Digest: computed checksum
|
216
|
+
|
217
|
+
Digest is a string representing the checksum calculated according to
|
218
|
+
the Alg algorithm over the content, or empty to leave it unspecified.
|
219
|
+
|
220
|
+
|
221
|
+
4.4. Length of content
|
222
|
+
|
223
|
+
Length is the number (base 10) of octets in the identified content,
|
224
|
+
or empty to leave it unspecified. It is typically useful in
|
225
|
+
providing a rapid test for altered content and for estimating file
|
226
|
+
transfer times.
|
227
|
+
|
228
|
+
|
229
|
+
4.5. ModTime: time last modified
|
230
|
+
|
231
|
+
ModTime is a lexically sort-friendly date such as [TEMPER]
|
232
|
+
('YYYYMMDDhhmmss') or [W3CDTF] (YYYY-MM-DDThh:mm:ss), or empty to
|
233
|
+
leave it unspecified. It should represent the UTC time when the
|
234
|
+
content was last modified and is typically useful in incremental or
|
235
|
+
priority harvesting of content (cf. [OAI] and [SITEMAPS]).
|
236
|
+
|
237
|
+
|
238
|
+
4.6. TargetFileOrURL: other location
|
239
|
+
|
240
|
+
TargetFileOrURL is a secondary location for the content that
|
241
|
+
applications would use as necessary. For instance, a transfer tool
|
242
|
+
that also renames files could use this token as the destination name.
|
243
|
+
|
244
|
+
|
245
|
+
5. Extensions: structured comment lines
|
246
|
+
|
247
|
+
Comment lines that begin with a token of the form '#%_symbol_' are
|
248
|
+
special structured comment lines that usually indicate specific
|
249
|
+
optional functionality that extends the core Checkm specification.
|
250
|
+
Matching against a _symbol_ is case-insensitive (e.g., #%foo is
|
251
|
+
equivalent to #%FOO). The rest of a structured comment line is
|
252
|
+
tokenized in the same way as non-comment lines. The structured
|
253
|
+
comment symbols that follow are currently reserved.
|
254
|
+
|
255
|
+
5.1. Optional extension: #%checkm_0.7
|
256
|
+
|
257
|
+
It is highly recommended that the first line of a Checkm manifest be
|
258
|
+
of the form
|
259
|
+
|
260
|
+
#%checkm_M.N
|
261
|
+
|
262
|
+
where M.N identify major and minor version numbers. The current
|
263
|
+
version is 0.7.
|
264
|
+
|
265
|
+
5.2. Optional extension: #%eof
|
266
|
+
|
267
|
+
A line consisting of
|
268
|
+
|
269
|
+
#%eof
|
270
|
+
|
271
|
+
is reserved as an explicit end of manifest file marker. It can be
|
272
|
+
used to distinguish manifests that might be empty because of an error
|
273
|
+
from those that are deliberately empty.
|
274
|
+
|
275
|
+
5.3. Optional extension: #%fields
|
276
|
+
|
277
|
+
To precisely identify all fields in a given Checkm manifest, before
|
278
|
+
any non-comment lines include a line of the form
|
279
|
+
|
280
|
+
#%fields | Field_Id | ...
|
281
|
+
|
282
|
+
containing one or more instances of a Field_Id, each identifying the
|
283
|
+
corresponding manifest field. A Field_Id may be a simple string
|
284
|
+
suggestive of the respective field's function or it may be a globally
|
285
|
+
unique URL. If a Field_Id URL is resolvable, it should document any
|
286
|
+
restriction or extension in effect. The #%fields structured comment
|
287
|
+
may form part of a #%profile definition.
|
288
|
+
|
289
|
+
Semantics of the basic fields 1 through 6 may not be altered except
|
290
|
+
to narrow their meanings, such as to restrict the values of field 3
|
291
|
+
to one particular algorithm. Semantics of the extension fields (7 and
|
292
|
+
higher) may be defined at will.
|
293
|
+
|
294
|
+
|
295
|
+
5.4. Optional extension: #%prefix
|
296
|
+
|
297
|
+
To define an abbeviation for a long URL in a manner reminiscent of
|
298
|
+
Turtle [Turtle], before any use of the abbreviation include a line of
|
299
|
+
the form
|
300
|
+
|
301
|
+
#%prefix | Abbrev: | URL
|
302
|
+
|
303
|
+
where Abbrev (which may be empty) is a "prefix" that will stand in
|
304
|
+
for the given URL when it used in other structured comments (and not
|
305
|
+
in non-comment lines). The #%prefix structured comment may form part
|
306
|
+
of a #%profile definition.
|
307
|
+
|
308
|
+
5.5. Optional extension: #%profile
|
309
|
+
|
310
|
+
To declare that a Checkm manifest conforms to a specific profile,
|
311
|
+
before any non-comment lines include a line of the form
|
312
|
+
|
313
|
+
#%profile | ProfileURL
|
314
|
+
|
315
|
+
where ProfileURL is a unique identifier for a specific profile. If
|
316
|
+
the URL is resolvable, it should document any restrictions and
|
317
|
+
extensions. Some example profiles appear in an appendix.
|
318
|
+
|
319
|
+
|
320
|
+
6. Conformance Terminology
|
321
|
+
|
322
|
+
A tool that uses the Checkm format should document which parts of the
|
323
|
+
format it supports. For example, documentation should state what
|
324
|
+
extensions, if any, are in use. One common restriction could be
|
325
|
+
expressed something like,
|
326
|
+
|
327
|
+
"... which must be a single-level, 3-column Checkm manifest with
|
328
|
+
relative filenames."
|
329
|
+
|
330
|
+
This terminology suggests that, for this particular tool, an
|
331
|
+
exception or undefined behavior is the likely result of supplying a
|
332
|
+
Checkm manifest that has any line beginning with '@', a URL, or an
|
333
|
+
absolute pathname, or that has any line with more than or fewer than
|
334
|
+
3 tokens.
|
335
|
+
|
336
|
+
|
337
|
+
7. Example two-level Checkm manifest
|
338
|
+
|
339
|
+
#%checkm_0.7
|
340
|
+
# A two-level manifest.
|
341
|
+
|
342
|
+
#Filename |Alg |Checksum |Length
|
343
|
+
foo.bar |sha1|2eacd0da7aa89b094f5121eb2901bf4de2219ef1 | 366
|
344
|
+
foo.bar |md5 |3e83471320227c0797a0c251f28db0c5 | 366
|
345
|
+
# This next line "includes" the manifest in file "myfirst".
|
346
|
+
@myfirst |md5 |6ab96c8930621d50cef31da4df6d9ed8 | 264
|
347
|
+
|
348
|
+
where the included file "myfirst" contains 264 octets and lists two
|
349
|
+
files:
|
350
|
+
|
351
|
+
#%checkm_0.7
|
352
|
+
# My first manifest. Two files total.
|
353
|
+
# Filename |Algorithm| Digest
|
354
|
+
book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
|
355
|
+
images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
|
356
|
+
|
357
|
+
8. References
|
358
|
+
|
359
|
+
[FIPS180-2]
|
360
|
+
NIST, "FIPS 180-2: Secure Hash Standard (SHS)",
|
361
|
+
February 2004, <http://csrc.nist.gov/publications/fips/
|
362
|
+
fips180-2/fips180-2withchangenotice.pdf>.
|
363
|
+
|
364
|
+
[OAI] Lagoze, C. and H. Van de Sompel, "Open Archives Initiative
|
365
|
+
Protocol for Metadata Harvesting", June 2002, <http://
|
366
|
+
www.openarchives.org/OAI/openarchivesprotocol.html>.
|
367
|
+
|
368
|
+
[RFC1321] Rivest, R., "The MD5 Message-Digest Algorithm", RFC 1321,
|
369
|
+
April 1992.
|
370
|
+
|
371
|
+
[RFC3174] Eastlake, D. and P. Jones, "US Secure Hash Algorithm 1
|
372
|
+
(SHA1)", RFC 3174, September 2001.
|
373
|
+
|
374
|
+
[RFC3629] Yergeau, F., "UTF-8, a transformation format of ISO
|
375
|
+
10646", STD 63, RFC 3629, November 2003.
|
376
|
+
|
377
|
+
[RFC3986] Berners-Lee, T., Fielding, R., and L. Masinter, "Uniform
|
378
|
+
Resource Identifier (URI): Generic Syntax", STD 66,
|
379
|
+
RFC 3986, January 2005.
|
380
|
+
|
381
|
+
[SITEMAPS]
|
382
|
+
sitemaps.org, "Sitemaps XML format", February 2008,
|
383
|
+
<http://sitemaps.org/protocol.php>.
|
384
|
+
|
385
|
+
[TEMPER] Blair, C. and J. Kunze, "Temporal Enumerated Ranges",
|
386
|
+
August 2007,
|
387
|
+
<http://www.cdlib.org/inside/diglib/ark/temperspec.pdf>.
|
388
|
+
|
389
|
+
[Turtle] Beckett, D. and T. Berners-Lee, "Turtle - Terse RDF Triple
|
390
|
+
Language", January 2008,
|
391
|
+
<http://www.w3.org/TeamSubmission/turtle/>.
|
392
|
+
|
393
|
+
[W3CDTF] Wolf, M. and C. Wicksteed, "Date and Time Formats (W3C
|
394
|
+
profile of ISO8601)",
|
395
|
+
<http://www.w3.org/TR/NOTE-datetime>.
|
396
|
+
|
397
|
+
|
398
|
+
Appendix A. Example profiles
|
399
|
+
|
400
|
+
The most important attribute of a Checkm profile is a globally unique
|
401
|
+
identifier, such as,
|
402
|
+
|
403
|
+
http://merritt.cdlib.org/registry/mrt-ingest-manifest
|
404
|
+
|
405
|
+
which applications can use for conditional processing. If, in
|
406
|
+
addition, this identifier is resolvable, it should return a text file
|
407
|
+
with the same format as a Checkm manifest but with no non-comment
|
408
|
+
lines. This file formally documents any particular ways in which the
|
409
|
+
first six Checkm fields may be restricted and what any additional
|
410
|
+
fields mean. As an example, the profile URL above corresponds to
|
411
|
+
|
412
|
+
#%checkm_0.7
|
413
|
+
#
|
414
|
+
# This is a profile definition for a "Merritt ingest" manifest.
|
415
|
+
#
|
416
|
+
#%profile | http://merritt.cdlib.org/registry/mrt-ingest-manifest
|
417
|
+
#%prefix | mrt: | http://merritt.cdlib.org/terms#
|
418
|
+
#%prefix | nfo: |
|
419
|
+
http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#
|
420
|
+
#%fields | nfo:fileUrl | nfo:hashAlgorithm | nfo:hashValue |
|
421
|
+
nfo:fileSize | nfo:fileLastModified | nfo:fileName |
|
422
|
+
mrt:mimeType
|
423
|
+
|
424
|
+
In this example and the next, indented lines artificially occur where
|
425
|
+
long lines have been wrapped for display purposes. The profile below
|
426
|
+
uses Checkm inclusion lines as a way to describe "digital objects".
|
427
|
+
|
428
|
+
#%checkm_0.7
|
429
|
+
#
|
430
|
+
# This is a profile definition for a "Merritt batch" manifest.
|
431
|
+
# It is meant to be used with Checkm "inclusion" lines, as in
|
432
|
+
#
|
433
|
+
# @url | [alg] | [value] | [length] | | filename | [primary] [ | local ]
|
434
|
+
#
|
435
|
+
#%profile | http://merritt.cdlib.org/registry/mrt-batch-manifest
|
436
|
+
#%prefix | mrt: | http://merritt.cdlib.org/terms#
|
437
|
+
#%prefix | nfo: |
|
438
|
+
http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#
|
439
|
+
#%fields | nfo:fileUrl | nfo:hashAlgorithm | nfo:hashValue |
|
440
|
+
nfo:fileSize | nfo:fileLastModified | nfo:fileName |
|
441
|
+
mrt:primaryIdentifier | mrt:localIdentifier
|
442
|
+
|
443
|
+
|
444
|
+
Authors' Addresses
|
445
|
+
|
446
|
+
John A. Kunze
|
447
|
+
California Digital Library
|
448
|
+
415 20th St, 4th Floor
|
449
|
+
Oakland, CA 94612
|
450
|
+
US
|
451
|
+
|
452
|
+
Email: jak@ucop.edu
|
453
|
+
|
454
|
+
Stephen Abrams
|
455
|
+
California Digital Library
|
456
|
+
415 20th St, 4th Floor
|
457
|
+
Oakland, CA 94612
|
458
|
+
US
|
459
|
+
|
460
|
+
Email: stephen.abrams@ucop.edu
|
461
|
+
|
462
|
+
David Loy
|
463
|
+
California Digital Library
|
464
|
+
415 20th St, 4th Floor
|
465
|
+
Oakland, CA 94612
|
466
|
+
US
|
467
|
+
|
468
|
+
Email: david.loy@ucop.edu
|