text-checkm 0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +312 -0
- data/.idea/$CACHE_FILE$ +26 -0
- data/.idea/.gitignore +6 -0
- data/.idea/go.imports.xml +6 -0
- data/.idea/inspectionProfiles/Project_Default.xml +17 -0
- data/.idea/inspectionProfiles/profiles_settings.xml +7 -0
- data/.idea/misc.xml +9 -0
- data/.idea/modules.xml +8 -0
- data/.idea/vcs.xml +6 -0
- data/.rubocop.yml +144 -0
- data/.ruby-version +1 -0
- data/.simplecov +7 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +98 -0
- data/LICENSE.md +23 -0
- data/README.md +23 -0
- data/Rakefile +36 -0
- data/SPEC.txt +468 -0
- data/lib/text/checkm.rb +2 -0
- data/lib/text/checkm/checksum.rb +40 -0
- data/lib/text/checkm/entry.rb +77 -0
- data/lib/text/checkm/manifest.rb +115 -0
- data/lib/text/checkm/module_info.rb +16 -0
- data/rakelib/bundle.rake +8 -0
- data/rakelib/coverage.rake +5 -0
- data/rakelib/rubocop.rake +16 -0
- data/spec/.rubocop.yml +19 -0
- data/spec/data/merritt-manifest.checkm +9 -0
- data/spec/data/myfirst.checkm +5 -0
- data/spec/data/test_1/1 +1 -0
- data/spec/data/two-level-manifest.checkm +8 -0
- data/spec/lib/text/checkm/checksum_spec.rb +24 -0
- data/spec/lib/text/checkm/entry_spec.rb +52 -0
- data/spec/lib/text/checkm/manifest_spec.rb +157 -0
- data/spec/spec_helper.rb +23 -0
- data/text-checkm.gemspec +29 -0
- data/text-checkm.iml +51 -0
- metadata +236 -0
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.7.0
|
data/.simplecov
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
# 0.7 (22 July 2020)
|
2
|
+
|
3
|
+
- fork from [ruby-microservices/checkm@b7a23d6](https://github.com/ruby-microservices/checkm/tree/b7a23d6a72af643cb9554bf16ff49fc27eded827)
|
4
|
+
(last MIT-licensed version)
|
5
|
+
- update for Ruby 2.7.0
|
6
|
+
- rename module from `Checkm` to `Text::Checkm` to avoid name collisions
|
7
|
+
- bump version to 0.7 to match [spec](SPEC.txt) version
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
text-checkm (0.7)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: http://rubygems.org/
|
8
|
+
specs:
|
9
|
+
ast (2.4.1)
|
10
|
+
builder (3.2.4)
|
11
|
+
bundle-audit (0.1.0)
|
12
|
+
bundler-audit
|
13
|
+
bundler-audit (0.7.0.1)
|
14
|
+
bundler (>= 1.2.0, < 3)
|
15
|
+
thor (>= 0.18, < 2)
|
16
|
+
ci_reporter (2.0.0)
|
17
|
+
builder (>= 2.1.2)
|
18
|
+
ci_reporter_rspec (1.0.0)
|
19
|
+
ci_reporter (~> 2.0)
|
20
|
+
rspec (>= 2.14, < 4)
|
21
|
+
colorize (0.8.1)
|
22
|
+
diff-lcs (1.4.4)
|
23
|
+
docile (1.3.2)
|
24
|
+
ffi (1.13.1)
|
25
|
+
io-console (0.5.6)
|
26
|
+
irb (1.2.4)
|
27
|
+
reline (>= 0.0.1)
|
28
|
+
json (2.3.1)
|
29
|
+
listen (3.1.5)
|
30
|
+
rb-fsevent (~> 0.9, >= 0.9.4)
|
31
|
+
rb-inotify (~> 0.9, >= 0.9.7)
|
32
|
+
ruby_dep (~> 1.2)
|
33
|
+
parallel (1.19.2)
|
34
|
+
parser (2.7.1.4)
|
35
|
+
ast (~> 2.4.1)
|
36
|
+
rainbow (3.0.0)
|
37
|
+
rake (13.0.1)
|
38
|
+
rb-fsevent (0.10.4)
|
39
|
+
rb-inotify (0.10.1)
|
40
|
+
ffi (~> 1.0)
|
41
|
+
regexp_parser (1.7.1)
|
42
|
+
reline (0.1.4)
|
43
|
+
io-console (~> 0.5)
|
44
|
+
rexml (3.2.4)
|
45
|
+
rspec (3.9.0)
|
46
|
+
rspec-core (~> 3.9.0)
|
47
|
+
rspec-expectations (~> 3.9.0)
|
48
|
+
rspec-mocks (~> 3.9.0)
|
49
|
+
rspec-core (3.9.2)
|
50
|
+
rspec-support (~> 3.9.3)
|
51
|
+
rspec-expectations (3.9.2)
|
52
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
53
|
+
rspec-support (~> 3.9.0)
|
54
|
+
rspec-mocks (3.9.1)
|
55
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
56
|
+
rspec-support (~> 3.9.0)
|
57
|
+
rspec-support (3.9.3)
|
58
|
+
rubocop (0.86.0)
|
59
|
+
parallel (~> 1.10)
|
60
|
+
parser (>= 2.7.0.1)
|
61
|
+
rainbow (>= 2.2.2, < 4.0)
|
62
|
+
regexp_parser (>= 1.7)
|
63
|
+
rexml
|
64
|
+
rubocop-ast (>= 0.0.3, < 1.0)
|
65
|
+
ruby-progressbar (~> 1.7)
|
66
|
+
unicode-display_width (>= 1.4.0, < 2.0)
|
67
|
+
rubocop-ast (0.2.0)
|
68
|
+
parser (>= 2.7.0.1)
|
69
|
+
ruby-progressbar (1.10.1)
|
70
|
+
ruby_dep (1.5.0)
|
71
|
+
simplecov (0.16.1)
|
72
|
+
docile (~> 1.1)
|
73
|
+
json (>= 1.8, < 3)
|
74
|
+
simplecov-html (~> 0.10.0)
|
75
|
+
simplecov-html (0.10.2)
|
76
|
+
simplecov-rcov (0.2.3)
|
77
|
+
simplecov (>= 0.4.1)
|
78
|
+
thor (1.0.1)
|
79
|
+
unicode-display_width (1.7.0)
|
80
|
+
|
81
|
+
PLATFORMS
|
82
|
+
ruby
|
83
|
+
|
84
|
+
DEPENDENCIES
|
85
|
+
bundle-audit
|
86
|
+
ci_reporter_rspec
|
87
|
+
colorize
|
88
|
+
irb
|
89
|
+
listen (>= 3.0.5, < 3.2)
|
90
|
+
rake (>= 13.0)
|
91
|
+
rspec-support
|
92
|
+
rubocop (= 0.86)
|
93
|
+
simplecov (~> 0.16.1)
|
94
|
+
simplecov-rcov
|
95
|
+
text-checkm!
|
96
|
+
|
97
|
+
BUNDLED WITH
|
98
|
+
2.1.2
|
data/LICENSE.md
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright © 2010 Chris Beer
|
4
|
+
|
5
|
+
Copyright © 2020 The Regents of the University of California
|
6
|
+
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a
|
8
|
+
copy of this software and associated documentation files (the “Software”),
|
9
|
+
to deal in the Software without restriction, including without limitation
|
10
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
11
|
+
and/or sell copies of the Software, and to permit persons to whom the
|
12
|
+
Software is furnished to do so, subject to the following conditions:
|
13
|
+
|
14
|
+
The above copyright notice and this permission notice shall be included in
|
15
|
+
all copies or substantial portions of the Software.
|
16
|
+
|
17
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
22
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
23
|
+
DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# Text::Checkm
|
2
|
+
|
3
|
+
Ruby implementation of
|
4
|
+
[Checkm](https://confluence.ucop.edu/display/Curation/Checkm), a
|
5
|
+
general-purpose text-based file manifest format developed at the
|
6
|
+
[California Digital Library](https://cdlib.org/) by John Kunze, Stephen
|
7
|
+
Abrams, and David Loy.
|
8
|
+
|
9
|
+
(See [SPEC.txt](SPEC.txt) in this repository for details.)
|
10
|
+
|
11
|
+
## Copyright
|
12
|
+
|
13
|
+
Based on [checkm](https://github.com/ruby-microservices/checkm/), copyright
|
14
|
+
© 2010 Chris Beer. (For compliance with [UC policies on the use and
|
15
|
+
creation of open source
|
16
|
+
software](https://security.ucop.edu/resources/open-source-software-licensing.html),
|
17
|
+
this library is based on
|
18
|
+
[ruby-microservices/checkm@b7a23d6](https://github.com/ruby-microservices/checkm/tree/b7a23d6a72af643cb9554bf16ff49fc27eded827),
|
19
|
+
the last MIT-licensed revision.)
|
20
|
+
|
21
|
+
Subsequent work © 2020 the Regents of the University of California. See
|
22
|
+
[LICENSE.md](LICENSE.md) for more details.
|
23
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path('Gemfile', __dir__)
|
2
|
+
require 'bundler/setup' # Set up gems listed in the Gemfile.
|
3
|
+
|
4
|
+
# ------------------------------------------------------------
|
5
|
+
# Application code
|
6
|
+
|
7
|
+
File.expand_path('lib', __dir__).tap do |lib|
|
8
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
9
|
+
end
|
10
|
+
|
11
|
+
# ------------------------------------------------------------
|
12
|
+
# RSpec
|
13
|
+
|
14
|
+
require 'rspec/core/rake_task'
|
15
|
+
require 'ci/reporter/rake/rspec'
|
16
|
+
|
17
|
+
ENV['CI_REPORTS'] ||= File.expand_path('artifacts', __dir__)
|
18
|
+
|
19
|
+
namespace :spec do
|
20
|
+
desc 'Run all tests'
|
21
|
+
RSpec::Core::RakeTask.new(:all) do |task|
|
22
|
+
task.rspec_opts = %w[--color --format documentation --order default]
|
23
|
+
task.pattern = 'spec/**/*_spec.rb'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
desc 'Run all tests'
|
28
|
+
task spec: ['spec:all']
|
29
|
+
|
30
|
+
# ------------------------------------------------------------
|
31
|
+
# Custom tasks
|
32
|
+
|
33
|
+
desc 'Run tests, check test coverage, check code style'
|
34
|
+
task default: %i[coverage rubocop bundle:audit]
|
35
|
+
|
36
|
+
# TODO: YARD
|
data/SPEC.txt
ADDED
@@ -0,0 +1,468 @@
|
|
1
|
+
---------------------------------------------------------------------------
|
2
|
+
|
3
|
+
NOTE: This is an unofficial copy of the Checkm 0.7 specification,
|
4
|
+
adapted from a diff between the 0.3 and 0.7 specifications captured by
|
5
|
+
the Internet Archive's Wayback Machine on 23 October 2015 and retrieved
|
6
|
+
on 22 July 2020. It is provided here for reference only, and should not
|
7
|
+
be considered normative. (D. Moles, 2020-07-22)
|
8
|
+
|
9
|
+
---------------------------------------------------------------------------
|
10
|
+
|
11
|
+
Repository Working Group J. Kunze
|
12
|
+
S. Abrams
|
13
|
+
D. Loy
|
14
|
+
California Digital Library
|
15
|
+
|
16
|
+
June 11, 2010
|
17
|
+
|
18
|
+
Checkm: a checksum-based manifest format (v0.7)
|
19
|
+
|
20
|
+
|
21
|
+
Abstract
|
22
|
+
|
23
|
+
Checkm is a general-purpose text-based file manifest format. Each
|
24
|
+
line of a Checkm manifest is a set of '|'-separated tokens, the first
|
25
|
+
of which identifies the corresponding digital content by filename or
|
26
|
+
URL. Other tokens identify digest algorithm, checksum, content
|
27
|
+
length, and modification time. Tokens may be left unspecified with
|
28
|
+
empty fields or by ending the line early, the degenerate case being a
|
29
|
+
simple file list. It is up to tools that use the Checkm format to
|
30
|
+
specify any further restrictions on tokens (e.g., allowed defaults
|
31
|
+
and digest algorithms) and on overall manifest completeness and
|
32
|
+
coherence. A structured comment mechanism permits a way to document
|
33
|
+
extensions and restrictions. Checkm is designed to support tools that
|
34
|
+
verify the bit-level integrity of groups of files in support of such
|
35
|
+
things as content fixity, replication, import, and export. A manifest
|
36
|
+
may be single-level or multi-level (hierarchical), the latter being
|
37
|
+
useful, for example, in harvesting material from very large web sites
|
38
|
+
(cf. sitemaps).
|
39
|
+
|
40
|
+
|
41
|
+
1. Checkm overview
|
42
|
+
|
43
|
+
Checkm (pronounced "check 'em") is a simple text-based manifest
|
44
|
+
format for digital content. A manifest is a set of lines, each of
|
45
|
+
which describes a unit of content via up to six whitespace-separated
|
46
|
+
tokens. The meaning of a token is given by its position within the
|
47
|
+
line. For example, the first three tokens give the name of the
|
48
|
+
content, a checksum algorithm, and a digest (checksum) computed using
|
49
|
+
that algorithm, respectively. Here's a manifest identifying two files
|
50
|
+
with MD5 checksums (not recommended for serious use but short enough
|
51
|
+
to fit in these examples).
|
52
|
+
|
53
|
+
#%checkm_0.7
|
54
|
+
# My first manifest. Two files total.
|
55
|
+
# Filename |Algorithm| Digest
|
56
|
+
book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
|
57
|
+
images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
|
58
|
+
|
59
|
+
Checkm is purely concerned with format and not with such things as
|
60
|
+
completeness and fitness for a given application. It defines the
|
61
|
+
meanings of the six tokens but does not mandate their use. For
|
62
|
+
example, a file package transfer tool could require use of four
|
63
|
+
tokens, but another tool designed for fixity checking might only
|
64
|
+
require two tokens. The next example is a bare-bones manifest in
|
65
|
+
which all but the first token have been dropped, in other words, it's
|
66
|
+
just a list of filenames or URLs, one per line. This is a useful
|
67
|
+
degenerate case when only a list of named units of content is needed.
|
68
|
+
|
69
|
+
#%checkm_0.7
|
70
|
+
# My second manifest. Just a list of files.
|
71
|
+
# Filename (no other tokens given)
|
72
|
+
http://example.org/i/chap9.xml
|
73
|
+
http://example.org/i/chap9fig2.png
|
74
|
+
|
75
|
+
To leave tokens unspecified that would occur in the middle rather
|
76
|
+
than at the end of a line, leave the corresponding fields empty. A
|
77
|
+
field is considered empty if the line terminates before it is reached
|
78
|
+
or if it consists only of linear whitespace, namely, zero or more
|
79
|
+
SPACE (hex 20) or TAB (hex 09) characters. For example, a package
|
80
|
+
transfer application that also renames files might use the following
|
81
|
+
manifest.
|
82
|
+
|
83
|
+
#%checkm_0.7
|
84
|
+
# My third manifest.
|
85
|
+
# Filename and Target specified, not Alg, Digest, Length, or ModTime
|
86
|
+
http://example.org/i/chap9.xml ||||| book/Chapter9.xml
|
87
|
+
http://example.org/i/chap9fig2.png ||||| images/r862.png
|
88
|
+
|
89
|
+
|
90
|
+
Each non-comment line can contain up to six tokens, and has the form,
|
91
|
+
|
92
|
+
[@]SourceFileOrURL | Alg | Digest | Length | ModTime | TargetFileOrURL
|
93
|
+
|
94
|
+
where "[@]" indicates an optional '@' that causes the identified
|
95
|
+
content to be "included" as a manifest extension. In principle there
|
96
|
+
is no upper or lower limit on the number of lines in a Checkm
|
97
|
+
manifest, however, practical considerations may call for extending a
|
98
|
+
single-level manifest to a multi-level manifest.
|
99
|
+
|
100
|
+
|
101
|
+
2. Multi-level manifests
|
102
|
+
|
103
|
+
If supported, a multi-level manifest permits one large manifest to be
|
104
|
+
spread over a number of smaller manifests. To trigger this, the
|
105
|
+
SourceFileOrURL token that begins a line is preceded by a literal
|
106
|
+
'@'. It invokes a simple inclusion mechanism indicating that the
|
107
|
+
identified content is also in Checkm format and extends the current
|
108
|
+
manifest; this is similar to mainstream sitemap extension mechanisms
|
109
|
+
(cf. [SITEMAPS]). A tool can be said to support only single-level
|
110
|
+
Checkm if it does not support multi-level manifests.
|
111
|
+
|
112
|
+
Included manifests may themselves recursively include other
|
113
|
+
manifests. There is no limit either to the number of inclusions or to
|
114
|
+
the depth of a multi-level manifest. Cycles in the inclusion graph
|
115
|
+
are generally considered to be in poor taste.
|
116
|
+
|
117
|
+
|
118
|
+
3. Checkm lines and tokens
|
119
|
+
|
120
|
+
Manifest lines end with either LF (hex 0a) or CRLF (hex 0d0a). Blank
|
121
|
+
lines are ignored. Lines that begin with '#' are considered
|
122
|
+
"comments" that are to be ignored by processors except for those
|
123
|
+
implementing Checkm extensions (described later).
|
124
|
+
|
125
|
+
Checkm tokens on a given line all relate to the unit of content or to
|
126
|
+
the extended functionality identified by the first token on the line.
|
127
|
+
A unit of content is a contiguous sequence of octets (for most
|
128
|
+
purposes this is a "file") identified by a filename or URL.
|
129
|
+
|
130
|
+
Tokens consist of UTF-8 characters [RFC3629] separated by a '|'
|
131
|
+
character (hex 7c). Any linear whitespace found at the start or end
|
132
|
+
of a token is ignored. Any characters not allowed in a token or in a
|
133
|
+
URL, such as '|' or whitespace, may be represented using URL percent-
|
134
|
+
encoding [RFC3986].
|
135
|
+
|
136
|
+
Tokens may be left unspecified by simply dropping them from the end
|
137
|
+
of the line or by leaving the field empty (zero or more linear
|
138
|
+
whitespace characters). Checkm is silent about which tokens are
|
139
|
+
required or prohibited and what defaults may be in effect. Checkm is
|
140
|
+
also silent about manifest completeness (which units of content must
|
141
|
+
be included) and hyper-specification (whether one unit of content can
|
142
|
+
or must have more than one line describing it, e.g., resulting from
|
143
|
+
two digest algorithms).
|
144
|
+
|
145
|
+
|
146
|
+
4. Content lines
|
147
|
+
|
148
|
+
The first of up to six tokens on a non-comment line look like this
|
149
|
+
|
150
|
+
[@]SourceFileOrURL | Alg | Digest | Length | ModTime | TargetFileOrURL
|
151
|
+
TOKEN NUMBER: 1 2 3 4 5 6
|
152
|
+
|
153
|
+
The token's numbered position determines its meaning, as explained in
|
154
|
+
the correspondingly numbered subsections below.
|
155
|
+
|
156
|
+
Any extra fields at positions 7 and higher are considered to be Checkm
|
157
|
+
extensions.
|
158
|
+
|
159
|
+
4.1. [@]SourceFileOrURL: content identifier
|
160
|
+
|
161
|
+
The SourceFileOrURL token identifies digital content, and may be
|
162
|
+
given as '-' to indicate that the content may be found on the
|
163
|
+
equivalent of Unix "stdin". This token may be a URL or a relative or
|
164
|
+
absolute filename. To prevent interpretation of a relative pathname
|
165
|
+
that begins with '#' or '@', one can insert "./" in front of the
|
166
|
+
name. Whether this token is a filename or a URL, any characters not
|
167
|
+
allowed in a URL must be represented using URL percent-encoding
|
168
|
+
[RFC3986].
|
169
|
+
|
170
|
+
If any SourceFileOrURL token in a manifest is preceded by the
|
171
|
+
optional '@', the line amounts to an "include" statement and the
|
172
|
+
manifest is considered to be "multi-level". Other tokens on that line
|
173
|
+
still relate to the content but the "included" content itself is
|
174
|
+
considered to be an extension of the current manifest. For example, a
|
175
|
+
multi-level Checkm manifest totaling 4 million lines could be
|
176
|
+
represented by a 2000-line manifest, each line of which references a
|
177
|
+
2000-line single-level manifest.
|
178
|
+
|
179
|
+
If none of the lines in a manifest is preceded by '@', the manifest
|
180
|
+
is considered to be "single-level". It is permissible for a tool that
|
181
|
+
conforms to Checkm to declare support for only single-level
|
182
|
+
manifests.
|
183
|
+
|
184
|
+
4.2. Alg: algorithm
|
185
|
+
|
186
|
+
Alg is either the literal string "dir" (designating a directory), a
|
187
|
+
string specifying a cryptographic checksum algorithm, or empty to
|
188
|
+
leave it unspecified. The special case of "dir" is useful for listing
|
189
|
+
an empty directory, which has neither a fixed octetstream over which
|
190
|
+
to compute a digest nor a contained filename to imply the directory's
|
191
|
+
existence. For example,
|
192
|
+
|
193
|
+
|
194
|
+
#%checkm_0.7
|
195
|
+
# My fourth manifest. Two files and a directory.
|
196
|
+
# Filename |Algorithm| Digest
|
197
|
+
book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
|
198
|
+
icons/ | dir
|
199
|
+
images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
|
200
|
+
|
201
|
+
Implementors of tools that use Checkm are strongly encouraged to
|
202
|
+
support at least two widely implemented checksum algorithms:
|
203
|
+
|
204
|
+
"md5" [RFC1321]
|
205
|
+
|
206
|
+
"sha1" [RFC3174]
|
207
|
+
|
208
|
+
"sha256" [FIPS180-2]
|
209
|
+
|
210
|
+
When using other algorithms, the name of the algorithm should be
|
211
|
+
normalized for use in the manifest's filename, by lowercasing the
|
212
|
+
common name of the algorithm, and removing all non-alphanumeric
|
213
|
+
characters.
|
214
|
+
|
215
|
+
4.3. Digest: computed checksum
|
216
|
+
|
217
|
+
Digest is a string representing the checksum calculated according to
|
218
|
+
the Alg algorithm over the content, or empty to leave it unspecified.
|
219
|
+
|
220
|
+
|
221
|
+
4.4. Length of content
|
222
|
+
|
223
|
+
Length is the number (base 10) of octets in the identified content,
|
224
|
+
or empty to leave it unspecified. It is typically useful in
|
225
|
+
providing a rapid test for altered content and for estimating file
|
226
|
+
transfer times.
|
227
|
+
|
228
|
+
|
229
|
+
4.5. ModTime: time last modified
|
230
|
+
|
231
|
+
ModTime is a lexically sort-friendly date such as [TEMPER]
|
232
|
+
('YYYYMMDDhhmmss') or [W3CDTF] (YYYY-MM-DDThh:mm:ss), or empty to
|
233
|
+
leave it unspecified. It should represent the UTC time when the
|
234
|
+
content was last modified and is typically useful in incremental or
|
235
|
+
priority harvesting of content (cf. [OAI] and [SITEMAPS]).
|
236
|
+
|
237
|
+
|
238
|
+
4.6. TargetFileOrURL: other location
|
239
|
+
|
240
|
+
TargetFileOrURL is a secondary location for the content that
|
241
|
+
applications would use as necessary. For instance, a transfer tool
|
242
|
+
that also renames files could use this token as the destination name.
|
243
|
+
|
244
|
+
|
245
|
+
5. Extensions: structured comment lines
|
246
|
+
|
247
|
+
Comment lines that begin with a token of the form '#%_symbol_' are
|
248
|
+
special structured comment lines that usually indicate specific
|
249
|
+
optional functionality that extends the core Checkm specification.
|
250
|
+
Matching against a _symbol_ is case-insensitive (e.g., #%foo is
|
251
|
+
equivalent to #%FOO). The rest of a structured comment line is
|
252
|
+
tokenized in the same way as non-comment lines. The structured
|
253
|
+
comment symbols that follow are currently reserved.
|
254
|
+
|
255
|
+
5.1. Optional extension: #%checkm_0.7
|
256
|
+
|
257
|
+
It is highly recommended that the first line of a Checkm manifest be
|
258
|
+
of the form
|
259
|
+
|
260
|
+
#%checkm_M.N
|
261
|
+
|
262
|
+
where M.N identify major and minor version numbers. The current
|
263
|
+
version is 0.7.
|
264
|
+
|
265
|
+
5.2. Optional extension: #%eof
|
266
|
+
|
267
|
+
A line consisting of
|
268
|
+
|
269
|
+
#%eof
|
270
|
+
|
271
|
+
is reserved as an explicit end of manifest file marker. It can be
|
272
|
+
used to distinguish manifests that might be empty because of an error
|
273
|
+
from those that are deliberately empty.
|
274
|
+
|
275
|
+
5.3. Optional extension: #%fields
|
276
|
+
|
277
|
+
To precisely identify all fields in a given Checkm manifest, before
|
278
|
+
any non-comment lines include a line of the form
|
279
|
+
|
280
|
+
#%fields | Field_Id | ...
|
281
|
+
|
282
|
+
containing one or more instances of a Field_Id, each identifying the
|
283
|
+
corresponding manifest field. A Field_Id may be a simple string
|
284
|
+
suggestive of the respective field's function or it may be a globally
|
285
|
+
unique URL. If a Field_Id URL is resolvable, it should document any
|
286
|
+
restriction or extension in effect. The #%fields structured comment
|
287
|
+
may form part of a #%profile definition.
|
288
|
+
|
289
|
+
Semantics of the basic fields 1 through 6 may not be altered except
|
290
|
+
to narrow their meanings, such as to restrict the values of field 3
|
291
|
+
to one particular algorithm. Semantics of the extension fields (7 and
|
292
|
+
higher) may be defined at will.
|
293
|
+
|
294
|
+
|
295
|
+
5.4. Optional extension: #%prefix
|
296
|
+
|
297
|
+
To define an abbeviation for a long URL in a manner reminiscent of
|
298
|
+
Turtle [Turtle], before any use of the abbreviation include a line of
|
299
|
+
the form
|
300
|
+
|
301
|
+
#%prefix | Abbrev: | URL
|
302
|
+
|
303
|
+
where Abbrev (which may be empty) is a "prefix" that will stand in
|
304
|
+
for the given URL when it used in other structured comments (and not
|
305
|
+
in non-comment lines). The #%prefix structured comment may form part
|
306
|
+
of a #%profile definition.
|
307
|
+
|
308
|
+
5.5. Optional extension: #%profile
|
309
|
+
|
310
|
+
To declare that a Checkm manifest conforms to a specific profile,
|
311
|
+
before any non-comment lines include a line of the form
|
312
|
+
|
313
|
+
#%profile | ProfileURL
|
314
|
+
|
315
|
+
where ProfileURL is a unique identifier for a specific profile. If
|
316
|
+
the URL is resolvable, it should document any restrictions and
|
317
|
+
extensions. Some example profiles appear in an appendix.
|
318
|
+
|
319
|
+
|
320
|
+
6. Conformance Terminology
|
321
|
+
|
322
|
+
A tool that uses the Checkm format should document which parts of the
|
323
|
+
format it supports. For example, documentation should state what
|
324
|
+
extensions, if any, are in use. One common restriction could be
|
325
|
+
expressed something like,
|
326
|
+
|
327
|
+
"... which must be a single-level, 3-column Checkm manifest with
|
328
|
+
relative filenames."
|
329
|
+
|
330
|
+
This terminology suggests that, for this particular tool, an
|
331
|
+
exception or undefined behavior is the likely result of supplying a
|
332
|
+
Checkm manifest that has any line beginning with '@', a URL, or an
|
333
|
+
absolute pathname, or that has any line with more than or fewer than
|
334
|
+
3 tokens.
|
335
|
+
|
336
|
+
|
337
|
+
7. Example two-level Checkm manifest
|
338
|
+
|
339
|
+
#%checkm_0.7
|
340
|
+
# A two-level manifest.
|
341
|
+
|
342
|
+
#Filename |Alg |Checksum |Length
|
343
|
+
foo.bar |sha1|2eacd0da7aa89b094f5121eb2901bf4de2219ef1 | 366
|
344
|
+
foo.bar |md5 |3e83471320227c0797a0c251f28db0c5 | 366
|
345
|
+
# This next line "includes" the manifest in file "myfirst".
|
346
|
+
@myfirst |md5 |6ab96c8930621d50cef31da4df6d9ed8 | 264
|
347
|
+
|
348
|
+
where the included file "myfirst" contains 264 octets and lists two
|
349
|
+
files:
|
350
|
+
|
351
|
+
#%checkm_0.7
|
352
|
+
# My first manifest. Two files total.
|
353
|
+
# Filename |Algorithm| Digest
|
354
|
+
book/Chapter9.xml | md5 | 49afbd86a1ca9f34b677a3f09655eae9
|
355
|
+
images/r862.png | md5 | 408ad21d50cef31da4df6d9ed81b01a7
|
356
|
+
|
357
|
+
8. References
|
358
|
+
|
359
|
+
[FIPS180-2]
|
360
|
+
NIST, "FIPS 180-2: Secure Hash Standard (SHS)",
|
361
|
+
February 2004, <http://csrc.nist.gov/publications/fips/
|
362
|
+
fips180-2/fips180-2withchangenotice.pdf>.
|
363
|
+
|
364
|
+
[OAI] Lagoze, C. and H. Van de Sompel, "Open Archives Initiative
|
365
|
+
Protocol for Metadata Harvesting", June 2002, <http://
|
366
|
+
www.openarchives.org/OAI/openarchivesprotocol.html>.
|
367
|
+
|
368
|
+
[RFC1321] Rivest, R., "The MD5 Message-Digest Algorithm", RFC 1321,
|
369
|
+
April 1992.
|
370
|
+
|
371
|
+
[RFC3174] Eastlake, D. and P. Jones, "US Secure Hash Algorithm 1
|
372
|
+
(SHA1)", RFC 3174, September 2001.
|
373
|
+
|
374
|
+
[RFC3629] Yergeau, F., "UTF-8, a transformation format of ISO
|
375
|
+
10646", STD 63, RFC 3629, November 2003.
|
376
|
+
|
377
|
+
[RFC3986] Berners-Lee, T., Fielding, R., and L. Masinter, "Uniform
|
378
|
+
Resource Identifier (URI): Generic Syntax", STD 66,
|
379
|
+
RFC 3986, January 2005.
|
380
|
+
|
381
|
+
[SITEMAPS]
|
382
|
+
sitemaps.org, "Sitemaps XML format", February 2008,
|
383
|
+
<http://sitemaps.org/protocol.php>.
|
384
|
+
|
385
|
+
[TEMPER] Blair, C. and J. Kunze, "Temporal Enumerated Ranges",
|
386
|
+
August 2007,
|
387
|
+
<http://www.cdlib.org/inside/diglib/ark/temperspec.pdf>.
|
388
|
+
|
389
|
+
[Turtle] Beckett, D. and T. Berners-Lee, "Turtle - Terse RDF Triple
|
390
|
+
Language", January 2008,
|
391
|
+
<http://www.w3.org/TeamSubmission/turtle/>.
|
392
|
+
|
393
|
+
[W3CDTF] Wolf, M. and C. Wicksteed, "Date and Time Formats (W3C
|
394
|
+
profile of ISO8601)",
|
395
|
+
<http://www.w3.org/TR/NOTE-datetime>.
|
396
|
+
|
397
|
+
|
398
|
+
Appendix A. Example profiles
|
399
|
+
|
400
|
+
The most important attribute of a Checkm profile is a globally unique
|
401
|
+
identifier, such as,
|
402
|
+
|
403
|
+
http://merritt.cdlib.org/registry/mrt-ingest-manifest
|
404
|
+
|
405
|
+
which applications can use for conditional processing. If, in
|
406
|
+
addition, this identifier is resolvable, it should return a text file
|
407
|
+
with the same format as a Checkm manifest but with no non-comment
|
408
|
+
lines. This file formally documents any particular ways in which the
|
409
|
+
first six Checkm fields may be restricted and what any additional
|
410
|
+
fields mean. As an example, the profile URL above corresponds to
|
411
|
+
|
412
|
+
#%checkm_0.7
|
413
|
+
#
|
414
|
+
# This is a profile definition for a "Merritt ingest" manifest.
|
415
|
+
#
|
416
|
+
#%profile | http://merritt.cdlib.org/registry/mrt-ingest-manifest
|
417
|
+
#%prefix | mrt: | http://merritt.cdlib.org/terms#
|
418
|
+
#%prefix | nfo: |
|
419
|
+
http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#
|
420
|
+
#%fields | nfo:fileUrl | nfo:hashAlgorithm | nfo:hashValue |
|
421
|
+
nfo:fileSize | nfo:fileLastModified | nfo:fileName |
|
422
|
+
mrt:mimeType
|
423
|
+
|
424
|
+
In this example and the next, indented lines artificially occur where
|
425
|
+
long lines have been wrapped for display purposes. The profile below
|
426
|
+
uses Checkm inclusion lines as a way to describe "digital objects".
|
427
|
+
|
428
|
+
#%checkm_0.7
|
429
|
+
#
|
430
|
+
# This is a profile definition for a "Merritt batch" manifest.
|
431
|
+
# It is meant to be used with Checkm "inclusion" lines, as in
|
432
|
+
#
|
433
|
+
# @url | [alg] | [value] | [length] | | filename | [primary] [ | local ]
|
434
|
+
#
|
435
|
+
#%profile | http://merritt.cdlib.org/registry/mrt-batch-manifest
|
436
|
+
#%prefix | mrt: | http://merritt.cdlib.org/terms#
|
437
|
+
#%prefix | nfo: |
|
438
|
+
http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#
|
439
|
+
#%fields | nfo:fileUrl | nfo:hashAlgorithm | nfo:hashValue |
|
440
|
+
nfo:fileSize | nfo:fileLastModified | nfo:fileName |
|
441
|
+
mrt:primaryIdentifier | mrt:localIdentifier
|
442
|
+
|
443
|
+
|
444
|
+
Authors' Addresses
|
445
|
+
|
446
|
+
John A. Kunze
|
447
|
+
California Digital Library
|
448
|
+
415 20th St, 4th Floor
|
449
|
+
Oakland, CA 94612
|
450
|
+
US
|
451
|
+
|
452
|
+
Email: jak@ucop.edu
|
453
|
+
|
454
|
+
Stephen Abrams
|
455
|
+
California Digital Library
|
456
|
+
415 20th St, 4th Floor
|
457
|
+
Oakland, CA 94612
|
458
|
+
US
|
459
|
+
|
460
|
+
Email: stephen.abrams@ucop.edu
|
461
|
+
|
462
|
+
David Loy
|
463
|
+
California Digital Library
|
464
|
+
415 20th St, 4th Floor
|
465
|
+
Oakland, CA 94612
|
466
|
+
US
|
467
|
+
|
468
|
+
Email: david.loy@ucop.edu
|