libis-format 0.9.32 → 0.9.33
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/data/types.yml +30 -16
- data/lib/libis/format/config.rb +7 -18
- data/lib/libis/format/converter/image_converter.rb +6 -0
- data/lib/libis/format/droid.rb +82 -25
- data/lib/libis/format/extension_identification.rb +55 -0
- data/lib/libis/format/fido.rb +57 -72
- data/lib/libis/format/file_tool.rb +76 -0
- data/lib/libis/format/identification_tool.rb +174 -0
- data/lib/libis/format/identifier.rb +129 -117
- data/lib/libis/format/type_database.rb +36 -5
- data/lib/libis/format/version.rb +1 -1
- data/lib/libis/format.rb +3 -0
- data/libis-format.gemspec +2 -1
- data/spec/converter_spec.rb +6 -4
- data/spec/identifier_spec.rb +125 -34
- metadata +21 -126
- data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
- data/tools/droid/container-signature-20170330.xml +0 -3584
- data/tools/droid/droid-command-line-6.3.jar +0 -0
- data/tools/droid/droid.bat +0 -152
- data/tools/droid/droid.sh +0 -152
- data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
- data/tools/droid/lib/activation-1.1.jar +0 -0
- data/tools/droid/lib/aopalliance-1.0.jar +0 -0
- data/tools/droid/lib/asm-2.2.3.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
- data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
- data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
- data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
- data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
- data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
- data/tools/droid/lib/classmate-1.0.0.jar +0 -0
- data/tools/droid/lib/commons-cli-1.2.jar +0 -0
- data/tools/droid/lib/commons-codec-1.10.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
- data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
- data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
- data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
- data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
- data/tools/droid/lib/commons-io-2.4.jar +0 -0
- data/tools/droid/lib/commons-lang-2.6.jar +0 -0
- data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
- data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
- data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
- data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
- data/tools/droid/lib/droid-container-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-help-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-results-6.3.jar +0 -0
- data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
- data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
- data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
- data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
- data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
- data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
- data/tools/droid/lib/itext-2.0.8.jar +0 -0
- data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
- data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
- data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
- data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
- data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
- data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
- data/tools/droid/lib/jta-1.1.jar +0 -0
- data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
- data/tools/droid/lib/log4j-1.2.13.jar +0 -0
- data/tools/droid/lib/neethi-2.0.4.jar +0 -0
- data/tools/droid/lib/opencsv-2.3.jar +0 -0
- data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
- data/tools/droid/lib/poi-3.13.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
- data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
- data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
- data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
- data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
- data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
- data/tools/droid/lib/truezip-6.8.4.jar +0 -0
- data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
- data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
- data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
- data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
- data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
- data/tools/droid/lib/xz-1.0.jar +0 -0
- data/tools/fido/__init__.py +0 -50
- data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
- data/tools/fido/conf/container-signature-20170330.xml +0 -3584
- data/tools/fido/conf/dc.xsd +0 -119
- data/tools/fido/conf/dcmitype.xsd +0 -53
- data/tools/fido/conf/dcterms.xsd +0 -383
- data/tools/fido/conf/fido-formats.xsd +0 -173
- data/tools/fido/conf/format_extension_template.xml +0 -105
- data/tools/fido/conf/format_extensions.xml +0 -484
- data/tools/fido/conf/formats-v90.xml +0 -48877
- data/tools/fido/conf/pronom-xml-v90.zip +0 -0
- data/tools/fido/conf/versions.xml +0 -8
- data/tools/fido/fido.bat +0 -4
- data/tools/fido/fido.py +0 -884
- data/tools/fido/fido.sh +0 -5
- data/tools/fido/package.py +0 -96
- data/tools/fido/prepare.py +0 -645
- data/tools/fido/pronomutils.py +0 -200
- data/tools/fido/toxml.py +0 -60
- data/tools/fido/update_signatures.py +0 -183
data/tools/fido/prepare.py
DELETED
@@ -1,645 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
"""Format Identification for Digital Objects."""
|
5
|
-
|
6
|
-
from __future__ import print_function
|
7
|
-
|
8
|
-
from argparse import ArgumentParser
|
9
|
-
import hashlib
|
10
|
-
import sys
|
11
|
-
from xml.dom import minidom
|
12
|
-
from xml.etree import ElementTree as ET
|
13
|
-
import zipfile
|
14
|
-
|
15
|
-
from six.moves import cStringIO
|
16
|
-
from six.moves.urllib.request import urlopen
|
17
|
-
from six.moves.urllib.parse import urlparse
|
18
|
-
|
19
|
-
from .pronomutils import get_local_pronom_versions
|
20
|
-
|
21
|
-
|
22
|
-
# \a\b\n\r\t\v
|
23
|
-
# MdR: took out '<' and '>' out of _ordinary because they were converted to entities <>
|
24
|
-
# MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
|
25
|
-
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
26
|
-
_special = '$()*+.?![]^\\{|}'
|
27
|
-
_hex = '0123456789abcdef'
|
28
|
-
|
29
|
-
|
30
|
-
class NS:
|
31
|
-
"""
|
32
|
-
Helper class for XML name spaces in ElementTree.
|
33
|
-
|
34
|
-
Use like MYNS=NS("{http://some/uri}") and then MYNS(tag1/tag2).
|
35
|
-
"""
|
36
|
-
|
37
|
-
def __init__(self, uri):
|
38
|
-
"""Instantiate class with `uri` argument."""
|
39
|
-
self.uri = uri
|
40
|
-
|
41
|
-
def __getattr__(self, tag):
|
42
|
-
"""Append URI to the class attributes."""
|
43
|
-
return self.uri + tag
|
44
|
-
|
45
|
-
def __call__(self, path):
|
46
|
-
"""Define behavior when the instant is used as a function."""
|
47
|
-
return "/".join(getattr(self, tag) for tag in path.split("/"))
|
48
|
-
|
49
|
-
|
50
|
-
XHTML = NS("{http://www.w3.org/1999/xhtml}") # XHTML namespace
|
51
|
-
TNA = NS("{http://pronom.nationalarchives.gov.uk}") # TNA namespace
|
52
|
-
|
53
|
-
|
54
|
-
def get_text_tna(element, tag, default=''):
|
55
|
-
"""Helper function to return the text for a tag or path using the TNA namespace."""
|
56
|
-
part = element.find(TNA(tag))
|
57
|
-
if part is None or part.text is None:
|
58
|
-
return default
|
59
|
-
return part.text.strip()
|
60
|
-
|
61
|
-
|
62
|
-
def prettify(elem):
|
63
|
-
"""Return a pretty-printed XML string for the Element."""
|
64
|
-
rough_string = ET.tostring(elem, 'UTF-8')
|
65
|
-
reparsed = minidom.parseString(rough_string)
|
66
|
-
return reparsed.toprettyxml(indent=" ")
|
67
|
-
|
68
|
-
|
69
|
-
class FormatInfo:
|
70
|
-
"""Convert PRONOM formats into FIDO signatures."""
|
71
|
-
|
72
|
-
def __init__(self, pronom_files, format_list=[]):
|
73
|
-
"""Instantiate class, take a list of PRONOM files and an optional list of formats."""
|
74
|
-
self.info = {}
|
75
|
-
self.formats = []
|
76
|
-
self.pronom_files = pronom_files
|
77
|
-
for f in format_list:
|
78
|
-
self.add_format(f) # FIXME: add_format is undefined!
|
79
|
-
|
80
|
-
def save(self, dst=sys.stdout):
|
81
|
-
"""Write the fido XML format definitions to @param dst."""
|
82
|
-
tree = ET.ElementTree(ET.Element('formats', {
|
83
|
-
'version': '0.3',
|
84
|
-
'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance",
|
85
|
-
'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
|
86
|
-
'xmlns:dc': "http://purl.org/dc/elements/1.1/",
|
87
|
-
'xmlns:dcterms': "http://purl.org/dc/terms/"
|
88
|
-
}))
|
89
|
-
root = tree.getroot()
|
90
|
-
for f in self.formats:
|
91
|
-
# MdR: this skipped puids without sig, but we want them ALL
|
92
|
-
# because puid might be matched on extension
|
93
|
-
# if f.find('signature'):
|
94
|
-
root.append(f)
|
95
|
-
self.indent(root)
|
96
|
-
with open(dst, 'wb') as file_:
|
97
|
-
# print >>out, ET.tostring(root,encoding='utf-8')
|
98
|
-
print(ET.tostring(root), file=file_)
|
99
|
-
|
100
|
-
def indent(self, elem, level=0):
|
101
|
-
"""Indent output."""
|
102
|
-
i = "\n" + level * " "
|
103
|
-
if len(elem):
|
104
|
-
if not elem.text or not elem.text.strip():
|
105
|
-
elem.text = i + " "
|
106
|
-
if not elem.tail or not elem.tail.strip():
|
107
|
-
elem.tail = i
|
108
|
-
for elem in elem:
|
109
|
-
self.indent(elem, level + 1)
|
110
|
-
if not elem.tail or not elem.tail.strip():
|
111
|
-
elem.tail = i
|
112
|
-
else:
|
113
|
-
if level and (not elem.tail or not elem.tail.strip()):
|
114
|
-
elem.tail = i
|
115
|
-
|
116
|
-
def load_pronom_xml(self, puid_filter=None):
|
117
|
-
"""
|
118
|
-
Load the pronom XML from self.pronom_files and convert it to fido XML.
|
119
|
-
|
120
|
-
As a side-effect, set self.formats to a list of ElementTree.Element.
|
121
|
-
If a @param puid is specified, only that one will be loaded.
|
122
|
-
"""
|
123
|
-
formats = []
|
124
|
-
# for p in self.pronom_files:
|
125
|
-
# print p
|
126
|
-
# print self.pronom_files
|
127
|
-
# exit()
|
128
|
-
try:
|
129
|
-
zip = zipfile.ZipFile(self.pronom_files, 'r')
|
130
|
-
for item in zip.infolist():
|
131
|
-
# print item.filename
|
132
|
-
try:
|
133
|
-
stream = zip.open(item)
|
134
|
-
# Work is done here!
|
135
|
-
# if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
|
136
|
-
format_ = self.parse_pronom_xml(stream, puid_filter)
|
137
|
-
if format_ is not None:
|
138
|
-
formats.append(format_)
|
139
|
-
finally:
|
140
|
-
stream.close()
|
141
|
-
finally:
|
142
|
-
try:
|
143
|
-
zip.close()
|
144
|
-
except Exception as e:
|
145
|
-
print("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e), file=sys.stderr)
|
146
|
-
sys.exit()
|
147
|
-
# Replace the formatID with puids in has_priority_over
|
148
|
-
if puid_filter is None:
|
149
|
-
id_map = {}
|
150
|
-
for element in formats:
|
151
|
-
puid = element.find('puid').text
|
152
|
-
# print "working on puid:",puid
|
153
|
-
pronom_id = element.find('pronom_id').text
|
154
|
-
id_map[pronom_id] = puid
|
155
|
-
for element in formats:
|
156
|
-
for rel in element.findall('has_priority_over'):
|
157
|
-
rel.text = id_map[rel.text]
|
158
|
-
|
159
|
-
self._sort_formats(formats)
|
160
|
-
self.formats = formats
|
161
|
-
|
162
|
-
def parse_pronom_xml(self, source, puid_filter=None):
|
163
|
-
"""
|
164
|
-
Parse PRONOM XML and convert into FIDO XML.
|
165
|
-
|
166
|
-
If a @param puid is specified, only that one will be loaded.
|
167
|
-
@return ET.ElementTree Element representing it.
|
168
|
-
"""
|
169
|
-
pronom_xml = ET.parse(source)
|
170
|
-
pronom_root = pronom_xml.getroot()
|
171
|
-
pronom_format = pronom_root.find(TNA('report_format_detail/FileFormat'))
|
172
|
-
fido_format = ET.Element('format')
|
173
|
-
# Get the base Format information
|
174
|
-
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
175
|
-
type = get_text_tna(id, 'IdentifierType')
|
176
|
-
if type == 'PUID':
|
177
|
-
puid = get_text_tna(id, 'Identifier')
|
178
|
-
ET.SubElement(fido_format, 'puid').text = puid
|
179
|
-
if puid_filter and puid != puid_filter:
|
180
|
-
return None
|
181
|
-
# A bit clumsy. I want to have puid first, then mime, then container.
|
182
|
-
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
183
|
-
type = get_text_tna(id, 'IdentifierType')
|
184
|
-
if type == 'MIME':
|
185
|
-
ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
|
186
|
-
elif type == 'PUID':
|
187
|
-
puid = get_text_tna(id, 'Identifier')
|
188
|
-
if puid == 'x-fmt/263':
|
189
|
-
ET.SubElement(fido_format, 'container').text = 'zip'
|
190
|
-
elif puid == 'x-fmt/265':
|
191
|
-
ET.SubElement(fido_format, 'container').text = 'tar'
|
192
|
-
ET.SubElement(fido_format, 'name').text = get_text_tna(pronom_format, 'FormatName')
|
193
|
-
ET.SubElement(fido_format, 'version').text = get_text_tna(pronom_format, 'FormatVersion')
|
194
|
-
ET.SubElement(fido_format, 'alias').text = get_text_tna(pronom_format, 'FormatAliases')
|
195
|
-
ET.SubElement(fido_format, 'pronom_id').text = get_text_tna(pronom_format, 'FormatID')
|
196
|
-
# Get the extensions from the ExternalSignature
|
197
|
-
for x in pronom_format.findall(TNA('ExternalSignature')):
|
198
|
-
ET.SubElement(fido_format, 'extension').text = get_text_tna(x, 'Signature')
|
199
|
-
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
200
|
-
type = get_text_tna(id, 'IdentifierType')
|
201
|
-
if type == 'Apple Uniform Type Identifier':
|
202
|
-
ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
|
203
|
-
# Handle the relationships
|
204
|
-
for x in pronom_format.findall(TNA('RelatedFormat')):
|
205
|
-
rel = get_text_tna(x, 'RelationshipType')
|
206
|
-
if rel == 'Has priority over':
|
207
|
-
ET.SubElement(fido_format, 'has_priority_over').text = get_text_tna(x, 'RelatedFormatID')
|
208
|
-
# Get the InternalSignature information
|
209
|
-
for pronom_sig in pronom_format.findall(TNA('InternalSignature')):
|
210
|
-
fido_sig = ET.SubElement(fido_format, 'signature')
|
211
|
-
ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
|
212
|
-
# There are some funny chars in the notes, which caused me trouble and it is a unicode string,
|
213
|
-
ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
|
214
|
-
for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
|
215
|
-
fido_pat = ET.SubElement(fido_sig, 'pattern')
|
216
|
-
pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
|
217
|
-
bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
|
218
|
-
offset = get_text_tna(pronom_pat, 'Offset')
|
219
|
-
max_offset = get_text_tna(pronom_pat, 'MaxOffset')
|
220
|
-
if not max_offset:
|
221
|
-
pass
|
222
|
-
# print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
|
223
|
-
regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
|
224
|
-
# print "done puid", puid
|
225
|
-
if regex == "__INCOMPATIBLE_SIG__":
|
226
|
-
print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
|
227
|
-
# remove the empty 'signature' nodes
|
228
|
-
# now that the signature is not compatible and thus "regex" is empty
|
229
|
-
remove = fido_format.findall('signature')
|
230
|
-
for r in remove:
|
231
|
-
fido_format.remove(r)
|
232
|
-
continue
|
233
|
-
ET.SubElement(fido_pat, 'position').text = pos
|
234
|
-
ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
|
235
|
-
ET.SubElement(fido_pat, 'regex').text = regex
|
236
|
-
# Get the format details
|
237
|
-
fido_details = ET.SubElement(fido_format, 'details')
|
238
|
-
ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
|
239
|
-
ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
|
240
|
-
ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
|
241
|
-
ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
|
242
|
-
for x in pronom_format.findall(TNA('RelatedFormat')):
|
243
|
-
rel = get_text_tna(x, 'RelationshipType')
|
244
|
-
if rel == 'Is supertype of':
|
245
|
-
ET.SubElement(fido_details, 'is_supertype_of').text = get_text_tna(x, 'RelatedFormatID')
|
246
|
-
for x in pronom_format.findall(TNA('RelatedFormat')):
|
247
|
-
rel = get_text_tna(x, 'RelationshipType')
|
248
|
-
if rel == 'Is subtype of':
|
249
|
-
ET.SubElement(fido_details, 'is_subtype_of').text = get_text_tna(x, 'RelatedFormatID')
|
250
|
-
ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
|
251
|
-
# References
|
252
|
-
for x in pronom_format.findall(TNA("Document")):
|
253
|
-
r = ET.SubElement(fido_details, 'reference')
|
254
|
-
ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
|
255
|
-
ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
|
256
|
-
ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
|
257
|
-
ET.SubElement(r, 'dcterms:available').text = get_text_tna(x, 'PublicationDate')
|
258
|
-
for id in x.findall(TNA('DocumentIdentifier')):
|
259
|
-
type = get_text_tna(id, 'IdentifierType')
|
260
|
-
if type == 'URL':
|
261
|
-
ET.SubElement(r, 'dc:identifier').text = "http://" + get_text_tna(id, 'Identifier')
|
262
|
-
else:
|
263
|
-
ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
|
264
|
-
ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
|
265
|
-
ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
|
266
|
-
ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription') + " " + get_text_tna(x, 'AvailabilityNote')
|
267
|
-
ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
|
268
|
-
# Examples
|
269
|
-
for x in pronom_format.findall(TNA("ReferenceFile")):
|
270
|
-
rf = ET.SubElement(fido_details, 'example_file')
|
271
|
-
ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
|
272
|
-
ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
|
273
|
-
checksum = ""
|
274
|
-
for id in x.findall(TNA('ReferenceFileIdentifier')):
|
275
|
-
type = get_text_tna(id, 'IdentifierType')
|
276
|
-
if type == 'URL':
|
277
|
-
# Starting with PRONOM 89, some URLs contain http://
|
278
|
-
# and others do not.
|
279
|
-
url = get_text_tna(id, 'Identifier')
|
280
|
-
if not urlparse(url).scheme:
|
281
|
-
url = "http://" + url
|
282
|
-
ET.SubElement(rf, 'dc:identifier').text = url
|
283
|
-
# And calculate the checksum of this resource:
|
284
|
-
m = hashlib.md5()
|
285
|
-
sock = urlopen(url)
|
286
|
-
m.update(sock.read())
|
287
|
-
sock.close()
|
288
|
-
checksum = m.hexdigest()
|
289
|
-
else:
|
290
|
-
ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
|
291
|
-
ET.SubElement(rf, 'dcterms:license').text = ""
|
292
|
-
ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
|
293
|
-
checksumElement = ET.SubElement(rf, 'checksum')
|
294
|
-
checksumElement.text = checksum
|
295
|
-
checksumElement.attrib['type'] = "md5"
|
296
|
-
# Record Metadata
|
297
|
-
md = ET.SubElement(fido_details, 'record_metadata')
|
298
|
-
ET.SubElement(md, 'status').text = 'unknown'
|
299
|
-
ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
|
300
|
-
ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
|
301
|
-
ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
|
302
|
-
ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
|
303
|
-
return fido_format
|
304
|
-
|
305
|
-
# FIXME: I don't think that this quite works yet!
|
306
|
-
def _sort_formats(self, formatlist):
|
307
|
-
"""Sort the format list based on their priority relationships so higher priority formats appear earlier in the list."""
|
308
|
-
def compare_formats(f1, f2):
|
309
|
-
f1ID = f1.find('puid').text
|
310
|
-
f2ID = f2.find('puid').text
|
311
|
-
for worse in f1.findall('has_priority_over'):
|
312
|
-
if worse.text == f2ID:
|
313
|
-
return - 1
|
314
|
-
for worse in f2.findall('has_priority_over'):
|
315
|
-
if worse.text == f1ID:
|
316
|
-
return 1
|
317
|
-
if f1ID < f2ID:
|
318
|
-
return - 1
|
319
|
-
elif f1ID == f2ID:
|
320
|
-
return 0
|
321
|
-
else:
|
322
|
-
return 1
|
323
|
-
return sorted(formatlist, cmp=compare_formats)
|
324
|
-
|
325
|
-
|
326
|
-
def fido_position(pronom_position):
|
327
|
-
"""Return BOF/EOF/VAR instead of the more verbose pronom position names."""
|
328
|
-
if pronom_position == 'Absolute from BOF':
|
329
|
-
return 'BOF'
|
330
|
-
elif pronom_position == 'Absolute from EOF':
|
331
|
-
return 'EOF'
|
332
|
-
elif pronom_position == 'Variable':
|
333
|
-
return 'VAR'
|
334
|
-
elif pronom_position == 'Indirect From BOF':
|
335
|
-
return 'IFB'
|
336
|
-
else: # to make sure FIDO does not crash (IFB aftermath)
|
337
|
-
sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
|
338
|
-
return 'VAR'
|
339
|
-
|
340
|
-
|
341
|
-
def _convert_err_msg(msg, c, i, chars):
|
342
|
-
return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
|
343
|
-
|
344
|
-
|
345
|
-
def doByte(chars, i, littleendian):
|
346
|
-
"""
|
347
|
-
Convert two chars[i] and chars[i+1] into a byte.
|
348
|
-
|
349
|
-
@return a tuple (byte, 2)
|
350
|
-
"""
|
351
|
-
c1 = '0123456789ABCDEF'.find(chars[i].upper())
|
352
|
-
c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
|
353
|
-
if (c1 < 0 or c2 < 0):
|
354
|
-
raise Exception(_convert_err_msg('bad byte sequence', chars[i:i + 2], i, chars))
|
355
|
-
if littleendian:
|
356
|
-
val = chr(16 * c1 + c2)
|
357
|
-
else:
|
358
|
-
val = chr(c1 + 16 * c2)
|
359
|
-
return (escape(val), 2)
|
360
|
-
|
361
|
-
|
362
|
-
def _escape_char(c):
|
363
|
-
if c in '\n':
|
364
|
-
return '\\n'
|
365
|
-
elif c == '\r':
|
366
|
-
return '\\r'
|
367
|
-
elif c in _special:
|
368
|
-
return '\\' + c
|
369
|
-
else:
|
370
|
-
(high, low) = divmod(ord(c), 16)
|
371
|
-
return '\\x' + _hex[high] + _hex[low]
|
372
|
-
|
373
|
-
|
374
|
-
def escape(string):
|
375
|
-
"""Escape characters in pattern that are non-printable, non-ascii, or special for regexes."""
|
376
|
-
return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
|
377
|
-
|
378
|
-
|
379
|
-
def calculate_repetition(char, pos, offset, maxoffset):
|
380
|
-
"""Recursively calculates offset/maxoffset repetition, when one or both offsets is greater than 65535 bytes (64KB). See: https://bugs.python.org/issue13169."""
|
381
|
-
calcbuf = cStringIO()
|
382
|
-
|
383
|
-
calcremain = False
|
384
|
-
offsetremain = 0
|
385
|
-
maxoffsetremain = 0
|
386
|
-
|
387
|
-
if offset is not None and int(offset) > 65535:
|
388
|
-
offsetremain = str(int(offset) - 65535)
|
389
|
-
offset = '65535'
|
390
|
-
calcremain = True
|
391
|
-
if maxoffset is not None and int(maxoffset) > 65535:
|
392
|
-
maxoffsetremain = str(int(maxoffset) - 65535)
|
393
|
-
maxoffset = '65535'
|
394
|
-
calcremain = True
|
395
|
-
|
396
|
-
if pos == "BOF" or pos == "EOF":
|
397
|
-
if offset != '0':
|
398
|
-
calcbuf.write(char + '{' + str(offset))
|
399
|
-
if maxoffset is not None:
|
400
|
-
calcbuf.write(',' + maxoffset)
|
401
|
-
calcbuf.write('}')
|
402
|
-
elif maxoffset is not None:
|
403
|
-
calcbuf.write(char + '{0,' + maxoffset + '}')
|
404
|
-
|
405
|
-
if pos == "IFB":
|
406
|
-
if offset != '0':
|
407
|
-
calcbuf.write(char + '{' + str(offset))
|
408
|
-
if maxoffset is not None:
|
409
|
-
calcbuf.write(',' + maxoffset)
|
410
|
-
calcbuf.write('}')
|
411
|
-
if maxoffset is not None:
|
412
|
-
calcbuf.write(',}')
|
413
|
-
elif maxoffset is not None:
|
414
|
-
calcbuf.write(char + '{0,' + maxoffset + '}')
|
415
|
-
|
416
|
-
if calcremain: # recursion happens here
|
417
|
-
calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
|
418
|
-
|
419
|
-
val = calcbuf.getvalue()
|
420
|
-
calcbuf.close()
|
421
|
-
return val
|
422
|
-
|
423
|
-
|
424
|
-
def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
425
|
-
"""
|
426
|
-
Convert to regular expression.
|
427
|
-
|
428
|
-
Endianness is not used.
|
429
|
-
|
430
|
-
@param chars, a pronom bytesequence, into a
|
431
|
-
@return regular expression.
|
432
|
-
"""
|
433
|
-
if 'Big' in endianness:
|
434
|
-
littleendian = False
|
435
|
-
else:
|
436
|
-
littleendian = True
|
437
|
-
if len(offset) == 0:
|
438
|
-
offset = '0'
|
439
|
-
if len(maxoffset) == 0:
|
440
|
-
maxoffset = None
|
441
|
-
if maxoffset == '0':
|
442
|
-
maxoffset = None
|
443
|
-
# make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
|
444
|
-
global buf
|
445
|
-
buf = cStringIO()
|
446
|
-
buf.write("(?s)") # If a regex starts with (?s), it is equivalent to DOTALL.
|
447
|
-
i = 0
|
448
|
-
state = 'start'
|
449
|
-
if 'BOF' in pos:
|
450
|
-
buf.write('\\A') # start of regex
|
451
|
-
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
452
|
-
|
453
|
-
if 'IFB' in pos:
|
454
|
-
buf.write('\\A')
|
455
|
-
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
456
|
-
|
457
|
-
while True:
|
458
|
-
if i == len(chars):
|
459
|
-
break
|
460
|
-
# print _convert_err_msg(state,chars[i],i,chars)
|
461
|
-
if state == 'start':
|
462
|
-
if chars[i].isalnum():
|
463
|
-
state = 'bytes'
|
464
|
-
elif chars[i] == '[' and chars[i + 1] == '!':
|
465
|
-
state = 'non-match'
|
466
|
-
elif chars[i] == '[':
|
467
|
-
state = 'bracket'
|
468
|
-
elif chars[i] == '{':
|
469
|
-
state = 'curly'
|
470
|
-
elif chars[i] == '(':
|
471
|
-
state = 'paren'
|
472
|
-
elif chars[i] in '*+?':
|
473
|
-
state = 'specials'
|
474
|
-
else:
|
475
|
-
raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars))
|
476
|
-
elif state == 'bytes':
|
477
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
478
|
-
buf.write(byt)
|
479
|
-
i += inc
|
480
|
-
state = 'start'
|
481
|
-
elif state == 'non-match':
|
482
|
-
buf.write('(!')
|
483
|
-
i += 2
|
484
|
-
while True:
|
485
|
-
if chars[i].isalnum():
|
486
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
487
|
-
buf.write(byt)
|
488
|
-
i += inc
|
489
|
-
elif chars[i] == ']':
|
490
|
-
break
|
491
|
-
else:
|
492
|
-
raise Exception(_convert_err_msg('Illegal character in non-match', chars[i], i, chars))
|
493
|
-
buf.write(')')
|
494
|
-
i += 1
|
495
|
-
state = 'start'
|
496
|
-
|
497
|
-
elif state == 'bracket':
|
498
|
-
try:
|
499
|
-
buf.write('[')
|
500
|
-
i += 1
|
501
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
502
|
-
buf.write(byt)
|
503
|
-
i += inc
|
504
|
-
# assert(chars[i] == ':')
|
505
|
-
if chars[i] != ':':
|
506
|
-
return "__INCOMPATIBLE_SIG__"
|
507
|
-
buf.write('-')
|
508
|
-
i += 1
|
509
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
510
|
-
buf.write(byt)
|
511
|
-
i += inc
|
512
|
-
# assert(chars[i] == ']')
|
513
|
-
if chars[i] != ']':
|
514
|
-
return "__INCOMPATIBLE_SIG__"
|
515
|
-
buf.write(']')
|
516
|
-
i += 1
|
517
|
-
except Exception:
|
518
|
-
print(_convert_err_msg('Illegal character in bracket', chars[i], i, chars))
|
519
|
-
raise
|
520
|
-
if i < len(chars) and chars[i] == '{':
|
521
|
-
state = 'curly-after-bracket'
|
522
|
-
else:
|
523
|
-
state = 'start'
|
524
|
-
elif state == 'paren':
|
525
|
-
buf.write('(?:')
|
526
|
-
i += 1
|
527
|
-
while True:
|
528
|
-
if chars[i].isalnum():
|
529
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
530
|
-
buf.write(byt)
|
531
|
-
i += inc
|
532
|
-
elif chars[i] == '|':
|
533
|
-
buf.write('|')
|
534
|
-
i += 1
|
535
|
-
elif chars[i] == ')':
|
536
|
-
break
|
537
|
-
# START fix FIDO-20
|
538
|
-
elif chars[i] == '[':
|
539
|
-
buf.write('[')
|
540
|
-
i += 1
|
541
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
542
|
-
buf.write(byt)
|
543
|
-
i += inc
|
544
|
-
# assert(chars[i] == ':')
|
545
|
-
if chars[i] != ':':
|
546
|
-
return "__INCOMPATIBLE_SIG__"
|
547
|
-
buf.write('-')
|
548
|
-
i += 1
|
549
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
550
|
-
buf.write(byt)
|
551
|
-
i += inc
|
552
|
-
|
553
|
-
# assert(chars[i] == ']')
|
554
|
-
if chars[i] != ']':
|
555
|
-
return "__INCOMPATIBLE_SIG__"
|
556
|
-
buf.write(']')
|
557
|
-
i += 1
|
558
|
-
else:
|
559
|
-
raise Exception(_convert_err_msg(('Current state = \'{0}\' : Illegal character in paren').format(state), chars[i], i, chars))
|
560
|
-
buf.write(')')
|
561
|
-
i += 1
|
562
|
-
state = 'start'
|
563
|
-
# END fix FIDO-20
|
564
|
-
elif state in ['curly', 'curly-after-bracket']:
|
565
|
-
# {nnnn} or {nnn-nnn} or {nnn-*}
|
566
|
-
# {nnn} or {nnn,nnn} or {nnn,}
|
567
|
-
# when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
|
568
|
-
# The above, while sensible, appears to be incorrect. A '.' is always needed.
|
569
|
-
# for droid equiv behavior
|
570
|
-
# if state == 'curly':
|
571
|
-
buf.write('.')
|
572
|
-
buf.write('{')
|
573
|
-
i += 1 # skip the (
|
574
|
-
while True:
|
575
|
-
if chars[i].isalnum():
|
576
|
-
buf.write(chars[i])
|
577
|
-
i += 1
|
578
|
-
elif chars[i] == '-':
|
579
|
-
buf.write(',')
|
580
|
-
i += 1
|
581
|
-
elif chars[i] == '*': # skip the *
|
582
|
-
i += 1
|
583
|
-
elif chars[i] == '}':
|
584
|
-
break
|
585
|
-
else:
|
586
|
-
raise Exception(_convert_err_msg('Illegal character in curly', chars[i], i, chars))
|
587
|
-
buf.write('}')
|
588
|
-
i += 1 # skip the )
|
589
|
-
state = 'start'
|
590
|
-
elif state == 'specials':
|
591
|
-
if chars[i] == '*':
|
592
|
-
buf.write('.*')
|
593
|
-
i += 1
|
594
|
-
elif chars[i] == '+':
|
595
|
-
buf.write('.+')
|
596
|
-
i += 1
|
597
|
-
elif chars[i] == '?':
|
598
|
-
if chars[i + 1] != '?':
|
599
|
-
raise Exception(_convert_err_msg('Illegal character after ?', chars[i + 1], i + 1, chars))
|
600
|
-
buf.write('.?')
|
601
|
-
i += 2
|
602
|
-
state = 'start'
|
603
|
-
else:
|
604
|
-
raise Exception('Illegal state {0}'.format(state))
|
605
|
-
|
606
|
-
if 'EOF' in pos:
|
607
|
-
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
608
|
-
buf.write('\\Z')
|
609
|
-
|
610
|
-
val = buf.getvalue()
|
611
|
-
buf.close()
|
612
|
-
return val
|
613
|
-
|
614
|
-
|
615
|
-
def run(input=None, output=None, puid=None):
|
616
|
-
"""Convert PRONOM formats into FIDO signatures."""
|
617
|
-
versions = get_local_pronom_versions()
|
618
|
-
|
619
|
-
if input is None:
|
620
|
-
input = versions.get_zip_file()
|
621
|
-
if output is None:
|
622
|
-
output = versions.get_signature_file()
|
623
|
-
|
624
|
-
info = FormatInfo(input)
|
625
|
-
info.load_pronom_xml(puid)
|
626
|
-
info.save(output)
|
627
|
-
print('Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats)), file=sys.stderr)
|
628
|
-
|
629
|
-
|
630
|
-
def main(args=None):
|
631
|
-
"""Main CLI entrypoint."""
|
632
|
-
if args is None:
|
633
|
-
args = sys.argv[1:]
|
634
|
-
|
635
|
-
parser = ArgumentParser(description='Produce the FIDO format XML that is loaded at run-time')
|
636
|
-
parser.add_argument('-input', default=None, help='Input file, a Zip containing PRONOM XML files')
|
637
|
-
parser.add_argument('-output', default=None, help='Ouptut file')
|
638
|
-
parser.add_argument('-puid', default=None, help='A particular PUID record to extract')
|
639
|
-
args = parser.parse_args(args)
|
640
|
-
|
641
|
-
run(input=args.input, output=args.output, puid=args.puid)
|
642
|
-
|
643
|
-
|
644
|
-
if __name__ == '__main__':
|
645
|
-
main()
|