libis-format 0.9.32 → 0.9.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/data/types.yml +30 -16
- data/lib/libis/format/config.rb +7 -18
- data/lib/libis/format/converter/image_converter.rb +6 -0
- data/lib/libis/format/droid.rb +82 -25
- data/lib/libis/format/extension_identification.rb +55 -0
- data/lib/libis/format/fido.rb +57 -72
- data/lib/libis/format/file_tool.rb +76 -0
- data/lib/libis/format/identification_tool.rb +174 -0
- data/lib/libis/format/identifier.rb +129 -117
- data/lib/libis/format/type_database.rb +36 -5
- data/lib/libis/format/version.rb +1 -1
- data/lib/libis/format.rb +3 -0
- data/libis-format.gemspec +2 -1
- data/spec/converter_spec.rb +6 -4
- data/spec/identifier_spec.rb +125 -34
- metadata +21 -126
- data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
- data/tools/droid/container-signature-20170330.xml +0 -3584
- data/tools/droid/droid-command-line-6.3.jar +0 -0
- data/tools/droid/droid.bat +0 -152
- data/tools/droid/droid.sh +0 -152
- data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
- data/tools/droid/lib/activation-1.1.jar +0 -0
- data/tools/droid/lib/aopalliance-1.0.jar +0 -0
- data/tools/droid/lib/asm-2.2.3.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
- data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
- data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
- data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
- data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
- data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
- data/tools/droid/lib/classmate-1.0.0.jar +0 -0
- data/tools/droid/lib/commons-cli-1.2.jar +0 -0
- data/tools/droid/lib/commons-codec-1.10.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
- data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
- data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
- data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
- data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
- data/tools/droid/lib/commons-io-2.4.jar +0 -0
- data/tools/droid/lib/commons-lang-2.6.jar +0 -0
- data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
- data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
- data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
- data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
- data/tools/droid/lib/droid-container-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-help-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-results-6.3.jar +0 -0
- data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
- data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
- data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
- data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
- data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
- data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
- data/tools/droid/lib/itext-2.0.8.jar +0 -0
- data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
- data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
- data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
- data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
- data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
- data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
- data/tools/droid/lib/jta-1.1.jar +0 -0
- data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
- data/tools/droid/lib/log4j-1.2.13.jar +0 -0
- data/tools/droid/lib/neethi-2.0.4.jar +0 -0
- data/tools/droid/lib/opencsv-2.3.jar +0 -0
- data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
- data/tools/droid/lib/poi-3.13.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
- data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
- data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
- data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
- data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
- data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
- data/tools/droid/lib/truezip-6.8.4.jar +0 -0
- data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
- data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
- data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
- data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
- data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
- data/tools/droid/lib/xz-1.0.jar +0 -0
- data/tools/fido/__init__.py +0 -50
- data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
- data/tools/fido/conf/container-signature-20170330.xml +0 -3584
- data/tools/fido/conf/dc.xsd +0 -119
- data/tools/fido/conf/dcmitype.xsd +0 -53
- data/tools/fido/conf/dcterms.xsd +0 -383
- data/tools/fido/conf/fido-formats.xsd +0 -173
- data/tools/fido/conf/format_extension_template.xml +0 -105
- data/tools/fido/conf/format_extensions.xml +0 -484
- data/tools/fido/conf/formats-v90.xml +0 -48877
- data/tools/fido/conf/pronom-xml-v90.zip +0 -0
- data/tools/fido/conf/versions.xml +0 -8
- data/tools/fido/fido.bat +0 -4
- data/tools/fido/fido.py +0 -884
- data/tools/fido/fido.sh +0 -5
- data/tools/fido/package.py +0 -96
- data/tools/fido/prepare.py +0 -645
- data/tools/fido/pronomutils.py +0 -200
- data/tools/fido/toxml.py +0 -60
- data/tools/fido/update_signatures.py +0 -183
data/tools/fido/prepare.py
DELETED
|
@@ -1,645 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
"""Format Identification for Digital Objects."""
|
|
5
|
-
|
|
6
|
-
from __future__ import print_function
|
|
7
|
-
|
|
8
|
-
from argparse import ArgumentParser
|
|
9
|
-
import hashlib
|
|
10
|
-
import sys
|
|
11
|
-
from xml.dom import minidom
|
|
12
|
-
from xml.etree import ElementTree as ET
|
|
13
|
-
import zipfile
|
|
14
|
-
|
|
15
|
-
from six.moves import cStringIO
|
|
16
|
-
from six.moves.urllib.request import urlopen
|
|
17
|
-
from six.moves.urllib.parse import urlparse
|
|
18
|
-
|
|
19
|
-
from .pronomutils import get_local_pronom_versions
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
# \a\b\n\r\t\v
|
|
23
|
-
# MdR: took out '<' and '>' out of _ordinary because they were converted to entities <>
|
|
24
|
-
# MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
|
|
25
|
-
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
|
26
|
-
_special = '$()*+.?![]^\\{|}'
|
|
27
|
-
_hex = '0123456789abcdef'
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class NS:
|
|
31
|
-
"""
|
|
32
|
-
Helper class for XML name spaces in ElementTree.
|
|
33
|
-
|
|
34
|
-
Use like MYNS=NS("{http://some/uri}") and then MYNS(tag1/tag2).
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
def __init__(self, uri):
|
|
38
|
-
"""Instantiate class with `uri` argument."""
|
|
39
|
-
self.uri = uri
|
|
40
|
-
|
|
41
|
-
def __getattr__(self, tag):
|
|
42
|
-
"""Append URI to the class attributes."""
|
|
43
|
-
return self.uri + tag
|
|
44
|
-
|
|
45
|
-
def __call__(self, path):
|
|
46
|
-
"""Define behavior when the instant is used as a function."""
|
|
47
|
-
return "/".join(getattr(self, tag) for tag in path.split("/"))
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
XHTML = NS("{http://www.w3.org/1999/xhtml}") # XHTML namespace
|
|
51
|
-
TNA = NS("{http://pronom.nationalarchives.gov.uk}") # TNA namespace
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def get_text_tna(element, tag, default=''):
|
|
55
|
-
"""Helper function to return the text for a tag or path using the TNA namespace."""
|
|
56
|
-
part = element.find(TNA(tag))
|
|
57
|
-
if part is None or part.text is None:
|
|
58
|
-
return default
|
|
59
|
-
return part.text.strip()
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def prettify(elem):
|
|
63
|
-
"""Return a pretty-printed XML string for the Element."""
|
|
64
|
-
rough_string = ET.tostring(elem, 'UTF-8')
|
|
65
|
-
reparsed = minidom.parseString(rough_string)
|
|
66
|
-
return reparsed.toprettyxml(indent=" ")
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
class FormatInfo:
|
|
70
|
-
"""Convert PRONOM formats into FIDO signatures."""
|
|
71
|
-
|
|
72
|
-
def __init__(self, pronom_files, format_list=[]):
|
|
73
|
-
"""Instantiate class, take a list of PRONOM files and an optional list of formats."""
|
|
74
|
-
self.info = {}
|
|
75
|
-
self.formats = []
|
|
76
|
-
self.pronom_files = pronom_files
|
|
77
|
-
for f in format_list:
|
|
78
|
-
self.add_format(f) # FIXME: add_format is undefined!
|
|
79
|
-
|
|
80
|
-
def save(self, dst=sys.stdout):
|
|
81
|
-
"""Write the fido XML format definitions to @param dst."""
|
|
82
|
-
tree = ET.ElementTree(ET.Element('formats', {
|
|
83
|
-
'version': '0.3',
|
|
84
|
-
'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance",
|
|
85
|
-
'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
|
|
86
|
-
'xmlns:dc': "http://purl.org/dc/elements/1.1/",
|
|
87
|
-
'xmlns:dcterms': "http://purl.org/dc/terms/"
|
|
88
|
-
}))
|
|
89
|
-
root = tree.getroot()
|
|
90
|
-
for f in self.formats:
|
|
91
|
-
# MdR: this skipped puids without sig, but we want them ALL
|
|
92
|
-
# because puid might be matched on extension
|
|
93
|
-
# if f.find('signature'):
|
|
94
|
-
root.append(f)
|
|
95
|
-
self.indent(root)
|
|
96
|
-
with open(dst, 'wb') as file_:
|
|
97
|
-
# print >>out, ET.tostring(root,encoding='utf-8')
|
|
98
|
-
print(ET.tostring(root), file=file_)
|
|
99
|
-
|
|
100
|
-
def indent(self, elem, level=0):
|
|
101
|
-
"""Indent output."""
|
|
102
|
-
i = "\n" + level * " "
|
|
103
|
-
if len(elem):
|
|
104
|
-
if not elem.text or not elem.text.strip():
|
|
105
|
-
elem.text = i + " "
|
|
106
|
-
if not elem.tail or not elem.tail.strip():
|
|
107
|
-
elem.tail = i
|
|
108
|
-
for elem in elem:
|
|
109
|
-
self.indent(elem, level + 1)
|
|
110
|
-
if not elem.tail or not elem.tail.strip():
|
|
111
|
-
elem.tail = i
|
|
112
|
-
else:
|
|
113
|
-
if level and (not elem.tail or not elem.tail.strip()):
|
|
114
|
-
elem.tail = i
|
|
115
|
-
|
|
116
|
-
def load_pronom_xml(self, puid_filter=None):
|
|
117
|
-
"""
|
|
118
|
-
Load the pronom XML from self.pronom_files and convert it to fido XML.
|
|
119
|
-
|
|
120
|
-
As a side-effect, set self.formats to a list of ElementTree.Element.
|
|
121
|
-
If a @param puid is specified, only that one will be loaded.
|
|
122
|
-
"""
|
|
123
|
-
formats = []
|
|
124
|
-
# for p in self.pronom_files:
|
|
125
|
-
# print p
|
|
126
|
-
# print self.pronom_files
|
|
127
|
-
# exit()
|
|
128
|
-
try:
|
|
129
|
-
zip = zipfile.ZipFile(self.pronom_files, 'r')
|
|
130
|
-
for item in zip.infolist():
|
|
131
|
-
# print item.filename
|
|
132
|
-
try:
|
|
133
|
-
stream = zip.open(item)
|
|
134
|
-
# Work is done here!
|
|
135
|
-
# if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
|
|
136
|
-
format_ = self.parse_pronom_xml(stream, puid_filter)
|
|
137
|
-
if format_ is not None:
|
|
138
|
-
formats.append(format_)
|
|
139
|
-
finally:
|
|
140
|
-
stream.close()
|
|
141
|
-
finally:
|
|
142
|
-
try:
|
|
143
|
-
zip.close()
|
|
144
|
-
except Exception as e:
|
|
145
|
-
print("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e), file=sys.stderr)
|
|
146
|
-
sys.exit()
|
|
147
|
-
# Replace the formatID with puids in has_priority_over
|
|
148
|
-
if puid_filter is None:
|
|
149
|
-
id_map = {}
|
|
150
|
-
for element in formats:
|
|
151
|
-
puid = element.find('puid').text
|
|
152
|
-
# print "working on puid:",puid
|
|
153
|
-
pronom_id = element.find('pronom_id').text
|
|
154
|
-
id_map[pronom_id] = puid
|
|
155
|
-
for element in formats:
|
|
156
|
-
for rel in element.findall('has_priority_over'):
|
|
157
|
-
rel.text = id_map[rel.text]
|
|
158
|
-
|
|
159
|
-
self._sort_formats(formats)
|
|
160
|
-
self.formats = formats
|
|
161
|
-
|
|
162
|
-
def parse_pronom_xml(self, source, puid_filter=None):
|
|
163
|
-
"""
|
|
164
|
-
Parse PRONOM XML and convert into FIDO XML.
|
|
165
|
-
|
|
166
|
-
If a @param puid is specified, only that one will be loaded.
|
|
167
|
-
@return ET.ElementTree Element representing it.
|
|
168
|
-
"""
|
|
169
|
-
pronom_xml = ET.parse(source)
|
|
170
|
-
pronom_root = pronom_xml.getroot()
|
|
171
|
-
pronom_format = pronom_root.find(TNA('report_format_detail/FileFormat'))
|
|
172
|
-
fido_format = ET.Element('format')
|
|
173
|
-
# Get the base Format information
|
|
174
|
-
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
|
175
|
-
type = get_text_tna(id, 'IdentifierType')
|
|
176
|
-
if type == 'PUID':
|
|
177
|
-
puid = get_text_tna(id, 'Identifier')
|
|
178
|
-
ET.SubElement(fido_format, 'puid').text = puid
|
|
179
|
-
if puid_filter and puid != puid_filter:
|
|
180
|
-
return None
|
|
181
|
-
# A bit clumsy. I want to have puid first, then mime, then container.
|
|
182
|
-
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
|
183
|
-
type = get_text_tna(id, 'IdentifierType')
|
|
184
|
-
if type == 'MIME':
|
|
185
|
-
ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
|
|
186
|
-
elif type == 'PUID':
|
|
187
|
-
puid = get_text_tna(id, 'Identifier')
|
|
188
|
-
if puid == 'x-fmt/263':
|
|
189
|
-
ET.SubElement(fido_format, 'container').text = 'zip'
|
|
190
|
-
elif puid == 'x-fmt/265':
|
|
191
|
-
ET.SubElement(fido_format, 'container').text = 'tar'
|
|
192
|
-
ET.SubElement(fido_format, 'name').text = get_text_tna(pronom_format, 'FormatName')
|
|
193
|
-
ET.SubElement(fido_format, 'version').text = get_text_tna(pronom_format, 'FormatVersion')
|
|
194
|
-
ET.SubElement(fido_format, 'alias').text = get_text_tna(pronom_format, 'FormatAliases')
|
|
195
|
-
ET.SubElement(fido_format, 'pronom_id').text = get_text_tna(pronom_format, 'FormatID')
|
|
196
|
-
# Get the extensions from the ExternalSignature
|
|
197
|
-
for x in pronom_format.findall(TNA('ExternalSignature')):
|
|
198
|
-
ET.SubElement(fido_format, 'extension').text = get_text_tna(x, 'Signature')
|
|
199
|
-
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
|
200
|
-
type = get_text_tna(id, 'IdentifierType')
|
|
201
|
-
if type == 'Apple Uniform Type Identifier':
|
|
202
|
-
ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
|
|
203
|
-
# Handle the relationships
|
|
204
|
-
for x in pronom_format.findall(TNA('RelatedFormat')):
|
|
205
|
-
rel = get_text_tna(x, 'RelationshipType')
|
|
206
|
-
if rel == 'Has priority over':
|
|
207
|
-
ET.SubElement(fido_format, 'has_priority_over').text = get_text_tna(x, 'RelatedFormatID')
|
|
208
|
-
# Get the InternalSignature information
|
|
209
|
-
for pronom_sig in pronom_format.findall(TNA('InternalSignature')):
|
|
210
|
-
fido_sig = ET.SubElement(fido_format, 'signature')
|
|
211
|
-
ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
|
|
212
|
-
# There are some funny chars in the notes, which caused me trouble and it is a unicode string,
|
|
213
|
-
ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
|
|
214
|
-
for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
|
|
215
|
-
fido_pat = ET.SubElement(fido_sig, 'pattern')
|
|
216
|
-
pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
|
|
217
|
-
bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
|
|
218
|
-
offset = get_text_tna(pronom_pat, 'Offset')
|
|
219
|
-
max_offset = get_text_tna(pronom_pat, 'MaxOffset')
|
|
220
|
-
if not max_offset:
|
|
221
|
-
pass
|
|
222
|
-
# print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
|
|
223
|
-
regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
|
|
224
|
-
# print "done puid", puid
|
|
225
|
-
if regex == "__INCOMPATIBLE_SIG__":
|
|
226
|
-
print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
|
|
227
|
-
# remove the empty 'signature' nodes
|
|
228
|
-
# now that the signature is not compatible and thus "regex" is empty
|
|
229
|
-
remove = fido_format.findall('signature')
|
|
230
|
-
for r in remove:
|
|
231
|
-
fido_format.remove(r)
|
|
232
|
-
continue
|
|
233
|
-
ET.SubElement(fido_pat, 'position').text = pos
|
|
234
|
-
ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
|
|
235
|
-
ET.SubElement(fido_pat, 'regex').text = regex
|
|
236
|
-
# Get the format details
|
|
237
|
-
fido_details = ET.SubElement(fido_format, 'details')
|
|
238
|
-
ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
|
|
239
|
-
ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
|
|
240
|
-
ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
|
|
241
|
-
ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
|
|
242
|
-
for x in pronom_format.findall(TNA('RelatedFormat')):
|
|
243
|
-
rel = get_text_tna(x, 'RelationshipType')
|
|
244
|
-
if rel == 'Is supertype of':
|
|
245
|
-
ET.SubElement(fido_details, 'is_supertype_of').text = get_text_tna(x, 'RelatedFormatID')
|
|
246
|
-
for x in pronom_format.findall(TNA('RelatedFormat')):
|
|
247
|
-
rel = get_text_tna(x, 'RelationshipType')
|
|
248
|
-
if rel == 'Is subtype of':
|
|
249
|
-
ET.SubElement(fido_details, 'is_subtype_of').text = get_text_tna(x, 'RelatedFormatID')
|
|
250
|
-
ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
|
|
251
|
-
# References
|
|
252
|
-
for x in pronom_format.findall(TNA("Document")):
|
|
253
|
-
r = ET.SubElement(fido_details, 'reference')
|
|
254
|
-
ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
|
|
255
|
-
ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
|
|
256
|
-
ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
|
|
257
|
-
ET.SubElement(r, 'dcterms:available').text = get_text_tna(x, 'PublicationDate')
|
|
258
|
-
for id in x.findall(TNA('DocumentIdentifier')):
|
|
259
|
-
type = get_text_tna(id, 'IdentifierType')
|
|
260
|
-
if type == 'URL':
|
|
261
|
-
ET.SubElement(r, 'dc:identifier').text = "http://" + get_text_tna(id, 'Identifier')
|
|
262
|
-
else:
|
|
263
|
-
ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
|
|
264
|
-
ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
|
|
265
|
-
ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
|
|
266
|
-
ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription') + " " + get_text_tna(x, 'AvailabilityNote')
|
|
267
|
-
ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
|
|
268
|
-
# Examples
|
|
269
|
-
for x in pronom_format.findall(TNA("ReferenceFile")):
|
|
270
|
-
rf = ET.SubElement(fido_details, 'example_file')
|
|
271
|
-
ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
|
|
272
|
-
ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
|
|
273
|
-
checksum = ""
|
|
274
|
-
for id in x.findall(TNA('ReferenceFileIdentifier')):
|
|
275
|
-
type = get_text_tna(id, 'IdentifierType')
|
|
276
|
-
if type == 'URL':
|
|
277
|
-
# Starting with PRONOM 89, some URLs contain http://
|
|
278
|
-
# and others do not.
|
|
279
|
-
url = get_text_tna(id, 'Identifier')
|
|
280
|
-
if not urlparse(url).scheme:
|
|
281
|
-
url = "http://" + url
|
|
282
|
-
ET.SubElement(rf, 'dc:identifier').text = url
|
|
283
|
-
# And calculate the checksum of this resource:
|
|
284
|
-
m = hashlib.md5()
|
|
285
|
-
sock = urlopen(url)
|
|
286
|
-
m.update(sock.read())
|
|
287
|
-
sock.close()
|
|
288
|
-
checksum = m.hexdigest()
|
|
289
|
-
else:
|
|
290
|
-
ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
|
|
291
|
-
ET.SubElement(rf, 'dcterms:license').text = ""
|
|
292
|
-
ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
|
|
293
|
-
checksumElement = ET.SubElement(rf, 'checksum')
|
|
294
|
-
checksumElement.text = checksum
|
|
295
|
-
checksumElement.attrib['type'] = "md5"
|
|
296
|
-
# Record Metadata
|
|
297
|
-
md = ET.SubElement(fido_details, 'record_metadata')
|
|
298
|
-
ET.SubElement(md, 'status').text = 'unknown'
|
|
299
|
-
ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
|
|
300
|
-
ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
|
|
301
|
-
ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
|
|
302
|
-
ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
|
|
303
|
-
return fido_format
|
|
304
|
-
|
|
305
|
-
# FIXME: I don't think that this quite works yet!
|
|
306
|
-
def _sort_formats(self, formatlist):
|
|
307
|
-
"""Sort the format list based on their priority relationships so higher priority formats appear earlier in the list."""
|
|
308
|
-
def compare_formats(f1, f2):
|
|
309
|
-
f1ID = f1.find('puid').text
|
|
310
|
-
f2ID = f2.find('puid').text
|
|
311
|
-
for worse in f1.findall('has_priority_over'):
|
|
312
|
-
if worse.text == f2ID:
|
|
313
|
-
return - 1
|
|
314
|
-
for worse in f2.findall('has_priority_over'):
|
|
315
|
-
if worse.text == f1ID:
|
|
316
|
-
return 1
|
|
317
|
-
if f1ID < f2ID:
|
|
318
|
-
return - 1
|
|
319
|
-
elif f1ID == f2ID:
|
|
320
|
-
return 0
|
|
321
|
-
else:
|
|
322
|
-
return 1
|
|
323
|
-
return sorted(formatlist, cmp=compare_formats)
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
def fido_position(pronom_position):
|
|
327
|
-
"""Return BOF/EOF/VAR instead of the more verbose pronom position names."""
|
|
328
|
-
if pronom_position == 'Absolute from BOF':
|
|
329
|
-
return 'BOF'
|
|
330
|
-
elif pronom_position == 'Absolute from EOF':
|
|
331
|
-
return 'EOF'
|
|
332
|
-
elif pronom_position == 'Variable':
|
|
333
|
-
return 'VAR'
|
|
334
|
-
elif pronom_position == 'Indirect From BOF':
|
|
335
|
-
return 'IFB'
|
|
336
|
-
else: # to make sure FIDO does not crash (IFB aftermath)
|
|
337
|
-
sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
|
|
338
|
-
return 'VAR'
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
def _convert_err_msg(msg, c, i, chars):
|
|
342
|
-
return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
def doByte(chars, i, littleendian):
|
|
346
|
-
"""
|
|
347
|
-
Convert two chars[i] and chars[i+1] into a byte.
|
|
348
|
-
|
|
349
|
-
@return a tuple (byte, 2)
|
|
350
|
-
"""
|
|
351
|
-
c1 = '0123456789ABCDEF'.find(chars[i].upper())
|
|
352
|
-
c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
|
|
353
|
-
if (c1 < 0 or c2 < 0):
|
|
354
|
-
raise Exception(_convert_err_msg('bad byte sequence', chars[i:i + 2], i, chars))
|
|
355
|
-
if littleendian:
|
|
356
|
-
val = chr(16 * c1 + c2)
|
|
357
|
-
else:
|
|
358
|
-
val = chr(c1 + 16 * c2)
|
|
359
|
-
return (escape(val), 2)
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
def _escape_char(c):
|
|
363
|
-
if c in '\n':
|
|
364
|
-
return '\\n'
|
|
365
|
-
elif c == '\r':
|
|
366
|
-
return '\\r'
|
|
367
|
-
elif c in _special:
|
|
368
|
-
return '\\' + c
|
|
369
|
-
else:
|
|
370
|
-
(high, low) = divmod(ord(c), 16)
|
|
371
|
-
return '\\x' + _hex[high] + _hex[low]
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
def escape(string):
|
|
375
|
-
"""Escape characters in pattern that are non-printable, non-ascii, or special for regexes."""
|
|
376
|
-
return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
def calculate_repetition(char, pos, offset, maxoffset):
|
|
380
|
-
"""Recursively calculates offset/maxoffset repetition, when one or both offsets is greater than 65535 bytes (64KB). See: https://bugs.python.org/issue13169."""
|
|
381
|
-
calcbuf = cStringIO()
|
|
382
|
-
|
|
383
|
-
calcremain = False
|
|
384
|
-
offsetremain = 0
|
|
385
|
-
maxoffsetremain = 0
|
|
386
|
-
|
|
387
|
-
if offset is not None and int(offset) > 65535:
|
|
388
|
-
offsetremain = str(int(offset) - 65535)
|
|
389
|
-
offset = '65535'
|
|
390
|
-
calcremain = True
|
|
391
|
-
if maxoffset is not None and int(maxoffset) > 65535:
|
|
392
|
-
maxoffsetremain = str(int(maxoffset) - 65535)
|
|
393
|
-
maxoffset = '65535'
|
|
394
|
-
calcremain = True
|
|
395
|
-
|
|
396
|
-
if pos == "BOF" or pos == "EOF":
|
|
397
|
-
if offset != '0':
|
|
398
|
-
calcbuf.write(char + '{' + str(offset))
|
|
399
|
-
if maxoffset is not None:
|
|
400
|
-
calcbuf.write(',' + maxoffset)
|
|
401
|
-
calcbuf.write('}')
|
|
402
|
-
elif maxoffset is not None:
|
|
403
|
-
calcbuf.write(char + '{0,' + maxoffset + '}')
|
|
404
|
-
|
|
405
|
-
if pos == "IFB":
|
|
406
|
-
if offset != '0':
|
|
407
|
-
calcbuf.write(char + '{' + str(offset))
|
|
408
|
-
if maxoffset is not None:
|
|
409
|
-
calcbuf.write(',' + maxoffset)
|
|
410
|
-
calcbuf.write('}')
|
|
411
|
-
if maxoffset is not None:
|
|
412
|
-
calcbuf.write(',}')
|
|
413
|
-
elif maxoffset is not None:
|
|
414
|
-
calcbuf.write(char + '{0,' + maxoffset + '}')
|
|
415
|
-
|
|
416
|
-
if calcremain: # recursion happens here
|
|
417
|
-
calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
|
|
418
|
-
|
|
419
|
-
val = calcbuf.getvalue()
|
|
420
|
-
calcbuf.close()
|
|
421
|
-
return val
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
425
|
-
"""
|
|
426
|
-
Convert to regular expression.
|
|
427
|
-
|
|
428
|
-
Endianness is not used.
|
|
429
|
-
|
|
430
|
-
@param chars, a pronom bytesequence, into a
|
|
431
|
-
@return regular expression.
|
|
432
|
-
"""
|
|
433
|
-
if 'Big' in endianness:
|
|
434
|
-
littleendian = False
|
|
435
|
-
else:
|
|
436
|
-
littleendian = True
|
|
437
|
-
if len(offset) == 0:
|
|
438
|
-
offset = '0'
|
|
439
|
-
if len(maxoffset) == 0:
|
|
440
|
-
maxoffset = None
|
|
441
|
-
if maxoffset == '0':
|
|
442
|
-
maxoffset = None
|
|
443
|
-
# make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
|
|
444
|
-
global buf
|
|
445
|
-
buf = cStringIO()
|
|
446
|
-
buf.write("(?s)") # If a regex starts with (?s), it is equivalent to DOTALL.
|
|
447
|
-
i = 0
|
|
448
|
-
state = 'start'
|
|
449
|
-
if 'BOF' in pos:
|
|
450
|
-
buf.write('\\A') # start of regex
|
|
451
|
-
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
|
452
|
-
|
|
453
|
-
if 'IFB' in pos:
|
|
454
|
-
buf.write('\\A')
|
|
455
|
-
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
|
456
|
-
|
|
457
|
-
while True:
|
|
458
|
-
if i == len(chars):
|
|
459
|
-
break
|
|
460
|
-
# print _convert_err_msg(state,chars[i],i,chars)
|
|
461
|
-
if state == 'start':
|
|
462
|
-
if chars[i].isalnum():
|
|
463
|
-
state = 'bytes'
|
|
464
|
-
elif chars[i] == '[' and chars[i + 1] == '!':
|
|
465
|
-
state = 'non-match'
|
|
466
|
-
elif chars[i] == '[':
|
|
467
|
-
state = 'bracket'
|
|
468
|
-
elif chars[i] == '{':
|
|
469
|
-
state = 'curly'
|
|
470
|
-
elif chars[i] == '(':
|
|
471
|
-
state = 'paren'
|
|
472
|
-
elif chars[i] in '*+?':
|
|
473
|
-
state = 'specials'
|
|
474
|
-
else:
|
|
475
|
-
raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars))
|
|
476
|
-
elif state == 'bytes':
|
|
477
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
|
478
|
-
buf.write(byt)
|
|
479
|
-
i += inc
|
|
480
|
-
state = 'start'
|
|
481
|
-
elif state == 'non-match':
|
|
482
|
-
buf.write('(!')
|
|
483
|
-
i += 2
|
|
484
|
-
while True:
|
|
485
|
-
if chars[i].isalnum():
|
|
486
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
|
487
|
-
buf.write(byt)
|
|
488
|
-
i += inc
|
|
489
|
-
elif chars[i] == ']':
|
|
490
|
-
break
|
|
491
|
-
else:
|
|
492
|
-
raise Exception(_convert_err_msg('Illegal character in non-match', chars[i], i, chars))
|
|
493
|
-
buf.write(')')
|
|
494
|
-
i += 1
|
|
495
|
-
state = 'start'
|
|
496
|
-
|
|
497
|
-
elif state == 'bracket':
|
|
498
|
-
try:
|
|
499
|
-
buf.write('[')
|
|
500
|
-
i += 1
|
|
501
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
|
502
|
-
buf.write(byt)
|
|
503
|
-
i += inc
|
|
504
|
-
# assert(chars[i] == ':')
|
|
505
|
-
if chars[i] != ':':
|
|
506
|
-
return "__INCOMPATIBLE_SIG__"
|
|
507
|
-
buf.write('-')
|
|
508
|
-
i += 1
|
|
509
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
|
510
|
-
buf.write(byt)
|
|
511
|
-
i += inc
|
|
512
|
-
# assert(chars[i] == ']')
|
|
513
|
-
if chars[i] != ']':
|
|
514
|
-
return "__INCOMPATIBLE_SIG__"
|
|
515
|
-
buf.write(']')
|
|
516
|
-
i += 1
|
|
517
|
-
except Exception:
|
|
518
|
-
print(_convert_err_msg('Illegal character in bracket', chars[i], i, chars))
|
|
519
|
-
raise
|
|
520
|
-
if i < len(chars) and chars[i] == '{':
|
|
521
|
-
state = 'curly-after-bracket'
|
|
522
|
-
else:
|
|
523
|
-
state = 'start'
|
|
524
|
-
elif state == 'paren':
|
|
525
|
-
buf.write('(?:')
|
|
526
|
-
i += 1
|
|
527
|
-
while True:
|
|
528
|
-
if chars[i].isalnum():
|
|
529
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
|
530
|
-
buf.write(byt)
|
|
531
|
-
i += inc
|
|
532
|
-
elif chars[i] == '|':
|
|
533
|
-
buf.write('|')
|
|
534
|
-
i += 1
|
|
535
|
-
elif chars[i] == ')':
|
|
536
|
-
break
|
|
537
|
-
# START fix FIDO-20
|
|
538
|
-
elif chars[i] == '[':
|
|
539
|
-
buf.write('[')
|
|
540
|
-
i += 1
|
|
541
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
|
542
|
-
buf.write(byt)
|
|
543
|
-
i += inc
|
|
544
|
-
# assert(chars[i] == ':')
|
|
545
|
-
if chars[i] != ':':
|
|
546
|
-
return "__INCOMPATIBLE_SIG__"
|
|
547
|
-
buf.write('-')
|
|
548
|
-
i += 1
|
|
549
|
-
(byt, inc) = doByte(chars, i, littleendian)
|
|
550
|
-
buf.write(byt)
|
|
551
|
-
i += inc
|
|
552
|
-
|
|
553
|
-
# assert(chars[i] == ']')
|
|
554
|
-
if chars[i] != ']':
|
|
555
|
-
return "__INCOMPATIBLE_SIG__"
|
|
556
|
-
buf.write(']')
|
|
557
|
-
i += 1
|
|
558
|
-
else:
|
|
559
|
-
raise Exception(_convert_err_msg(('Current state = \'{0}\' : Illegal character in paren').format(state), chars[i], i, chars))
|
|
560
|
-
buf.write(')')
|
|
561
|
-
i += 1
|
|
562
|
-
state = 'start'
|
|
563
|
-
# END fix FIDO-20
|
|
564
|
-
elif state in ['curly', 'curly-after-bracket']:
|
|
565
|
-
# {nnnn} or {nnn-nnn} or {nnn-*}
|
|
566
|
-
# {nnn} or {nnn,nnn} or {nnn,}
|
|
567
|
-
# when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
|
|
568
|
-
# The above, while sensible, appears to be incorrect. A '.' is always needed.
|
|
569
|
-
# for droid equiv behavior
|
|
570
|
-
# if state == 'curly':
|
|
571
|
-
buf.write('.')
|
|
572
|
-
buf.write('{')
|
|
573
|
-
i += 1 # skip the (
|
|
574
|
-
while True:
|
|
575
|
-
if chars[i].isalnum():
|
|
576
|
-
buf.write(chars[i])
|
|
577
|
-
i += 1
|
|
578
|
-
elif chars[i] == '-':
|
|
579
|
-
buf.write(',')
|
|
580
|
-
i += 1
|
|
581
|
-
elif chars[i] == '*': # skip the *
|
|
582
|
-
i += 1
|
|
583
|
-
elif chars[i] == '}':
|
|
584
|
-
break
|
|
585
|
-
else:
|
|
586
|
-
raise Exception(_convert_err_msg('Illegal character in curly', chars[i], i, chars))
|
|
587
|
-
buf.write('}')
|
|
588
|
-
i += 1 # skip the )
|
|
589
|
-
state = 'start'
|
|
590
|
-
elif state == 'specials':
|
|
591
|
-
if chars[i] == '*':
|
|
592
|
-
buf.write('.*')
|
|
593
|
-
i += 1
|
|
594
|
-
elif chars[i] == '+':
|
|
595
|
-
buf.write('.+')
|
|
596
|
-
i += 1
|
|
597
|
-
elif chars[i] == '?':
|
|
598
|
-
if chars[i + 1] != '?':
|
|
599
|
-
raise Exception(_convert_err_msg('Illegal character after ?', chars[i + 1], i + 1, chars))
|
|
600
|
-
buf.write('.?')
|
|
601
|
-
i += 2
|
|
602
|
-
state = 'start'
|
|
603
|
-
else:
|
|
604
|
-
raise Exception('Illegal state {0}'.format(state))
|
|
605
|
-
|
|
606
|
-
if 'EOF' in pos:
|
|
607
|
-
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
|
608
|
-
buf.write('\\Z')
|
|
609
|
-
|
|
610
|
-
val = buf.getvalue()
|
|
611
|
-
buf.close()
|
|
612
|
-
return val
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
def run(input=None, output=None, puid=None):
|
|
616
|
-
"""Convert PRONOM formats into FIDO signatures."""
|
|
617
|
-
versions = get_local_pronom_versions()
|
|
618
|
-
|
|
619
|
-
if input is None:
|
|
620
|
-
input = versions.get_zip_file()
|
|
621
|
-
if output is None:
|
|
622
|
-
output = versions.get_signature_file()
|
|
623
|
-
|
|
624
|
-
info = FormatInfo(input)
|
|
625
|
-
info.load_pronom_xml(puid)
|
|
626
|
-
info.save(output)
|
|
627
|
-
print('Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats)), file=sys.stderr)
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
def main(args=None):
|
|
631
|
-
"""Main CLI entrypoint."""
|
|
632
|
-
if args is None:
|
|
633
|
-
args = sys.argv[1:]
|
|
634
|
-
|
|
635
|
-
parser = ArgumentParser(description='Produce the FIDO format XML that is loaded at run-time')
|
|
636
|
-
parser.add_argument('-input', default=None, help='Input file, a Zip containing PRONOM XML files')
|
|
637
|
-
parser.add_argument('-output', default=None, help='Ouptut file')
|
|
638
|
-
parser.add_argument('-puid', default=None, help='A particular PUID record to extract')
|
|
639
|
-
args = parser.parse_args(args)
|
|
640
|
-
|
|
641
|
-
run(input=args.input, output=args.output, puid=args.puid)
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
if __name__ == '__main__':
|
|
645
|
-
main()
|