libis-format 0.9.30 → 0.9.32
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/libis/format/converter/image_converter.rb +2 -2
- data/lib/libis/format/office_to_pdf.rb +1 -1
- data/lib/libis/format/version.rb +1 -1
- data/spec/converter_spec.rb +43 -27
- data/spec/data/test-options.png +0 -0
- data/spec/data/test.pdf.tif +0 -0
- data/tools/droid/{DROID_SignatureFile_V82.xml → DROID_SignatureFile_V90.xml} +8202 -701
- data/tools/droid/{container-signature-20150307.xml → container-signature-20170330.xml} +3584 -2235
- data/tools/droid/droid-command-line-6.3.jar +0 -0
- data/tools/droid/droid.bat +152 -154
- data/tools/droid/droid.sh +30 -16
- data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
- data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
- data/tools/droid/lib/commons-codec-1.10.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
- data/tools/droid/lib/droid-container-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/{droid-help-6.1.5.jar → droid-help-6.3.jar} +0 -0
- data/tools/droid/lib/droid-report-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-results-6.3.jar +0 -0
- data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
- data/tools/droid/lib/poi-3.13.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
- data/tools/fido/__init__.py +50 -0
- data/tools/fido/conf/DROID_SignatureFile-v90.xml +2 -0
- data/tools/fido/conf/{container-signature-20150307.xml → container-signature-20170330.xml} +1487 -141
- data/tools/fido/conf/format_extensions.xml +0 -14
- data/tools/fido/conf/{formats-v81.xml → formats-v90.xml} +11409 -887
- data/tools/fido/conf/{pronom-xml-v81.zip → pronom-xml-v90.zip} +0 -0
- data/tools/fido/conf/versions.xml +6 -6
- data/tools/fido/fido.py +437 -407
- data/tools/fido/package.py +96 -0
- data/tools/fido/prepare.py +217 -188
- data/tools/fido/pronomutils.py +143 -58
- data/tools/fido/toxml.py +54 -46
- data/tools/fido/update_signatures.py +139 -127
- metadata +34 -40
- data/tools/droid/droid-command-line-6.1.5.jar +0 -0
- data/tools/droid/lib/antlr-2.7.7.jar +0 -0
- data/tools/droid/lib/antlr-3.2.jar +0 -0
- data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
- data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-codec-1.4.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
- data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
- data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
- data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
- data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
- data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
- data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
- data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/poi-3.7.jar +0 -0
- data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
- data/tools/fido/argparselocal.py +0 -2355
- data/tools/fido/conf/DROID_SignatureFile-v81.xml +0 -2
@@ -0,0 +1,96 @@
|
|
1
|
+
"""Support for containers."""
|
2
|
+
|
3
|
+
import re
|
4
|
+
import zipfile
|
5
|
+
|
6
|
+
import olefile
|
7
|
+
from six import iteritems
|
8
|
+
|
9
|
+
|
10
|
+
class Package(object):
|
11
|
+
"""Base class for container support."""
|
12
|
+
|
13
|
+
def _process_puid_map(self, data, puid_map):
|
14
|
+
results = []
|
15
|
+
for puid, signatures in iteritems(puid_map):
|
16
|
+
results.extend(self._process_matches(data, puid, signatures))
|
17
|
+
|
18
|
+
return results
|
19
|
+
|
20
|
+
def _process_matches(self, data, puid, signatures):
|
21
|
+
results = []
|
22
|
+
for signature in signatures:
|
23
|
+
if re.search(signature["signature"], data):
|
24
|
+
results.append(puid)
|
25
|
+
|
26
|
+
return results
|
27
|
+
|
28
|
+
|
29
|
+
class OlePackage(Package):
|
30
|
+
"""OlePackage supports OLE containers."""
|
31
|
+
|
32
|
+
def __init__(self, ole, signatures):
|
33
|
+
"""Instantiate OlePackage object given the location of its file and signatures."""
|
34
|
+
self.ole = ole
|
35
|
+
self.signatures = signatures
|
36
|
+
|
37
|
+
def detect_formats(self):
|
38
|
+
"""Detect available formats inside the OLE container."""
|
39
|
+
try:
|
40
|
+
ole = olefile.OleFileIO(self.ole)
|
41
|
+
except IOError:
|
42
|
+
return []
|
43
|
+
|
44
|
+
results = []
|
45
|
+
for path, puid_map in iteritems(self.signatures):
|
46
|
+
# Each OLE container signature lists the path of the file inside the OLE
|
47
|
+
# on which it operates; if the file is missing, there can be no match.
|
48
|
+
# This is not a precise match because the name of the stream may slightly
|
49
|
+
# differ; for example, \x01CompObj instead of CompObj
|
50
|
+
filepath = None
|
51
|
+
for paths in ole.listdir():
|
52
|
+
p = '/'.join(paths)
|
53
|
+
if p == path or p[1:] == path:
|
54
|
+
filepath = p
|
55
|
+
break
|
56
|
+
|
57
|
+
# Path to match isn't in the container at all
|
58
|
+
if filepath is None:
|
59
|
+
continue
|
60
|
+
|
61
|
+
with ole.openstream(filepath) as stream:
|
62
|
+
contents = stream.read()
|
63
|
+
results.extend(self._process_puid_map(contents, puid_map))
|
64
|
+
|
65
|
+
return results
|
66
|
+
|
67
|
+
|
68
|
+
class ZipPackage(Package):
|
69
|
+
"""ZipPackage supports Zip containers."""
|
70
|
+
|
71
|
+
def __init__(self, zip_, signatures):
|
72
|
+
"""Instantiate ZipPackage object given the location of its file and signatures."""
|
73
|
+
self.zip = zip_
|
74
|
+
self.signatures = signatures
|
75
|
+
|
76
|
+
def detect_formats(self):
|
77
|
+
"""Detect available formats inside the ZIP container."""
|
78
|
+
try:
|
79
|
+
zip_ = zipfile.ZipFile(self.zip)
|
80
|
+
except (zipfile.BadZipfile, RuntimeError, UnicodeDecodeError):
|
81
|
+
return []
|
82
|
+
|
83
|
+
results = []
|
84
|
+
for path, puid_map in iteritems(self.signatures):
|
85
|
+
# Each ZIP container signature lists the path of the file inside the ZIP
|
86
|
+
# on which it operates; if the file is missing, there can be no match.
|
87
|
+
if path not in zip_.namelist():
|
88
|
+
continue
|
89
|
+
|
90
|
+
# Extract the requested file from the ZIP only once, and pass the same
|
91
|
+
# data to each signature that requires it.
|
92
|
+
with zip_.open(path) as id_file:
|
93
|
+
contents = id_file.read()
|
94
|
+
results.extend(self._process_puid_map(contents, puid_map))
|
95
|
+
|
96
|
+
return results
|
data/tools/fido/prepare.py
CHANGED
@@ -1,88 +1,112 @@
|
|
1
|
-
|
1
|
+
#!/usr/bin/env python
|
2
2
|
# -*- coding: utf-8 -*-
|
3
|
-
# Format Identification for Digital Objects
|
4
3
|
|
5
|
-
|
6
|
-
|
7
|
-
import
|
8
|
-
|
9
|
-
|
10
|
-
import cStringIO, zipfile, os
|
4
|
+
"""Format Identification for Digital Objects."""
|
5
|
+
|
6
|
+
from __future__ import print_function
|
7
|
+
|
8
|
+
from argparse import ArgumentParser
|
11
9
|
import hashlib
|
12
|
-
import
|
10
|
+
import sys
|
11
|
+
from xml.dom import minidom
|
13
12
|
from xml.etree import ElementTree as ET
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
import zipfile
|
14
|
+
|
15
|
+
from six.moves import cStringIO
|
16
|
+
from six.moves.urllib.request import urlopen
|
17
|
+
from six.moves.urllib.parse import urlparse
|
18
|
+
|
19
|
+
from .pronomutils import get_local_pronom_versions
|
20
|
+
|
21
|
+
|
22
|
+
# \a\b\n\r\t\v
|
23
|
+
# MdR: took out '<' and '>' out of _ordinary because they were converted to entities <>
|
24
|
+
# MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
|
25
|
+
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
26
|
+
_special = '$()*+.?![]^\\{|}'
|
27
|
+
_hex = '0123456789abcdef'
|
28
|
+
|
18
29
|
|
19
30
|
class NS:
|
20
|
-
"""Helper class for XML name spaces in ElementTree.
|
21
|
-
Use like MYNS=NS("{http://some/uri}") and then
|
22
|
-
MYNS(tag1/tag2).
|
23
31
|
"""
|
32
|
+
Helper class for XML name spaces in ElementTree.
|
33
|
+
|
34
|
+
Use like MYNS=NS("{http://some/uri}") and then MYNS(tag1/tag2).
|
35
|
+
"""
|
36
|
+
|
24
37
|
def __init__(self, uri):
|
38
|
+
"""Instantiate class with `uri` argument."""
|
25
39
|
self.uri = uri
|
40
|
+
|
26
41
|
def __getattr__(self, tag):
|
42
|
+
"""Append URI to the class attributes."""
|
27
43
|
return self.uri + tag
|
44
|
+
|
28
45
|
def __call__(self, path):
|
46
|
+
"""Define behavior when the instant is used as a function."""
|
29
47
|
return "/".join(getattr(self, tag) for tag in path.split("/"))
|
30
48
|
|
31
|
-
|
32
|
-
XHTML = NS("{http://www.w3.org/1999/xhtml}")
|
33
|
-
# TNA namespace
|
34
|
-
|
49
|
+
|
50
|
+
XHTML = NS("{http://www.w3.org/1999/xhtml}") # XHTML namespace
|
51
|
+
TNA = NS("{http://pronom.nationalarchives.gov.uk}") # TNA namespace
|
52
|
+
|
35
53
|
|
36
54
|
def get_text_tna(element, tag, default=''):
|
37
|
-
"""Helper function to return the text for a tag or path using the TNA namespace.
|
38
|
-
"""
|
55
|
+
"""Helper function to return the text for a tag or path using the TNA namespace."""
|
39
56
|
part = element.find(TNA(tag))
|
40
|
-
|
57
|
+
if part is None or part.text is None:
|
58
|
+
return default
|
59
|
+
return part.text.strip()
|
60
|
+
|
41
61
|
|
42
62
|
def prettify(elem):
|
43
|
-
"""Return a pretty-printed XML string for the Element.
|
44
|
-
"""
|
45
|
-
from xml.dom import minidom
|
63
|
+
"""Return a pretty-printed XML string for the Element."""
|
46
64
|
rough_string = ET.tostring(elem, 'UTF-8')
|
47
65
|
reparsed = minidom.parseString(rough_string)
|
48
66
|
return reparsed.toprettyxml(indent=" ")
|
49
67
|
|
68
|
+
|
50
69
|
class FormatInfo:
|
70
|
+
"""Convert PRONOM formats into FIDO signatures."""
|
71
|
+
|
51
72
|
def __init__(self, pronom_files, format_list=[]):
|
73
|
+
"""Instantiate class, take a list of PRONOM files and an optional list of formats."""
|
52
74
|
self.info = {}
|
53
75
|
self.formats = []
|
54
76
|
self.pronom_files = pronom_files
|
55
77
|
for f in format_list:
|
56
|
-
self.add_format(f)
|
57
|
-
|
58
|
-
def save(self, dst):
|
59
|
-
"""Write the fido XML format definitions to @param dst
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
78
|
+
self.add_format(f) # FIXME: add_format is undefined!
|
79
|
+
|
80
|
+
def save(self, dst=sys.stdout):
|
81
|
+
"""Write the fido XML format definitions to @param dst."""
|
82
|
+
tree = ET.ElementTree(ET.Element('formats', {
|
83
|
+
'version': '0.3',
|
84
|
+
'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance",
|
85
|
+
'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
|
86
|
+
'xmlns:dc': "http://purl.org/dc/elements/1.1/",
|
87
|
+
'xmlns:dcterms': "http://purl.org/dc/terms/"
|
88
|
+
}))
|
66
89
|
root = tree.getroot()
|
67
90
|
for f in self.formats:
|
68
91
|
# MdR: this skipped puids without sig, but we want them ALL
|
69
92
|
# because puid might be matched on extension
|
70
|
-
#if f.find('signature'):
|
93
|
+
# if f.find('signature'):
|
71
94
|
root.append(f)
|
72
95
|
self.indent(root)
|
73
|
-
with open(dst, 'wb') as
|
74
|
-
|
75
|
-
|
96
|
+
with open(dst, 'wb') as file_:
|
97
|
+
# print >>out, ET.tostring(root,encoding='utf-8')
|
98
|
+
print(ET.tostring(root), file=file_)
|
76
99
|
|
77
100
|
def indent(self, elem, level=0):
|
78
|
-
|
101
|
+
"""Indent output."""
|
102
|
+
i = "\n" + level * " "
|
79
103
|
if len(elem):
|
80
104
|
if not elem.text or not elem.text.strip():
|
81
105
|
elem.text = i + " "
|
82
106
|
if not elem.tail or not elem.tail.strip():
|
83
107
|
elem.tail = i
|
84
108
|
for elem in elem:
|
85
|
-
self.indent(elem, level+1)
|
109
|
+
self.indent(elem, level + 1)
|
86
110
|
if not elem.tail or not elem.tail.strip():
|
87
111
|
elem.tail = i
|
88
112
|
else:
|
@@ -90,52 +114,57 @@ class FormatInfo:
|
|
90
114
|
elem.tail = i
|
91
115
|
|
92
116
|
def load_pronom_xml(self, puid_filter=None):
|
93
|
-
"""
|
94
|
-
|
95
|
-
|
117
|
+
"""
|
118
|
+
Load the pronom XML from self.pronom_files and convert it to fido XML.
|
119
|
+
|
120
|
+
As a side-effect, set self.formats to a list of ElementTree.Element.
|
121
|
+
If a @param puid is specified, only that one will be loaded.
|
96
122
|
"""
|
97
123
|
formats = []
|
98
|
-
#for p in self.pronom_files:
|
124
|
+
# for p in self.pronom_files:
|
99
125
|
# print p
|
100
|
-
#print self.pronom_files
|
101
|
-
#exit()
|
126
|
+
# print self.pronom_files
|
127
|
+
# exit()
|
102
128
|
try:
|
103
129
|
zip = zipfile.ZipFile(self.pronom_files, 'r')
|
104
130
|
for item in zip.infolist():
|
105
|
-
#print item.filename
|
131
|
+
# print item.filename
|
106
132
|
try:
|
107
133
|
stream = zip.open(item)
|
108
134
|
# Work is done here!
|
109
|
-
#if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
|
110
|
-
|
111
|
-
if
|
112
|
-
formats.append(
|
135
|
+
# if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
|
136
|
+
format_ = self.parse_pronom_xml(stream, puid_filter)
|
137
|
+
if format_ is not None:
|
138
|
+
formats.append(format_)
|
113
139
|
finally:
|
114
140
|
stream.close()
|
115
141
|
finally:
|
116
142
|
try:
|
117
143
|
zip.close()
|
118
|
-
except Exception
|
119
|
-
|
144
|
+
except Exception as e:
|
145
|
+
print("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e), file=sys.stderr)
|
120
146
|
sys.exit()
|
121
147
|
# Replace the formatID with puids in has_priority_over
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
for
|
130
|
-
rel
|
148
|
+
if puid_filter is None:
|
149
|
+
id_map = {}
|
150
|
+
for element in formats:
|
151
|
+
puid = element.find('puid').text
|
152
|
+
# print "working on puid:",puid
|
153
|
+
pronom_id = element.find('pronom_id').text
|
154
|
+
id_map[pronom_id] = puid
|
155
|
+
for element in formats:
|
156
|
+
for rel in element.findall('has_priority_over'):
|
157
|
+
rel.text = id_map[rel.text]
|
131
158
|
|
132
159
|
self._sort_formats(formats)
|
133
160
|
self.formats = formats
|
134
|
-
|
161
|
+
|
135
162
|
def parse_pronom_xml(self, source, puid_filter=None):
|
136
|
-
"""
|
137
|
-
|
138
|
-
|
163
|
+
"""
|
164
|
+
Parse PRONOM XML and convert into FIDO XML.
|
165
|
+
|
166
|
+
If a @param puid is specified, only that one will be loaded.
|
167
|
+
@return ET.ElementTree Element representing it.
|
139
168
|
"""
|
140
169
|
pronom_xml = ET.parse(source)
|
141
170
|
pronom_root = pronom_xml.getroot()
|
@@ -147,13 +176,13 @@ class FormatInfo:
|
|
147
176
|
if type == 'PUID':
|
148
177
|
puid = get_text_tna(id, 'Identifier')
|
149
178
|
ET.SubElement(fido_format, 'puid').text = puid
|
150
|
-
if puid_filter
|
179
|
+
if puid_filter and puid != puid_filter:
|
151
180
|
return None
|
152
181
|
# A bit clumsy. I want to have puid first, then mime, then container.
|
153
182
|
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
154
183
|
type = get_text_tna(id, 'IdentifierType')
|
155
184
|
if type == 'MIME':
|
156
|
-
ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
|
185
|
+
ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
|
157
186
|
elif type == 'PUID':
|
158
187
|
puid = get_text_tna(id, 'Identifier')
|
159
188
|
if puid == 'x-fmt/263':
|
@@ -170,7 +199,7 @@ class FormatInfo:
|
|
170
199
|
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
171
200
|
type = get_text_tna(id, 'IdentifierType')
|
172
201
|
if type == 'Apple Uniform Type Identifier':
|
173
|
-
ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
|
202
|
+
ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
|
174
203
|
# Handle the relationships
|
175
204
|
for x in pronom_format.findall(TNA('RelatedFormat')):
|
176
205
|
rel = get_text_tna(x, 'RelationshipType')
|
@@ -181,20 +210,20 @@ class FormatInfo:
|
|
181
210
|
fido_sig = ET.SubElement(fido_format, 'signature')
|
182
211
|
ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
|
183
212
|
# There are some funny chars in the notes, which caused me trouble and it is a unicode string,
|
184
|
-
ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
|
213
|
+
ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
|
185
214
|
for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
|
186
215
|
fido_pat = ET.SubElement(fido_sig, 'pattern')
|
187
216
|
pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
|
188
217
|
bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
|
189
218
|
offset = get_text_tna(pronom_pat, 'Offset')
|
190
219
|
max_offset = get_text_tna(pronom_pat, 'MaxOffset')
|
191
|
-
if max_offset
|
220
|
+
if not max_offset:
|
192
221
|
pass
|
193
|
-
#print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
|
222
|
+
# print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
|
194
223
|
regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
|
195
|
-
#print "done puid", puid
|
224
|
+
# print "done puid", puid
|
196
225
|
if regex == "__INCOMPATIBLE_SIG__":
|
197
|
-
print
|
226
|
+
print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
|
198
227
|
# remove the empty 'signature' nodes
|
199
228
|
# now that the signature is not compatible and thus "regex" is empty
|
200
229
|
remove = fido_format.findall('signature')
|
@@ -205,8 +234,8 @@ class FormatInfo:
|
|
205
234
|
ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
|
206
235
|
ET.SubElement(fido_pat, 'regex').text = regex
|
207
236
|
# Get the format details
|
208
|
-
fido_details = ET.SubElement(fido_format,'details')
|
209
|
-
ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
|
237
|
+
fido_details = ET.SubElement(fido_format, 'details')
|
238
|
+
ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
|
210
239
|
ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
|
211
240
|
ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
|
212
241
|
ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
|
@@ -221,7 +250,7 @@ class FormatInfo:
|
|
221
250
|
ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
|
222
251
|
# References
|
223
252
|
for x in pronom_format.findall(TNA("Document")):
|
224
|
-
r = ET.SubElement(fido_details,'reference')
|
253
|
+
r = ET.SubElement(fido_details, 'reference')
|
225
254
|
ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
|
226
255
|
ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
|
227
256
|
ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
|
@@ -229,51 +258,53 @@ class FormatInfo:
|
|
229
258
|
for id in x.findall(TNA('DocumentIdentifier')):
|
230
259
|
type = get_text_tna(id, 'IdentifierType')
|
231
260
|
if type == 'URL':
|
232
|
-
ET.SubElement(r, 'dc:identifier').text = "http://"+get_text_tna(id, 'Identifier')
|
261
|
+
ET.SubElement(r, 'dc:identifier').text = "http://" + get_text_tna(id, 'Identifier')
|
233
262
|
else:
|
234
|
-
ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
|
263
|
+
ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
|
235
264
|
ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
|
236
265
|
ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
|
237
|
-
ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription')+" "+get_text_tna(x, 'AvailabilityNote')
|
266
|
+
ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription') + " " + get_text_tna(x, 'AvailabilityNote')
|
238
267
|
ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
|
239
|
-
#
|
268
|
+
# Examples
|
240
269
|
for x in pronom_format.findall(TNA("ReferenceFile")):
|
241
|
-
rf = ET.SubElement(fido_details,'example_file')
|
270
|
+
rf = ET.SubElement(fido_details, 'example_file')
|
242
271
|
ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
|
243
272
|
ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
|
244
273
|
checksum = ""
|
245
274
|
for id in x.findall(TNA('ReferenceFileIdentifier')):
|
246
275
|
type = get_text_tna(id, 'IdentifierType')
|
247
276
|
if type == 'URL':
|
248
|
-
|
249
|
-
|
277
|
+
# Starting with PRONOM 89, some URLs contain http://
|
278
|
+
# and others do not.
|
279
|
+
url = get_text_tna(id, 'Identifier')
|
280
|
+
if not urlparse(url).scheme:
|
281
|
+
url = "http://" + url
|
282
|
+
ET.SubElement(rf, 'dc:identifier').text = url
|
250
283
|
# And calculate the checksum of this resource:
|
251
284
|
m = hashlib.md5()
|
252
|
-
sock =
|
285
|
+
sock = urlopen(url)
|
253
286
|
m.update(sock.read())
|
254
287
|
sock.close()
|
255
|
-
checksum=m.hexdigest()
|
288
|
+
checksum = m.hexdigest()
|
256
289
|
else:
|
257
|
-
ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
|
290
|
+
ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
|
258
291
|
ET.SubElement(rf, 'dcterms:license').text = ""
|
259
292
|
ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
|
260
293
|
checksumElement = ET.SubElement(rf, 'checksum')
|
261
294
|
checksumElement.text = checksum
|
262
295
|
checksumElement.attrib['type'] = "md5"
|
263
296
|
# Record Metadata
|
264
|
-
md = ET.SubElement(fido_details,'record_metadata')
|
265
|
-
ET.SubElement(md, 'status').text ='unknown'
|
297
|
+
md = ET.SubElement(fido_details, 'record_metadata')
|
298
|
+
ET.SubElement(md, 'status').text = 'unknown'
|
266
299
|
ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
|
267
300
|
ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
|
268
301
|
ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
|
269
|
-
ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
|
270
|
-
return fido_format
|
271
|
-
|
272
|
-
#FIXME: I don't think that this quite works yet!
|
302
|
+
ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
|
303
|
+
return fido_format
|
304
|
+
|
305
|
+
# FIXME: I don't think that this quite works yet!
|
273
306
|
def _sort_formats(self, formatlist):
|
274
|
-
"""Sort the format list based on their priority relationships so higher priority
|
275
|
-
formats appear earlier in the list.
|
276
|
-
"""
|
307
|
+
"""Sort the format list based on their priority relationships so higher priority formats appear earlier in the list."""
|
277
308
|
def compare_formats(f1, f2):
|
278
309
|
f1ID = f1.find('puid').text
|
279
310
|
f2ID = f2.find('puid').text
|
@@ -291,9 +322,9 @@ class FormatInfo:
|
|
291
322
|
return 1
|
292
323
|
return sorted(formatlist, cmp=compare_formats)
|
293
324
|
|
325
|
+
|
294
326
|
def fido_position(pronom_position):
|
295
|
-
"""
|
296
|
-
"""
|
327
|
+
"""Return BOF/EOF/VAR instead of the more verbose pronom position names."""
|
297
328
|
if pronom_position == 'Absolute from BOF':
|
298
329
|
return 'BOF'
|
299
330
|
elif pronom_position == 'Absolute from EOF':
|
@@ -302,16 +333,20 @@ def fido_position(pronom_position):
|
|
302
333
|
return 'VAR'
|
303
334
|
elif pronom_position == 'Indirect From BOF':
|
304
335
|
return 'IFB'
|
305
|
-
else:
|
306
|
-
sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
|
336
|
+
else: # to make sure FIDO does not crash (IFB aftermath)
|
337
|
+
sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
|
307
338
|
return 'VAR'
|
308
339
|
|
340
|
+
|
309
341
|
def _convert_err_msg(msg, c, i, chars):
|
310
342
|
return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
|
311
343
|
|
344
|
+
|
312
345
|
def doByte(chars, i, littleendian):
|
313
|
-
"""
|
314
|
-
|
346
|
+
"""
|
347
|
+
Convert two chars[i] and chars[i+1] into a byte.
|
348
|
+
|
349
|
+
@return a tuple (byte, 2)
|
315
350
|
"""
|
316
351
|
c1 = '0123456789ABCDEF'.find(chars[i].upper())
|
317
352
|
c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
|
@@ -323,12 +358,7 @@ def doByte(chars, i, littleendian):
|
|
323
358
|
val = chr(c1 + 16 * c2)
|
324
359
|
return (escape(val), 2)
|
325
360
|
|
326
|
-
|
327
|
-
# MdR: took out '<' and '>' out of _ordinary because they were converted to entities <>
|
328
|
-
# MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
|
329
|
-
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
330
|
-
_special = '$()*+.?![]^\\{|}'
|
331
|
-
_hex = '0123456789abcdef'
|
361
|
+
|
332
362
|
def _escape_char(c):
|
333
363
|
if c in '\n':
|
334
364
|
return '\\n'
|
@@ -340,68 +370,66 @@ def _escape_char(c):
|
|
340
370
|
(high, low) = divmod(ord(c), 16)
|
341
371
|
return '\\x' + _hex[high] + _hex[low]
|
342
372
|
|
373
|
+
|
343
374
|
def escape(string):
|
344
|
-
"Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
|
375
|
+
"""Escape characters in pattern that are non-printable, non-ascii, or special for regexes."""
|
345
376
|
return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
|
346
377
|
|
378
|
+
|
347
379
|
def calculate_repetition(char, pos, offset, maxoffset):
|
348
|
-
"""
|
349
|
-
|
350
|
-
|
351
|
-
see: bugs.python.org/issue13169
|
352
|
-
Otherwise it returns the {offset,maxoffset}
|
353
|
-
"""
|
354
|
-
calcbuf = cStringIO.StringIO()
|
355
|
-
|
380
|
+
"""Recursively calculates offset/maxoffset repetition, when one or both offsets is greater than 65535 bytes (64KB). See: https://bugs.python.org/issue13169."""
|
381
|
+
calcbuf = cStringIO()
|
382
|
+
|
356
383
|
calcremain = False
|
357
384
|
offsetremain = 0
|
358
385
|
maxoffsetremain = 0
|
359
|
-
|
360
|
-
if offset
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
calcremain = True
|
370
|
-
|
386
|
+
|
387
|
+
if offset is not None and int(offset) > 65535:
|
388
|
+
offsetremain = str(int(offset) - 65535)
|
389
|
+
offset = '65535'
|
390
|
+
calcremain = True
|
391
|
+
if maxoffset is not None and int(maxoffset) > 65535:
|
392
|
+
maxoffsetremain = str(int(maxoffset) - 65535)
|
393
|
+
maxoffset = '65535'
|
394
|
+
calcremain = True
|
395
|
+
|
371
396
|
if pos == "BOF" or pos == "EOF":
|
372
397
|
if offset != '0':
|
373
398
|
calcbuf.write(char + '{' + str(offset))
|
374
|
-
if maxoffset
|
399
|
+
if maxoffset is not None:
|
375
400
|
calcbuf.write(',' + maxoffset)
|
376
401
|
calcbuf.write('}')
|
377
|
-
elif maxoffset
|
402
|
+
elif maxoffset is not None:
|
378
403
|
calcbuf.write(char + '{0,' + maxoffset + '}')
|
379
404
|
|
380
405
|
if pos == "IFB":
|
381
406
|
if offset != '0':
|
382
407
|
calcbuf.write(char + '{' + str(offset))
|
383
|
-
if maxoffset
|
408
|
+
if maxoffset is not None:
|
384
409
|
calcbuf.write(',' + maxoffset)
|
385
410
|
calcbuf.write('}')
|
386
|
-
if maxoffset
|
411
|
+
if maxoffset is not None:
|
387
412
|
calcbuf.write(',}')
|
388
|
-
elif maxoffset
|
413
|
+
elif maxoffset is not None:
|
389
414
|
calcbuf.write(char + '{0,' + maxoffset + '}')
|
390
415
|
|
391
|
-
if calcremain:
|
416
|
+
if calcremain: # recursion happens here
|
392
417
|
calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
|
393
|
-
|
418
|
+
|
394
419
|
val = calcbuf.getvalue()
|
395
420
|
calcbuf.close()
|
396
421
|
return val
|
397
422
|
|
423
|
+
|
398
424
|
def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
399
|
-
"""Convert
|
400
|
-
@param chars, a pronom bytesequence, into a
|
401
|
-
@return regular expression.
|
402
|
-
Endianness is not used.
|
403
425
|
"""
|
426
|
+
Convert to regular expression.
|
427
|
+
|
428
|
+
Endianness is not used.
|
404
429
|
|
430
|
+
@param chars, a pronom bytesequence, into a
|
431
|
+
@return regular expression.
|
432
|
+
"""
|
405
433
|
if 'Big' in endianness:
|
406
434
|
littleendian = False
|
407
435
|
else:
|
@@ -410,24 +438,26 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
410
438
|
offset = '0'
|
411
439
|
if len(maxoffset) == 0:
|
412
440
|
maxoffset = None
|
441
|
+
if maxoffset == '0':
|
442
|
+
maxoffset = None
|
413
443
|
# make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
|
414
444
|
global buf
|
415
|
-
buf = cStringIO
|
416
|
-
buf.write("(?s)")
|
445
|
+
buf = cStringIO()
|
446
|
+
buf.write("(?s)") # If a regex starts with (?s), it is equivalent to DOTALL.
|
417
447
|
i = 0
|
418
448
|
state = 'start'
|
419
449
|
if 'BOF' in pos:
|
420
|
-
buf.write('\\A')
|
450
|
+
buf.write('\\A') # start of regex
|
421
451
|
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
422
|
-
|
452
|
+
|
423
453
|
if 'IFB' in pos:
|
424
454
|
buf.write('\\A')
|
425
455
|
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
426
|
-
|
456
|
+
|
427
457
|
while True:
|
428
458
|
if i == len(chars):
|
429
459
|
break
|
430
|
-
#print _convert_err_msg(state,chars[i],i,chars)
|
460
|
+
# print _convert_err_msg(state,chars[i],i,chars)
|
431
461
|
if state == 'start':
|
432
462
|
if chars[i].isalnum():
|
433
463
|
state = 'bytes'
|
@@ -471,7 +501,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
471
501
|
(byt, inc) = doByte(chars, i, littleendian)
|
472
502
|
buf.write(byt)
|
473
503
|
i += inc
|
474
|
-
#assert(chars[i] == ':')
|
504
|
+
# assert(chars[i] == ':')
|
475
505
|
if chars[i] != ':':
|
476
506
|
return "__INCOMPATIBLE_SIG__"
|
477
507
|
buf.write('-')
|
@@ -479,13 +509,13 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
479
509
|
(byt, inc) = doByte(chars, i, littleendian)
|
480
510
|
buf.write(byt)
|
481
511
|
i += inc
|
482
|
-
#assert(chars[i] == ']')
|
512
|
+
# assert(chars[i] == ']')
|
483
513
|
if chars[i] != ']':
|
484
514
|
return "__INCOMPATIBLE_SIG__"
|
485
515
|
buf.write(']')
|
486
516
|
i += 1
|
487
517
|
except Exception:
|
488
|
-
print
|
518
|
+
print(_convert_err_msg('Illegal character in bracket', chars[i], i, chars))
|
489
519
|
raise
|
490
520
|
if i < len(chars) and chars[i] == '{':
|
491
521
|
state = 'curly-after-bracket'
|
@@ -511,7 +541,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
511
541
|
(byt, inc) = doByte(chars, i, littleendian)
|
512
542
|
buf.write(byt)
|
513
543
|
i += inc
|
514
|
-
#assert(chars[i] == ':')
|
544
|
+
# assert(chars[i] == ':')
|
515
545
|
if chars[i] != ':':
|
516
546
|
return "__INCOMPATIBLE_SIG__"
|
517
547
|
buf.write('-')
|
@@ -519,8 +549,8 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
519
549
|
(byt, inc) = doByte(chars, i, littleendian)
|
520
550
|
buf.write(byt)
|
521
551
|
i += inc
|
522
|
-
|
523
|
-
#assert(chars[i] == ']')
|
552
|
+
|
553
|
+
# assert(chars[i] == ']')
|
524
554
|
if chars[i] != ']':
|
525
555
|
return "__INCOMPATIBLE_SIG__"
|
526
556
|
buf.write(']')
|
@@ -537,7 +567,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
537
567
|
# when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
|
538
568
|
# The above, while sensible, appears to be incorrect. A '.' is always needed.
|
539
569
|
# for droid equiv behavior
|
540
|
-
#if state == 'curly':
|
570
|
+
# if state == 'curly':
|
541
571
|
buf.write('.')
|
542
572
|
buf.write('{')
|
543
573
|
i += 1 # skip the (
|
@@ -548,7 +578,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
548
578
|
elif chars[i] == '-':
|
549
579
|
buf.write(',')
|
550
580
|
i += 1
|
551
|
-
elif chars[i] == '*':
|
581
|
+
elif chars[i] == '*': # skip the *
|
552
582
|
i += 1
|
553
583
|
elif chars[i] == '}':
|
554
584
|
break
|
@@ -581,36 +611,35 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
581
611
|
buf.close()
|
582
612
|
return val
|
583
613
|
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
parser
|
606
|
-
parser.add_argument('-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
614
|
+
|
615
|
+
def run(input=None, output=None, puid=None):
|
616
|
+
"""Convert PRONOM formats into FIDO signatures."""
|
617
|
+
versions = get_local_pronom_versions()
|
618
|
+
|
619
|
+
if input is None:
|
620
|
+
input = versions.get_zip_file()
|
621
|
+
if output is None:
|
622
|
+
output = versions.get_signature_file()
|
623
|
+
|
624
|
+
info = FormatInfo(input)
|
625
|
+
info.load_pronom_xml(puid)
|
626
|
+
info.save(output)
|
627
|
+
print('Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats)), file=sys.stderr)
|
628
|
+
|
629
|
+
|
630
|
+
def main(args=None):
|
631
|
+
"""Main CLI entrypoint."""
|
632
|
+
if args is None:
|
633
|
+
args = sys.argv[1:]
|
634
|
+
|
635
|
+
parser = ArgumentParser(description='Produce the FIDO format XML that is loaded at run-time')
|
636
|
+
parser.add_argument('-input', default=None, help='Input file, a Zip containing PRONOM XML files')
|
637
|
+
parser.add_argument('-output', default=None, help='Ouptut file')
|
638
|
+
parser.add_argument('-puid', default=None, help='A particular PUID record to extract')
|
639
|
+
args = parser.parse_args(args)
|
640
|
+
|
641
|
+
run(input=args.input, output=args.output, puid=args.puid)
|
642
|
+
|
643
|
+
|
615
644
|
if __name__ == '__main__':
|
616
|
-
main()
|
645
|
+
main()
|