libis-format 0.9.30 → 0.9.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/libis/format/converter/image_converter.rb +2 -2
- data/lib/libis/format/office_to_pdf.rb +1 -1
- data/lib/libis/format/version.rb +1 -1
- data/spec/converter_spec.rb +43 -27
- data/spec/data/test-options.png +0 -0
- data/spec/data/test.pdf.tif +0 -0
- data/tools/droid/{DROID_SignatureFile_V82.xml → DROID_SignatureFile_V90.xml} +8202 -701
- data/tools/droid/{container-signature-20150307.xml → container-signature-20170330.xml} +3584 -2235
- data/tools/droid/droid-command-line-6.3.jar +0 -0
- data/tools/droid/droid.bat +152 -154
- data/tools/droid/droid.sh +30 -16
- data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
- data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
- data/tools/droid/lib/commons-codec-1.10.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
- data/tools/droid/lib/droid-container-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/{droid-help-6.1.5.jar → droid-help-6.3.jar} +0 -0
- data/tools/droid/lib/droid-report-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-results-6.3.jar +0 -0
- data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
- data/tools/droid/lib/poi-3.13.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
- data/tools/fido/__init__.py +50 -0
- data/tools/fido/conf/DROID_SignatureFile-v90.xml +2 -0
- data/tools/fido/conf/{container-signature-20150307.xml → container-signature-20170330.xml} +1487 -141
- data/tools/fido/conf/format_extensions.xml +0 -14
- data/tools/fido/conf/{formats-v81.xml → formats-v90.xml} +11409 -887
- data/tools/fido/conf/{pronom-xml-v81.zip → pronom-xml-v90.zip} +0 -0
- data/tools/fido/conf/versions.xml +6 -6
- data/tools/fido/fido.py +437 -407
- data/tools/fido/package.py +96 -0
- data/tools/fido/prepare.py +217 -188
- data/tools/fido/pronomutils.py +143 -58
- data/tools/fido/toxml.py +54 -46
- data/tools/fido/update_signatures.py +139 -127
- metadata +34 -40
- data/tools/droid/droid-command-line-6.1.5.jar +0 -0
- data/tools/droid/lib/antlr-2.7.7.jar +0 -0
- data/tools/droid/lib/antlr-3.2.jar +0 -0
- data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
- data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-codec-1.4.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
- data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
- data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
- data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
- data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
- data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
- data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
- data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/poi-3.7.jar +0 -0
- data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
- data/tools/fido/argparselocal.py +0 -2355
- data/tools/fido/conf/DROID_SignatureFile-v81.xml +0 -2
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Support for containers."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import zipfile
|
|
5
|
+
|
|
6
|
+
import olefile
|
|
7
|
+
from six import iteritems
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Package(object):
|
|
11
|
+
"""Base class for container support."""
|
|
12
|
+
|
|
13
|
+
def _process_puid_map(self, data, puid_map):
|
|
14
|
+
results = []
|
|
15
|
+
for puid, signatures in iteritems(puid_map):
|
|
16
|
+
results.extend(self._process_matches(data, puid, signatures))
|
|
17
|
+
|
|
18
|
+
return results
|
|
19
|
+
|
|
20
|
+
def _process_matches(self, data, puid, signatures):
|
|
21
|
+
results = []
|
|
22
|
+
for signature in signatures:
|
|
23
|
+
if re.search(signature["signature"], data):
|
|
24
|
+
results.append(puid)
|
|
25
|
+
|
|
26
|
+
return results
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class OlePackage(Package):
|
|
30
|
+
"""OlePackage supports OLE containers."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, ole, signatures):
|
|
33
|
+
"""Instantiate OlePackage object given the location of its file and signatures."""
|
|
34
|
+
self.ole = ole
|
|
35
|
+
self.signatures = signatures
|
|
36
|
+
|
|
37
|
+
def detect_formats(self):
|
|
38
|
+
"""Detect available formats inside the OLE container."""
|
|
39
|
+
try:
|
|
40
|
+
ole = olefile.OleFileIO(self.ole)
|
|
41
|
+
except IOError:
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
results = []
|
|
45
|
+
for path, puid_map in iteritems(self.signatures):
|
|
46
|
+
# Each OLE container signature lists the path of the file inside the OLE
|
|
47
|
+
# on which it operates; if the file is missing, there can be no match.
|
|
48
|
+
# This is not a precise match because the name of the stream may slightly
|
|
49
|
+
# differ; for example, \x01CompObj instead of CompObj
|
|
50
|
+
filepath = None
|
|
51
|
+
for paths in ole.listdir():
|
|
52
|
+
p = '/'.join(paths)
|
|
53
|
+
if p == path or p[1:] == path:
|
|
54
|
+
filepath = p
|
|
55
|
+
break
|
|
56
|
+
|
|
57
|
+
# Path to match isn't in the container at all
|
|
58
|
+
if filepath is None:
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
with ole.openstream(filepath) as stream:
|
|
62
|
+
contents = stream.read()
|
|
63
|
+
results.extend(self._process_puid_map(contents, puid_map))
|
|
64
|
+
|
|
65
|
+
return results
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ZipPackage(Package):
|
|
69
|
+
"""ZipPackage supports Zip containers."""
|
|
70
|
+
|
|
71
|
+
def __init__(self, zip_, signatures):
|
|
72
|
+
"""Instantiate ZipPackage object given the location of its file and signatures."""
|
|
73
|
+
self.zip = zip_
|
|
74
|
+
self.signatures = signatures
|
|
75
|
+
|
|
76
|
+
def detect_formats(self):
|
|
77
|
+
"""Detect available formats inside the ZIP container."""
|
|
78
|
+
try:
|
|
79
|
+
zip_ = zipfile.ZipFile(self.zip)
|
|
80
|
+
except (zipfile.BadZipfile, RuntimeError, UnicodeDecodeError):
|
|
81
|
+
return []
|
|
82
|
+
|
|
83
|
+
results = []
|
|
84
|
+
for path, puid_map in iteritems(self.signatures):
|
|
85
|
+
# Each ZIP container signature lists the path of the file inside the ZIP
|
|
86
|
+
# on which it operates; if the file is missing, there can be no match.
|
|
87
|
+
if path not in zip_.namelist():
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# Extract the requested file from the ZIP only once, and pass the same
|
|
91
|
+
# data to each signature that requires it.
|
|
92
|
+
with zip_.open(path) as id_file:
|
|
93
|
+
contents = id_file.read()
|
|
94
|
+
results.extend(self._process_puid_map(contents, puid_map))
|
|
95
|
+
|
|
96
|
+
return results
|
data/tools/fido/prepare.py
CHANGED
|
@@ -1,88 +1,112 @@
|
|
|
1
|
-
|
|
1
|
+
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
|
-
# Format Identification for Digital Objects
|
|
4
3
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
import
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
import cStringIO, zipfile, os
|
|
4
|
+
"""Format Identification for Digital Objects."""
|
|
5
|
+
|
|
6
|
+
from __future__ import print_function
|
|
7
|
+
|
|
8
|
+
from argparse import ArgumentParser
|
|
11
9
|
import hashlib
|
|
12
|
-
import
|
|
10
|
+
import sys
|
|
11
|
+
from xml.dom import minidom
|
|
13
12
|
from xml.etree import ElementTree as ET
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
13
|
+
import zipfile
|
|
14
|
+
|
|
15
|
+
from six.moves import cStringIO
|
|
16
|
+
from six.moves.urllib.request import urlopen
|
|
17
|
+
from six.moves.urllib.parse import urlparse
|
|
18
|
+
|
|
19
|
+
from .pronomutils import get_local_pronom_versions
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# \a\b\n\r\t\v
|
|
23
|
+
# MdR: took out '<' and '>' out of _ordinary because they were converted to entities <>
|
|
24
|
+
# MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
|
|
25
|
+
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
|
26
|
+
_special = '$()*+.?![]^\\{|}'
|
|
27
|
+
_hex = '0123456789abcdef'
|
|
28
|
+
|
|
18
29
|
|
|
19
30
|
class NS:
|
|
20
|
-
"""Helper class for XML name spaces in ElementTree.
|
|
21
|
-
Use like MYNS=NS("{http://some/uri}") and then
|
|
22
|
-
MYNS(tag1/tag2).
|
|
23
31
|
"""
|
|
32
|
+
Helper class for XML name spaces in ElementTree.
|
|
33
|
+
|
|
34
|
+
Use like MYNS=NS("{http://some/uri}") and then MYNS(tag1/tag2).
|
|
35
|
+
"""
|
|
36
|
+
|
|
24
37
|
def __init__(self, uri):
|
|
38
|
+
"""Instantiate class with `uri` argument."""
|
|
25
39
|
self.uri = uri
|
|
40
|
+
|
|
26
41
|
def __getattr__(self, tag):
|
|
42
|
+
"""Append URI to the class attributes."""
|
|
27
43
|
return self.uri + tag
|
|
44
|
+
|
|
28
45
|
def __call__(self, path):
|
|
46
|
+
"""Define behavior when the instant is used as a function."""
|
|
29
47
|
return "/".join(getattr(self, tag) for tag in path.split("/"))
|
|
30
48
|
|
|
31
|
-
|
|
32
|
-
XHTML = NS("{http://www.w3.org/1999/xhtml}")
|
|
33
|
-
# TNA namespace
|
|
34
|
-
|
|
49
|
+
|
|
50
|
+
XHTML = NS("{http://www.w3.org/1999/xhtml}") # XHTML namespace
|
|
51
|
+
TNA = NS("{http://pronom.nationalarchives.gov.uk}") # TNA namespace
|
|
52
|
+
|
|
35
53
|
|
|
36
54
|
def get_text_tna(element, tag, default=''):
|
|
37
|
-
"""Helper function to return the text for a tag or path using the TNA namespace.
|
|
38
|
-
"""
|
|
55
|
+
"""Helper function to return the text for a tag or path using the TNA namespace."""
|
|
39
56
|
part = element.find(TNA(tag))
|
|
40
|
-
|
|
57
|
+
if part is None or part.text is None:
|
|
58
|
+
return default
|
|
59
|
+
return part.text.strip()
|
|
60
|
+
|
|
41
61
|
|
|
42
62
|
def prettify(elem):
|
|
43
|
-
"""Return a pretty-printed XML string for the Element.
|
|
44
|
-
"""
|
|
45
|
-
from xml.dom import minidom
|
|
63
|
+
"""Return a pretty-printed XML string for the Element."""
|
|
46
64
|
rough_string = ET.tostring(elem, 'UTF-8')
|
|
47
65
|
reparsed = minidom.parseString(rough_string)
|
|
48
66
|
return reparsed.toprettyxml(indent=" ")
|
|
49
67
|
|
|
68
|
+
|
|
50
69
|
class FormatInfo:
|
|
70
|
+
"""Convert PRONOM formats into FIDO signatures."""
|
|
71
|
+
|
|
51
72
|
def __init__(self, pronom_files, format_list=[]):
|
|
73
|
+
"""Instantiate class, take a list of PRONOM files and an optional list of formats."""
|
|
52
74
|
self.info = {}
|
|
53
75
|
self.formats = []
|
|
54
76
|
self.pronom_files = pronom_files
|
|
55
77
|
for f in format_list:
|
|
56
|
-
self.add_format(f)
|
|
57
|
-
|
|
58
|
-
def save(self, dst):
|
|
59
|
-
"""Write the fido XML format definitions to @param dst
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
78
|
+
self.add_format(f) # FIXME: add_format is undefined!
|
|
79
|
+
|
|
80
|
+
def save(self, dst=sys.stdout):
|
|
81
|
+
"""Write the fido XML format definitions to @param dst."""
|
|
82
|
+
tree = ET.ElementTree(ET.Element('formats', {
|
|
83
|
+
'version': '0.3',
|
|
84
|
+
'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance",
|
|
85
|
+
'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
|
|
86
|
+
'xmlns:dc': "http://purl.org/dc/elements/1.1/",
|
|
87
|
+
'xmlns:dcterms': "http://purl.org/dc/terms/"
|
|
88
|
+
}))
|
|
66
89
|
root = tree.getroot()
|
|
67
90
|
for f in self.formats:
|
|
68
91
|
# MdR: this skipped puids without sig, but we want them ALL
|
|
69
92
|
# because puid might be matched on extension
|
|
70
|
-
#if f.find('signature'):
|
|
93
|
+
# if f.find('signature'):
|
|
71
94
|
root.append(f)
|
|
72
95
|
self.indent(root)
|
|
73
|
-
with open(dst, 'wb') as
|
|
74
|
-
|
|
75
|
-
|
|
96
|
+
with open(dst, 'wb') as file_:
|
|
97
|
+
# print >>out, ET.tostring(root,encoding='utf-8')
|
|
98
|
+
print(ET.tostring(root), file=file_)
|
|
76
99
|
|
|
77
100
|
def indent(self, elem, level=0):
|
|
78
|
-
|
|
101
|
+
"""Indent output."""
|
|
102
|
+
i = "\n" + level * " "
|
|
79
103
|
if len(elem):
|
|
80
104
|
if not elem.text or not elem.text.strip():
|
|
81
105
|
elem.text = i + " "
|
|
82
106
|
if not elem.tail or not elem.tail.strip():
|
|
83
107
|
elem.tail = i
|
|
84
108
|
for elem in elem:
|
|
85
|
-
self.indent(elem, level+1)
|
|
109
|
+
self.indent(elem, level + 1)
|
|
86
110
|
if not elem.tail or not elem.tail.strip():
|
|
87
111
|
elem.tail = i
|
|
88
112
|
else:
|
|
@@ -90,52 +114,57 @@ class FormatInfo:
|
|
|
90
114
|
elem.tail = i
|
|
91
115
|
|
|
92
116
|
def load_pronom_xml(self, puid_filter=None):
|
|
93
|
-
"""
|
|
94
|
-
|
|
95
|
-
|
|
117
|
+
"""
|
|
118
|
+
Load the pronom XML from self.pronom_files and convert it to fido XML.
|
|
119
|
+
|
|
120
|
+
As a side-effect, set self.formats to a list of ElementTree.Element.
|
|
121
|
+
If a @param puid is specified, only that one will be loaded.
|
|
96
122
|
"""
|
|
97
123
|
formats = []
|
|
98
|
-
#for p in self.pronom_files:
|
|
124
|
+
# for p in self.pronom_files:
|
|
99
125
|
# print p
|
|
100
|
-
#print self.pronom_files
|
|
101
|
-
#exit()
|
|
126
|
+
# print self.pronom_files
|
|
127
|
+
# exit()
|
|
102
128
|
try:
|
|
103
129
|
zip = zipfile.ZipFile(self.pronom_files, 'r')
|
|
104
130
|
for item in zip.infolist():
|
|
105
|
-
#print item.filename
|
|
131
|
+
# print item.filename
|
|
106
132
|
try:
|
|
107
133
|
stream = zip.open(item)
|
|
108
134
|
# Work is done here!
|
|
109
|
-
#if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
|
|
110
|
-
|
|
111
|
-
if
|
|
112
|
-
formats.append(
|
|
135
|
+
# if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
|
|
136
|
+
format_ = self.parse_pronom_xml(stream, puid_filter)
|
|
137
|
+
if format_ is not None:
|
|
138
|
+
formats.append(format_)
|
|
113
139
|
finally:
|
|
114
140
|
stream.close()
|
|
115
141
|
finally:
|
|
116
142
|
try:
|
|
117
143
|
zip.close()
|
|
118
|
-
except Exception
|
|
119
|
-
|
|
144
|
+
except Exception as e:
|
|
145
|
+
print("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e), file=sys.stderr)
|
|
120
146
|
sys.exit()
|
|
121
147
|
# Replace the formatID with puids in has_priority_over
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
for
|
|
130
|
-
rel
|
|
148
|
+
if puid_filter is None:
|
|
149
|
+
id_map = {}
|
|
150
|
+
for element in formats:
|
|
151
|
+
puid = element.find('puid').text
|
|
152
|
+
# print "working on puid:",puid
|
|
153
|
+
pronom_id = element.find('pronom_id').text
|
|
154
|
+
id_map[pronom_id] = puid
|
|
155
|
+
for element in formats:
|
|
156
|
+
for rel in element.findall('has_priority_over'):
|
|
157
|
+
rel.text = id_map[rel.text]
|
|
131
158
|
|
|
132
159
|
self._sort_formats(formats)
|
|
133
160
|
self.formats = formats
|
|
134
|
-
|
|
161
|
+
|
|
135
162
|
def parse_pronom_xml(self, source, puid_filter=None):
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
|
|
163
|
+
"""
|
|
164
|
+
Parse PRONOM XML and convert into FIDO XML.
|
|
165
|
+
|
|
166
|
+
If a @param puid is specified, only that one will be loaded.
|
|
167
|
+
@return ET.ElementTree Element representing it.
|
|
139
168
|
"""
|
|
140
169
|
pronom_xml = ET.parse(source)
|
|
141
170
|
pronom_root = pronom_xml.getroot()
|
|
@@ -147,13 +176,13 @@ class FormatInfo:
|
|
|
147
176
|
if type == 'PUID':
|
|
148
177
|
puid = get_text_tna(id, 'Identifier')
|
|
149
178
|
ET.SubElement(fido_format, 'puid').text = puid
|
|
150
|
-
if puid_filter
|
|
179
|
+
if puid_filter and puid != puid_filter:
|
|
151
180
|
return None
|
|
152
181
|
# A bit clumsy. I want to have puid first, then mime, then container.
|
|
153
182
|
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
|
154
183
|
type = get_text_tna(id, 'IdentifierType')
|
|
155
184
|
if type == 'MIME':
|
|
156
|
-
ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
|
|
185
|
+
ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
|
|
157
186
|
elif type == 'PUID':
|
|
158
187
|
puid = get_text_tna(id, 'Identifier')
|
|
159
188
|
if puid == 'x-fmt/263':
|
|
@@ -170,7 +199,7 @@ class FormatInfo:
|
|
|
170
199
|
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
|
171
200
|
type = get_text_tna(id, 'IdentifierType')
|
|
172
201
|
if type == 'Apple Uniform Type Identifier':
|
|
173
|
-
ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
|
|
202
|
+
ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
|
|
174
203
|
# Handle the relationships
|
|
175
204
|
for x in pronom_format.findall(TNA('RelatedFormat')):
|
|
176
205
|
rel = get_text_tna(x, 'RelationshipType')
|
|
@@ -181,20 +210,20 @@ class FormatInfo:
|
|
|
181
210
|
fido_sig = ET.SubElement(fido_format, 'signature')
|
|
182
211
|
ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
|
|
183
212
|
# There are some funny chars in the notes, which caused me trouble and it is a unicode string,
|
|
184
|
-
ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
|
|
213
|
+
ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
|
|
185
214
|
for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
|
|
186
215
|
fido_pat = ET.SubElement(fido_sig, 'pattern')
|
|
187
216
|
pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
|
|
188
217
|
bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
|
|
189
218
|
offset = get_text_tna(pronom_pat, 'Offset')
|
|
190
219
|
max_offset = get_text_tna(pronom_pat, 'MaxOffset')
|
|
191
|
-
if max_offset
|
|
220
|
+
if not max_offset:
|
|
192
221
|
pass
|
|
193
|
-
#print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
|
|
222
|
+
# print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
|
|
194
223
|
regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
|
|
195
|
-
#print "done puid", puid
|
|
224
|
+
# print "done puid", puid
|
|
196
225
|
if regex == "__INCOMPATIBLE_SIG__":
|
|
197
|
-
print
|
|
226
|
+
print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
|
|
198
227
|
# remove the empty 'signature' nodes
|
|
199
228
|
# now that the signature is not compatible and thus "regex" is empty
|
|
200
229
|
remove = fido_format.findall('signature')
|
|
@@ -205,8 +234,8 @@ class FormatInfo:
|
|
|
205
234
|
ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
|
|
206
235
|
ET.SubElement(fido_pat, 'regex').text = regex
|
|
207
236
|
# Get the format details
|
|
208
|
-
fido_details = ET.SubElement(fido_format,'details')
|
|
209
|
-
ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
|
|
237
|
+
fido_details = ET.SubElement(fido_format, 'details')
|
|
238
|
+
ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
|
|
210
239
|
ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
|
|
211
240
|
ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
|
|
212
241
|
ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
|
|
@@ -221,7 +250,7 @@ class FormatInfo:
|
|
|
221
250
|
ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
|
|
222
251
|
# References
|
|
223
252
|
for x in pronom_format.findall(TNA("Document")):
|
|
224
|
-
r = ET.SubElement(fido_details,'reference')
|
|
253
|
+
r = ET.SubElement(fido_details, 'reference')
|
|
225
254
|
ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
|
|
226
255
|
ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
|
|
227
256
|
ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
|
|
@@ -229,51 +258,53 @@ class FormatInfo:
|
|
|
229
258
|
for id in x.findall(TNA('DocumentIdentifier')):
|
|
230
259
|
type = get_text_tna(id, 'IdentifierType')
|
|
231
260
|
if type == 'URL':
|
|
232
|
-
ET.SubElement(r, 'dc:identifier').text = "http://"+get_text_tna(id, 'Identifier')
|
|
261
|
+
ET.SubElement(r, 'dc:identifier').text = "http://" + get_text_tna(id, 'Identifier')
|
|
233
262
|
else:
|
|
234
|
-
ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
|
|
263
|
+
ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
|
|
235
264
|
ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
|
|
236
265
|
ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
|
|
237
|
-
ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription')+" "+get_text_tna(x, 'AvailabilityNote')
|
|
266
|
+
ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription') + " " + get_text_tna(x, 'AvailabilityNote')
|
|
238
267
|
ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
|
|
239
|
-
#
|
|
268
|
+
# Examples
|
|
240
269
|
for x in pronom_format.findall(TNA("ReferenceFile")):
|
|
241
|
-
rf = ET.SubElement(fido_details,'example_file')
|
|
270
|
+
rf = ET.SubElement(fido_details, 'example_file')
|
|
242
271
|
ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
|
|
243
272
|
ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
|
|
244
273
|
checksum = ""
|
|
245
274
|
for id in x.findall(TNA('ReferenceFileIdentifier')):
|
|
246
275
|
type = get_text_tna(id, 'IdentifierType')
|
|
247
276
|
if type == 'URL':
|
|
248
|
-
|
|
249
|
-
|
|
277
|
+
# Starting with PRONOM 89, some URLs contain http://
|
|
278
|
+
# and others do not.
|
|
279
|
+
url = get_text_tna(id, 'Identifier')
|
|
280
|
+
if not urlparse(url).scheme:
|
|
281
|
+
url = "http://" + url
|
|
282
|
+
ET.SubElement(rf, 'dc:identifier').text = url
|
|
250
283
|
# And calculate the checksum of this resource:
|
|
251
284
|
m = hashlib.md5()
|
|
252
|
-
sock =
|
|
285
|
+
sock = urlopen(url)
|
|
253
286
|
m.update(sock.read())
|
|
254
287
|
sock.close()
|
|
255
|
-
checksum=m.hexdigest()
|
|
288
|
+
checksum = m.hexdigest()
|
|
256
289
|
else:
|
|
257
|
-
ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
|
|
290
|
+
ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
|
|
258
291
|
ET.SubElement(rf, 'dcterms:license').text = ""
|
|
259
292
|
ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
|
|
260
293
|
checksumElement = ET.SubElement(rf, 'checksum')
|
|
261
294
|
checksumElement.text = checksum
|
|
262
295
|
checksumElement.attrib['type'] = "md5"
|
|
263
296
|
# Record Metadata
|
|
264
|
-
md = ET.SubElement(fido_details,'record_metadata')
|
|
265
|
-
ET.SubElement(md, 'status').text ='unknown'
|
|
297
|
+
md = ET.SubElement(fido_details, 'record_metadata')
|
|
298
|
+
ET.SubElement(md, 'status').text = 'unknown'
|
|
266
299
|
ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
|
|
267
300
|
ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
|
|
268
301
|
ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
|
|
269
|
-
ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
|
|
270
|
-
return fido_format
|
|
271
|
-
|
|
272
|
-
#FIXME: I don't think that this quite works yet!
|
|
302
|
+
ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
|
|
303
|
+
return fido_format
|
|
304
|
+
|
|
305
|
+
# FIXME: I don't think that this quite works yet!
|
|
273
306
|
def _sort_formats(self, formatlist):
|
|
274
|
-
"""Sort the format list based on their priority relationships so higher priority
|
|
275
|
-
formats appear earlier in the list.
|
|
276
|
-
"""
|
|
307
|
+
"""Sort the format list based on their priority relationships so higher priority formats appear earlier in the list."""
|
|
277
308
|
def compare_formats(f1, f2):
|
|
278
309
|
f1ID = f1.find('puid').text
|
|
279
310
|
f2ID = f2.find('puid').text
|
|
@@ -291,9 +322,9 @@ class FormatInfo:
|
|
|
291
322
|
return 1
|
|
292
323
|
return sorted(formatlist, cmp=compare_formats)
|
|
293
324
|
|
|
325
|
+
|
|
294
326
|
def fido_position(pronom_position):
|
|
295
|
-
"""
|
|
296
|
-
"""
|
|
327
|
+
"""Return BOF/EOF/VAR instead of the more verbose pronom position names."""
|
|
297
328
|
if pronom_position == 'Absolute from BOF':
|
|
298
329
|
return 'BOF'
|
|
299
330
|
elif pronom_position == 'Absolute from EOF':
|
|
@@ -302,16 +333,20 @@ def fido_position(pronom_position):
|
|
|
302
333
|
return 'VAR'
|
|
303
334
|
elif pronom_position == 'Indirect From BOF':
|
|
304
335
|
return 'IFB'
|
|
305
|
-
else:
|
|
306
|
-
sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
|
|
336
|
+
else: # to make sure FIDO does not crash (IFB aftermath)
|
|
337
|
+
sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
|
|
307
338
|
return 'VAR'
|
|
308
339
|
|
|
340
|
+
|
|
309
341
|
def _convert_err_msg(msg, c, i, chars):
|
|
310
342
|
return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
|
|
311
343
|
|
|
344
|
+
|
|
312
345
|
def doByte(chars, i, littleendian):
|
|
313
|
-
"""
|
|
314
|
-
|
|
346
|
+
"""
|
|
347
|
+
Convert two chars[i] and chars[i+1] into a byte.
|
|
348
|
+
|
|
349
|
+
@return a tuple (byte, 2)
|
|
315
350
|
"""
|
|
316
351
|
c1 = '0123456789ABCDEF'.find(chars[i].upper())
|
|
317
352
|
c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
|
|
@@ -323,12 +358,7 @@ def doByte(chars, i, littleendian):
|
|
|
323
358
|
val = chr(c1 + 16 * c2)
|
|
324
359
|
return (escape(val), 2)
|
|
325
360
|
|
|
326
|
-
|
|
327
|
-
# MdR: took out '<' and '>' out of _ordinary because they were converted to entities <>
|
|
328
|
-
# MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
|
|
329
|
-
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
|
330
|
-
_special = '$()*+.?![]^\\{|}'
|
|
331
|
-
_hex = '0123456789abcdef'
|
|
361
|
+
|
|
332
362
|
def _escape_char(c):
|
|
333
363
|
if c in '\n':
|
|
334
364
|
return '\\n'
|
|
@@ -340,68 +370,66 @@ def _escape_char(c):
|
|
|
340
370
|
(high, low) = divmod(ord(c), 16)
|
|
341
371
|
return '\\x' + _hex[high] + _hex[low]
|
|
342
372
|
|
|
373
|
+
|
|
343
374
|
def escape(string):
|
|
344
|
-
"Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
|
|
375
|
+
"""Escape characters in pattern that are non-printable, non-ascii, or special for regexes."""
|
|
345
376
|
return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
|
|
346
377
|
|
|
378
|
+
|
|
347
379
|
def calculate_repetition(char, pos, offset, maxoffset):
|
|
348
|
-
"""
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
see: bugs.python.org/issue13169
|
|
352
|
-
Otherwise it returns the {offset,maxoffset}
|
|
353
|
-
"""
|
|
354
|
-
calcbuf = cStringIO.StringIO()
|
|
355
|
-
|
|
380
|
+
"""Recursively calculates offset/maxoffset repetition, when one or both offsets is greater than 65535 bytes (64KB). See: https://bugs.python.org/issue13169."""
|
|
381
|
+
calcbuf = cStringIO()
|
|
382
|
+
|
|
356
383
|
calcremain = False
|
|
357
384
|
offsetremain = 0
|
|
358
385
|
maxoffsetremain = 0
|
|
359
|
-
|
|
360
|
-
if offset
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
calcremain = True
|
|
370
|
-
|
|
386
|
+
|
|
387
|
+
if offset is not None and int(offset) > 65535:
|
|
388
|
+
offsetremain = str(int(offset) - 65535)
|
|
389
|
+
offset = '65535'
|
|
390
|
+
calcremain = True
|
|
391
|
+
if maxoffset is not None and int(maxoffset) > 65535:
|
|
392
|
+
maxoffsetremain = str(int(maxoffset) - 65535)
|
|
393
|
+
maxoffset = '65535'
|
|
394
|
+
calcremain = True
|
|
395
|
+
|
|
371
396
|
if pos == "BOF" or pos == "EOF":
|
|
372
397
|
if offset != '0':
|
|
373
398
|
calcbuf.write(char + '{' + str(offset))
|
|
374
|
-
if maxoffset
|
|
399
|
+
if maxoffset is not None:
|
|
375
400
|
calcbuf.write(',' + maxoffset)
|
|
376
401
|
calcbuf.write('}')
|
|
377
|
-
elif maxoffset
|
|
402
|
+
elif maxoffset is not None:
|
|
378
403
|
calcbuf.write(char + '{0,' + maxoffset + '}')
|
|
379
404
|
|
|
380
405
|
if pos == "IFB":
|
|
381
406
|
if offset != '0':
|
|
382
407
|
calcbuf.write(char + '{' + str(offset))
|
|
383
|
-
if maxoffset
|
|
408
|
+
if maxoffset is not None:
|
|
384
409
|
calcbuf.write(',' + maxoffset)
|
|
385
410
|
calcbuf.write('}')
|
|
386
|
-
if maxoffset
|
|
411
|
+
if maxoffset is not None:
|
|
387
412
|
calcbuf.write(',}')
|
|
388
|
-
elif maxoffset
|
|
413
|
+
elif maxoffset is not None:
|
|
389
414
|
calcbuf.write(char + '{0,' + maxoffset + '}')
|
|
390
415
|
|
|
391
|
-
if calcremain:
|
|
416
|
+
if calcremain: # recursion happens here
|
|
392
417
|
calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
|
|
393
|
-
|
|
418
|
+
|
|
394
419
|
val = calcbuf.getvalue()
|
|
395
420
|
calcbuf.close()
|
|
396
421
|
return val
|
|
397
422
|
|
|
423
|
+
|
|
398
424
|
def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
399
|
-
"""Convert
|
|
400
|
-
@param chars, a pronom bytesequence, into a
|
|
401
|
-
@return regular expression.
|
|
402
|
-
Endianness is not used.
|
|
403
425
|
"""
|
|
426
|
+
Convert to regular expression.
|
|
427
|
+
|
|
428
|
+
Endianness is not used.
|
|
404
429
|
|
|
430
|
+
@param chars, a pronom bytesequence, into a
|
|
431
|
+
@return regular expression.
|
|
432
|
+
"""
|
|
405
433
|
if 'Big' in endianness:
|
|
406
434
|
littleendian = False
|
|
407
435
|
else:
|
|
@@ -410,24 +438,26 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
|
410
438
|
offset = '0'
|
|
411
439
|
if len(maxoffset) == 0:
|
|
412
440
|
maxoffset = None
|
|
441
|
+
if maxoffset == '0':
|
|
442
|
+
maxoffset = None
|
|
413
443
|
# make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
|
|
414
444
|
global buf
|
|
415
|
-
buf = cStringIO
|
|
416
|
-
buf.write("(?s)")
|
|
445
|
+
buf = cStringIO()
|
|
446
|
+
buf.write("(?s)") # If a regex starts with (?s), it is equivalent to DOTALL.
|
|
417
447
|
i = 0
|
|
418
448
|
state = 'start'
|
|
419
449
|
if 'BOF' in pos:
|
|
420
|
-
buf.write('\\A')
|
|
450
|
+
buf.write('\\A') # start of regex
|
|
421
451
|
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
|
422
|
-
|
|
452
|
+
|
|
423
453
|
if 'IFB' in pos:
|
|
424
454
|
buf.write('\\A')
|
|
425
455
|
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
|
426
|
-
|
|
456
|
+
|
|
427
457
|
while True:
|
|
428
458
|
if i == len(chars):
|
|
429
459
|
break
|
|
430
|
-
#print _convert_err_msg(state,chars[i],i,chars)
|
|
460
|
+
# print _convert_err_msg(state,chars[i],i,chars)
|
|
431
461
|
if state == 'start':
|
|
432
462
|
if chars[i].isalnum():
|
|
433
463
|
state = 'bytes'
|
|
@@ -471,7 +501,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
|
471
501
|
(byt, inc) = doByte(chars, i, littleendian)
|
|
472
502
|
buf.write(byt)
|
|
473
503
|
i += inc
|
|
474
|
-
#assert(chars[i] == ':')
|
|
504
|
+
# assert(chars[i] == ':')
|
|
475
505
|
if chars[i] != ':':
|
|
476
506
|
return "__INCOMPATIBLE_SIG__"
|
|
477
507
|
buf.write('-')
|
|
@@ -479,13 +509,13 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
|
479
509
|
(byt, inc) = doByte(chars, i, littleendian)
|
|
480
510
|
buf.write(byt)
|
|
481
511
|
i += inc
|
|
482
|
-
#assert(chars[i] == ']')
|
|
512
|
+
# assert(chars[i] == ']')
|
|
483
513
|
if chars[i] != ']':
|
|
484
514
|
return "__INCOMPATIBLE_SIG__"
|
|
485
515
|
buf.write(']')
|
|
486
516
|
i += 1
|
|
487
517
|
except Exception:
|
|
488
|
-
print
|
|
518
|
+
print(_convert_err_msg('Illegal character in bracket', chars[i], i, chars))
|
|
489
519
|
raise
|
|
490
520
|
if i < len(chars) and chars[i] == '{':
|
|
491
521
|
state = 'curly-after-bracket'
|
|
@@ -511,7 +541,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
|
511
541
|
(byt, inc) = doByte(chars, i, littleendian)
|
|
512
542
|
buf.write(byt)
|
|
513
543
|
i += inc
|
|
514
|
-
#assert(chars[i] == ':')
|
|
544
|
+
# assert(chars[i] == ':')
|
|
515
545
|
if chars[i] != ':':
|
|
516
546
|
return "__INCOMPATIBLE_SIG__"
|
|
517
547
|
buf.write('-')
|
|
@@ -519,8 +549,8 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
|
519
549
|
(byt, inc) = doByte(chars, i, littleendian)
|
|
520
550
|
buf.write(byt)
|
|
521
551
|
i += inc
|
|
522
|
-
|
|
523
|
-
#assert(chars[i] == ']')
|
|
552
|
+
|
|
553
|
+
# assert(chars[i] == ']')
|
|
524
554
|
if chars[i] != ']':
|
|
525
555
|
return "__INCOMPATIBLE_SIG__"
|
|
526
556
|
buf.write(']')
|
|
@@ -537,7 +567,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
|
537
567
|
# when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
|
|
538
568
|
# The above, while sensible, appears to be incorrect. A '.' is always needed.
|
|
539
569
|
# for droid equiv behavior
|
|
540
|
-
#if state == 'curly':
|
|
570
|
+
# if state == 'curly':
|
|
541
571
|
buf.write('.')
|
|
542
572
|
buf.write('{')
|
|
543
573
|
i += 1 # skip the (
|
|
@@ -548,7 +578,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
|
548
578
|
elif chars[i] == '-':
|
|
549
579
|
buf.write(',')
|
|
550
580
|
i += 1
|
|
551
|
-
elif chars[i] == '*':
|
|
581
|
+
elif chars[i] == '*': # skip the *
|
|
552
582
|
i += 1
|
|
553
583
|
elif chars[i] == '}':
|
|
554
584
|
break
|
|
@@ -581,36 +611,35 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
|
581
611
|
buf.close()
|
|
582
612
|
return val
|
|
583
613
|
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
parser
|
|
606
|
-
parser.add_argument('-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
614
|
+
|
|
615
|
+
def run(input=None, output=None, puid=None):
|
|
616
|
+
"""Convert PRONOM formats into FIDO signatures."""
|
|
617
|
+
versions = get_local_pronom_versions()
|
|
618
|
+
|
|
619
|
+
if input is None:
|
|
620
|
+
input = versions.get_zip_file()
|
|
621
|
+
if output is None:
|
|
622
|
+
output = versions.get_signature_file()
|
|
623
|
+
|
|
624
|
+
info = FormatInfo(input)
|
|
625
|
+
info.load_pronom_xml(puid)
|
|
626
|
+
info.save(output)
|
|
627
|
+
print('Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats)), file=sys.stderr)
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def main(args=None):
|
|
631
|
+
"""Main CLI entrypoint."""
|
|
632
|
+
if args is None:
|
|
633
|
+
args = sys.argv[1:]
|
|
634
|
+
|
|
635
|
+
parser = ArgumentParser(description='Produce the FIDO format XML that is loaded at run-time')
|
|
636
|
+
parser.add_argument('-input', default=None, help='Input file, a Zip containing PRONOM XML files')
|
|
637
|
+
parser.add_argument('-output', default=None, help='Ouptut file')
|
|
638
|
+
parser.add_argument('-puid', default=None, help='A particular PUID record to extract')
|
|
639
|
+
args = parser.parse_args(args)
|
|
640
|
+
|
|
641
|
+
run(input=args.input, output=args.output, puid=args.puid)
|
|
642
|
+
|
|
643
|
+
|
|
615
644
|
if __name__ == '__main__':
|
|
616
|
-
main()
|
|
645
|
+
main()
|