libis-format 0.9.30 → 0.9.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/libis/format/converter/image_converter.rb +2 -2
- data/lib/libis/format/office_to_pdf.rb +1 -1
- data/lib/libis/format/version.rb +1 -1
- data/spec/converter_spec.rb +43 -27
- data/spec/data/test-options.png +0 -0
- data/spec/data/test.pdf.tif +0 -0
- data/tools/droid/{DROID_SignatureFile_V82.xml → DROID_SignatureFile_V90.xml} +8202 -701
- data/tools/droid/{container-signature-20150307.xml → container-signature-20170330.xml} +3584 -2235
- data/tools/droid/droid-command-line-6.3.jar +0 -0
- data/tools/droid/droid.bat +152 -154
- data/tools/droid/droid.sh +30 -16
- data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
- data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
- data/tools/droid/lib/commons-codec-1.10.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
- data/tools/droid/lib/droid-container-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/{droid-help-6.1.5.jar → droid-help-6.3.jar} +0 -0
- data/tools/droid/lib/droid-report-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-results-6.3.jar +0 -0
- data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
- data/tools/droid/lib/poi-3.13.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
- data/tools/fido/__init__.py +50 -0
- data/tools/fido/conf/DROID_SignatureFile-v90.xml +2 -0
- data/tools/fido/conf/{container-signature-20150307.xml → container-signature-20170330.xml} +1487 -141
- data/tools/fido/conf/format_extensions.xml +0 -14
- data/tools/fido/conf/{formats-v81.xml → formats-v90.xml} +11409 -887
- data/tools/fido/conf/{pronom-xml-v81.zip → pronom-xml-v90.zip} +0 -0
- data/tools/fido/conf/versions.xml +6 -6
- data/tools/fido/fido.py +437 -407
- data/tools/fido/package.py +96 -0
- data/tools/fido/prepare.py +217 -188
- data/tools/fido/pronomutils.py +143 -58
- data/tools/fido/toxml.py +54 -46
- data/tools/fido/update_signatures.py +139 -127
- metadata +34 -40
- data/tools/droid/droid-command-line-6.1.5.jar +0 -0
- data/tools/droid/lib/antlr-2.7.7.jar +0 -0
- data/tools/droid/lib/antlr-3.2.jar +0 -0
- data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
- data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-codec-1.4.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
- data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
- data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
- data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
- data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
- data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
- data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
- data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/poi-3.7.jar +0 -0
- data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
- data/tools/fido/argparselocal.py +0 -2355
- data/tools/fido/conf/DROID_SignatureFile-v81.xml +0 -2
|
Binary file
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
<?xml version=
|
|
1
|
+
<?xml version='1.0' encoding='utf-8'?>
|
|
2
2
|
<versions>
|
|
3
|
-
<pronomVersion>
|
|
4
|
-
<pronomSignature>formats-
|
|
5
|
-
<pronomContainerSignature>container-signature-
|
|
3
|
+
<pronomVersion>90</pronomVersion>
|
|
4
|
+
<pronomSignature>formats-v90.xml</pronomSignature>
|
|
5
|
+
<pronomContainerSignature>container-signature-20170330.xml</pronomContainerSignature>
|
|
6
6
|
<fidoExtensionSignature>format_extensions.xml</fidoExtensionSignature>
|
|
7
|
-
<updateScript>1.
|
|
8
|
-
</versions>
|
|
7
|
+
<updateScript>1.3.6</updateScript>
|
|
8
|
+
</versions>
|
data/tools/fido/fido.py
CHANGED
|
@@ -1,57 +1,75 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Format Identification for Digital Objects (FIDO).
|
|
6
|
+
|
|
7
|
+
FIDO is a command-line tool to identify the file formats of digital objects.
|
|
8
|
+
It is designed for simple integration into automated work-flows.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import absolute_import
|
|
12
|
+
|
|
13
|
+
from argparse import ArgumentParser, RawTextHelpFormatter
|
|
14
|
+
from contextlib import closing
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import sys
|
|
18
|
+
import tarfile
|
|
19
|
+
import tempfile
|
|
20
|
+
import time
|
|
5
21
|
from xml.etree import cElementTree as ET
|
|
6
22
|
from xml.etree import ElementTree as CET
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
'
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
23
|
+
import zipfile
|
|
24
|
+
|
|
25
|
+
from six.moves import range
|
|
26
|
+
|
|
27
|
+
from . import __version__, CONFIG_DIR
|
|
28
|
+
from .package import OlePackage, ZipPackage
|
|
29
|
+
from .pronomutils import get_local_pronom_versions
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
defaults = {
|
|
33
|
+
'bufsize': 128 * 1024, # (bytes)
|
|
34
|
+
'regexcachesize': 2084, # (bytes)
|
|
35
|
+
'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
|
|
36
|
+
'printnomatch': "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
|
|
37
|
+
'format_files': [
|
|
38
|
+
'formats-v88.xml',
|
|
39
|
+
'format_extensions.xml'
|
|
40
|
+
],
|
|
41
|
+
'containersignature_file': 'container-signature-20170330.xml',
|
|
42
|
+
'container_bufsize': 512 * 1024, # (bytes)
|
|
43
|
+
'description': """Format Identification for Digital Objects (fido).
|
|
44
|
+
FIDO is a command-line tool to identify the file formats of digital objects.
|
|
45
|
+
It is designed for simple integration into automated work-flows.""",
|
|
46
|
+
'epilog': """
|
|
47
|
+
Open Planets Foundation (http://www.openplanetsfoundation.org)
|
|
48
|
+
See License.txt for license information.
|
|
49
|
+
Download from: https://github.com/openplanets/fido/releases
|
|
50
|
+
Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
|
|
51
|
+
Author: Adam Farquhar (BL), 2010
|
|
52
|
+
Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
|
|
53
|
+
FIDO uses the UK National Archives (TNA) PRONOM File Format
|
|
54
|
+
and Container descriptions.
|
|
55
|
+
PRONOM is available from http://www.nationalarchives.gov.uk/pronom/""",
|
|
36
56
|
}
|
|
37
57
|
|
|
58
|
+
|
|
38
59
|
class Fido:
|
|
39
|
-
def __init__(self, quiet=False, bufsize=None, container_bufsize
|
|
60
|
+
def __init__(self, quiet=False, bufsize=None, container_bufsize=None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=CONFIG_DIR, format_files=None, containersignature_file=None):
|
|
40
61
|
global defaults
|
|
41
62
|
self.quiet = quiet
|
|
42
|
-
self.bufsize =
|
|
43
|
-
self.container_bufsize =
|
|
44
|
-
self.printmatch =
|
|
45
|
-
self.printnomatch =
|
|
46
|
-
self.handle_matches =
|
|
63
|
+
self.bufsize = defaults['bufsize'] if bufsize is None else bufsize
|
|
64
|
+
self.container_bufsize = defaults['container_bufsize'] if container_bufsize is None else container_bufsize
|
|
65
|
+
self.printmatch = defaults['printmatch'] if printmatch is None else printmatch
|
|
66
|
+
self.printnomatch = defaults['printnomatch'] if printnomatch is None else printnomatch
|
|
67
|
+
self.handle_matches = self.print_matches if handle_matches is None else handle_matches
|
|
47
68
|
self.zip = zip
|
|
48
|
-
self.nocontainer =
|
|
49
|
-
self.conf_dir =
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
self.format_files = defaults['format_files'] if format_files == None else format_files
|
|
53
|
-
#self.containersignature_file = defaults['containersignature_file'] if containersignature_file == None else containersignature_file
|
|
54
|
-
self.containersignature_file = defaults['containersignature_file'] #if containersignature_file == None else containersignature_file
|
|
69
|
+
self.nocontainer = nocontainer
|
|
70
|
+
self.conf_dir = conf_dir
|
|
71
|
+
self.format_files = defaults['format_files'] if format_files is None else format_files
|
|
72
|
+
self.containersignature_file = defaults['containersignature_file']
|
|
55
73
|
self.formats = []
|
|
56
74
|
self.puid_format_map = {}
|
|
57
75
|
self.puid_has_priority_over_map = {}
|
|
@@ -69,10 +87,10 @@ class Fido:
|
|
|
69
87
|
self.externalsig = ET.XML('<signature><name>External</name></signature>')
|
|
70
88
|
|
|
71
89
|
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
|
72
|
-
|
|
73
|
-
_special = '$()*+.?![]^\\{|}'
|
|
90
|
+
_special = '$()*+.?![]^\\{|}' # Before: '$*+.?![]^\\{|}'
|
|
74
91
|
_hex = '0123456789abcdef'
|
|
75
|
-
|
|
92
|
+
|
|
93
|
+
def _escape_char(self, c):
|
|
76
94
|
if c in '\n':
|
|
77
95
|
return '\\n'
|
|
78
96
|
elif c == '\r':
|
|
@@ -82,17 +100,22 @@ class Fido:
|
|
|
82
100
|
else:
|
|
83
101
|
(high, low) = divmod(ord(c), 16)
|
|
84
102
|
return '\\x' + self._hex[high] + self._hex[low]
|
|
85
|
-
|
|
86
|
-
def escape(self,string):
|
|
87
|
-
"
|
|
103
|
+
|
|
104
|
+
def escape(self, string):
|
|
105
|
+
"""
|
|
106
|
+
Escape characters in pattern that are non-printable, non-ascii, or
|
|
107
|
+
special for regexes.
|
|
108
|
+
"""
|
|
88
109
|
escaped = ''.join(c if c in self._ordinary else self._escape_char(c) for c in string)
|
|
89
110
|
return escaped
|
|
90
111
|
|
|
91
|
-
def convert_container_sequence(self,sig):
|
|
92
|
-
"""
|
|
93
|
-
and convert to regular
|
|
112
|
+
def convert_container_sequence(self, sig):
|
|
113
|
+
"""
|
|
114
|
+
Parse the PRONOM container sequences and convert to regular
|
|
115
|
+
expressions.
|
|
94
116
|
"""
|
|
95
|
-
|
|
117
|
+
# The sequence is regex matching bytes from a file so the sequence must also be bytes
|
|
118
|
+
seq = b'(?s)'
|
|
96
119
|
inq = False
|
|
97
120
|
byt = False
|
|
98
121
|
rng = False
|
|
@@ -105,41 +128,41 @@ class Fido:
|
|
|
105
128
|
if sig[i] == " ":
|
|
106
129
|
continue
|
|
107
130
|
if sig[i] == "[":
|
|
108
|
-
seq += "("
|
|
131
|
+
seq += b"("
|
|
109
132
|
rng = True
|
|
110
133
|
continue
|
|
111
134
|
if not byt:
|
|
112
|
-
seq += "\\x" + sig[i].lower()
|
|
135
|
+
seq += b"\\x" + sig[i].lower().encode('utf8')
|
|
113
136
|
byt = True
|
|
114
137
|
continue
|
|
115
138
|
if byt:
|
|
116
|
-
seq += sig[i].lower()
|
|
139
|
+
seq += sig[i].lower().encode('utf8')
|
|
117
140
|
byt = False
|
|
118
141
|
continue
|
|
119
142
|
if inq:
|
|
120
143
|
if sig[i] == "'" and not rng:
|
|
121
144
|
inq = False
|
|
122
145
|
continue
|
|
123
|
-
seq += self.escape(sig[i])
|
|
146
|
+
seq += self.escape(sig[i]).encode('utf8')
|
|
124
147
|
continue
|
|
125
148
|
if rng:
|
|
126
149
|
if sig[i] == "]":
|
|
127
|
-
seq += ")"
|
|
150
|
+
seq += b")"
|
|
128
151
|
rng = False
|
|
129
152
|
continue
|
|
130
153
|
if sig[i] != "-" and sig[i] != "'" and ror:
|
|
131
|
-
seq += self.escape(sig[i])
|
|
154
|
+
seq += self.escape(sig[i]).encode('utf8')
|
|
132
155
|
continue
|
|
133
|
-
if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and not byt:
|
|
134
|
-
seq += "\\x" + sig[i].lower()
|
|
156
|
+
if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and sig[i] != ":" and not ror and not byt:
|
|
157
|
+
seq += b"\\x" + sig[i].lower().encode('utf8')
|
|
135
158
|
byt = True
|
|
136
159
|
continue
|
|
137
160
|
if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and byt:
|
|
138
|
-
seq += sig[i].lower()
|
|
161
|
+
seq += sig[i].lower().encode('utf8')
|
|
139
162
|
byt = False
|
|
140
163
|
continue
|
|
141
164
|
if sig[i] == "-" or sig[i] == " ":
|
|
142
|
-
seq += "|"
|
|
165
|
+
seq += b"|"
|
|
143
166
|
continue
|
|
144
167
|
if sig[i] == "'" and not ror:
|
|
145
168
|
ror = True
|
|
@@ -147,12 +170,13 @@ class Fido:
|
|
|
147
170
|
if sig[i] == "'" and ror:
|
|
148
171
|
ror = False
|
|
149
172
|
continue
|
|
150
|
-
|
|
173
|
+
|
|
151
174
|
return seq
|
|
152
|
-
|
|
175
|
+
|
|
153
176
|
def load_container_signature(self, containersignature_file):
|
|
154
|
-
"""
|
|
155
|
-
and convert sequences to
|
|
177
|
+
"""
|
|
178
|
+
Load the PRONOM container-signature file and convert sequences to
|
|
179
|
+
regular expressions.
|
|
156
180
|
"""
|
|
157
181
|
tree = CET.parse(containersignature_file)
|
|
158
182
|
# load and have container signatures converted
|
|
@@ -163,11 +187,6 @@ class Fido:
|
|
|
163
187
|
self.sequenceSignature[signatureId] = []
|
|
164
188
|
for sequence in signatureSequence:
|
|
165
189
|
self.sequenceSignature[signatureId].append(self.convert_container_sequence(sequence[0].text))
|
|
166
|
-
# find PUIDs which trigger container matching
|
|
167
|
-
self.puidTriggers = {}
|
|
168
|
-
triggers = tree.find('TriggerPuids')
|
|
169
|
-
for puid in triggers.findall('TriggerPuid'):
|
|
170
|
-
self.puidTriggers[puid.get('Puid')] = True
|
|
171
190
|
# map PUID to container signatureId
|
|
172
191
|
self.puidMapping = {}
|
|
173
192
|
mappings = tree.find('FileFormatMappings')
|
|
@@ -175,24 +194,72 @@ class Fido:
|
|
|
175
194
|
if mapping.get('signatureId') not in self.puidMapping:
|
|
176
195
|
self.puidMapping[mapping.get('signatureId')] = []
|
|
177
196
|
self.puidMapping[mapping.get('signatureId')].append(mapping.get('Puid'))
|
|
178
|
-
#
|
|
179
|
-
#
|
|
180
|
-
#
|
|
181
|
-
|
|
197
|
+
# print "sequences:\n",self.sequenceSignature
|
|
198
|
+
# print "mapping:\n",self.puidMapping
|
|
199
|
+
# exit()
|
|
200
|
+
|
|
201
|
+
def extract_signatures(self, doc, signature_type="ZIP"):
|
|
202
|
+
"""
|
|
203
|
+
Given an XML container signature file, returns a dictionary of signatures.
|
|
204
|
+
|
|
205
|
+
The format of the dictionary is:
|
|
206
|
+
|
|
207
|
+
{
|
|
208
|
+
path_to_file_inside_zip: {puid: [signatures]}
|
|
209
|
+
}
|
|
210
|
+
"""
|
|
211
|
+
root = doc.getroot()
|
|
212
|
+
format_mappings = root.find("FileFormatMappings")
|
|
213
|
+
|
|
214
|
+
def get_puid(doc, element_id):
|
|
215
|
+
return format_mappings.find('FileFormatMapping[@signatureId="{}"]'.format(element_id)).attrib["Puid"]
|
|
216
|
+
|
|
217
|
+
def format_signature_attributes(element):
|
|
218
|
+
return {
|
|
219
|
+
"path": element.findtext("Files/File/Path"),
|
|
220
|
+
"id": element.attrib["Id"],
|
|
221
|
+
"signature": self.convert_container_sequence(element.findtext("Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence/Sequence"))
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
elements = root.findall("ContainerSignatures/ContainerSignature[@ContainerType=\"{}\"]".format(signature_type))
|
|
225
|
+
signatures = {}
|
|
226
|
+
for el in elements:
|
|
227
|
+
if el.find("Files/File/BinarySignatures") is None:
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
puid = get_puid(doc, el.attrib["Id"])
|
|
231
|
+
signature = format_signature_attributes(el)
|
|
232
|
+
path = signature["path"]
|
|
233
|
+
if path not in signatures:
|
|
234
|
+
signatures[path] = {}
|
|
235
|
+
if puid not in signatures[path]:
|
|
236
|
+
signatures[path][puid] = []
|
|
237
|
+
signatures[path][puid].append(format_signature_attributes(el))
|
|
238
|
+
return signatures
|
|
239
|
+
|
|
240
|
+
def match_container(self, signature_type, klass, file, signature_file):
|
|
241
|
+
puids = klass(file, self.extract_signatures(signature_file, signature_type=signature_type)).detect_formats()
|
|
242
|
+
results = []
|
|
243
|
+
for puid in puids:
|
|
244
|
+
format = self.puid_format_map[puid]
|
|
245
|
+
signature = format.findtext("name")
|
|
246
|
+
results.append((format, signature))
|
|
247
|
+
return results
|
|
182
248
|
|
|
183
249
|
def load_fido_xml(self, file):
|
|
184
|
-
"""
|
|
185
|
-
|
|
186
|
-
|
|
250
|
+
"""
|
|
251
|
+
Load the fido format information from @param file.
|
|
252
|
+
As a side-effect, set self.formats.
|
|
253
|
+
@return list of ElementTree.Element, one for each format.
|
|
187
254
|
"""
|
|
188
255
|
tree = ET.parse(file)
|
|
189
|
-
#print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
|
|
190
|
-
#TODO: Handle empty regexes properly; perhaps remove from the format list
|
|
256
|
+
# print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
|
|
257
|
+
# TODO: Handle empty regexes properly; perhaps remove from the format list
|
|
191
258
|
for element in tree.getroot().findall('./format'):
|
|
192
259
|
puid = self.get_puid(element)
|
|
193
260
|
# Handle over-writes in multiple file loads
|
|
194
|
-
existing = self.puid_format_map.get(puid, False)
|
|
195
|
-
if
|
|
261
|
+
existing = self.puid_format_map.get(puid, False)
|
|
262
|
+
if existing:
|
|
196
263
|
# Already have one, so replace old with new!
|
|
197
264
|
self.formats[self.formats.index(existing)] = element
|
|
198
265
|
else:
|
|
@@ -205,31 +272,33 @@ class Fido:
|
|
|
205
272
|
# To delete a format: (1) remove from self.formats, (2) remove from puid_format_map, (3) remove from selt.puid_has_priority_over_map
|
|
206
273
|
def get_signatures(self, format):
|
|
207
274
|
return format.findall('signature')
|
|
208
|
-
|
|
275
|
+
|
|
209
276
|
def has_priority_over(self, format, possibly_inferior):
|
|
210
277
|
return self.get_puid(possibly_inferior)in self.puid_has_priority_over_map[self.get_puid(format)]
|
|
211
|
-
|
|
278
|
+
|
|
212
279
|
def get_puid(self, format):
|
|
213
280
|
return format.find('puid').text
|
|
214
|
-
|
|
281
|
+
|
|
215
282
|
def get_patterns(self, signature):
|
|
216
283
|
return signature.findall('pattern')
|
|
217
|
-
|
|
218
|
-
def get_pos(self, pat):
|
|
284
|
+
|
|
285
|
+
def get_pos(self, pat):
|
|
219
286
|
return pat.find('position').text
|
|
220
|
-
|
|
287
|
+
|
|
221
288
|
def get_regex(self, pat):
|
|
222
|
-
|
|
223
|
-
|
|
289
|
+
# The regex is matching bytes from a file so regex must also be bytes
|
|
290
|
+
return pat.find('regex').text.encode('utf8')
|
|
291
|
+
|
|
224
292
|
def get_extension(self, format):
|
|
225
293
|
return format.find('extension').text
|
|
226
|
-
|
|
294
|
+
|
|
227
295
|
def print_matches(self, fullname, matches, delta_t, matchtype=''):
|
|
228
|
-
"""
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
296
|
+
"""
|
|
297
|
+
The default match handler. Prints out information for each match in the list.
|
|
298
|
+
@param fullname is name of the file being matched
|
|
299
|
+
@param matches is a list of (format, signature)
|
|
300
|
+
@param delta_t is the time taken for the match.
|
|
301
|
+
@param matchtype is the type of match (signature, containersignature, extension, fail)
|
|
233
302
|
"""
|
|
234
303
|
class Info:
|
|
235
304
|
pass
|
|
@@ -241,38 +310,60 @@ class Fido:
|
|
|
241
310
|
obj.filesize = self.current_filesize
|
|
242
311
|
obj.matchtype = matchtype
|
|
243
312
|
if len(matches) == 0:
|
|
244
|
-
sys.stdout.write(self.printnomatch % {
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
313
|
+
sys.stdout.write(self.printnomatch % {
|
|
314
|
+
"info.time": obj.time,
|
|
315
|
+
"info.filesize": obj.filesize,
|
|
316
|
+
"info.filename": obj.filename,
|
|
317
|
+
"info.count": obj.count,
|
|
318
|
+
"info.matchtype": "fail"
|
|
319
|
+
})
|
|
320
|
+
return
|
|
321
|
+
i = 0
|
|
322
|
+
for (f, sig_name) in matches:
|
|
323
|
+
i += 1
|
|
324
|
+
obj.group_index = i
|
|
325
|
+
obj.puid = self.get_puid(f)
|
|
326
|
+
obj.formatname = f.find('name').text
|
|
327
|
+
obj.signaturename = sig_name
|
|
328
|
+
mime = f.find('mime')
|
|
329
|
+
obj.mimetype = mime.text if mime is not None else None
|
|
330
|
+
version = f.find('version')
|
|
331
|
+
obj.version = version.text if version is not None else None
|
|
332
|
+
alias = f.find('alias')
|
|
333
|
+
obj.alias = alias.text if alias is not None else None
|
|
334
|
+
apple_uti = f.find('apple_uid')
|
|
335
|
+
obj.apple_uti = apple_uti.text if apple_uti is not None else None
|
|
336
|
+
sys.stdout.write(self.printmatch % {
|
|
337
|
+
"info.time": obj.time,
|
|
338
|
+
"info.puid": obj.puid,
|
|
339
|
+
"info.formatname": obj.formatname,
|
|
340
|
+
"info.signaturename": obj.signaturename,
|
|
341
|
+
"info.filesize": obj.filesize,
|
|
342
|
+
"info.filename": obj.filename,
|
|
343
|
+
"info.mimetype": obj.mimetype,
|
|
344
|
+
"info.matchtype": obj.matchtype,
|
|
345
|
+
"info.version": obj.version,
|
|
346
|
+
"info.alias": obj.alias,
|
|
347
|
+
"info.apple_uti": obj.apple_uti,
|
|
348
|
+
"info.group_size": obj.group_size,
|
|
349
|
+
"info.group_index": obj.group_index,
|
|
350
|
+
"info.count": obj.count
|
|
351
|
+
})
|
|
352
|
+
|
|
264
353
|
def print_summary(self, secs):
|
|
265
|
-
"""
|
|
354
|
+
"""
|
|
355
|
+
Print summary information on the number of matches and time taken.
|
|
266
356
|
"""
|
|
267
357
|
count = self.current_count
|
|
268
358
|
if not self.quiet:
|
|
269
359
|
rate = (int(round(count / secs)) if secs != 0 else 9999)
|
|
270
|
-
#print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
|
|
271
|
-
sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' %
|
|
272
|
-
|
|
360
|
+
# print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
|
|
361
|
+
sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
|
|
362
|
+
|
|
273
363
|
def identify_file(self, filename):
|
|
274
|
-
"""
|
|
275
|
-
|
|
364
|
+
"""
|
|
365
|
+
Identify the type of @param filename.
|
|
366
|
+
Call self.handle_matches instead of returning a value.
|
|
276
367
|
"""
|
|
277
368
|
self.current_file = filename
|
|
278
369
|
self.matchtype = "signature"
|
|
@@ -282,9 +373,19 @@ class Fido:
|
|
|
282
373
|
size = os.stat(filename)[6]
|
|
283
374
|
self.current_filesize = size
|
|
284
375
|
if self.current_filesize == 0:
|
|
285
|
-
sys.stderr.write("FIDO: Zero byte file (empty): Path is:
|
|
286
|
-
bofbuffer, eofbuffer = self.get_buffers(f, size, seekable=True)
|
|
287
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
376
|
+
sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + filename + "\n")
|
|
377
|
+
bofbuffer, eofbuffer, _ = self.get_buffers(f, size, seekable=True)
|
|
378
|
+
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
379
|
+
container_type = self.container_type(matches)
|
|
380
|
+
if container_type in ("zip", "ole"):
|
|
381
|
+
container_file = ET.parse(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
|
|
382
|
+
if container_type == "zip":
|
|
383
|
+
container_matches = self.match_container("ZIP", ZipPackage, filename, container_file)
|
|
384
|
+
else:
|
|
385
|
+
container_matches = self.match_container("OLE2", OlePackage, filename, container_file)
|
|
386
|
+
if len(container_matches) > 0:
|
|
387
|
+
self.handle_matches(filename, container_matches, time.clock() - t0, "container")
|
|
388
|
+
return
|
|
288
389
|
# from here is also repeated in walk_zip
|
|
289
390
|
# we should make this uniform in a next version!
|
|
290
391
|
#
|
|
@@ -296,30 +397,37 @@ class Fido:
|
|
|
296
397
|
elif len(matches) == 0 or self.current_filesize == 0:
|
|
297
398
|
matches = self.match_extensions(filename)
|
|
298
399
|
self.handle_matches(filename, matches, time.clock() - t0, "extension")
|
|
400
|
+
# only recurse into certain containers, like ZIP or TAR
|
|
401
|
+
container = self.container_type(matches)
|
|
299
402
|
# till here matey!
|
|
300
|
-
if self.zip:
|
|
301
|
-
self.identify_contents(filename, type=
|
|
403
|
+
if self.zip and self.can_recurse_into_container(container):
|
|
404
|
+
self.identify_contents(filename, type=container)
|
|
302
405
|
except IOError:
|
|
303
|
-
#print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
|
|
406
|
+
# print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
|
|
304
407
|
sys.stderr.write("FIDO: Error in identify_file: Path is {0}\n".format(filename))
|
|
305
408
|
|
|
306
409
|
def identify_contents(self, filename, fileobj=None, type=False):
|
|
307
|
-
"""Identify each item in a container (such as a zip or tar file). Call self.handle_matches on each item.
|
|
308
|
-
@param fileobj could be a file, or a stream.
|
|
309
410
|
"""
|
|
310
|
-
|
|
411
|
+
Identify each item in a container (such as a zip or tar file). Call
|
|
412
|
+
self.handle_matches on each item.
|
|
413
|
+
@param fileobj could be a file, or a stream.
|
|
414
|
+
"""
|
|
415
|
+
if not type:
|
|
311
416
|
return
|
|
312
417
|
elif type == 'zip':
|
|
313
418
|
self.walk_zip(filename, fileobj)
|
|
314
419
|
elif type == 'tar':
|
|
315
420
|
self.walk_tar(filename, fileobj)
|
|
316
|
-
else:
|
|
421
|
+
else: # TODO: ouch!
|
|
317
422
|
raise RuntimeError("Unknown container type: " + repr(type))
|
|
318
|
-
|
|
423
|
+
|
|
319
424
|
def identify_multi_object_stream(self, stream):
|
|
320
|
-
"""
|
|
321
|
-
|
|
322
|
-
|
|
425
|
+
"""
|
|
426
|
+
Does not work!
|
|
427
|
+
Stream may contain one or more objects each with an HTTP style header
|
|
428
|
+
that must include content-length. The headers consist of keyword:value
|
|
429
|
+
pairs terminated by a newline. There must be a newline following the
|
|
430
|
+
headers.
|
|
323
431
|
"""
|
|
324
432
|
offset = 0
|
|
325
433
|
while True:
|
|
@@ -337,10 +445,10 @@ class Fido:
|
|
|
337
445
|
content_length = int(pair[1])
|
|
338
446
|
if content_length == -1:
|
|
339
447
|
return
|
|
340
|
-
# Consume exactly content-length bytes
|
|
448
|
+
# Consume exactly content-length bytes
|
|
341
449
|
self.current_file = 'STDIN!(at ' + str(offset) + ' bytes)'
|
|
342
450
|
self.current_filesize = content_length
|
|
343
|
-
bofbuffer, eofbuffer = self.get_buffers(stream, content_length)
|
|
451
|
+
bofbuffer, eofbuffer, _ = self.get_buffers(stream, content_length)
|
|
344
452
|
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
345
453
|
# MdR: this needs attention
|
|
346
454
|
if len(matches) > 0:
|
|
@@ -348,11 +456,12 @@ class Fido:
|
|
|
348
456
|
elif len(matches) == 0 or self.current_filesize == 0:
|
|
349
457
|
matches = self.match_extensions(self.current_file)
|
|
350
458
|
self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
|
|
351
|
-
|
|
459
|
+
|
|
352
460
|
def identify_stream(self, stream, filename):
|
|
353
|
-
"""
|
|
354
|
-
|
|
355
|
-
|
|
461
|
+
"""
|
|
462
|
+
Identify the type of @param stream.
|
|
463
|
+
Call self.handle_matches instead of returning a value.
|
|
464
|
+
Does not close stream.
|
|
356
465
|
"""
|
|
357
466
|
t0 = time.clock()
|
|
358
467
|
bofbuffer, eofbuffer, bytes_read = self.get_buffers(stream, length=None)
|
|
@@ -381,20 +490,40 @@ class Fido:
|
|
|
381
490
|
if (os.name != "nt"):
|
|
382
491
|
self.current_file = 'STDIN'
|
|
383
492
|
self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
|
|
384
|
-
|
|
493
|
+
|
|
385
494
|
def container_type(self, matches):
|
|
386
|
-
"""Determine if one of the @param matches is the format of a container that we can look inside of (e.g., zip, tar).
|
|
387
|
-
@return False, zip, or tar.
|
|
388
495
|
"""
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
496
|
+
Determine if one of the @param matches is the format of a container
|
|
497
|
+
that we can look inside of (e.g., zip, tar).
|
|
498
|
+
@return False, zip, or tar.
|
|
499
|
+
"""
|
|
500
|
+
for (format_, unused) in matches:
|
|
501
|
+
container = format_.find('container')
|
|
502
|
+
if container is not None:
|
|
392
503
|
return container.text
|
|
504
|
+
|
|
505
|
+
# aside from checking <container> elements,
|
|
506
|
+
# check for fmt/111, which is OLE
|
|
507
|
+
puid = format_.find('puid')
|
|
508
|
+
if puid is not None and puid.text == 'fmt/111':
|
|
509
|
+
return 'ole'
|
|
393
510
|
return False
|
|
394
|
-
|
|
511
|
+
|
|
512
|
+
def can_recurse_into_container(self, container_type):
|
|
513
|
+
"""
|
|
514
|
+
Determine if the passed container type can:
|
|
515
|
+
a) be extracted, and
|
|
516
|
+
b) contain individual files which can be identified separately.
|
|
517
|
+
|
|
518
|
+
This function is useful for filtering out containers such as OLE,
|
|
519
|
+
which are usually most interesting as compound objects rather than
|
|
520
|
+
for their contents.
|
|
521
|
+
"""
|
|
522
|
+
return container_type in ('zip', 'tar')
|
|
523
|
+
|
|
395
524
|
def blocking_read(self, file, bytes_to_read):
|
|
396
525
|
bytes_read = 0
|
|
397
|
-
buffer = ''
|
|
526
|
+
buffer = b''
|
|
398
527
|
while bytes_read < bytes_to_read:
|
|
399
528
|
readbuffer = file.read(bytes_to_read - bytes_read)
|
|
400
529
|
buffer += readbuffer
|
|
@@ -403,18 +532,19 @@ class Fido:
|
|
|
403
532
|
if readbuffer == '':
|
|
404
533
|
break
|
|
405
534
|
return buffer
|
|
406
|
-
|
|
535
|
+
|
|
407
536
|
def get_buffers(self, stream, length=None, seekable=False):
|
|
408
|
-
"""Return buffers from the beginning and end of stream and the number of bytes read
|
|
409
|
-
if there may be more bytes in the stream.
|
|
410
|
-
|
|
411
|
-
If length is None, return the length as found.
|
|
412
|
-
If seekable is False, the steam does not support a seek operation.
|
|
413
537
|
"""
|
|
414
|
-
|
|
538
|
+
Return buffers from the beginning and end of stream and the number of
|
|
539
|
+
bytes read if there may be more bytes in the stream.
|
|
540
|
+
|
|
541
|
+
If length is None, return the length as found.
|
|
542
|
+
If seekable is False, the steam does not support a seek operation.
|
|
543
|
+
"""
|
|
544
|
+
bytes_to_read = self.bufsize if length is None else min(length, self.bufsize)
|
|
415
545
|
bofbuffer = self.blocking_read(stream, bytes_to_read)
|
|
416
546
|
bytes_read = len(bofbuffer)
|
|
417
|
-
if length
|
|
547
|
+
if length is None:
|
|
418
548
|
# A stream with unknown length; have to keep two buffers around
|
|
419
549
|
prevbuffer = bofbuffer
|
|
420
550
|
while True:
|
|
@@ -439,101 +569,86 @@ class Fido:
|
|
|
439
569
|
stream.seek(length - self.bufsize)
|
|
440
570
|
eofbuffer = self.blocking_read(stream, self.bufsize)
|
|
441
571
|
else:
|
|
442
|
-
# We have more to read and know how much.
|
|
572
|
+
# We have more to read and know how much.
|
|
443
573
|
# n*bufsize + r = length
|
|
444
574
|
(n, r) = divmod(bytes_unread, self.bufsize)
|
|
445
575
|
# skip n-1*bufsize bytes
|
|
446
|
-
for unused_i in
|
|
576
|
+
for unused_i in range(1, n):
|
|
447
577
|
self.blocking_read(stream, self.bufsize)
|
|
448
578
|
# skip r bytes
|
|
449
579
|
self.blocking_read(stream, r)
|
|
450
580
|
# and read the remaining bufsize bytes into the eofbuffer
|
|
451
581
|
eofbuffer = self.blocking_read(stream, self.bufsize)
|
|
452
|
-
return bofbuffer, eofbuffer
|
|
453
|
-
|
|
582
|
+
return bofbuffer, eofbuffer, bytes_to_read
|
|
583
|
+
|
|
454
584
|
def walk_zip(self, filename, fileobj=None):
|
|
455
|
-
"""Identify the type of each item in the zip
|
|
456
|
-
@param fileobj. If fileobj is not provided, open
|
|
457
|
-
@param filename.
|
|
458
|
-
Call self.handle_matches instead of returning a value.
|
|
459
585
|
"""
|
|
460
|
-
|
|
461
|
-
|
|
586
|
+
Identify the type of each item in the zip
|
|
587
|
+
@param fileobj. If fileobj is not provided, open.
|
|
588
|
+
@param filename.
|
|
589
|
+
Call self.handle_matches instead of returning a value.
|
|
590
|
+
"""
|
|
462
591
|
try:
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
if self.current_filesize
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
if len(matches) > 0 and self.current_filesize > 0:
|
|
483
|
-
self.handle_matches(item_name, matches, time.clock() - t0, "signature")
|
|
484
|
-
elif len(matches) == 0 or self.current_filesize == 0:
|
|
485
|
-
matches = self.match_extensions(item_name)
|
|
486
|
-
self.handle_matches(item_name, matches, time.clock() - t0, "extension")
|
|
487
|
-
if self.container_type(matches):
|
|
592
|
+
with zipfile.ZipFile((fileobj if fileobj else filename), 'r') as zipstream:
|
|
593
|
+
for item in zipstream.infolist():
|
|
594
|
+
if item.file_size == 0:
|
|
595
|
+
continue # TODO: Find a better test for isdir
|
|
596
|
+
t0 = time.clock()
|
|
597
|
+
with zipstream.open(item) as f:
|
|
598
|
+
item_name = filename + '!' + item.filename
|
|
599
|
+
self.current_file = item_name
|
|
600
|
+
self.current_filesize = item.file_size
|
|
601
|
+
if self.current_filesize == 0:
|
|
602
|
+
sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + item_name + "\n")
|
|
603
|
+
bofbuffer, eofbuffer, _ = self.get_buffers(f, item.file_size)
|
|
604
|
+
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
605
|
+
if len(matches) > 0 and self.current_filesize > 0:
|
|
606
|
+
self.handle_matches(item_name, matches, time.clock() - t0, "signature")
|
|
607
|
+
elif len(matches) == 0 or self.current_filesize == 0:
|
|
608
|
+
matches = self.match_extensions(item_name)
|
|
609
|
+
self.handle_matches(item_name, matches, time.clock() - t0, "extension")
|
|
610
|
+
if self.container_type(matches):
|
|
488
611
|
target = tempfile.SpooledTemporaryFile(prefix='Fido')
|
|
489
|
-
|
|
490
|
-
try:
|
|
491
|
-
source = zipstream.open(item)
|
|
612
|
+
with zipstream.open(item) as source:
|
|
492
613
|
self.copy_stream(source, target)
|
|
493
|
-
#target.seek(0)
|
|
614
|
+
# target.seek(0)
|
|
494
615
|
self.identify_contents(item_name, target, self.container_type(matches))
|
|
495
|
-
finally:
|
|
496
|
-
source.close()
|
|
497
616
|
except IOError:
|
|
498
617
|
sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
|
|
499
618
|
except zipfile.BadZipfile:
|
|
500
619
|
sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
|
|
501
|
-
|
|
502
|
-
finally:
|
|
503
|
-
if zipstream != None: zipstream.close()
|
|
504
620
|
|
|
505
621
|
def walk_tar(self, filename, fileobj):
|
|
506
|
-
"""Identify the type of each item in the tar
|
|
507
|
-
@param fileobj. If fileobj is not provided, open
|
|
508
|
-
@param filename.
|
|
509
|
-
Call self.handle_matches instead of returning a value.
|
|
510
622
|
"""
|
|
511
|
-
|
|
512
|
-
|
|
623
|
+
Identify the type of each item in the tar.
|
|
624
|
+
@param fileobj. If fileobj is not provided, open.
|
|
625
|
+
@param filename.
|
|
626
|
+
Call self.handle_matches instead of returning a value.
|
|
627
|
+
"""
|
|
513
628
|
try:
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
629
|
+
with tarfile.TarFile(filename, fileobj=fileobj, mode='r') as tarstream:
|
|
630
|
+
for item in tarstream.getmembers():
|
|
631
|
+
if not item.isfile():
|
|
632
|
+
continue
|
|
517
633
|
t0 = time.clock()
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
f.close()
|
|
634
|
+
with closing(tarstream.extractfile(item)) as f:
|
|
635
|
+
tar_item_name = filename + '!' + item.name
|
|
636
|
+
self.current_file = tar_item_name
|
|
637
|
+
self.current_filesize = item.size
|
|
638
|
+
bofbuffer, eofbuffer, _ = self.get_buffers(f, item.size)
|
|
639
|
+
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
640
|
+
self.handle_matches(tar_item_name, matches, time.clock() - t0)
|
|
641
|
+
if self.container_type(matches):
|
|
642
|
+
f.seek(0)
|
|
643
|
+
self.identify_contents(tar_item_name, f, self.container_type(matches))
|
|
529
644
|
except tarfile.TarError:
|
|
530
|
-
|
|
531
|
-
finally:
|
|
532
|
-
if tarstream != None: tarstream.close()
|
|
645
|
+
sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
|
|
533
646
|
|
|
534
647
|
def as_good_as_any(self, f1, match_list):
|
|
535
|
-
"""
|
|
536
|
-
|
|
648
|
+
"""
|
|
649
|
+
Return True if the proposed format is as good as any in the match_list.
|
|
650
|
+
For example, if there is no format in the match_list that has priority over the proposed one
|
|
651
|
+
"""
|
|
537
652
|
if match_list != []:
|
|
538
653
|
f1_puid = self.get_puid(f1)
|
|
539
654
|
for (f2, unused) in match_list:
|
|
@@ -542,9 +657,10 @@ class Fido:
|
|
|
542
657
|
elif f1_puid in self.puid_has_priority_over_map[self.get_puid(f2)]:
|
|
543
658
|
return False
|
|
544
659
|
return True
|
|
545
|
-
|
|
660
|
+
|
|
546
661
|
def buffered_read(self, file_pos, overlap):
|
|
547
|
-
"""
|
|
662
|
+
"""
|
|
663
|
+
Buffered read of data chunks.
|
|
548
664
|
"""
|
|
549
665
|
buf = ""
|
|
550
666
|
if not overlap:
|
|
@@ -552,102 +668,24 @@ class Fido:
|
|
|
552
668
|
else:
|
|
553
669
|
bufsize = self.container_bufsize + self.overlap_range
|
|
554
670
|
file_end = self.current_filesize
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
else:
|
|
560
|
-
file_read = self.bufsize
|
|
561
|
-
buf = file_handle.read(file_read)
|
|
562
|
-
return buf
|
|
563
|
-
|
|
564
|
-
def read_container(self,parent_buffer,parent_result):
|
|
565
|
-
"""Header of compound containers can be further away than default 128 KB buffer
|
|
566
|
-
especially with big files containing binary objects.
|
|
567
|
-
This function reads containers in chunks of 512 KB (defaults['container_bufsize'])
|
|
568
|
-
Each chunk is inspected with the PRONOM container sequences.
|
|
569
|
-
Each chunk smuggles in a piece from the previous chunk to prevent
|
|
570
|
-
cutting off patterns we are looking for in the middle.
|
|
571
|
-
This method is somewhat slower than reading the complete file at once.
|
|
572
|
-
This is to prevent Fido to potentially crash in the midst of scanning a very big file.
|
|
573
|
-
NOTE (MdR): this piece of code is still a bit quirky
|
|
574
|
-
as it does not yet takes byte positions into account which
|
|
575
|
-
are available in the DROID container signature file
|
|
576
|
-
"""
|
|
577
|
-
container_result = []
|
|
578
|
-
nobuffer = False
|
|
579
|
-
overlap = False
|
|
580
|
-
self.overlap_range = 512 # bytes
|
|
581
|
-
container_hit = False
|
|
582
|
-
passes = 1
|
|
583
|
-
container_buffer = ""
|
|
584
|
-
# TODO: find better way to handle zip contents
|
|
585
|
-
# for now: ugly hack, but working
|
|
586
|
-
# this slows down because the zip is re-opened on each item
|
|
587
|
-
# if "!" is in filename, it is a zip item
|
|
588
|
-
# if "!" in self.current_file:
|
|
589
|
-
# import zipfile, tempfile
|
|
590
|
-
# zip, item = self.current_file.split("!")
|
|
591
|
-
# zipitem = tempfile.SpooledTemporaryFile(prefix='Fido')
|
|
592
|
-
#with zipstream.open(item) as source:
|
|
593
|
-
# try:
|
|
594
|
-
# source = zipstream.open(item)
|
|
595
|
-
# self.copy_stream(source, target)
|
|
596
|
-
# target.seek(0)
|
|
597
|
-
# self.identify_contents(item_name, target, self.container_type(matches))
|
|
598
|
-
# finally:
|
|
599
|
-
# source.close()
|
|
600
|
-
#exit()
|
|
601
|
-
# in case argument 'nocontainer' is set
|
|
602
|
-
# read default bofbuffer
|
|
603
|
-
if self.nocontainer or self.current_filesize <= self.bufsize or self.current_file == "STDIN":
|
|
604
|
-
passes = 1
|
|
605
|
-
nobuffer = True
|
|
606
|
-
else:
|
|
607
|
-
passes = int(float(self.current_filesize / self.container_bufsize) + 1)
|
|
608
|
-
pos = 0
|
|
609
|
-
for i in xrange(passes):
|
|
610
|
-
if nobuffer is True:
|
|
611
|
-
container_buffer = parent_buffer
|
|
671
|
+
with open(self.current_file, 'rb') as file_handle:
|
|
672
|
+
file_handle.seek(file_pos)
|
|
673
|
+
if file_end - file_pos < bufsize:
|
|
674
|
+
file_read = file_end - file_pos
|
|
612
675
|
else:
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
pos = ((self.container_bufsize * i) - self.overlap_range)
|
|
617
|
-
overlap = True
|
|
618
|
-
container_buffer = self.buffered_read(pos, overlap)
|
|
619
|
-
for (container_id,container_regexes) in self.sequenceSignature.iteritems():
|
|
620
|
-
# set hitcounter in case a container entry
|
|
621
|
-
# has more than one regex
|
|
622
|
-
hitcounter = 0
|
|
623
|
-
if len(container_regexes) > 0:
|
|
624
|
-
for container_regex in container_regexes:
|
|
625
|
-
if re.search(container_regex, container_buffer):
|
|
626
|
-
hitcounter += 1
|
|
627
|
-
# if the hitcounter matches the number of regexes
|
|
628
|
-
# then it must be a positive hit, else continue
|
|
629
|
-
# to match the rest of the sequences
|
|
630
|
-
if hitcounter < len(container_regexes):
|
|
631
|
-
continue
|
|
632
|
-
self.matchtype = "container"
|
|
633
|
-
for container_puid in self.puidMapping[container_id]:
|
|
634
|
-
for container_format in self.formats:
|
|
635
|
-
if container_format.find('puid').text == container_puid:
|
|
636
|
-
if self.as_good_as_any(container_format, parent_result):
|
|
637
|
-
for container_sig in self.get_signatures(container_format):
|
|
638
|
-
container_result.append((container_format, container_sig))
|
|
639
|
-
break
|
|
640
|
-
return container_result
|
|
676
|
+
file_read = self.bufsize
|
|
677
|
+
buf = file_handle.read(file_read)
|
|
678
|
+
return buf
|
|
641
679
|
|
|
642
680
|
def match_formats(self, bofbuffer, eofbuffer):
|
|
643
|
-
"""
|
|
644
|
-
|
|
645
|
-
|
|
681
|
+
"""
|
|
682
|
+
Apply the patterns for formats to the supplied buffers.
|
|
683
|
+
@return a match list of (format, signature) tuples.
|
|
684
|
+
The list has inferior matches removed.
|
|
646
685
|
"""
|
|
647
686
|
self.current_count += 1
|
|
648
|
-
#t0 = time.clock()
|
|
687
|
+
# t0 = time.clock()
|
|
649
688
|
result = []
|
|
650
|
-
container_result = []
|
|
651
689
|
for format in self.formats:
|
|
652
690
|
try:
|
|
653
691
|
self.current_format = format
|
|
@@ -659,7 +697,7 @@ class Fido:
|
|
|
659
697
|
self.current_pat = pat
|
|
660
698
|
pos = self.get_pos(pat)
|
|
661
699
|
regex = self.get_regex(pat)
|
|
662
|
-
#print 'trying ', regex
|
|
700
|
+
# print 'trying ', regex
|
|
663
701
|
if pos == 'BOF':
|
|
664
702
|
if not re.match(regex, bofbuffer):
|
|
665
703
|
success = False
|
|
@@ -670,60 +708,55 @@ class Fido:
|
|
|
670
708
|
break
|
|
671
709
|
elif pos == 'VAR':
|
|
672
710
|
if not re.search(regex, bofbuffer):
|
|
673
|
-
success = False
|
|
711
|
+
success = False
|
|
674
712
|
break
|
|
675
713
|
elif pos == 'IFB':
|
|
676
714
|
if not re.search(regex, bofbuffer):
|
|
677
|
-
success = False
|
|
715
|
+
success = False
|
|
678
716
|
break
|
|
679
717
|
if success:
|
|
680
|
-
result.append((format, sig))
|
|
681
|
-
# check if file needs to be parsed with container signature
|
|
682
|
-
# we skip files with extension "zip" (x-fmt/263)
|
|
683
|
-
ext = os.path.splitext(self.current_file)[1].lower().lstrip(".")
|
|
684
|
-
if format.find('puid').text in self.puidTriggers and ext != "zip":
|
|
685
|
-
container_result = self.read_container(bofbuffer,result)
|
|
686
|
-
if len(container_result) != 0:
|
|
687
|
-
for (k,v) in container_result:
|
|
688
|
-
result.append((k,v))
|
|
689
|
-
break
|
|
718
|
+
result.append((format, sig.findtext("name")))
|
|
690
719
|
except Exception as e:
|
|
691
|
-
sys.stderr.write(str(e)+"\n")
|
|
720
|
+
sys.stderr.write(str(e) + "\n")
|
|
692
721
|
continue
|
|
693
722
|
# TODO: MdR: needs some <3
|
|
694
|
-
#print "Unexpected error:", sys.exc_info()[0], e
|
|
695
|
-
#sys.stdout.write('***', self.get_puid(format), regex)
|
|
696
|
-
|
|
723
|
+
# print "Unexpected error:", sys.exc_info()[0], e
|
|
724
|
+
# sys.stdout.write('***', self.get_puid(format), regex)
|
|
725
|
+
|
|
697
726
|
# t1 = time.clock()
|
|
698
727
|
# if t1 - t0 > 0.02:
|
|
699
728
|
# print >> sys.stderr, "FIDO: Slow ID", self.current_file
|
|
700
729
|
result = [match for match in result if self.as_good_as_any(match[0], result)]
|
|
701
|
-
result = list(set(result)) # remove duplicate results, this is due to ??? in self.read_container(), needs fix
|
|
702
730
|
return result
|
|
703
|
-
|
|
731
|
+
|
|
704
732
|
def match_extensions(self, filename):
|
|
705
|
-
"
|
|
733
|
+
"""
|
|
734
|
+
Return the list of (format, self.externalsig) for every format whose extension matches the filename.
|
|
735
|
+
"""
|
|
706
736
|
myext = os.path.splitext(filename)[1].lower().lstrip(".")
|
|
707
737
|
result = []
|
|
708
|
-
if
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
738
|
+
if not myext:
|
|
739
|
+
return result
|
|
740
|
+
for element in self.formats:
|
|
741
|
+
for format_ in element.findall('extension'):
|
|
742
|
+
if myext == format_.text:
|
|
743
|
+
result.append((element, self.externalsig.findtext("name")))
|
|
744
|
+
break
|
|
715
745
|
result = [match for match in result if self.as_good_as_any(match[0], result)]
|
|
716
746
|
return result
|
|
717
|
-
|
|
747
|
+
|
|
718
748
|
def copy_stream(self, source, target):
|
|
719
749
|
while True:
|
|
720
750
|
buf = source.read(self.bufsize)
|
|
721
751
|
if len(buf) == 0:
|
|
722
752
|
break
|
|
723
753
|
target.write(buf)
|
|
724
|
-
|
|
754
|
+
|
|
755
|
+
|
|
725
756
|
def list_files(roots, recurse=False):
|
|
726
|
-
"
|
|
757
|
+
"""
|
|
758
|
+
Return the files one at a time. Roots could be a fileobj or a list.
|
|
759
|
+
"""
|
|
727
760
|
for root in roots:
|
|
728
761
|
root = (root if root[-1] != '\n' else root[:-1])
|
|
729
762
|
root = os.path.normpath(root)
|
|
@@ -733,17 +766,14 @@ def list_files(roots, recurse=False):
|
|
|
733
766
|
for path, unused, files in os.walk(root):
|
|
734
767
|
for f in files:
|
|
735
768
|
yield os.path.join(path, f)
|
|
736
|
-
if recurse
|
|
769
|
+
if not recurse:
|
|
737
770
|
break
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
arglist = sys.argv[1:]
|
|
745
|
-
if len(arglist) == False:
|
|
746
|
-
arglist.append("-h")
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
def main(args=None):
|
|
774
|
+
if not args:
|
|
775
|
+
args = sys.argv[1:]
|
|
776
|
+
|
|
747
777
|
parser = ArgumentParser(description=defaults['description'], epilog=defaults['epilog'], fromfile_prefix_chars='@', formatter_class=RawTextHelpFormatter)
|
|
748
778
|
parser.add_argument('-v', default=False, action='store_true', help='show version information')
|
|
749
779
|
parser.add_argument('-q', default=False, action='store_true', help='run (more) quietly')
|
|
@@ -751,87 +781,86 @@ def main(arglist=None):
|
|
|
751
781
|
parser.add_argument('-zip', default=False, action='store_true', help='recurse into zip and tar files')
|
|
752
782
|
parser.add_argument('-nocontainer', default=False, action='store_true', help='disable deep scan of container documents, increases speed but may reduce accuracy with big files')
|
|
753
783
|
parser.add_argument('-pronom_only', default=False, action='store_true', help='disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results')
|
|
784
|
+
|
|
754
785
|
group = parser.add_mutually_exclusive_group()
|
|
755
786
|
group.add_argument('-input', default=False, help='file containing a list of files to check, one per line. - means stdin')
|
|
756
787
|
group.add_argument('files', nargs='*', default=[], metavar='FILE', help='files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.')
|
|
788
|
+
|
|
757
789
|
parser.add_argument('-filename', default=None, help='filename if file contents passed through STDIN')
|
|
758
790
|
parser.add_argument('-useformats', metavar='INCLUDEPUIDS', default=None, help='comma separated string of formats to use in identification')
|
|
759
791
|
parser.add_argument('-nouseformats', metavar='EXCLUDEPUIDS', default=None, help='comma separated string of formats not to use in identification')
|
|
760
792
|
parser.add_argument('-matchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use on match. See nomatchprintf, README.txt.')
|
|
761
793
|
parser.add_argument('-nomatchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use if no match. See README.txt')
|
|
762
|
-
parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['bufsize'])+' bytes)')
|
|
763
|
-
parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['container_bufsize'])+' bytes)')
|
|
764
|
-
|
|
794
|
+
parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['bufsize']) + ' bytes)')
|
|
795
|
+
parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['container_bufsize']) + ' bytes)')
|
|
765
796
|
parser.add_argument('-loadformats', default=None, metavar='XML1,...,XMLn', help='comma separated string of XML format files to add.')
|
|
766
|
-
parser.add_argument('-confdir', default=
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
try:
|
|
783
|
-
versions = VET.parse(versionsFile)
|
|
784
|
-
except Exception, e:
|
|
785
|
-
sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
|
|
786
|
-
sys.exit()
|
|
787
|
-
defaults['xml_pronomSignature'] = versions.find("pronomSignature").text
|
|
788
|
-
# defaults['xml_pronomContainerSignature'] = versions.find("pronomContainerSignature").text
|
|
789
|
-
defaults['containersignature_file'] = versions.find("pronomContainerSignature").text
|
|
790
|
-
defaults['xml_fidoExtensionSignature'] = versions.find("fidoExtensionSignature").text
|
|
791
|
-
defaults['format_files'] = []
|
|
792
|
-
defaults['format_files'].append(defaults['xml_pronomSignature'])
|
|
797
|
+
parser.add_argument('-confdir', default=CONFIG_DIR, help='configuration directory to load_fido_xml, for example, the format specifications from.')
|
|
798
|
+
|
|
799
|
+
if len(sys.argv) == 1:
|
|
800
|
+
parser.print_help()
|
|
801
|
+
sys.exit(1)
|
|
802
|
+
args = parser.parse_args(args)
|
|
803
|
+
|
|
804
|
+
t0 = time.clock()
|
|
805
|
+
|
|
806
|
+
versions = get_local_pronom_versions(args.confdir)
|
|
807
|
+
|
|
808
|
+
defaults['xml_pronomSignature'] = versions.pronom_signature
|
|
809
|
+
defaults['containersignature_file'] = versions.pronom_container_signature
|
|
810
|
+
defaults['xml_fidoExtensionSignature'] = versions.fido_extension_signature
|
|
811
|
+
defaults['format_files'] = [defaults['xml_pronomSignature']]
|
|
812
|
+
|
|
793
813
|
if args.pronom_only:
|
|
794
|
-
versionHeader = "FIDO v{0} ({1}, {2})\n".format(
|
|
814
|
+
versionHeader = "FIDO v{0} ({1}, {2})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'])
|
|
795
815
|
else:
|
|
796
|
-
versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(
|
|
816
|
+
versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'], defaults['xml_fidoExtensionSignature'])
|
|
797
817
|
defaults['format_files'].append(defaults['xml_fidoExtensionSignature'])
|
|
798
|
-
|
|
799
|
-
if args.v
|
|
818
|
+
|
|
819
|
+
if args.v:
|
|
800
820
|
sys.stdout.write(versionHeader)
|
|
801
821
|
sys.exit(0)
|
|
802
|
-
|
|
822
|
+
|
|
823
|
+
if args.matchprintf:
|
|
803
824
|
args.matchprintf = args.matchprintf.decode('string_escape')
|
|
804
|
-
if args.nomatchprintf
|
|
825
|
+
if args.nomatchprintf:
|
|
805
826
|
args.nomatchprintf = args.nomatchprintf.decode('string_escape')
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
827
|
+
|
|
828
|
+
fido = Fido(
|
|
829
|
+
quiet=args.q,
|
|
830
|
+
bufsize=args.bufsize,
|
|
831
|
+
container_bufsize=args.container_bufsize,
|
|
832
|
+
printmatch=args.matchprintf,
|
|
833
|
+
printnomatch=args.nomatchprintf,
|
|
834
|
+
zip=args.zip,
|
|
835
|
+
nocontainer=args.nocontainer,
|
|
836
|
+
conf_dir=args.confdir)
|
|
837
|
+
|
|
838
|
+
# TODO: Allow conf options to be dis-included
|
|
810
839
|
if args.loadformats:
|
|
811
840
|
for file in args.loadformats.split(','):
|
|
812
841
|
fido.load_fido_xml(file)
|
|
813
|
-
|
|
814
|
-
#TODO: remove from maps
|
|
842
|
+
|
|
843
|
+
# TODO: remove from maps
|
|
815
844
|
if args.useformats:
|
|
816
845
|
args.useformats = args.useformats.split(',')
|
|
817
846
|
fido.formats = [f for f in fido.formats if f.find('puid').text in args.useformats]
|
|
818
847
|
elif args.nouseformats:
|
|
819
848
|
args.nouseformats = args.nouseformats.split(',')
|
|
820
849
|
fido.formats = [f for f in fido.formats if f.find('puid').text not in args.nouseformats]
|
|
821
|
-
|
|
850
|
+
|
|
822
851
|
# Set up to use stdin, or open input files:
|
|
823
852
|
if args.input == '-':
|
|
824
853
|
args.files = sys.stdin
|
|
825
854
|
elif args.input:
|
|
826
855
|
args.files = open(args.input, 'r')
|
|
827
|
-
|
|
856
|
+
|
|
828
857
|
# RUN
|
|
829
858
|
try:
|
|
830
859
|
if not args.q:
|
|
831
860
|
sys.stderr.write(versionHeader)
|
|
832
861
|
sys.stderr.flush()
|
|
833
862
|
if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
|
|
834
|
-
if fido.zip
|
|
863
|
+
if fido.zip:
|
|
835
864
|
raise RuntimeError("Multiple content read from stdin not yet supported.")
|
|
836
865
|
sys.exit(1)
|
|
837
866
|
fido.identify_multi_object_stream(sys.stdin)
|
|
@@ -844,11 +873,12 @@ def main(arglist=None):
|
|
|
844
873
|
msg = "FIDO: Interrupt while identifying file {0}"
|
|
845
874
|
sys.stderr.write(msg.format(fido.current_file))
|
|
846
875
|
sys.exit(1)
|
|
847
|
-
|
|
876
|
+
|
|
848
877
|
if not args.q:
|
|
849
878
|
sys.stdout.flush()
|
|
850
879
|
fido.print_summary(time.clock() - t0)
|
|
851
880
|
sys.stderr.flush()
|
|
852
881
|
|
|
882
|
+
|
|
853
883
|
if __name__ == '__main__':
|
|
854
884
|
main()
|