libis-format 0.9.32 → 0.9.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/data/types.yml +30 -16
- data/lib/libis/format/config.rb +7 -18
- data/lib/libis/format/converter/image_converter.rb +6 -0
- data/lib/libis/format/droid.rb +82 -25
- data/lib/libis/format/extension_identification.rb +55 -0
- data/lib/libis/format/fido.rb +57 -72
- data/lib/libis/format/file_tool.rb +76 -0
- data/lib/libis/format/identification_tool.rb +174 -0
- data/lib/libis/format/identifier.rb +129 -117
- data/lib/libis/format/type_database.rb +36 -5
- data/lib/libis/format/version.rb +1 -1
- data/lib/libis/format.rb +3 -0
- data/libis-format.gemspec +2 -1
- data/spec/converter_spec.rb +6 -4
- data/spec/identifier_spec.rb +125 -34
- metadata +21 -126
- data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
- data/tools/droid/container-signature-20170330.xml +0 -3584
- data/tools/droid/droid-command-line-6.3.jar +0 -0
- data/tools/droid/droid.bat +0 -152
- data/tools/droid/droid.sh +0 -152
- data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
- data/tools/droid/lib/activation-1.1.jar +0 -0
- data/tools/droid/lib/aopalliance-1.0.jar +0 -0
- data/tools/droid/lib/asm-2.2.3.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
- data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
- data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
- data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
- data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
- data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
- data/tools/droid/lib/classmate-1.0.0.jar +0 -0
- data/tools/droid/lib/commons-cli-1.2.jar +0 -0
- data/tools/droid/lib/commons-codec-1.10.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
- data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
- data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
- data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
- data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
- data/tools/droid/lib/commons-io-2.4.jar +0 -0
- data/tools/droid/lib/commons-lang-2.6.jar +0 -0
- data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
- data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
- data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
- data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
- data/tools/droid/lib/droid-container-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-help-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-results-6.3.jar +0 -0
- data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
- data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
- data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
- data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
- data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
- data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
- data/tools/droid/lib/itext-2.0.8.jar +0 -0
- data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
- data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
- data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
- data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
- data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
- data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
- data/tools/droid/lib/jta-1.1.jar +0 -0
- data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
- data/tools/droid/lib/log4j-1.2.13.jar +0 -0
- data/tools/droid/lib/neethi-2.0.4.jar +0 -0
- data/tools/droid/lib/opencsv-2.3.jar +0 -0
- data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
- data/tools/droid/lib/poi-3.13.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
- data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
- data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
- data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
- data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
- data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
- data/tools/droid/lib/truezip-6.8.4.jar +0 -0
- data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
- data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
- data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
- data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
- data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
- data/tools/droid/lib/xz-1.0.jar +0 -0
- data/tools/fido/__init__.py +0 -50
- data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
- data/tools/fido/conf/container-signature-20170330.xml +0 -3584
- data/tools/fido/conf/dc.xsd +0 -119
- data/tools/fido/conf/dcmitype.xsd +0 -53
- data/tools/fido/conf/dcterms.xsd +0 -383
- data/tools/fido/conf/fido-formats.xsd +0 -173
- data/tools/fido/conf/format_extension_template.xml +0 -105
- data/tools/fido/conf/format_extensions.xml +0 -484
- data/tools/fido/conf/formats-v90.xml +0 -48877
- data/tools/fido/conf/pronom-xml-v90.zip +0 -0
- data/tools/fido/conf/versions.xml +0 -8
- data/tools/fido/fido.bat +0 -4
- data/tools/fido/fido.py +0 -884
- data/tools/fido/fido.sh +0 -5
- data/tools/fido/package.py +0 -96
- data/tools/fido/prepare.py +0 -645
- data/tools/fido/pronomutils.py +0 -200
- data/tools/fido/toxml.py +0 -60
- data/tools/fido/update_signatures.py +0 -183
data/tools/fido/fido.py
DELETED
|
@@ -1,884 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
Format Identification for Digital Objects (FIDO).
|
|
6
|
-
|
|
7
|
-
FIDO is a command-line tool to identify the file formats of digital objects.
|
|
8
|
-
It is designed for simple integration into automated work-flows.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
from __future__ import absolute_import
|
|
12
|
-
|
|
13
|
-
from argparse import ArgumentParser, RawTextHelpFormatter
|
|
14
|
-
from contextlib import closing
|
|
15
|
-
import os
|
|
16
|
-
import re
|
|
17
|
-
import sys
|
|
18
|
-
import tarfile
|
|
19
|
-
import tempfile
|
|
20
|
-
import time
|
|
21
|
-
from xml.etree import cElementTree as ET
|
|
22
|
-
from xml.etree import ElementTree as CET
|
|
23
|
-
import zipfile
|
|
24
|
-
|
|
25
|
-
from six.moves import range
|
|
26
|
-
|
|
27
|
-
from . import __version__, CONFIG_DIR
|
|
28
|
-
from .package import OlePackage, ZipPackage
|
|
29
|
-
from .pronomutils import get_local_pronom_versions
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
defaults = {
|
|
33
|
-
'bufsize': 128 * 1024, # (bytes)
|
|
34
|
-
'regexcachesize': 2084, # (bytes)
|
|
35
|
-
'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
|
|
36
|
-
'printnomatch': "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
|
|
37
|
-
'format_files': [
|
|
38
|
-
'formats-v88.xml',
|
|
39
|
-
'format_extensions.xml'
|
|
40
|
-
],
|
|
41
|
-
'containersignature_file': 'container-signature-20170330.xml',
|
|
42
|
-
'container_bufsize': 512 * 1024, # (bytes)
|
|
43
|
-
'description': """Format Identification for Digital Objects (fido).
|
|
44
|
-
FIDO is a command-line tool to identify the file formats of digital objects.
|
|
45
|
-
It is designed for simple integration into automated work-flows.""",
|
|
46
|
-
'epilog': """
|
|
47
|
-
Open Planets Foundation (http://www.openplanetsfoundation.org)
|
|
48
|
-
See License.txt for license information.
|
|
49
|
-
Download from: https://github.com/openplanets/fido/releases
|
|
50
|
-
Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
|
|
51
|
-
Author: Adam Farquhar (BL), 2010
|
|
52
|
-
Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
|
|
53
|
-
FIDO uses the UK National Archives (TNA) PRONOM File Format
|
|
54
|
-
and Container descriptions.
|
|
55
|
-
PRONOM is available from http://www.nationalarchives.gov.uk/pronom/""",
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
class Fido:
|
|
60
|
-
def __init__(self, quiet=False, bufsize=None, container_bufsize=None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=CONFIG_DIR, format_files=None, containersignature_file=None):
|
|
61
|
-
global defaults
|
|
62
|
-
self.quiet = quiet
|
|
63
|
-
self.bufsize = defaults['bufsize'] if bufsize is None else bufsize
|
|
64
|
-
self.container_bufsize = defaults['container_bufsize'] if container_bufsize is None else container_bufsize
|
|
65
|
-
self.printmatch = defaults['printmatch'] if printmatch is None else printmatch
|
|
66
|
-
self.printnomatch = defaults['printnomatch'] if printnomatch is None else printnomatch
|
|
67
|
-
self.handle_matches = self.print_matches if handle_matches is None else handle_matches
|
|
68
|
-
self.zip = zip
|
|
69
|
-
self.nocontainer = nocontainer
|
|
70
|
-
self.conf_dir = conf_dir
|
|
71
|
-
self.format_files = defaults['format_files'] if format_files is None else format_files
|
|
72
|
-
self.containersignature_file = defaults['containersignature_file']
|
|
73
|
-
self.formats = []
|
|
74
|
-
self.puid_format_map = {}
|
|
75
|
-
self.puid_has_priority_over_map = {}
|
|
76
|
-
# load signatures
|
|
77
|
-
for xml_file in self.format_files:
|
|
78
|
-
self.load_fido_xml(os.path.join(os.path.abspath(self.conf_dir), xml_file))
|
|
79
|
-
self.load_container_signature(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
|
|
80
|
-
self.current_file = ''
|
|
81
|
-
self.current_filesize = 0
|
|
82
|
-
self.current_format = None
|
|
83
|
-
self.current_sig = None
|
|
84
|
-
self.current_pat = None
|
|
85
|
-
self.current_count = 0 # Count of calls to match_formats
|
|
86
|
-
re._MAXCACHE = defaults['regexcachesize']
|
|
87
|
-
self.externalsig = ET.XML('<signature><name>External</name></signature>')
|
|
88
|
-
|
|
89
|
-
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
|
90
|
-
_special = '$()*+.?![]^\\{|}' # Before: '$*+.?![]^\\{|}'
|
|
91
|
-
_hex = '0123456789abcdef'
|
|
92
|
-
|
|
93
|
-
def _escape_char(self, c):
|
|
94
|
-
if c in '\n':
|
|
95
|
-
return '\\n'
|
|
96
|
-
elif c == '\r':
|
|
97
|
-
return '\\r'
|
|
98
|
-
elif c in self._special:
|
|
99
|
-
return '\\' + c
|
|
100
|
-
else:
|
|
101
|
-
(high, low) = divmod(ord(c), 16)
|
|
102
|
-
return '\\x' + self._hex[high] + self._hex[low]
|
|
103
|
-
|
|
104
|
-
def escape(self, string):
|
|
105
|
-
"""
|
|
106
|
-
Escape characters in pattern that are non-printable, non-ascii, or
|
|
107
|
-
special for regexes.
|
|
108
|
-
"""
|
|
109
|
-
escaped = ''.join(c if c in self._ordinary else self._escape_char(c) for c in string)
|
|
110
|
-
return escaped
|
|
111
|
-
|
|
112
|
-
def convert_container_sequence(self, sig):
|
|
113
|
-
"""
|
|
114
|
-
Parse the PRONOM container sequences and convert to regular
|
|
115
|
-
expressions.
|
|
116
|
-
"""
|
|
117
|
-
# The sequence is regex matching bytes from a file so the sequence must also be bytes
|
|
118
|
-
seq = b'(?s)'
|
|
119
|
-
inq = False
|
|
120
|
-
byt = False
|
|
121
|
-
rng = False
|
|
122
|
-
ror = False
|
|
123
|
-
for i in range(len(sig)):
|
|
124
|
-
if not inq and not rng:
|
|
125
|
-
if sig[i] == "'":
|
|
126
|
-
inq = True
|
|
127
|
-
continue
|
|
128
|
-
if sig[i] == " ":
|
|
129
|
-
continue
|
|
130
|
-
if sig[i] == "[":
|
|
131
|
-
seq += b"("
|
|
132
|
-
rng = True
|
|
133
|
-
continue
|
|
134
|
-
if not byt:
|
|
135
|
-
seq += b"\\x" + sig[i].lower().encode('utf8')
|
|
136
|
-
byt = True
|
|
137
|
-
continue
|
|
138
|
-
if byt:
|
|
139
|
-
seq += sig[i].lower().encode('utf8')
|
|
140
|
-
byt = False
|
|
141
|
-
continue
|
|
142
|
-
if inq:
|
|
143
|
-
if sig[i] == "'" and not rng:
|
|
144
|
-
inq = False
|
|
145
|
-
continue
|
|
146
|
-
seq += self.escape(sig[i]).encode('utf8')
|
|
147
|
-
continue
|
|
148
|
-
if rng:
|
|
149
|
-
if sig[i] == "]":
|
|
150
|
-
seq += b")"
|
|
151
|
-
rng = False
|
|
152
|
-
continue
|
|
153
|
-
if sig[i] != "-" and sig[i] != "'" and ror:
|
|
154
|
-
seq += self.escape(sig[i]).encode('utf8')
|
|
155
|
-
continue
|
|
156
|
-
if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and sig[i] != ":" and not ror and not byt:
|
|
157
|
-
seq += b"\\x" + sig[i].lower().encode('utf8')
|
|
158
|
-
byt = True
|
|
159
|
-
continue
|
|
160
|
-
if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and byt:
|
|
161
|
-
seq += sig[i].lower().encode('utf8')
|
|
162
|
-
byt = False
|
|
163
|
-
continue
|
|
164
|
-
if sig[i] == "-" or sig[i] == " ":
|
|
165
|
-
seq += b"|"
|
|
166
|
-
continue
|
|
167
|
-
if sig[i] == "'" and not ror:
|
|
168
|
-
ror = True
|
|
169
|
-
continue
|
|
170
|
-
if sig[i] == "'" and ror:
|
|
171
|
-
ror = False
|
|
172
|
-
continue
|
|
173
|
-
|
|
174
|
-
return seq
|
|
175
|
-
|
|
176
|
-
def load_container_signature(self, containersignature_file):
|
|
177
|
-
"""
|
|
178
|
-
Load the PRONOM container-signature file and convert sequences to
|
|
179
|
-
regular expressions.
|
|
180
|
-
"""
|
|
181
|
-
tree = CET.parse(containersignature_file)
|
|
182
|
-
# load and have container signatures converted
|
|
183
|
-
self.sequenceSignature = {}
|
|
184
|
-
for signature in tree.getroot().findall('ContainerSignatures/ContainerSignature'):
|
|
185
|
-
signatureId = signature.get('Id')
|
|
186
|
-
signatureSequence = signature.findall('Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence')
|
|
187
|
-
self.sequenceSignature[signatureId] = []
|
|
188
|
-
for sequence in signatureSequence:
|
|
189
|
-
self.sequenceSignature[signatureId].append(self.convert_container_sequence(sequence[0].text))
|
|
190
|
-
# map PUID to container signatureId
|
|
191
|
-
self.puidMapping = {}
|
|
192
|
-
mappings = tree.find('FileFormatMappings')
|
|
193
|
-
for mapping in mappings.findall('FileFormatMapping'):
|
|
194
|
-
if mapping.get('signatureId') not in self.puidMapping:
|
|
195
|
-
self.puidMapping[mapping.get('signatureId')] = []
|
|
196
|
-
self.puidMapping[mapping.get('signatureId')].append(mapping.get('Puid'))
|
|
197
|
-
# print "sequences:\n",self.sequenceSignature
|
|
198
|
-
# print "mapping:\n",self.puidMapping
|
|
199
|
-
# exit()
|
|
200
|
-
|
|
201
|
-
def extract_signatures(self, doc, signature_type="ZIP"):
|
|
202
|
-
"""
|
|
203
|
-
Given an XML container signature file, returns a dictionary of signatures.
|
|
204
|
-
|
|
205
|
-
The format of the dictionary is:
|
|
206
|
-
|
|
207
|
-
{
|
|
208
|
-
path_to_file_inside_zip: {puid: [signatures]}
|
|
209
|
-
}
|
|
210
|
-
"""
|
|
211
|
-
root = doc.getroot()
|
|
212
|
-
format_mappings = root.find("FileFormatMappings")
|
|
213
|
-
|
|
214
|
-
def get_puid(doc, element_id):
|
|
215
|
-
return format_mappings.find('FileFormatMapping[@signatureId="{}"]'.format(element_id)).attrib["Puid"]
|
|
216
|
-
|
|
217
|
-
def format_signature_attributes(element):
|
|
218
|
-
return {
|
|
219
|
-
"path": element.findtext("Files/File/Path"),
|
|
220
|
-
"id": element.attrib["Id"],
|
|
221
|
-
"signature": self.convert_container_sequence(element.findtext("Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence/Sequence"))
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
elements = root.findall("ContainerSignatures/ContainerSignature[@ContainerType=\"{}\"]".format(signature_type))
|
|
225
|
-
signatures = {}
|
|
226
|
-
for el in elements:
|
|
227
|
-
if el.find("Files/File/BinarySignatures") is None:
|
|
228
|
-
continue
|
|
229
|
-
|
|
230
|
-
puid = get_puid(doc, el.attrib["Id"])
|
|
231
|
-
signature = format_signature_attributes(el)
|
|
232
|
-
path = signature["path"]
|
|
233
|
-
if path not in signatures:
|
|
234
|
-
signatures[path] = {}
|
|
235
|
-
if puid not in signatures[path]:
|
|
236
|
-
signatures[path][puid] = []
|
|
237
|
-
signatures[path][puid].append(format_signature_attributes(el))
|
|
238
|
-
return signatures
|
|
239
|
-
|
|
240
|
-
def match_container(self, signature_type, klass, file, signature_file):
|
|
241
|
-
puids = klass(file, self.extract_signatures(signature_file, signature_type=signature_type)).detect_formats()
|
|
242
|
-
results = []
|
|
243
|
-
for puid in puids:
|
|
244
|
-
format = self.puid_format_map[puid]
|
|
245
|
-
signature = format.findtext("name")
|
|
246
|
-
results.append((format, signature))
|
|
247
|
-
return results
|
|
248
|
-
|
|
249
|
-
def load_fido_xml(self, file):
|
|
250
|
-
"""
|
|
251
|
-
Load the fido format information from @param file.
|
|
252
|
-
As a side-effect, set self.formats.
|
|
253
|
-
@return list of ElementTree.Element, one for each format.
|
|
254
|
-
"""
|
|
255
|
-
tree = ET.parse(file)
|
|
256
|
-
# print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
|
|
257
|
-
# TODO: Handle empty regexes properly; perhaps remove from the format list
|
|
258
|
-
for element in tree.getroot().findall('./format'):
|
|
259
|
-
puid = self.get_puid(element)
|
|
260
|
-
# Handle over-writes in multiple file loads
|
|
261
|
-
existing = self.puid_format_map.get(puid, False)
|
|
262
|
-
if existing:
|
|
263
|
-
# Already have one, so replace old with new!
|
|
264
|
-
self.formats[self.formats.index(existing)] = element
|
|
265
|
-
else:
|
|
266
|
-
self.formats.append(element)
|
|
267
|
-
self.puid_format_map[puid] = element
|
|
268
|
-
# Build some structures to speed things up
|
|
269
|
-
self.puid_has_priority_over_map[puid] = frozenset([puid_element.text for puid_element in element.findall('has_priority_over')])
|
|
270
|
-
return self.formats
|
|
271
|
-
|
|
272
|
-
# To delete a format: (1) remove from self.formats, (2) remove from puid_format_map, (3) remove from selt.puid_has_priority_over_map
|
|
273
|
-
def get_signatures(self, format):
|
|
274
|
-
return format.findall('signature')
|
|
275
|
-
|
|
276
|
-
def has_priority_over(self, format, possibly_inferior):
|
|
277
|
-
return self.get_puid(possibly_inferior)in self.puid_has_priority_over_map[self.get_puid(format)]
|
|
278
|
-
|
|
279
|
-
def get_puid(self, format):
|
|
280
|
-
return format.find('puid').text
|
|
281
|
-
|
|
282
|
-
def get_patterns(self, signature):
|
|
283
|
-
return signature.findall('pattern')
|
|
284
|
-
|
|
285
|
-
def get_pos(self, pat):
|
|
286
|
-
return pat.find('position').text
|
|
287
|
-
|
|
288
|
-
def get_regex(self, pat):
|
|
289
|
-
# The regex is matching bytes from a file so regex must also be bytes
|
|
290
|
-
return pat.find('regex').text.encode('utf8')
|
|
291
|
-
|
|
292
|
-
def get_extension(self, format):
|
|
293
|
-
return format.find('extension').text
|
|
294
|
-
|
|
295
|
-
def print_matches(self, fullname, matches, delta_t, matchtype=''):
|
|
296
|
-
"""
|
|
297
|
-
The default match handler. Prints out information for each match in the list.
|
|
298
|
-
@param fullname is name of the file being matched
|
|
299
|
-
@param matches is a list of (format, signature)
|
|
300
|
-
@param delta_t is the time taken for the match.
|
|
301
|
-
@param matchtype is the type of match (signature, containersignature, extension, fail)
|
|
302
|
-
"""
|
|
303
|
-
class Info:
|
|
304
|
-
pass
|
|
305
|
-
obj = Info()
|
|
306
|
-
obj.count = self.current_count
|
|
307
|
-
obj.group_size = len(matches)
|
|
308
|
-
obj.filename = fullname
|
|
309
|
-
obj.time = int(delta_t * 1000)
|
|
310
|
-
obj.filesize = self.current_filesize
|
|
311
|
-
obj.matchtype = matchtype
|
|
312
|
-
if len(matches) == 0:
|
|
313
|
-
sys.stdout.write(self.printnomatch % {
|
|
314
|
-
"info.time": obj.time,
|
|
315
|
-
"info.filesize": obj.filesize,
|
|
316
|
-
"info.filename": obj.filename,
|
|
317
|
-
"info.count": obj.count,
|
|
318
|
-
"info.matchtype": "fail"
|
|
319
|
-
})
|
|
320
|
-
return
|
|
321
|
-
i = 0
|
|
322
|
-
for (f, sig_name) in matches:
|
|
323
|
-
i += 1
|
|
324
|
-
obj.group_index = i
|
|
325
|
-
obj.puid = self.get_puid(f)
|
|
326
|
-
obj.formatname = f.find('name').text
|
|
327
|
-
obj.signaturename = sig_name
|
|
328
|
-
mime = f.find('mime')
|
|
329
|
-
obj.mimetype = mime.text if mime is not None else None
|
|
330
|
-
version = f.find('version')
|
|
331
|
-
obj.version = version.text if version is not None else None
|
|
332
|
-
alias = f.find('alias')
|
|
333
|
-
obj.alias = alias.text if alias is not None else None
|
|
334
|
-
apple_uti = f.find('apple_uid')
|
|
335
|
-
obj.apple_uti = apple_uti.text if apple_uti is not None else None
|
|
336
|
-
sys.stdout.write(self.printmatch % {
|
|
337
|
-
"info.time": obj.time,
|
|
338
|
-
"info.puid": obj.puid,
|
|
339
|
-
"info.formatname": obj.formatname,
|
|
340
|
-
"info.signaturename": obj.signaturename,
|
|
341
|
-
"info.filesize": obj.filesize,
|
|
342
|
-
"info.filename": obj.filename,
|
|
343
|
-
"info.mimetype": obj.mimetype,
|
|
344
|
-
"info.matchtype": obj.matchtype,
|
|
345
|
-
"info.version": obj.version,
|
|
346
|
-
"info.alias": obj.alias,
|
|
347
|
-
"info.apple_uti": obj.apple_uti,
|
|
348
|
-
"info.group_size": obj.group_size,
|
|
349
|
-
"info.group_index": obj.group_index,
|
|
350
|
-
"info.count": obj.count
|
|
351
|
-
})
|
|
352
|
-
|
|
353
|
-
def print_summary(self, secs):
|
|
354
|
-
"""
|
|
355
|
-
Print summary information on the number of matches and time taken.
|
|
356
|
-
"""
|
|
357
|
-
count = self.current_count
|
|
358
|
-
if not self.quiet:
|
|
359
|
-
rate = (int(round(count / secs)) if secs != 0 else 9999)
|
|
360
|
-
# print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
|
|
361
|
-
sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
|
|
362
|
-
|
|
363
|
-
def identify_file(self, filename):
|
|
364
|
-
"""
|
|
365
|
-
Identify the type of @param filename.
|
|
366
|
-
Call self.handle_matches instead of returning a value.
|
|
367
|
-
"""
|
|
368
|
-
self.current_file = filename
|
|
369
|
-
self.matchtype = "signature"
|
|
370
|
-
try:
|
|
371
|
-
t0 = time.clock()
|
|
372
|
-
f = open(filename, 'rb')
|
|
373
|
-
size = os.stat(filename)[6]
|
|
374
|
-
self.current_filesize = size
|
|
375
|
-
if self.current_filesize == 0:
|
|
376
|
-
sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + filename + "\n")
|
|
377
|
-
bofbuffer, eofbuffer, _ = self.get_buffers(f, size, seekable=True)
|
|
378
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
379
|
-
container_type = self.container_type(matches)
|
|
380
|
-
if container_type in ("zip", "ole"):
|
|
381
|
-
container_file = ET.parse(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
|
|
382
|
-
if container_type == "zip":
|
|
383
|
-
container_matches = self.match_container("ZIP", ZipPackage, filename, container_file)
|
|
384
|
-
else:
|
|
385
|
-
container_matches = self.match_container("OLE2", OlePackage, filename, container_file)
|
|
386
|
-
if len(container_matches) > 0:
|
|
387
|
-
self.handle_matches(filename, container_matches, time.clock() - t0, "container")
|
|
388
|
-
return
|
|
389
|
-
# from here is also repeated in walk_zip
|
|
390
|
-
# we should make this uniform in a next version!
|
|
391
|
-
#
|
|
392
|
-
# filesize is made conditional because files with 0 bytes
|
|
393
|
-
# are falsely characterised being 'rtf' (due to wacky sig)
|
|
394
|
-
# in these cases we try to match the extension instead
|
|
395
|
-
if len(matches) > 0 and self.current_filesize > 0:
|
|
396
|
-
self.handle_matches(filename, matches, time.clock() - t0, self.matchtype)
|
|
397
|
-
elif len(matches) == 0 or self.current_filesize == 0:
|
|
398
|
-
matches = self.match_extensions(filename)
|
|
399
|
-
self.handle_matches(filename, matches, time.clock() - t0, "extension")
|
|
400
|
-
# only recurse into certain containers, like ZIP or TAR
|
|
401
|
-
container = self.container_type(matches)
|
|
402
|
-
# till here matey!
|
|
403
|
-
if self.zip and self.can_recurse_into_container(container):
|
|
404
|
-
self.identify_contents(filename, type=container)
|
|
405
|
-
except IOError:
|
|
406
|
-
# print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
|
|
407
|
-
sys.stderr.write("FIDO: Error in identify_file: Path is {0}\n".format(filename))
|
|
408
|
-
|
|
409
|
-
def identify_contents(self, filename, fileobj=None, type=False):
|
|
410
|
-
"""
|
|
411
|
-
Identify each item in a container (such as a zip or tar file). Call
|
|
412
|
-
self.handle_matches on each item.
|
|
413
|
-
@param fileobj could be a file, or a stream.
|
|
414
|
-
"""
|
|
415
|
-
if not type:
|
|
416
|
-
return
|
|
417
|
-
elif type == 'zip':
|
|
418
|
-
self.walk_zip(filename, fileobj)
|
|
419
|
-
elif type == 'tar':
|
|
420
|
-
self.walk_tar(filename, fileobj)
|
|
421
|
-
else: # TODO: ouch!
|
|
422
|
-
raise RuntimeError("Unknown container type: " + repr(type))
|
|
423
|
-
|
|
424
|
-
def identify_multi_object_stream(self, stream):
|
|
425
|
-
"""
|
|
426
|
-
Does not work!
|
|
427
|
-
Stream may contain one or more objects each with an HTTP style header
|
|
428
|
-
that must include content-length. The headers consist of keyword:value
|
|
429
|
-
pairs terminated by a newline. There must be a newline following the
|
|
430
|
-
headers.
|
|
431
|
-
"""
|
|
432
|
-
offset = 0
|
|
433
|
-
while True:
|
|
434
|
-
t0 = time.clock()
|
|
435
|
-
content_length = -1
|
|
436
|
-
for line in stream:
|
|
437
|
-
offset += len(line)
|
|
438
|
-
if line == '\n':
|
|
439
|
-
if content_length < 0:
|
|
440
|
-
raise EnvironmentError("No content-length provided.")
|
|
441
|
-
else:
|
|
442
|
-
break
|
|
443
|
-
pair = line.lower().split(':', 2)
|
|
444
|
-
if pair[0] == 'content-length':
|
|
445
|
-
content_length = int(pair[1])
|
|
446
|
-
if content_length == -1:
|
|
447
|
-
return
|
|
448
|
-
# Consume exactly content-length bytes
|
|
449
|
-
self.current_file = 'STDIN!(at ' + str(offset) + ' bytes)'
|
|
450
|
-
self.current_filesize = content_length
|
|
451
|
-
bofbuffer, eofbuffer, _ = self.get_buffers(stream, content_length)
|
|
452
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
453
|
-
# MdR: this needs attention
|
|
454
|
-
if len(matches) > 0:
|
|
455
|
-
self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
|
|
456
|
-
elif len(matches) == 0 or self.current_filesize == 0:
|
|
457
|
-
matches = self.match_extensions(self.current_file)
|
|
458
|
-
self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
|
|
459
|
-
|
|
460
|
-
def identify_stream(self, stream, filename):
|
|
461
|
-
"""
|
|
462
|
-
Identify the type of @param stream.
|
|
463
|
-
Call self.handle_matches instead of returning a value.
|
|
464
|
-
Does not close stream.
|
|
465
|
-
"""
|
|
466
|
-
t0 = time.clock()
|
|
467
|
-
bofbuffer, eofbuffer, bytes_read = self.get_buffers(stream, length=None)
|
|
468
|
-
self.current_filesize = bytes_read
|
|
469
|
-
self.current_file = 'STDIN'
|
|
470
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
471
|
-
# MdR: this needs attention
|
|
472
|
-
if len(matches) > 0:
|
|
473
|
-
self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
|
|
474
|
-
elif len(matches) == 0 or self.current_filesize == 0:
|
|
475
|
-
# we can only determine the filename from the STDIN stream
|
|
476
|
-
# on Linux, on Windows there is not a (simple) way to do that
|
|
477
|
-
if (os.name != "nt"):
|
|
478
|
-
try:
|
|
479
|
-
self.current_file = os.readlink("/proc/self/fd/0")
|
|
480
|
-
except:
|
|
481
|
-
if filename is not None:
|
|
482
|
-
self.current_file = filename
|
|
483
|
-
else:
|
|
484
|
-
self.current_file = 'STDIN'
|
|
485
|
-
else:
|
|
486
|
-
if filename is not None:
|
|
487
|
-
self.current_file = filename
|
|
488
|
-
matches = self.match_extensions(self.current_file)
|
|
489
|
-
# we have to reset self.current_file if not on Windows
|
|
490
|
-
if (os.name != "nt"):
|
|
491
|
-
self.current_file = 'STDIN'
|
|
492
|
-
self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
|
|
493
|
-
|
|
494
|
-
def container_type(self, matches):
|
|
495
|
-
"""
|
|
496
|
-
Determine if one of the @param matches is the format of a container
|
|
497
|
-
that we can look inside of (e.g., zip, tar).
|
|
498
|
-
@return False, zip, or tar.
|
|
499
|
-
"""
|
|
500
|
-
for (format_, unused) in matches:
|
|
501
|
-
container = format_.find('container')
|
|
502
|
-
if container is not None:
|
|
503
|
-
return container.text
|
|
504
|
-
|
|
505
|
-
# aside from checking <container> elements,
|
|
506
|
-
# check for fmt/111, which is OLE
|
|
507
|
-
puid = format_.find('puid')
|
|
508
|
-
if puid is not None and puid.text == 'fmt/111':
|
|
509
|
-
return 'ole'
|
|
510
|
-
return False
|
|
511
|
-
|
|
512
|
-
def can_recurse_into_container(self, container_type):
|
|
513
|
-
"""
|
|
514
|
-
Determine if the passed container type can:
|
|
515
|
-
a) be extracted, and
|
|
516
|
-
b) contain individual files which can be identified separately.
|
|
517
|
-
|
|
518
|
-
This function is useful for filtering out containers such as OLE,
|
|
519
|
-
which are usually most interesting as compound objects rather than
|
|
520
|
-
for their contents.
|
|
521
|
-
"""
|
|
522
|
-
return container_type in ('zip', 'tar')
|
|
523
|
-
|
|
524
|
-
def blocking_read(self, file, bytes_to_read):
|
|
525
|
-
bytes_read = 0
|
|
526
|
-
buffer = b''
|
|
527
|
-
while bytes_read < bytes_to_read:
|
|
528
|
-
readbuffer = file.read(bytes_to_read - bytes_read)
|
|
529
|
-
buffer += readbuffer
|
|
530
|
-
bytes_read = len(buffer)
|
|
531
|
-
# break out if EOF is reached.
|
|
532
|
-
if readbuffer == '':
|
|
533
|
-
break
|
|
534
|
-
return buffer
|
|
535
|
-
|
|
536
|
-
def get_buffers(self, stream, length=None, seekable=False):
|
|
537
|
-
"""
|
|
538
|
-
Return buffers from the beginning and end of stream and the number of
|
|
539
|
-
bytes read if there may be more bytes in the stream.
|
|
540
|
-
|
|
541
|
-
If length is None, return the length as found.
|
|
542
|
-
If seekable is False, the steam does not support a seek operation.
|
|
543
|
-
"""
|
|
544
|
-
bytes_to_read = self.bufsize if length is None else min(length, self.bufsize)
|
|
545
|
-
bofbuffer = self.blocking_read(stream, bytes_to_read)
|
|
546
|
-
bytes_read = len(bofbuffer)
|
|
547
|
-
if length is None:
|
|
548
|
-
# A stream with unknown length; have to keep two buffers around
|
|
549
|
-
prevbuffer = bofbuffer
|
|
550
|
-
while True:
|
|
551
|
-
buffer = self.blocking_read(stream, self.bufsize)
|
|
552
|
-
bytes_read += len(buffer)
|
|
553
|
-
if len(buffer) == self.bufsize:
|
|
554
|
-
prevbuffer = buffer
|
|
555
|
-
else:
|
|
556
|
-
eofbuffer = prevbuffer if len(buffer) == 0 else prevbuffer[-(self.bufsize - len(buffer)):] + buffer
|
|
557
|
-
break
|
|
558
|
-
return bofbuffer, eofbuffer, bytes_read
|
|
559
|
-
else:
|
|
560
|
-
bytes_unread = length - len(bofbuffer)
|
|
561
|
-
if bytes_unread == 0:
|
|
562
|
-
eofbuffer = bofbuffer
|
|
563
|
-
elif bytes_unread < self.bufsize:
|
|
564
|
-
# The buffs overlap
|
|
565
|
-
eofbuffer = bofbuffer[bytes_unread:] + self.blocking_read(stream, bytes_unread)
|
|
566
|
-
elif bytes_unread == self.bufsize:
|
|
567
|
-
eofbuffer = self.blocking_read(stream, self.bufsize)
|
|
568
|
-
elif seekable: # easy case when we can just seek!
|
|
569
|
-
stream.seek(length - self.bufsize)
|
|
570
|
-
eofbuffer = self.blocking_read(stream, self.bufsize)
|
|
571
|
-
else:
|
|
572
|
-
# We have more to read and know how much.
|
|
573
|
-
# n*bufsize + r = length
|
|
574
|
-
(n, r) = divmod(bytes_unread, self.bufsize)
|
|
575
|
-
# skip n-1*bufsize bytes
|
|
576
|
-
for unused_i in range(1, n):
|
|
577
|
-
self.blocking_read(stream, self.bufsize)
|
|
578
|
-
# skip r bytes
|
|
579
|
-
self.blocking_read(stream, r)
|
|
580
|
-
# and read the remaining bufsize bytes into the eofbuffer
|
|
581
|
-
eofbuffer = self.blocking_read(stream, self.bufsize)
|
|
582
|
-
return bofbuffer, eofbuffer, bytes_to_read
|
|
583
|
-
|
|
584
|
-
def walk_zip(self, filename, fileobj=None):
|
|
585
|
-
"""
|
|
586
|
-
Identify the type of each item in the zip
|
|
587
|
-
@param fileobj. If fileobj is not provided, open.
|
|
588
|
-
@param filename.
|
|
589
|
-
Call self.handle_matches instead of returning a value.
|
|
590
|
-
"""
|
|
591
|
-
try:
|
|
592
|
-
with zipfile.ZipFile((fileobj if fileobj else filename), 'r') as zipstream:
|
|
593
|
-
for item in zipstream.infolist():
|
|
594
|
-
if item.file_size == 0:
|
|
595
|
-
continue # TODO: Find a better test for isdir
|
|
596
|
-
t0 = time.clock()
|
|
597
|
-
with zipstream.open(item) as f:
|
|
598
|
-
item_name = filename + '!' + item.filename
|
|
599
|
-
self.current_file = item_name
|
|
600
|
-
self.current_filesize = item.file_size
|
|
601
|
-
if self.current_filesize == 0:
|
|
602
|
-
sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + item_name + "\n")
|
|
603
|
-
bofbuffer, eofbuffer, _ = self.get_buffers(f, item.file_size)
|
|
604
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
605
|
-
if len(matches) > 0 and self.current_filesize > 0:
|
|
606
|
-
self.handle_matches(item_name, matches, time.clock() - t0, "signature")
|
|
607
|
-
elif len(matches) == 0 or self.current_filesize == 0:
|
|
608
|
-
matches = self.match_extensions(item_name)
|
|
609
|
-
self.handle_matches(item_name, matches, time.clock() - t0, "extension")
|
|
610
|
-
if self.container_type(matches):
|
|
611
|
-
target = tempfile.SpooledTemporaryFile(prefix='Fido')
|
|
612
|
-
with zipstream.open(item) as source:
|
|
613
|
-
self.copy_stream(source, target)
|
|
614
|
-
# target.seek(0)
|
|
615
|
-
self.identify_contents(item_name, target, self.container_type(matches))
|
|
616
|
-
except IOError:
|
|
617
|
-
sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
|
|
618
|
-
except zipfile.BadZipfile:
|
|
619
|
-
sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
|
|
620
|
-
|
|
621
|
-
def walk_tar(self, filename, fileobj):
|
|
622
|
-
"""
|
|
623
|
-
Identify the type of each item in the tar.
|
|
624
|
-
@param fileobj. If fileobj is not provided, open.
|
|
625
|
-
@param filename.
|
|
626
|
-
Call self.handle_matches instead of returning a value.
|
|
627
|
-
"""
|
|
628
|
-
try:
|
|
629
|
-
with tarfile.TarFile(filename, fileobj=fileobj, mode='r') as tarstream:
|
|
630
|
-
for item in tarstream.getmembers():
|
|
631
|
-
if not item.isfile():
|
|
632
|
-
continue
|
|
633
|
-
t0 = time.clock()
|
|
634
|
-
with closing(tarstream.extractfile(item)) as f:
|
|
635
|
-
tar_item_name = filename + '!' + item.name
|
|
636
|
-
self.current_file = tar_item_name
|
|
637
|
-
self.current_filesize = item.size
|
|
638
|
-
bofbuffer, eofbuffer, _ = self.get_buffers(f, item.size)
|
|
639
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
|
640
|
-
self.handle_matches(tar_item_name, matches, time.clock() - t0)
|
|
641
|
-
if self.container_type(matches):
|
|
642
|
-
f.seek(0)
|
|
643
|
-
self.identify_contents(tar_item_name, f, self.container_type(matches))
|
|
644
|
-
except tarfile.TarError:
|
|
645
|
-
sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
|
|
646
|
-
|
|
647
|
-
def as_good_as_any(self, f1, match_list):
|
|
648
|
-
"""
|
|
649
|
-
Return True if the proposed format is as good as any in the match_list.
|
|
650
|
-
For example, if there is no format in the match_list that has priority over the proposed one
|
|
651
|
-
"""
|
|
652
|
-
if match_list != []:
|
|
653
|
-
f1_puid = self.get_puid(f1)
|
|
654
|
-
for (f2, unused) in match_list:
|
|
655
|
-
if f1 == f2:
|
|
656
|
-
continue
|
|
657
|
-
elif f1_puid in self.puid_has_priority_over_map[self.get_puid(f2)]:
|
|
658
|
-
return False
|
|
659
|
-
return True
|
|
660
|
-
|
|
661
|
-
def buffered_read(self, file_pos, overlap):
|
|
662
|
-
"""
|
|
663
|
-
Buffered read of data chunks.
|
|
664
|
-
"""
|
|
665
|
-
buf = ""
|
|
666
|
-
if not overlap:
|
|
667
|
-
bufsize = self.container_bufsize
|
|
668
|
-
else:
|
|
669
|
-
bufsize = self.container_bufsize + self.overlap_range
|
|
670
|
-
file_end = self.current_filesize
|
|
671
|
-
with open(self.current_file, 'rb') as file_handle:
|
|
672
|
-
file_handle.seek(file_pos)
|
|
673
|
-
if file_end - file_pos < bufsize:
|
|
674
|
-
file_read = file_end - file_pos
|
|
675
|
-
else:
|
|
676
|
-
file_read = self.bufsize
|
|
677
|
-
buf = file_handle.read(file_read)
|
|
678
|
-
return buf
|
|
679
|
-
|
|
680
|
-
def match_formats(self, bofbuffer, eofbuffer):
|
|
681
|
-
"""
|
|
682
|
-
Apply the patterns for formats to the supplied buffers.
|
|
683
|
-
@return a match list of (format, signature) tuples.
|
|
684
|
-
The list has inferior matches removed.
|
|
685
|
-
"""
|
|
686
|
-
self.current_count += 1
|
|
687
|
-
# t0 = time.clock()
|
|
688
|
-
result = []
|
|
689
|
-
for format in self.formats:
|
|
690
|
-
try:
|
|
691
|
-
self.current_format = format
|
|
692
|
-
if self.as_good_as_any(format, result):
|
|
693
|
-
for sig in self.get_signatures(format):
|
|
694
|
-
self.current_sig = sig
|
|
695
|
-
success = True
|
|
696
|
-
for pat in self.get_patterns(sig):
|
|
697
|
-
self.current_pat = pat
|
|
698
|
-
pos = self.get_pos(pat)
|
|
699
|
-
regex = self.get_regex(pat)
|
|
700
|
-
# print 'trying ', regex
|
|
701
|
-
if pos == 'BOF':
|
|
702
|
-
if not re.match(regex, bofbuffer):
|
|
703
|
-
success = False
|
|
704
|
-
break
|
|
705
|
-
elif pos == 'EOF':
|
|
706
|
-
if not re.search(regex, eofbuffer):
|
|
707
|
-
success = False
|
|
708
|
-
break
|
|
709
|
-
elif pos == 'VAR':
|
|
710
|
-
if not re.search(regex, bofbuffer):
|
|
711
|
-
success = False
|
|
712
|
-
break
|
|
713
|
-
elif pos == 'IFB':
|
|
714
|
-
if not re.search(regex, bofbuffer):
|
|
715
|
-
success = False
|
|
716
|
-
break
|
|
717
|
-
if success:
|
|
718
|
-
result.append((format, sig.findtext("name")))
|
|
719
|
-
except Exception as e:
|
|
720
|
-
sys.stderr.write(str(e) + "\n")
|
|
721
|
-
continue
|
|
722
|
-
# TODO: MdR: needs some <3
|
|
723
|
-
# print "Unexpected error:", sys.exc_info()[0], e
|
|
724
|
-
# sys.stdout.write('***', self.get_puid(format), regex)
|
|
725
|
-
|
|
726
|
-
# t1 = time.clock()
|
|
727
|
-
# if t1 - t0 > 0.02:
|
|
728
|
-
# print >> sys.stderr, "FIDO: Slow ID", self.current_file
|
|
729
|
-
result = [match for match in result if self.as_good_as_any(match[0], result)]
|
|
730
|
-
return result
|
|
731
|
-
|
|
732
|
-
def match_extensions(self, filename):
|
|
733
|
-
"""
|
|
734
|
-
Return the list of (format, self.externalsig) for every format whose extension matches the filename.
|
|
735
|
-
"""
|
|
736
|
-
myext = os.path.splitext(filename)[1].lower().lstrip(".")
|
|
737
|
-
result = []
|
|
738
|
-
if not myext:
|
|
739
|
-
return result
|
|
740
|
-
for element in self.formats:
|
|
741
|
-
for format_ in element.findall('extension'):
|
|
742
|
-
if myext == format_.text:
|
|
743
|
-
result.append((element, self.externalsig.findtext("name")))
|
|
744
|
-
break
|
|
745
|
-
result = [match for match in result if self.as_good_as_any(match[0], result)]
|
|
746
|
-
return result
|
|
747
|
-
|
|
748
|
-
def copy_stream(self, source, target):
|
|
749
|
-
while True:
|
|
750
|
-
buf = source.read(self.bufsize)
|
|
751
|
-
if len(buf) == 0:
|
|
752
|
-
break
|
|
753
|
-
target.write(buf)
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
def list_files(roots, recurse=False):
|
|
757
|
-
"""
|
|
758
|
-
Return the files one at a time. Roots could be a fileobj or a list.
|
|
759
|
-
"""
|
|
760
|
-
for root in roots:
|
|
761
|
-
root = (root if root[-1] != '\n' else root[:-1])
|
|
762
|
-
root = os.path.normpath(root)
|
|
763
|
-
if os.path.isfile(root):
|
|
764
|
-
yield root
|
|
765
|
-
else:
|
|
766
|
-
for path, unused, files in os.walk(root):
|
|
767
|
-
for f in files:
|
|
768
|
-
yield os.path.join(path, f)
|
|
769
|
-
if not recurse:
|
|
770
|
-
break
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
def main(args=None):
|
|
774
|
-
if not args:
|
|
775
|
-
args = sys.argv[1:]
|
|
776
|
-
|
|
777
|
-
parser = ArgumentParser(description=defaults['description'], epilog=defaults['epilog'], fromfile_prefix_chars='@', formatter_class=RawTextHelpFormatter)
|
|
778
|
-
parser.add_argument('-v', default=False, action='store_true', help='show version information')
|
|
779
|
-
parser.add_argument('-q', default=False, action='store_true', help='run (more) quietly')
|
|
780
|
-
parser.add_argument('-recurse', default=False, action='store_true', help='recurse into subdirectories')
|
|
781
|
-
parser.add_argument('-zip', default=False, action='store_true', help='recurse into zip and tar files')
|
|
782
|
-
parser.add_argument('-nocontainer', default=False, action='store_true', help='disable deep scan of container documents, increases speed but may reduce accuracy with big files')
|
|
783
|
-
parser.add_argument('-pronom_only', default=False, action='store_true', help='disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results')
|
|
784
|
-
|
|
785
|
-
group = parser.add_mutually_exclusive_group()
|
|
786
|
-
group.add_argument('-input', default=False, help='file containing a list of files to check, one per line. - means stdin')
|
|
787
|
-
group.add_argument('files', nargs='*', default=[], metavar='FILE', help='files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.')
|
|
788
|
-
|
|
789
|
-
parser.add_argument('-filename', default=None, help='filename if file contents passed through STDIN')
|
|
790
|
-
parser.add_argument('-useformats', metavar='INCLUDEPUIDS', default=None, help='comma separated string of formats to use in identification')
|
|
791
|
-
parser.add_argument('-nouseformats', metavar='EXCLUDEPUIDS', default=None, help='comma separated string of formats not to use in identification')
|
|
792
|
-
parser.add_argument('-matchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use on match. See nomatchprintf, README.txt.')
|
|
793
|
-
parser.add_argument('-nomatchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use if no match. See README.txt')
|
|
794
|
-
parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['bufsize']) + ' bytes)')
|
|
795
|
-
parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['container_bufsize']) + ' bytes)')
|
|
796
|
-
parser.add_argument('-loadformats', default=None, metavar='XML1,...,XMLn', help='comma separated string of XML format files to add.')
|
|
797
|
-
parser.add_argument('-confdir', default=CONFIG_DIR, help='configuration directory to load_fido_xml, for example, the format specifications from.')
|
|
798
|
-
|
|
799
|
-
if len(sys.argv) == 1:
|
|
800
|
-
parser.print_help()
|
|
801
|
-
sys.exit(1)
|
|
802
|
-
args = parser.parse_args(args)
|
|
803
|
-
|
|
804
|
-
t0 = time.clock()
|
|
805
|
-
|
|
806
|
-
versions = get_local_pronom_versions(args.confdir)
|
|
807
|
-
|
|
808
|
-
defaults['xml_pronomSignature'] = versions.pronom_signature
|
|
809
|
-
defaults['containersignature_file'] = versions.pronom_container_signature
|
|
810
|
-
defaults['xml_fidoExtensionSignature'] = versions.fido_extension_signature
|
|
811
|
-
defaults['format_files'] = [defaults['xml_pronomSignature']]
|
|
812
|
-
|
|
813
|
-
if args.pronom_only:
|
|
814
|
-
versionHeader = "FIDO v{0} ({1}, {2})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'])
|
|
815
|
-
else:
|
|
816
|
-
versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'], defaults['xml_fidoExtensionSignature'])
|
|
817
|
-
defaults['format_files'].append(defaults['xml_fidoExtensionSignature'])
|
|
818
|
-
|
|
819
|
-
if args.v:
|
|
820
|
-
sys.stdout.write(versionHeader)
|
|
821
|
-
sys.exit(0)
|
|
822
|
-
|
|
823
|
-
if args.matchprintf:
|
|
824
|
-
args.matchprintf = args.matchprintf.decode('string_escape')
|
|
825
|
-
if args.nomatchprintf:
|
|
826
|
-
args.nomatchprintf = args.nomatchprintf.decode('string_escape')
|
|
827
|
-
|
|
828
|
-
fido = Fido(
|
|
829
|
-
quiet=args.q,
|
|
830
|
-
bufsize=args.bufsize,
|
|
831
|
-
container_bufsize=args.container_bufsize,
|
|
832
|
-
printmatch=args.matchprintf,
|
|
833
|
-
printnomatch=args.nomatchprintf,
|
|
834
|
-
zip=args.zip,
|
|
835
|
-
nocontainer=args.nocontainer,
|
|
836
|
-
conf_dir=args.confdir)
|
|
837
|
-
|
|
838
|
-
# TODO: Allow conf options to be dis-included
|
|
839
|
-
if args.loadformats:
|
|
840
|
-
for file in args.loadformats.split(','):
|
|
841
|
-
fido.load_fido_xml(file)
|
|
842
|
-
|
|
843
|
-
# TODO: remove from maps
|
|
844
|
-
if args.useformats:
|
|
845
|
-
args.useformats = args.useformats.split(',')
|
|
846
|
-
fido.formats = [f for f in fido.formats if f.find('puid').text in args.useformats]
|
|
847
|
-
elif args.nouseformats:
|
|
848
|
-
args.nouseformats = args.nouseformats.split(',')
|
|
849
|
-
fido.formats = [f for f in fido.formats if f.find('puid').text not in args.nouseformats]
|
|
850
|
-
|
|
851
|
-
# Set up to use stdin, or open input files:
|
|
852
|
-
if args.input == '-':
|
|
853
|
-
args.files = sys.stdin
|
|
854
|
-
elif args.input:
|
|
855
|
-
args.files = open(args.input, 'r')
|
|
856
|
-
|
|
857
|
-
# RUN
|
|
858
|
-
try:
|
|
859
|
-
if not args.q:
|
|
860
|
-
sys.stderr.write(versionHeader)
|
|
861
|
-
sys.stderr.flush()
|
|
862
|
-
if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
|
|
863
|
-
if fido.zip:
|
|
864
|
-
raise RuntimeError("Multiple content read from stdin not yet supported.")
|
|
865
|
-
sys.exit(1)
|
|
866
|
-
fido.identify_multi_object_stream(sys.stdin)
|
|
867
|
-
else:
|
|
868
|
-
fido.identify_stream(sys.stdin, args.filename)
|
|
869
|
-
else:
|
|
870
|
-
for file in list_files(args.files, args.recurse):
|
|
871
|
-
fido.identify_file(file)
|
|
872
|
-
except KeyboardInterrupt:
|
|
873
|
-
msg = "FIDO: Interrupt while identifying file {0}"
|
|
874
|
-
sys.stderr.write(msg.format(fido.current_file))
|
|
875
|
-
sys.exit(1)
|
|
876
|
-
|
|
877
|
-
if not args.q:
|
|
878
|
-
sys.stdout.flush()
|
|
879
|
-
fido.print_summary(time.clock() - t0)
|
|
880
|
-
sys.stderr.flush()
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
if __name__ == '__main__':
|
|
884
|
-
main()
|