libis-format 0.9.32 → 0.9.33
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/data/types.yml +30 -16
- data/lib/libis/format/config.rb +7 -18
- data/lib/libis/format/converter/image_converter.rb +6 -0
- data/lib/libis/format/droid.rb +82 -25
- data/lib/libis/format/extension_identification.rb +55 -0
- data/lib/libis/format/fido.rb +57 -72
- data/lib/libis/format/file_tool.rb +76 -0
- data/lib/libis/format/identification_tool.rb +174 -0
- data/lib/libis/format/identifier.rb +129 -117
- data/lib/libis/format/type_database.rb +36 -5
- data/lib/libis/format/version.rb +1 -1
- data/lib/libis/format.rb +3 -0
- data/libis-format.gemspec +2 -1
- data/spec/converter_spec.rb +6 -4
- data/spec/identifier_spec.rb +125 -34
- metadata +21 -126
- data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
- data/tools/droid/container-signature-20170330.xml +0 -3584
- data/tools/droid/droid-command-line-6.3.jar +0 -0
- data/tools/droid/droid.bat +0 -152
- data/tools/droid/droid.sh +0 -152
- data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
- data/tools/droid/lib/activation-1.1.jar +0 -0
- data/tools/droid/lib/aopalliance-1.0.jar +0 -0
- data/tools/droid/lib/asm-2.2.3.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
- data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
- data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
- data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
- data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
- data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
- data/tools/droid/lib/classmate-1.0.0.jar +0 -0
- data/tools/droid/lib/commons-cli-1.2.jar +0 -0
- data/tools/droid/lib/commons-codec-1.10.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
- data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
- data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
- data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
- data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
- data/tools/droid/lib/commons-io-2.4.jar +0 -0
- data/tools/droid/lib/commons-lang-2.6.jar +0 -0
- data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
- data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
- data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
- data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
- data/tools/droid/lib/droid-container-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-6.3.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-6.3.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-help-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-6.3.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
- data/tools/droid/lib/droid-results-6.3.jar +0 -0
- data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
- data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
- data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
- data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
- data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
- data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
- data/tools/droid/lib/itext-2.0.8.jar +0 -0
- data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
- data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
- data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
- data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
- data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
- data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
- data/tools/droid/lib/jta-1.1.jar +0 -0
- data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
- data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
- data/tools/droid/lib/log4j-1.2.13.jar +0 -0
- data/tools/droid/lib/neethi-2.0.4.jar +0 -0
- data/tools/droid/lib/opencsv-2.3.jar +0 -0
- data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
- data/tools/droid/lib/poi-3.13.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
- data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
- data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
- data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
- data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
- data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
- data/tools/droid/lib/truezip-6.8.4.jar +0 -0
- data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
- data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
- data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
- data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
- data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
- data/tools/droid/lib/xz-1.0.jar +0 -0
- data/tools/fido/__init__.py +0 -50
- data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
- data/tools/fido/conf/container-signature-20170330.xml +0 -3584
- data/tools/fido/conf/dc.xsd +0 -119
- data/tools/fido/conf/dcmitype.xsd +0 -53
- data/tools/fido/conf/dcterms.xsd +0 -383
- data/tools/fido/conf/fido-formats.xsd +0 -173
- data/tools/fido/conf/format_extension_template.xml +0 -105
- data/tools/fido/conf/format_extensions.xml +0 -484
- data/tools/fido/conf/formats-v90.xml +0 -48877
- data/tools/fido/conf/pronom-xml-v90.zip +0 -0
- data/tools/fido/conf/versions.xml +0 -8
- data/tools/fido/fido.bat +0 -4
- data/tools/fido/fido.py +0 -884
- data/tools/fido/fido.sh +0 -5
- data/tools/fido/package.py +0 -96
- data/tools/fido/prepare.py +0 -645
- data/tools/fido/pronomutils.py +0 -200
- data/tools/fido/toxml.py +0 -60
- data/tools/fido/update_signatures.py +0 -183
data/tools/fido/fido.py
DELETED
@@ -1,884 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
"""
|
5
|
-
Format Identification for Digital Objects (FIDO).
|
6
|
-
|
7
|
-
FIDO is a command-line tool to identify the file formats of digital objects.
|
8
|
-
It is designed for simple integration into automated work-flows.
|
9
|
-
"""
|
10
|
-
|
11
|
-
from __future__ import absolute_import
|
12
|
-
|
13
|
-
from argparse import ArgumentParser, RawTextHelpFormatter
|
14
|
-
from contextlib import closing
|
15
|
-
import os
|
16
|
-
import re
|
17
|
-
import sys
|
18
|
-
import tarfile
|
19
|
-
import tempfile
|
20
|
-
import time
|
21
|
-
from xml.etree import cElementTree as ET
|
22
|
-
from xml.etree import ElementTree as CET
|
23
|
-
import zipfile
|
24
|
-
|
25
|
-
from six.moves import range
|
26
|
-
|
27
|
-
from . import __version__, CONFIG_DIR
|
28
|
-
from .package import OlePackage, ZipPackage
|
29
|
-
from .pronomutils import get_local_pronom_versions
|
30
|
-
|
31
|
-
|
32
|
-
defaults = {
|
33
|
-
'bufsize': 128 * 1024, # (bytes)
|
34
|
-
'regexcachesize': 2084, # (bytes)
|
35
|
-
'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
|
36
|
-
'printnomatch': "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
|
37
|
-
'format_files': [
|
38
|
-
'formats-v88.xml',
|
39
|
-
'format_extensions.xml'
|
40
|
-
],
|
41
|
-
'containersignature_file': 'container-signature-20170330.xml',
|
42
|
-
'container_bufsize': 512 * 1024, # (bytes)
|
43
|
-
'description': """Format Identification for Digital Objects (fido).
|
44
|
-
FIDO is a command-line tool to identify the file formats of digital objects.
|
45
|
-
It is designed for simple integration into automated work-flows.""",
|
46
|
-
'epilog': """
|
47
|
-
Open Planets Foundation (http://www.openplanetsfoundation.org)
|
48
|
-
See License.txt for license information.
|
49
|
-
Download from: https://github.com/openplanets/fido/releases
|
50
|
-
Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
|
51
|
-
Author: Adam Farquhar (BL), 2010
|
52
|
-
Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
|
53
|
-
FIDO uses the UK National Archives (TNA) PRONOM File Format
|
54
|
-
and Container descriptions.
|
55
|
-
PRONOM is available from http://www.nationalarchives.gov.uk/pronom/""",
|
56
|
-
}
|
57
|
-
|
58
|
-
|
59
|
-
class Fido:
|
60
|
-
def __init__(self, quiet=False, bufsize=None, container_bufsize=None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=CONFIG_DIR, format_files=None, containersignature_file=None):
|
61
|
-
global defaults
|
62
|
-
self.quiet = quiet
|
63
|
-
self.bufsize = defaults['bufsize'] if bufsize is None else bufsize
|
64
|
-
self.container_bufsize = defaults['container_bufsize'] if container_bufsize is None else container_bufsize
|
65
|
-
self.printmatch = defaults['printmatch'] if printmatch is None else printmatch
|
66
|
-
self.printnomatch = defaults['printnomatch'] if printnomatch is None else printnomatch
|
67
|
-
self.handle_matches = self.print_matches if handle_matches is None else handle_matches
|
68
|
-
self.zip = zip
|
69
|
-
self.nocontainer = nocontainer
|
70
|
-
self.conf_dir = conf_dir
|
71
|
-
self.format_files = defaults['format_files'] if format_files is None else format_files
|
72
|
-
self.containersignature_file = defaults['containersignature_file']
|
73
|
-
self.formats = []
|
74
|
-
self.puid_format_map = {}
|
75
|
-
self.puid_has_priority_over_map = {}
|
76
|
-
# load signatures
|
77
|
-
for xml_file in self.format_files:
|
78
|
-
self.load_fido_xml(os.path.join(os.path.abspath(self.conf_dir), xml_file))
|
79
|
-
self.load_container_signature(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
|
80
|
-
self.current_file = ''
|
81
|
-
self.current_filesize = 0
|
82
|
-
self.current_format = None
|
83
|
-
self.current_sig = None
|
84
|
-
self.current_pat = None
|
85
|
-
self.current_count = 0 # Count of calls to match_formats
|
86
|
-
re._MAXCACHE = defaults['regexcachesize']
|
87
|
-
self.externalsig = ET.XML('<signature><name>External</name></signature>')
|
88
|
-
|
89
|
-
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
90
|
-
_special = '$()*+.?![]^\\{|}' # Before: '$*+.?![]^\\{|}'
|
91
|
-
_hex = '0123456789abcdef'
|
92
|
-
|
93
|
-
def _escape_char(self, c):
|
94
|
-
if c in '\n':
|
95
|
-
return '\\n'
|
96
|
-
elif c == '\r':
|
97
|
-
return '\\r'
|
98
|
-
elif c in self._special:
|
99
|
-
return '\\' + c
|
100
|
-
else:
|
101
|
-
(high, low) = divmod(ord(c), 16)
|
102
|
-
return '\\x' + self._hex[high] + self._hex[low]
|
103
|
-
|
104
|
-
def escape(self, string):
|
105
|
-
"""
|
106
|
-
Escape characters in pattern that are non-printable, non-ascii, or
|
107
|
-
special for regexes.
|
108
|
-
"""
|
109
|
-
escaped = ''.join(c if c in self._ordinary else self._escape_char(c) for c in string)
|
110
|
-
return escaped
|
111
|
-
|
112
|
-
def convert_container_sequence(self, sig):
|
113
|
-
"""
|
114
|
-
Parse the PRONOM container sequences and convert to regular
|
115
|
-
expressions.
|
116
|
-
"""
|
117
|
-
# The sequence is regex matching bytes from a file so the sequence must also be bytes
|
118
|
-
seq = b'(?s)'
|
119
|
-
inq = False
|
120
|
-
byt = False
|
121
|
-
rng = False
|
122
|
-
ror = False
|
123
|
-
for i in range(len(sig)):
|
124
|
-
if not inq and not rng:
|
125
|
-
if sig[i] == "'":
|
126
|
-
inq = True
|
127
|
-
continue
|
128
|
-
if sig[i] == " ":
|
129
|
-
continue
|
130
|
-
if sig[i] == "[":
|
131
|
-
seq += b"("
|
132
|
-
rng = True
|
133
|
-
continue
|
134
|
-
if not byt:
|
135
|
-
seq += b"\\x" + sig[i].lower().encode('utf8')
|
136
|
-
byt = True
|
137
|
-
continue
|
138
|
-
if byt:
|
139
|
-
seq += sig[i].lower().encode('utf8')
|
140
|
-
byt = False
|
141
|
-
continue
|
142
|
-
if inq:
|
143
|
-
if sig[i] == "'" and not rng:
|
144
|
-
inq = False
|
145
|
-
continue
|
146
|
-
seq += self.escape(sig[i]).encode('utf8')
|
147
|
-
continue
|
148
|
-
if rng:
|
149
|
-
if sig[i] == "]":
|
150
|
-
seq += b")"
|
151
|
-
rng = False
|
152
|
-
continue
|
153
|
-
if sig[i] != "-" and sig[i] != "'" and ror:
|
154
|
-
seq += self.escape(sig[i]).encode('utf8')
|
155
|
-
continue
|
156
|
-
if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and sig[i] != ":" and not ror and not byt:
|
157
|
-
seq += b"\\x" + sig[i].lower().encode('utf8')
|
158
|
-
byt = True
|
159
|
-
continue
|
160
|
-
if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and byt:
|
161
|
-
seq += sig[i].lower().encode('utf8')
|
162
|
-
byt = False
|
163
|
-
continue
|
164
|
-
if sig[i] == "-" or sig[i] == " ":
|
165
|
-
seq += b"|"
|
166
|
-
continue
|
167
|
-
if sig[i] == "'" and not ror:
|
168
|
-
ror = True
|
169
|
-
continue
|
170
|
-
if sig[i] == "'" and ror:
|
171
|
-
ror = False
|
172
|
-
continue
|
173
|
-
|
174
|
-
return seq
|
175
|
-
|
176
|
-
def load_container_signature(self, containersignature_file):
|
177
|
-
"""
|
178
|
-
Load the PRONOM container-signature file and convert sequences to
|
179
|
-
regular expressions.
|
180
|
-
"""
|
181
|
-
tree = CET.parse(containersignature_file)
|
182
|
-
# load and have container signatures converted
|
183
|
-
self.sequenceSignature = {}
|
184
|
-
for signature in tree.getroot().findall('ContainerSignatures/ContainerSignature'):
|
185
|
-
signatureId = signature.get('Id')
|
186
|
-
signatureSequence = signature.findall('Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence')
|
187
|
-
self.sequenceSignature[signatureId] = []
|
188
|
-
for sequence in signatureSequence:
|
189
|
-
self.sequenceSignature[signatureId].append(self.convert_container_sequence(sequence[0].text))
|
190
|
-
# map PUID to container signatureId
|
191
|
-
self.puidMapping = {}
|
192
|
-
mappings = tree.find('FileFormatMappings')
|
193
|
-
for mapping in mappings.findall('FileFormatMapping'):
|
194
|
-
if mapping.get('signatureId') not in self.puidMapping:
|
195
|
-
self.puidMapping[mapping.get('signatureId')] = []
|
196
|
-
self.puidMapping[mapping.get('signatureId')].append(mapping.get('Puid'))
|
197
|
-
# print "sequences:\n",self.sequenceSignature
|
198
|
-
# print "mapping:\n",self.puidMapping
|
199
|
-
# exit()
|
200
|
-
|
201
|
-
def extract_signatures(self, doc, signature_type="ZIP"):
|
202
|
-
"""
|
203
|
-
Given an XML container signature file, returns a dictionary of signatures.
|
204
|
-
|
205
|
-
The format of the dictionary is:
|
206
|
-
|
207
|
-
{
|
208
|
-
path_to_file_inside_zip: {puid: [signatures]}
|
209
|
-
}
|
210
|
-
"""
|
211
|
-
root = doc.getroot()
|
212
|
-
format_mappings = root.find("FileFormatMappings")
|
213
|
-
|
214
|
-
def get_puid(doc, element_id):
|
215
|
-
return format_mappings.find('FileFormatMapping[@signatureId="{}"]'.format(element_id)).attrib["Puid"]
|
216
|
-
|
217
|
-
def format_signature_attributes(element):
|
218
|
-
return {
|
219
|
-
"path": element.findtext("Files/File/Path"),
|
220
|
-
"id": element.attrib["Id"],
|
221
|
-
"signature": self.convert_container_sequence(element.findtext("Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence/Sequence"))
|
222
|
-
}
|
223
|
-
|
224
|
-
elements = root.findall("ContainerSignatures/ContainerSignature[@ContainerType=\"{}\"]".format(signature_type))
|
225
|
-
signatures = {}
|
226
|
-
for el in elements:
|
227
|
-
if el.find("Files/File/BinarySignatures") is None:
|
228
|
-
continue
|
229
|
-
|
230
|
-
puid = get_puid(doc, el.attrib["Id"])
|
231
|
-
signature = format_signature_attributes(el)
|
232
|
-
path = signature["path"]
|
233
|
-
if path not in signatures:
|
234
|
-
signatures[path] = {}
|
235
|
-
if puid not in signatures[path]:
|
236
|
-
signatures[path][puid] = []
|
237
|
-
signatures[path][puid].append(format_signature_attributes(el))
|
238
|
-
return signatures
|
239
|
-
|
240
|
-
def match_container(self, signature_type, klass, file, signature_file):
|
241
|
-
puids = klass(file, self.extract_signatures(signature_file, signature_type=signature_type)).detect_formats()
|
242
|
-
results = []
|
243
|
-
for puid in puids:
|
244
|
-
format = self.puid_format_map[puid]
|
245
|
-
signature = format.findtext("name")
|
246
|
-
results.append((format, signature))
|
247
|
-
return results
|
248
|
-
|
249
|
-
def load_fido_xml(self, file):
|
250
|
-
"""
|
251
|
-
Load the fido format information from @param file.
|
252
|
-
As a side-effect, set self.formats.
|
253
|
-
@return list of ElementTree.Element, one for each format.
|
254
|
-
"""
|
255
|
-
tree = ET.parse(file)
|
256
|
-
# print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
|
257
|
-
# TODO: Handle empty regexes properly; perhaps remove from the format list
|
258
|
-
for element in tree.getroot().findall('./format'):
|
259
|
-
puid = self.get_puid(element)
|
260
|
-
# Handle over-writes in multiple file loads
|
261
|
-
existing = self.puid_format_map.get(puid, False)
|
262
|
-
if existing:
|
263
|
-
# Already have one, so replace old with new!
|
264
|
-
self.formats[self.formats.index(existing)] = element
|
265
|
-
else:
|
266
|
-
self.formats.append(element)
|
267
|
-
self.puid_format_map[puid] = element
|
268
|
-
# Build some structures to speed things up
|
269
|
-
self.puid_has_priority_over_map[puid] = frozenset([puid_element.text for puid_element in element.findall('has_priority_over')])
|
270
|
-
return self.formats
|
271
|
-
|
272
|
-
# To delete a format: (1) remove from self.formats, (2) remove from puid_format_map, (3) remove from selt.puid_has_priority_over_map
|
273
|
-
def get_signatures(self, format):
|
274
|
-
return format.findall('signature')
|
275
|
-
|
276
|
-
def has_priority_over(self, format, possibly_inferior):
|
277
|
-
return self.get_puid(possibly_inferior)in self.puid_has_priority_over_map[self.get_puid(format)]
|
278
|
-
|
279
|
-
def get_puid(self, format):
|
280
|
-
return format.find('puid').text
|
281
|
-
|
282
|
-
def get_patterns(self, signature):
|
283
|
-
return signature.findall('pattern')
|
284
|
-
|
285
|
-
def get_pos(self, pat):
|
286
|
-
return pat.find('position').text
|
287
|
-
|
288
|
-
def get_regex(self, pat):
|
289
|
-
# The regex is matching bytes from a file so regex must also be bytes
|
290
|
-
return pat.find('regex').text.encode('utf8')
|
291
|
-
|
292
|
-
def get_extension(self, format):
|
293
|
-
return format.find('extension').text
|
294
|
-
|
295
|
-
def print_matches(self, fullname, matches, delta_t, matchtype=''):
|
296
|
-
"""
|
297
|
-
The default match handler. Prints out information for each match in the list.
|
298
|
-
@param fullname is name of the file being matched
|
299
|
-
@param matches is a list of (format, signature)
|
300
|
-
@param delta_t is the time taken for the match.
|
301
|
-
@param matchtype is the type of match (signature, containersignature, extension, fail)
|
302
|
-
"""
|
303
|
-
class Info:
|
304
|
-
pass
|
305
|
-
obj = Info()
|
306
|
-
obj.count = self.current_count
|
307
|
-
obj.group_size = len(matches)
|
308
|
-
obj.filename = fullname
|
309
|
-
obj.time = int(delta_t * 1000)
|
310
|
-
obj.filesize = self.current_filesize
|
311
|
-
obj.matchtype = matchtype
|
312
|
-
if len(matches) == 0:
|
313
|
-
sys.stdout.write(self.printnomatch % {
|
314
|
-
"info.time": obj.time,
|
315
|
-
"info.filesize": obj.filesize,
|
316
|
-
"info.filename": obj.filename,
|
317
|
-
"info.count": obj.count,
|
318
|
-
"info.matchtype": "fail"
|
319
|
-
})
|
320
|
-
return
|
321
|
-
i = 0
|
322
|
-
for (f, sig_name) in matches:
|
323
|
-
i += 1
|
324
|
-
obj.group_index = i
|
325
|
-
obj.puid = self.get_puid(f)
|
326
|
-
obj.formatname = f.find('name').text
|
327
|
-
obj.signaturename = sig_name
|
328
|
-
mime = f.find('mime')
|
329
|
-
obj.mimetype = mime.text if mime is not None else None
|
330
|
-
version = f.find('version')
|
331
|
-
obj.version = version.text if version is not None else None
|
332
|
-
alias = f.find('alias')
|
333
|
-
obj.alias = alias.text if alias is not None else None
|
334
|
-
apple_uti = f.find('apple_uid')
|
335
|
-
obj.apple_uti = apple_uti.text if apple_uti is not None else None
|
336
|
-
sys.stdout.write(self.printmatch % {
|
337
|
-
"info.time": obj.time,
|
338
|
-
"info.puid": obj.puid,
|
339
|
-
"info.formatname": obj.formatname,
|
340
|
-
"info.signaturename": obj.signaturename,
|
341
|
-
"info.filesize": obj.filesize,
|
342
|
-
"info.filename": obj.filename,
|
343
|
-
"info.mimetype": obj.mimetype,
|
344
|
-
"info.matchtype": obj.matchtype,
|
345
|
-
"info.version": obj.version,
|
346
|
-
"info.alias": obj.alias,
|
347
|
-
"info.apple_uti": obj.apple_uti,
|
348
|
-
"info.group_size": obj.group_size,
|
349
|
-
"info.group_index": obj.group_index,
|
350
|
-
"info.count": obj.count
|
351
|
-
})
|
352
|
-
|
353
|
-
def print_summary(self, secs):
|
354
|
-
"""
|
355
|
-
Print summary information on the number of matches and time taken.
|
356
|
-
"""
|
357
|
-
count = self.current_count
|
358
|
-
if not self.quiet:
|
359
|
-
rate = (int(round(count / secs)) if secs != 0 else 9999)
|
360
|
-
# print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
|
361
|
-
sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
|
362
|
-
|
363
|
-
def identify_file(self, filename):
|
364
|
-
"""
|
365
|
-
Identify the type of @param filename.
|
366
|
-
Call self.handle_matches instead of returning a value.
|
367
|
-
"""
|
368
|
-
self.current_file = filename
|
369
|
-
self.matchtype = "signature"
|
370
|
-
try:
|
371
|
-
t0 = time.clock()
|
372
|
-
f = open(filename, 'rb')
|
373
|
-
size = os.stat(filename)[6]
|
374
|
-
self.current_filesize = size
|
375
|
-
if self.current_filesize == 0:
|
376
|
-
sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + filename + "\n")
|
377
|
-
bofbuffer, eofbuffer, _ = self.get_buffers(f, size, seekable=True)
|
378
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
379
|
-
container_type = self.container_type(matches)
|
380
|
-
if container_type in ("zip", "ole"):
|
381
|
-
container_file = ET.parse(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
|
382
|
-
if container_type == "zip":
|
383
|
-
container_matches = self.match_container("ZIP", ZipPackage, filename, container_file)
|
384
|
-
else:
|
385
|
-
container_matches = self.match_container("OLE2", OlePackage, filename, container_file)
|
386
|
-
if len(container_matches) > 0:
|
387
|
-
self.handle_matches(filename, container_matches, time.clock() - t0, "container")
|
388
|
-
return
|
389
|
-
# from here is also repeated in walk_zip
|
390
|
-
# we should make this uniform in a next version!
|
391
|
-
#
|
392
|
-
# filesize is made conditional because files with 0 bytes
|
393
|
-
# are falsely characterised being 'rtf' (due to wacky sig)
|
394
|
-
# in these cases we try to match the extension instead
|
395
|
-
if len(matches) > 0 and self.current_filesize > 0:
|
396
|
-
self.handle_matches(filename, matches, time.clock() - t0, self.matchtype)
|
397
|
-
elif len(matches) == 0 or self.current_filesize == 0:
|
398
|
-
matches = self.match_extensions(filename)
|
399
|
-
self.handle_matches(filename, matches, time.clock() - t0, "extension")
|
400
|
-
# only recurse into certain containers, like ZIP or TAR
|
401
|
-
container = self.container_type(matches)
|
402
|
-
# till here matey!
|
403
|
-
if self.zip and self.can_recurse_into_container(container):
|
404
|
-
self.identify_contents(filename, type=container)
|
405
|
-
except IOError:
|
406
|
-
# print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
|
407
|
-
sys.stderr.write("FIDO: Error in identify_file: Path is {0}\n".format(filename))
|
408
|
-
|
409
|
-
def identify_contents(self, filename, fileobj=None, type=False):
|
410
|
-
"""
|
411
|
-
Identify each item in a container (such as a zip or tar file). Call
|
412
|
-
self.handle_matches on each item.
|
413
|
-
@param fileobj could be a file, or a stream.
|
414
|
-
"""
|
415
|
-
if not type:
|
416
|
-
return
|
417
|
-
elif type == 'zip':
|
418
|
-
self.walk_zip(filename, fileobj)
|
419
|
-
elif type == 'tar':
|
420
|
-
self.walk_tar(filename, fileobj)
|
421
|
-
else: # TODO: ouch!
|
422
|
-
raise RuntimeError("Unknown container type: " + repr(type))
|
423
|
-
|
424
|
-
def identify_multi_object_stream(self, stream):
|
425
|
-
"""
|
426
|
-
Does not work!
|
427
|
-
Stream may contain one or more objects each with an HTTP style header
|
428
|
-
that must include content-length. The headers consist of keyword:value
|
429
|
-
pairs terminated by a newline. There must be a newline following the
|
430
|
-
headers.
|
431
|
-
"""
|
432
|
-
offset = 0
|
433
|
-
while True:
|
434
|
-
t0 = time.clock()
|
435
|
-
content_length = -1
|
436
|
-
for line in stream:
|
437
|
-
offset += len(line)
|
438
|
-
if line == '\n':
|
439
|
-
if content_length < 0:
|
440
|
-
raise EnvironmentError("No content-length provided.")
|
441
|
-
else:
|
442
|
-
break
|
443
|
-
pair = line.lower().split(':', 2)
|
444
|
-
if pair[0] == 'content-length':
|
445
|
-
content_length = int(pair[1])
|
446
|
-
if content_length == -1:
|
447
|
-
return
|
448
|
-
# Consume exactly content-length bytes
|
449
|
-
self.current_file = 'STDIN!(at ' + str(offset) + ' bytes)'
|
450
|
-
self.current_filesize = content_length
|
451
|
-
bofbuffer, eofbuffer, _ = self.get_buffers(stream, content_length)
|
452
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
453
|
-
# MdR: this needs attention
|
454
|
-
if len(matches) > 0:
|
455
|
-
self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
|
456
|
-
elif len(matches) == 0 or self.current_filesize == 0:
|
457
|
-
matches = self.match_extensions(self.current_file)
|
458
|
-
self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
|
459
|
-
|
460
|
-
def identify_stream(self, stream, filename):
|
461
|
-
"""
|
462
|
-
Identify the type of @param stream.
|
463
|
-
Call self.handle_matches instead of returning a value.
|
464
|
-
Does not close stream.
|
465
|
-
"""
|
466
|
-
t0 = time.clock()
|
467
|
-
bofbuffer, eofbuffer, bytes_read = self.get_buffers(stream, length=None)
|
468
|
-
self.current_filesize = bytes_read
|
469
|
-
self.current_file = 'STDIN'
|
470
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
471
|
-
# MdR: this needs attention
|
472
|
-
if len(matches) > 0:
|
473
|
-
self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
|
474
|
-
elif len(matches) == 0 or self.current_filesize == 0:
|
475
|
-
# we can only determine the filename from the STDIN stream
|
476
|
-
# on Linux, on Windows there is not a (simple) way to do that
|
477
|
-
if (os.name != "nt"):
|
478
|
-
try:
|
479
|
-
self.current_file = os.readlink("/proc/self/fd/0")
|
480
|
-
except:
|
481
|
-
if filename is not None:
|
482
|
-
self.current_file = filename
|
483
|
-
else:
|
484
|
-
self.current_file = 'STDIN'
|
485
|
-
else:
|
486
|
-
if filename is not None:
|
487
|
-
self.current_file = filename
|
488
|
-
matches = self.match_extensions(self.current_file)
|
489
|
-
# we have to reset self.current_file if not on Windows
|
490
|
-
if (os.name != "nt"):
|
491
|
-
self.current_file = 'STDIN'
|
492
|
-
self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
|
493
|
-
|
494
|
-
def container_type(self, matches):
|
495
|
-
"""
|
496
|
-
Determine if one of the @param matches is the format of a container
|
497
|
-
that we can look inside of (e.g., zip, tar).
|
498
|
-
@return False, zip, or tar.
|
499
|
-
"""
|
500
|
-
for (format_, unused) in matches:
|
501
|
-
container = format_.find('container')
|
502
|
-
if container is not None:
|
503
|
-
return container.text
|
504
|
-
|
505
|
-
# aside from checking <container> elements,
|
506
|
-
# check for fmt/111, which is OLE
|
507
|
-
puid = format_.find('puid')
|
508
|
-
if puid is not None and puid.text == 'fmt/111':
|
509
|
-
return 'ole'
|
510
|
-
return False
|
511
|
-
|
512
|
-
def can_recurse_into_container(self, container_type):
|
513
|
-
"""
|
514
|
-
Determine if the passed container type can:
|
515
|
-
a) be extracted, and
|
516
|
-
b) contain individual files which can be identified separately.
|
517
|
-
|
518
|
-
This function is useful for filtering out containers such as OLE,
|
519
|
-
which are usually most interesting as compound objects rather than
|
520
|
-
for their contents.
|
521
|
-
"""
|
522
|
-
return container_type in ('zip', 'tar')
|
523
|
-
|
524
|
-
def blocking_read(self, file, bytes_to_read):
|
525
|
-
bytes_read = 0
|
526
|
-
buffer = b''
|
527
|
-
while bytes_read < bytes_to_read:
|
528
|
-
readbuffer = file.read(bytes_to_read - bytes_read)
|
529
|
-
buffer += readbuffer
|
530
|
-
bytes_read = len(buffer)
|
531
|
-
# break out if EOF is reached.
|
532
|
-
if readbuffer == '':
|
533
|
-
break
|
534
|
-
return buffer
|
535
|
-
|
536
|
-
def get_buffers(self, stream, length=None, seekable=False):
|
537
|
-
"""
|
538
|
-
Return buffers from the beginning and end of stream and the number of
|
539
|
-
bytes read if there may be more bytes in the stream.
|
540
|
-
|
541
|
-
If length is None, return the length as found.
|
542
|
-
If seekable is False, the steam does not support a seek operation.
|
543
|
-
"""
|
544
|
-
bytes_to_read = self.bufsize if length is None else min(length, self.bufsize)
|
545
|
-
bofbuffer = self.blocking_read(stream, bytes_to_read)
|
546
|
-
bytes_read = len(bofbuffer)
|
547
|
-
if length is None:
|
548
|
-
# A stream with unknown length; have to keep two buffers around
|
549
|
-
prevbuffer = bofbuffer
|
550
|
-
while True:
|
551
|
-
buffer = self.blocking_read(stream, self.bufsize)
|
552
|
-
bytes_read += len(buffer)
|
553
|
-
if len(buffer) == self.bufsize:
|
554
|
-
prevbuffer = buffer
|
555
|
-
else:
|
556
|
-
eofbuffer = prevbuffer if len(buffer) == 0 else prevbuffer[-(self.bufsize - len(buffer)):] + buffer
|
557
|
-
break
|
558
|
-
return bofbuffer, eofbuffer, bytes_read
|
559
|
-
else:
|
560
|
-
bytes_unread = length - len(bofbuffer)
|
561
|
-
if bytes_unread == 0:
|
562
|
-
eofbuffer = bofbuffer
|
563
|
-
elif bytes_unread < self.bufsize:
|
564
|
-
# The buffs overlap
|
565
|
-
eofbuffer = bofbuffer[bytes_unread:] + self.blocking_read(stream, bytes_unread)
|
566
|
-
elif bytes_unread == self.bufsize:
|
567
|
-
eofbuffer = self.blocking_read(stream, self.bufsize)
|
568
|
-
elif seekable: # easy case when we can just seek!
|
569
|
-
stream.seek(length - self.bufsize)
|
570
|
-
eofbuffer = self.blocking_read(stream, self.bufsize)
|
571
|
-
else:
|
572
|
-
# We have more to read and know how much.
|
573
|
-
# n*bufsize + r = length
|
574
|
-
(n, r) = divmod(bytes_unread, self.bufsize)
|
575
|
-
# skip n-1*bufsize bytes
|
576
|
-
for unused_i in range(1, n):
|
577
|
-
self.blocking_read(stream, self.bufsize)
|
578
|
-
# skip r bytes
|
579
|
-
self.blocking_read(stream, r)
|
580
|
-
# and read the remaining bufsize bytes into the eofbuffer
|
581
|
-
eofbuffer = self.blocking_read(stream, self.bufsize)
|
582
|
-
return bofbuffer, eofbuffer, bytes_to_read
|
583
|
-
|
584
|
-
def walk_zip(self, filename, fileobj=None):
|
585
|
-
"""
|
586
|
-
Identify the type of each item in the zip
|
587
|
-
@param fileobj. If fileobj is not provided, open.
|
588
|
-
@param filename.
|
589
|
-
Call self.handle_matches instead of returning a value.
|
590
|
-
"""
|
591
|
-
try:
|
592
|
-
with zipfile.ZipFile((fileobj if fileobj else filename), 'r') as zipstream:
|
593
|
-
for item in zipstream.infolist():
|
594
|
-
if item.file_size == 0:
|
595
|
-
continue # TODO: Find a better test for isdir
|
596
|
-
t0 = time.clock()
|
597
|
-
with zipstream.open(item) as f:
|
598
|
-
item_name = filename + '!' + item.filename
|
599
|
-
self.current_file = item_name
|
600
|
-
self.current_filesize = item.file_size
|
601
|
-
if self.current_filesize == 0:
|
602
|
-
sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + item_name + "\n")
|
603
|
-
bofbuffer, eofbuffer, _ = self.get_buffers(f, item.file_size)
|
604
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
605
|
-
if len(matches) > 0 and self.current_filesize > 0:
|
606
|
-
self.handle_matches(item_name, matches, time.clock() - t0, "signature")
|
607
|
-
elif len(matches) == 0 or self.current_filesize == 0:
|
608
|
-
matches = self.match_extensions(item_name)
|
609
|
-
self.handle_matches(item_name, matches, time.clock() - t0, "extension")
|
610
|
-
if self.container_type(matches):
|
611
|
-
target = tempfile.SpooledTemporaryFile(prefix='Fido')
|
612
|
-
with zipstream.open(item) as source:
|
613
|
-
self.copy_stream(source, target)
|
614
|
-
# target.seek(0)
|
615
|
-
self.identify_contents(item_name, target, self.container_type(matches))
|
616
|
-
except IOError:
|
617
|
-
sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
|
618
|
-
except zipfile.BadZipfile:
|
619
|
-
sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
|
620
|
-
|
621
|
-
def walk_tar(self, filename, fileobj):
|
622
|
-
"""
|
623
|
-
Identify the type of each item in the tar.
|
624
|
-
@param fileobj. If fileobj is not provided, open.
|
625
|
-
@param filename.
|
626
|
-
Call self.handle_matches instead of returning a value.
|
627
|
-
"""
|
628
|
-
try:
|
629
|
-
with tarfile.TarFile(filename, fileobj=fileobj, mode='r') as tarstream:
|
630
|
-
for item in tarstream.getmembers():
|
631
|
-
if not item.isfile():
|
632
|
-
continue
|
633
|
-
t0 = time.clock()
|
634
|
-
with closing(tarstream.extractfile(item)) as f:
|
635
|
-
tar_item_name = filename + '!' + item.name
|
636
|
-
self.current_file = tar_item_name
|
637
|
-
self.current_filesize = item.size
|
638
|
-
bofbuffer, eofbuffer, _ = self.get_buffers(f, item.size)
|
639
|
-
matches = self.match_formats(bofbuffer, eofbuffer)
|
640
|
-
self.handle_matches(tar_item_name, matches, time.clock() - t0)
|
641
|
-
if self.container_type(matches):
|
642
|
-
f.seek(0)
|
643
|
-
self.identify_contents(tar_item_name, f, self.container_type(matches))
|
644
|
-
except tarfile.TarError:
|
645
|
-
sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
|
646
|
-
|
647
|
-
def as_good_as_any(self, f1, match_list):
|
648
|
-
"""
|
649
|
-
Return True if the proposed format is as good as any in the match_list.
|
650
|
-
For example, if there is no format in the match_list that has priority over the proposed one
|
651
|
-
"""
|
652
|
-
if match_list != []:
|
653
|
-
f1_puid = self.get_puid(f1)
|
654
|
-
for (f2, unused) in match_list:
|
655
|
-
if f1 == f2:
|
656
|
-
continue
|
657
|
-
elif f1_puid in self.puid_has_priority_over_map[self.get_puid(f2)]:
|
658
|
-
return False
|
659
|
-
return True
|
660
|
-
|
661
|
-
def buffered_read(self, file_pos, overlap):
|
662
|
-
"""
|
663
|
-
Buffered read of data chunks.
|
664
|
-
"""
|
665
|
-
buf = ""
|
666
|
-
if not overlap:
|
667
|
-
bufsize = self.container_bufsize
|
668
|
-
else:
|
669
|
-
bufsize = self.container_bufsize + self.overlap_range
|
670
|
-
file_end = self.current_filesize
|
671
|
-
with open(self.current_file, 'rb') as file_handle:
|
672
|
-
file_handle.seek(file_pos)
|
673
|
-
if file_end - file_pos < bufsize:
|
674
|
-
file_read = file_end - file_pos
|
675
|
-
else:
|
676
|
-
file_read = self.bufsize
|
677
|
-
buf = file_handle.read(file_read)
|
678
|
-
return buf
|
679
|
-
|
680
|
-
def match_formats(self, bofbuffer, eofbuffer):
|
681
|
-
"""
|
682
|
-
Apply the patterns for formats to the supplied buffers.
|
683
|
-
@return a match list of (format, signature) tuples.
|
684
|
-
The list has inferior matches removed.
|
685
|
-
"""
|
686
|
-
self.current_count += 1
|
687
|
-
# t0 = time.clock()
|
688
|
-
result = []
|
689
|
-
for format in self.formats:
|
690
|
-
try:
|
691
|
-
self.current_format = format
|
692
|
-
if self.as_good_as_any(format, result):
|
693
|
-
for sig in self.get_signatures(format):
|
694
|
-
self.current_sig = sig
|
695
|
-
success = True
|
696
|
-
for pat in self.get_patterns(sig):
|
697
|
-
self.current_pat = pat
|
698
|
-
pos = self.get_pos(pat)
|
699
|
-
regex = self.get_regex(pat)
|
700
|
-
# print 'trying ', regex
|
701
|
-
if pos == 'BOF':
|
702
|
-
if not re.match(regex, bofbuffer):
|
703
|
-
success = False
|
704
|
-
break
|
705
|
-
elif pos == 'EOF':
|
706
|
-
if not re.search(regex, eofbuffer):
|
707
|
-
success = False
|
708
|
-
break
|
709
|
-
elif pos == 'VAR':
|
710
|
-
if not re.search(regex, bofbuffer):
|
711
|
-
success = False
|
712
|
-
break
|
713
|
-
elif pos == 'IFB':
|
714
|
-
if not re.search(regex, bofbuffer):
|
715
|
-
success = False
|
716
|
-
break
|
717
|
-
if success:
|
718
|
-
result.append((format, sig.findtext("name")))
|
719
|
-
except Exception as e:
|
720
|
-
sys.stderr.write(str(e) + "\n")
|
721
|
-
continue
|
722
|
-
# TODO: MdR: needs some <3
|
723
|
-
# print "Unexpected error:", sys.exc_info()[0], e
|
724
|
-
# sys.stdout.write('***', self.get_puid(format), regex)
|
725
|
-
|
726
|
-
# t1 = time.clock()
|
727
|
-
# if t1 - t0 > 0.02:
|
728
|
-
# print >> sys.stderr, "FIDO: Slow ID", self.current_file
|
729
|
-
result = [match for match in result if self.as_good_as_any(match[0], result)]
|
730
|
-
return result
|
731
|
-
|
732
|
-
def match_extensions(self, filename):
|
733
|
-
"""
|
734
|
-
Return the list of (format, self.externalsig) for every format whose extension matches the filename.
|
735
|
-
"""
|
736
|
-
myext = os.path.splitext(filename)[1].lower().lstrip(".")
|
737
|
-
result = []
|
738
|
-
if not myext:
|
739
|
-
return result
|
740
|
-
for element in self.formats:
|
741
|
-
for format_ in element.findall('extension'):
|
742
|
-
if myext == format_.text:
|
743
|
-
result.append((element, self.externalsig.findtext("name")))
|
744
|
-
break
|
745
|
-
result = [match for match in result if self.as_good_as_any(match[0], result)]
|
746
|
-
return result
|
747
|
-
|
748
|
-
def copy_stream(self, source, target):
|
749
|
-
while True:
|
750
|
-
buf = source.read(self.bufsize)
|
751
|
-
if len(buf) == 0:
|
752
|
-
break
|
753
|
-
target.write(buf)
|
754
|
-
|
755
|
-
|
756
|
-
def list_files(roots, recurse=False):
|
757
|
-
"""
|
758
|
-
Return the files one at a time. Roots could be a fileobj or a list.
|
759
|
-
"""
|
760
|
-
for root in roots:
|
761
|
-
root = (root if root[-1] != '\n' else root[:-1])
|
762
|
-
root = os.path.normpath(root)
|
763
|
-
if os.path.isfile(root):
|
764
|
-
yield root
|
765
|
-
else:
|
766
|
-
for path, unused, files in os.walk(root):
|
767
|
-
for f in files:
|
768
|
-
yield os.path.join(path, f)
|
769
|
-
if not recurse:
|
770
|
-
break
|
771
|
-
|
772
|
-
|
773
|
-
def main(args=None):
|
774
|
-
if not args:
|
775
|
-
args = sys.argv[1:]
|
776
|
-
|
777
|
-
parser = ArgumentParser(description=defaults['description'], epilog=defaults['epilog'], fromfile_prefix_chars='@', formatter_class=RawTextHelpFormatter)
|
778
|
-
parser.add_argument('-v', default=False, action='store_true', help='show version information')
|
779
|
-
parser.add_argument('-q', default=False, action='store_true', help='run (more) quietly')
|
780
|
-
parser.add_argument('-recurse', default=False, action='store_true', help='recurse into subdirectories')
|
781
|
-
parser.add_argument('-zip', default=False, action='store_true', help='recurse into zip and tar files')
|
782
|
-
parser.add_argument('-nocontainer', default=False, action='store_true', help='disable deep scan of container documents, increases speed but may reduce accuracy with big files')
|
783
|
-
parser.add_argument('-pronom_only', default=False, action='store_true', help='disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results')
|
784
|
-
|
785
|
-
group = parser.add_mutually_exclusive_group()
|
786
|
-
group.add_argument('-input', default=False, help='file containing a list of files to check, one per line. - means stdin')
|
787
|
-
group.add_argument('files', nargs='*', default=[], metavar='FILE', help='files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.')
|
788
|
-
|
789
|
-
parser.add_argument('-filename', default=None, help='filename if file contents passed through STDIN')
|
790
|
-
parser.add_argument('-useformats', metavar='INCLUDEPUIDS', default=None, help='comma separated string of formats to use in identification')
|
791
|
-
parser.add_argument('-nouseformats', metavar='EXCLUDEPUIDS', default=None, help='comma separated string of formats not to use in identification')
|
792
|
-
parser.add_argument('-matchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use on match. See nomatchprintf, README.txt.')
|
793
|
-
parser.add_argument('-nomatchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use if no match. See README.txt')
|
794
|
-
parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['bufsize']) + ' bytes)')
|
795
|
-
parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['container_bufsize']) + ' bytes)')
|
796
|
-
parser.add_argument('-loadformats', default=None, metavar='XML1,...,XMLn', help='comma separated string of XML format files to add.')
|
797
|
-
parser.add_argument('-confdir', default=CONFIG_DIR, help='configuration directory to load_fido_xml, for example, the format specifications from.')
|
798
|
-
|
799
|
-
if len(sys.argv) == 1:
|
800
|
-
parser.print_help()
|
801
|
-
sys.exit(1)
|
802
|
-
args = parser.parse_args(args)
|
803
|
-
|
804
|
-
t0 = time.clock()
|
805
|
-
|
806
|
-
versions = get_local_pronom_versions(args.confdir)
|
807
|
-
|
808
|
-
defaults['xml_pronomSignature'] = versions.pronom_signature
|
809
|
-
defaults['containersignature_file'] = versions.pronom_container_signature
|
810
|
-
defaults['xml_fidoExtensionSignature'] = versions.fido_extension_signature
|
811
|
-
defaults['format_files'] = [defaults['xml_pronomSignature']]
|
812
|
-
|
813
|
-
if args.pronom_only:
|
814
|
-
versionHeader = "FIDO v{0} ({1}, {2})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'])
|
815
|
-
else:
|
816
|
-
versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'], defaults['xml_fidoExtensionSignature'])
|
817
|
-
defaults['format_files'].append(defaults['xml_fidoExtensionSignature'])
|
818
|
-
|
819
|
-
if args.v:
|
820
|
-
sys.stdout.write(versionHeader)
|
821
|
-
sys.exit(0)
|
822
|
-
|
823
|
-
if args.matchprintf:
|
824
|
-
args.matchprintf = args.matchprintf.decode('string_escape')
|
825
|
-
if args.nomatchprintf:
|
826
|
-
args.nomatchprintf = args.nomatchprintf.decode('string_escape')
|
827
|
-
|
828
|
-
fido = Fido(
|
829
|
-
quiet=args.q,
|
830
|
-
bufsize=args.bufsize,
|
831
|
-
container_bufsize=args.container_bufsize,
|
832
|
-
printmatch=args.matchprintf,
|
833
|
-
printnomatch=args.nomatchprintf,
|
834
|
-
zip=args.zip,
|
835
|
-
nocontainer=args.nocontainer,
|
836
|
-
conf_dir=args.confdir)
|
837
|
-
|
838
|
-
# TODO: Allow conf options to be dis-included
|
839
|
-
if args.loadformats:
|
840
|
-
for file in args.loadformats.split(','):
|
841
|
-
fido.load_fido_xml(file)
|
842
|
-
|
843
|
-
# TODO: remove from maps
|
844
|
-
if args.useformats:
|
845
|
-
args.useformats = args.useformats.split(',')
|
846
|
-
fido.formats = [f for f in fido.formats if f.find('puid').text in args.useformats]
|
847
|
-
elif args.nouseformats:
|
848
|
-
args.nouseformats = args.nouseformats.split(',')
|
849
|
-
fido.formats = [f for f in fido.formats if f.find('puid').text not in args.nouseformats]
|
850
|
-
|
851
|
-
# Set up to use stdin, or open input files:
|
852
|
-
if args.input == '-':
|
853
|
-
args.files = sys.stdin
|
854
|
-
elif args.input:
|
855
|
-
args.files = open(args.input, 'r')
|
856
|
-
|
857
|
-
# RUN
|
858
|
-
try:
|
859
|
-
if not args.q:
|
860
|
-
sys.stderr.write(versionHeader)
|
861
|
-
sys.stderr.flush()
|
862
|
-
if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
|
863
|
-
if fido.zip:
|
864
|
-
raise RuntimeError("Multiple content read from stdin not yet supported.")
|
865
|
-
sys.exit(1)
|
866
|
-
fido.identify_multi_object_stream(sys.stdin)
|
867
|
-
else:
|
868
|
-
fido.identify_stream(sys.stdin, args.filename)
|
869
|
-
else:
|
870
|
-
for file in list_files(args.files, args.recurse):
|
871
|
-
fido.identify_file(file)
|
872
|
-
except KeyboardInterrupt:
|
873
|
-
msg = "FIDO: Interrupt while identifying file {0}"
|
874
|
-
sys.stderr.write(msg.format(fido.current_file))
|
875
|
-
sys.exit(1)
|
876
|
-
|
877
|
-
if not args.q:
|
878
|
-
sys.stdout.flush()
|
879
|
-
fido.print_summary(time.clock() - t0)
|
880
|
-
sys.stderr.flush()
|
881
|
-
|
882
|
-
|
883
|
-
if __name__ == '__main__':
|
884
|
-
main()
|