libis-format 0.9.30 → 0.9.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/lib/libis/format/converter/image_converter.rb +2 -2
  3. data/lib/libis/format/office_to_pdf.rb +1 -1
  4. data/lib/libis/format/version.rb +1 -1
  5. data/spec/converter_spec.rb +43 -27
  6. data/spec/data/test-options.png +0 -0
  7. data/spec/data/test.pdf.tif +0 -0
  8. data/tools/droid/{DROID_SignatureFile_V82.xml → DROID_SignatureFile_V90.xml} +8202 -701
  9. data/tools/droid/{container-signature-20150307.xml → container-signature-20170330.xml} +3584 -2235
  10. data/tools/droid/droid-command-line-6.3.jar +0 -0
  11. data/tools/droid/droid.bat +152 -154
  12. data/tools/droid/droid.sh +30 -16
  13. data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
  14. data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
  15. data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
  16. data/tools/droid/lib/commons-codec-1.10.jar +0 -0
  17. data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
  18. data/tools/droid/lib/droid-container-6.3.jar +0 -0
  19. data/tools/droid/lib/droid-core-6.3.jar +0 -0
  20. data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
  21. data/tools/droid/lib/droid-export-6.3.jar +0 -0
  22. data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
  23. data/tools/droid/lib/{droid-help-6.1.5.jar → droid-help-6.3.jar} +0 -0
  24. data/tools/droid/lib/droid-report-6.3.jar +0 -0
  25. data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
  26. data/tools/droid/lib/droid-results-6.3.jar +0 -0
  27. data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
  28. data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
  29. data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
  30. data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
  31. data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
  32. data/tools/droid/lib/poi-3.13.jar +0 -0
  33. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  34. data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
  35. data/tools/fido/__init__.py +50 -0
  36. data/tools/fido/conf/DROID_SignatureFile-v90.xml +2 -0
  37. data/tools/fido/conf/{container-signature-20150307.xml → container-signature-20170330.xml} +1487 -141
  38. data/tools/fido/conf/format_extensions.xml +0 -14
  39. data/tools/fido/conf/{formats-v81.xml → formats-v90.xml} +11409 -887
  40. data/tools/fido/conf/{pronom-xml-v81.zip → pronom-xml-v90.zip} +0 -0
  41. data/tools/fido/conf/versions.xml +6 -6
  42. data/tools/fido/fido.py +437 -407
  43. data/tools/fido/package.py +96 -0
  44. data/tools/fido/prepare.py +217 -188
  45. data/tools/fido/pronomutils.py +143 -58
  46. data/tools/fido/toxml.py +54 -46
  47. data/tools/fido/update_signatures.py +139 -127
  48. metadata +34 -40
  49. data/tools/droid/droid-command-line-6.1.5.jar +0 -0
  50. data/tools/droid/lib/antlr-2.7.7.jar +0 -0
  51. data/tools/droid/lib/antlr-3.2.jar +0 -0
  52. data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
  53. data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
  54. data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
  55. data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
  56. data/tools/droid/lib/commons-codec-1.4.jar +0 -0
  57. data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
  58. data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
  59. data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
  60. data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
  61. data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
  62. data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
  63. data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
  64. data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
  65. data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
  66. data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
  67. data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
  68. data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
  69. data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
  70. data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
  71. data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
  72. data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
  73. data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
  74. data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
  75. data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
  76. data/tools/droid/lib/poi-3.7.jar +0 -0
  77. data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
  78. data/tools/fido/argparselocal.py +0 -2355
  79. data/tools/fido/conf/DROID_SignatureFile-v81.xml +0 -2
@@ -1,8 +1,8 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
1
+ <?xml version='1.0' encoding='utf-8'?>
2
2
  <versions>
3
- <pronomVersion>81</pronomVersion>
4
- <pronomSignature>formats-v81.xml</pronomSignature>
5
- <pronomContainerSignature>container-signature-20150307.xml</pronomContainerSignature>
3
+ <pronomVersion>90</pronomVersion>
4
+ <pronomSignature>formats-v90.xml</pronomSignature>
5
+ <pronomContainerSignature>container-signature-20170330.xml</pronomContainerSignature>
6
6
  <fidoExtensionSignature>format_extensions.xml</fidoExtensionSignature>
7
- <updateScript>1.2.2</updateScript>
8
- </versions>
7
+ <updateScript>1.3.6</updateScript>
8
+ </versions>
data/tools/fido/fido.py CHANGED
@@ -1,57 +1,75 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
- import sys, re, os, time, math
4
- import hashlib, urllib, urlparse, csv, getopt
3
+
4
+ """
5
+ Format Identification for Digital Objects (FIDO).
6
+
7
+ FIDO is a command-line tool to identify the file formats of digital objects.
8
+ It is designed for simple integration into automated work-flows.
9
+ """
10
+
11
+ from __future__ import absolute_import
12
+
13
+ from argparse import ArgumentParser, RawTextHelpFormatter
14
+ from contextlib import closing
15
+ import os
16
+ import re
17
+ import sys
18
+ import tarfile
19
+ import tempfile
20
+ import time
5
21
  from xml.etree import cElementTree as ET
6
22
  from xml.etree import ElementTree as CET
7
- from xml.etree import ElementTree as VET # versions.xml
8
-
9
- version = '1.3.1'
10
- defaults = {'bufsize': 128 * 1024, # (bytes)
11
- 'regexcachesize' :2084, # (bytes)
12
- 'conf_dir' : os.path.join(os.path.dirname(__file__), 'conf'),
13
- 'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
14
- 'printnomatch' : "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
15
- 'format_files': ['formats-v75.xml', 'format_extensions.xml'],
16
- 'containersignature_file' : 'container-signature-20150307.xml',
17
- # versions.xml is where fido.py reads version information
18
- # about which xml to load
19
- 'versions_file' : 'versions.xml',
20
- 'container_bufsize' : 512 * 1024, # (bytes)
21
- 'description' : """
22
- Format Identification for Digital Objects (fido).
23
- FIDO is a command-line tool to identify the file formats of digital objects.
24
- It is designed for simple integration into automated work-flows.
25
- """,
26
- 'epilog' : """
27
- Open Planets Foundation (http://www.openplanetsfoundation.org)
28
- See License.txt for license information.
29
- Download from: https://github.com/openplanets/fido/releases
30
- Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
31
- Author: Adam Farquhar (BL), 2010
32
- Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
33
- FIDO uses the UK National Archives (TNA) PRONOM File Format
34
- and Container descriptions.
35
- PRONOM is available from http://www.nationalarchives.gov.uk/pronom/"""
23
+ import zipfile
24
+
25
+ from six.moves import range
26
+
27
+ from . import __version__, CONFIG_DIR
28
+ from .package import OlePackage, ZipPackage
29
+ from .pronomutils import get_local_pronom_versions
30
+
31
+
32
+ defaults = {
33
+ 'bufsize': 128 * 1024, # (bytes)
34
+ 'regexcachesize': 2084, # (bytes)
35
+ 'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
36
+ 'printnomatch': "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
37
+ 'format_files': [
38
+ 'formats-v88.xml',
39
+ 'format_extensions.xml'
40
+ ],
41
+ 'containersignature_file': 'container-signature-20170330.xml',
42
+ 'container_bufsize': 512 * 1024, # (bytes)
43
+ 'description': """Format Identification for Digital Objects (fido).
44
+ FIDO is a command-line tool to identify the file formats of digital objects.
45
+ It is designed for simple integration into automated work-flows.""",
46
+ 'epilog': """
47
+ Open Planets Foundation (http://www.openplanetsfoundation.org)
48
+ See License.txt for license information.
49
+ Download from: https://github.com/openplanets/fido/releases
50
+ Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
51
+ Author: Adam Farquhar (BL), 2010
52
+ Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
53
+ FIDO uses the UK National Archives (TNA) PRONOM File Format
54
+ and Container descriptions.
55
+ PRONOM is available from http://www.nationalarchives.gov.uk/pronom/""",
36
56
  }
37
57
 
58
+
38
59
  class Fido:
39
- def __init__(self, quiet=False, bufsize=None, container_bufsize = None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=None, format_files=None, containersignature_file=None):
60
+ def __init__(self, quiet=False, bufsize=None, container_bufsize=None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=CONFIG_DIR, format_files=None, containersignature_file=None):
40
61
  global defaults
41
62
  self.quiet = quiet
42
- self.bufsize = (defaults['bufsize'] if bufsize == None else bufsize)
43
- self.container_bufsize = (defaults['container_bufsize'] if container_bufsize == None else container_bufsize)
44
- self.printmatch = (defaults['printmatch'] if printmatch == None else printmatch)
45
- self.printnomatch = (defaults['printnomatch'] if printnomatch == None else printnomatch)
46
- self.handle_matches = (self.print_matches if handle_matches == None else handle_matches)
63
+ self.bufsize = defaults['bufsize'] if bufsize is None else bufsize
64
+ self.container_bufsize = defaults['container_bufsize'] if container_bufsize is None else container_bufsize
65
+ self.printmatch = defaults['printmatch'] if printmatch is None else printmatch
66
+ self.printnomatch = defaults['printnomatch'] if printnomatch is None else printnomatch
67
+ self.handle_matches = self.print_matches if handle_matches is None else handle_matches
47
68
  self.zip = zip
48
- self.nocontainer = (defaults['nocontainer'] if nocontainer == None else nocontainer)
49
- self.conf_dir = defaults['conf_dir'] if conf_dir == None else conf_dir
50
- # print defaults
51
- # sys.exit()
52
- self.format_files = defaults['format_files'] if format_files == None else format_files
53
- #self.containersignature_file = defaults['containersignature_file'] if containersignature_file == None else containersignature_file
54
- self.containersignature_file = defaults['containersignature_file'] #if containersignature_file == None else containersignature_file
69
+ self.nocontainer = nocontainer
70
+ self.conf_dir = conf_dir
71
+ self.format_files = defaults['format_files'] if format_files is None else format_files
72
+ self.containersignature_file = defaults['containersignature_file']
55
73
  self.formats = []
56
74
  self.puid_format_map = {}
57
75
  self.puid_has_priority_over_map = {}
@@ -69,10 +87,10 @@ class Fido:
69
87
  self.externalsig = ET.XML('<signature><name>External</name></signature>')
70
88
 
71
89
  _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
72
- #_special = '$*+.?![]^\\{|}'
73
- _special = '$()*+.?![]^\\{|}'
90
+ _special = '$()*+.?![]^\\{|}' # Before: '$*+.?![]^\\{|}'
74
91
  _hex = '0123456789abcdef'
75
- def _escape_char(self,c):
92
+
93
+ def _escape_char(self, c):
76
94
  if c in '\n':
77
95
  return '\\n'
78
96
  elif c == '\r':
@@ -82,17 +100,22 @@ class Fido:
82
100
  else:
83
101
  (high, low) = divmod(ord(c), 16)
84
102
  return '\\x' + self._hex[high] + self._hex[low]
85
-
86
- def escape(self,string):
87
- "Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
103
+
104
+ def escape(self, string):
105
+ """
106
+ Escape characters in pattern that are non-printable, non-ascii, or
107
+ special for regexes.
108
+ """
88
109
  escaped = ''.join(c if c in self._ordinary else self._escape_char(c) for c in string)
89
110
  return escaped
90
111
 
91
- def convert_container_sequence(self,sig):
92
- """Parse the PRONOM container sequences
93
- and convert to regular expressions
112
+ def convert_container_sequence(self, sig):
113
+ """
114
+ Parse the PRONOM container sequences and convert to regular
115
+ expressions.
94
116
  """
95
- seq = '(?s)'
117
+ # The sequence is regex matching bytes from a file so the sequence must also be bytes
118
+ seq = b'(?s)'
96
119
  inq = False
97
120
  byt = False
98
121
  rng = False
@@ -105,41 +128,41 @@ class Fido:
105
128
  if sig[i] == " ":
106
129
  continue
107
130
  if sig[i] == "[":
108
- seq += "("
131
+ seq += b"("
109
132
  rng = True
110
133
  continue
111
134
  if not byt:
112
- seq += "\\x" + sig[i].lower()
135
+ seq += b"\\x" + sig[i].lower().encode('utf8')
113
136
  byt = True
114
137
  continue
115
138
  if byt:
116
- seq += sig[i].lower()
139
+ seq += sig[i].lower().encode('utf8')
117
140
  byt = False
118
141
  continue
119
142
  if inq:
120
143
  if sig[i] == "'" and not rng:
121
144
  inq = False
122
145
  continue
123
- seq += self.escape(sig[i])
146
+ seq += self.escape(sig[i]).encode('utf8')
124
147
  continue
125
148
  if rng:
126
149
  if sig[i] == "]":
127
- seq += ")"
150
+ seq += b")"
128
151
  rng = False
129
152
  continue
130
153
  if sig[i] != "-" and sig[i] != "'" and ror:
131
- seq += self.escape(sig[i])
154
+ seq += self.escape(sig[i]).encode('utf8')
132
155
  continue
133
- if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and not byt:
134
- seq += "\\x" + sig[i].lower()
156
+ if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and sig[i] != ":" and not ror and not byt:
157
+ seq += b"\\x" + sig[i].lower().encode('utf8')
135
158
  byt = True
136
159
  continue
137
160
  if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and byt:
138
- seq += sig[i].lower()
161
+ seq += sig[i].lower().encode('utf8')
139
162
  byt = False
140
163
  continue
141
164
  if sig[i] == "-" or sig[i] == " ":
142
- seq += "|"
165
+ seq += b"|"
143
166
  continue
144
167
  if sig[i] == "'" and not ror:
145
168
  ror = True
@@ -147,12 +170,13 @@ class Fido:
147
170
  if sig[i] == "'" and ror:
148
171
  ror = False
149
172
  continue
150
- #print seq
173
+
151
174
  return seq
152
-
175
+
153
176
  def load_container_signature(self, containersignature_file):
154
- """Load the PRONOM container-signature file
155
- and convert sequences to regular expressions
177
+ """
178
+ Load the PRONOM container-signature file and convert sequences to
179
+ regular expressions.
156
180
  """
157
181
  tree = CET.parse(containersignature_file)
158
182
  # load and have container signatures converted
@@ -163,11 +187,6 @@ class Fido:
163
187
  self.sequenceSignature[signatureId] = []
164
188
  for sequence in signatureSequence:
165
189
  self.sequenceSignature[signatureId].append(self.convert_container_sequence(sequence[0].text))
166
- # find PUIDs which trigger container matching
167
- self.puidTriggers = {}
168
- triggers = tree.find('TriggerPuids')
169
- for puid in triggers.findall('TriggerPuid'):
170
- self.puidTriggers[puid.get('Puid')] = True
171
190
  # map PUID to container signatureId
172
191
  self.puidMapping = {}
173
192
  mappings = tree.find('FileFormatMappings')
@@ -175,24 +194,72 @@ class Fido:
175
194
  if mapping.get('signatureId') not in self.puidMapping:
176
195
  self.puidMapping[mapping.get('signatureId')] = []
177
196
  self.puidMapping[mapping.get('signatureId')].append(mapping.get('Puid'))
178
- # print "sequences:\n",self.sequenceSignature
179
- # print "trigger:\n",self.puidTriggers
180
- # print "mapping:\n",self.puidMapping
181
- # exit()
197
+ # print "sequences:\n",self.sequenceSignature
198
+ # print "mapping:\n",self.puidMapping
199
+ # exit()
200
+
201
+ def extract_signatures(self, doc, signature_type="ZIP"):
202
+ """
203
+ Given an XML container signature file, returns a dictionary of signatures.
204
+
205
+ The format of the dictionary is:
206
+
207
+ {
208
+ path_to_file_inside_zip: {puid: [signatures]}
209
+ }
210
+ """
211
+ root = doc.getroot()
212
+ format_mappings = root.find("FileFormatMappings")
213
+
214
+ def get_puid(doc, element_id):
215
+ return format_mappings.find('FileFormatMapping[@signatureId="{}"]'.format(element_id)).attrib["Puid"]
216
+
217
+ def format_signature_attributes(element):
218
+ return {
219
+ "path": element.findtext("Files/File/Path"),
220
+ "id": element.attrib["Id"],
221
+ "signature": self.convert_container_sequence(element.findtext("Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence/Sequence"))
222
+ }
223
+
224
+ elements = root.findall("ContainerSignatures/ContainerSignature[@ContainerType=\"{}\"]".format(signature_type))
225
+ signatures = {}
226
+ for el in elements:
227
+ if el.find("Files/File/BinarySignatures") is None:
228
+ continue
229
+
230
+ puid = get_puid(doc, el.attrib["Id"])
231
+ signature = format_signature_attributes(el)
232
+ path = signature["path"]
233
+ if path not in signatures:
234
+ signatures[path] = {}
235
+ if puid not in signatures[path]:
236
+ signatures[path][puid] = []
237
+ signatures[path][puid].append(format_signature_attributes(el))
238
+ return signatures
239
+
240
+ def match_container(self, signature_type, klass, file, signature_file):
241
+ puids = klass(file, self.extract_signatures(signature_file, signature_type=signature_type)).detect_formats()
242
+ results = []
243
+ for puid in puids:
244
+ format = self.puid_format_map[puid]
245
+ signature = format.findtext("name")
246
+ results.append((format, signature))
247
+ return results
182
248
 
183
249
  def load_fido_xml(self, file):
184
- """Load the fido format information from @param file.
185
- As a side-effect, set self.formats
186
- @return list of ElementTree.Element, one for each format.
250
+ """
251
+ Load the fido format information from @param file.
252
+ As a side-effect, set self.formats.
253
+ @return list of ElementTree.Element, one for each format.
187
254
  """
188
255
  tree = ET.parse(file)
189
- #print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
190
- #TODO: Handle empty regexes properly; perhaps remove from the format list
256
+ # print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
257
+ # TODO: Handle empty regexes properly; perhaps remove from the format list
191
258
  for element in tree.getroot().findall('./format'):
192
259
  puid = self.get_puid(element)
193
260
  # Handle over-writes in multiple file loads
194
- existing = self.puid_format_map.get(puid, False)
195
- if existing:
261
+ existing = self.puid_format_map.get(puid, False)
262
+ if existing:
196
263
  # Already have one, so replace old with new!
197
264
  self.formats[self.formats.index(existing)] = element
198
265
  else:
@@ -205,31 +272,33 @@ class Fido:
205
272
  # To delete a format: (1) remove from self.formats, (2) remove from puid_format_map, (3) remove from selt.puid_has_priority_over_map
206
273
  def get_signatures(self, format):
207
274
  return format.findall('signature')
208
-
275
+
209
276
  def has_priority_over(self, format, possibly_inferior):
210
277
  return self.get_puid(possibly_inferior)in self.puid_has_priority_over_map[self.get_puid(format)]
211
-
278
+
212
279
  def get_puid(self, format):
213
280
  return format.find('puid').text
214
-
281
+
215
282
  def get_patterns(self, signature):
216
283
  return signature.findall('pattern')
217
-
218
- def get_pos(self, pat):
284
+
285
+ def get_pos(self, pat):
219
286
  return pat.find('position').text
220
-
287
+
221
288
  def get_regex(self, pat):
222
- return pat.find('regex').text
223
-
289
+ # The regex is matching bytes from a file so regex must also be bytes
290
+ return pat.find('regex').text.encode('utf8')
291
+
224
292
  def get_extension(self, format):
225
293
  return format.find('extension').text
226
-
294
+
227
295
  def print_matches(self, fullname, matches, delta_t, matchtype=''):
228
- """The default match handler. Prints out information for each match in the list.
229
- @param fullname is name of the file being matched
230
- @param matches is a list of (format, signature)
231
- @param delta_t is the time taken for the match.
232
- @param matchtype is the type of match (signature, containersignature, extension, fail)
296
+ """
297
+ The default match handler. Prints out information for each match in the list.
298
+ @param fullname is name of the file being matched
299
+ @param matches is a list of (format, signature)
300
+ @param delta_t is the time taken for the match.
301
+ @param matchtype is the type of match (signature, containersignature, extension, fail)
233
302
  """
234
303
  class Info:
235
304
  pass
@@ -241,38 +310,60 @@ class Fido:
241
310
  obj.filesize = self.current_filesize
242
311
  obj.matchtype = matchtype
243
312
  if len(matches) == 0:
244
- sys.stdout.write(self.printnomatch % { "info.time" : obj.time, "info.filesize" : obj.filesize, "info.filename" : obj.filename, "info.count"
245
- : obj.count, "info.matchtype" : "fail" } )
246
- else:
247
- i = 0
248
- for (f, s) in matches:
249
- i += 1
250
- obj.group_index = i
251
- obj.puid = self.get_puid(f)
252
- obj.formatname = f.find('name').text
253
- obj.signaturename = s.find('name').text
254
- mime = f.find('mime')
255
- obj.mimetype = mime.text if mime != None else None
256
- version = f.find('version')
257
- obj.version = version.text if version != None else None
258
- alias = f.find('alias')
259
- obj.alias = alias.text if alias != None else None
260
- apple_uti = f.find('apple_uid')
261
- obj.apple_uti = apple_uti.text if apple_uti != None else None
262
- sys.stdout.write(self.printmatch % { "info.time" : obj.time, "info.puid" : obj.puid, "info.formatname" : obj.formatname, "info.signaturename" : obj.signaturename, "info.filesize" : obj.filesize, "info.filename" : obj.filename, "info.mimetype" : obj.mimetype, "info.matchtype" : obj.matchtype, "info.version" : obj.version, "info.alias" : obj.alias, "info.apple_uti" : obj.apple_uti, "info.group_size" : obj.group_size, "info.group_index" : obj.group_index, "info.count" : obj.count })
263
-
313
+ sys.stdout.write(self.printnomatch % {
314
+ "info.time": obj.time,
315
+ "info.filesize": obj.filesize,
316
+ "info.filename": obj.filename,
317
+ "info.count": obj.count,
318
+ "info.matchtype": "fail"
319
+ })
320
+ return
321
+ i = 0
322
+ for (f, sig_name) in matches:
323
+ i += 1
324
+ obj.group_index = i
325
+ obj.puid = self.get_puid(f)
326
+ obj.formatname = f.find('name').text
327
+ obj.signaturename = sig_name
328
+ mime = f.find('mime')
329
+ obj.mimetype = mime.text if mime is not None else None
330
+ version = f.find('version')
331
+ obj.version = version.text if version is not None else None
332
+ alias = f.find('alias')
333
+ obj.alias = alias.text if alias is not None else None
334
+ apple_uti = f.find('apple_uid')
335
+ obj.apple_uti = apple_uti.text if apple_uti is not None else None
336
+ sys.stdout.write(self.printmatch % {
337
+ "info.time": obj.time,
338
+ "info.puid": obj.puid,
339
+ "info.formatname": obj.formatname,
340
+ "info.signaturename": obj.signaturename,
341
+ "info.filesize": obj.filesize,
342
+ "info.filename": obj.filename,
343
+ "info.mimetype": obj.mimetype,
344
+ "info.matchtype": obj.matchtype,
345
+ "info.version": obj.version,
346
+ "info.alias": obj.alias,
347
+ "info.apple_uti": obj.apple_uti,
348
+ "info.group_size": obj.group_size,
349
+ "info.group_index": obj.group_index,
350
+ "info.count": obj.count
351
+ })
352
+
264
353
  def print_summary(self, secs):
265
- """Print summary information on the number of matches and time taken.
354
+ """
355
+ Print summary information on the number of matches and time taken.
266
356
  """
267
357
  count = self.current_count
268
358
  if not self.quiet:
269
359
  rate = (int(round(count / secs)) if secs != 0 else 9999)
270
- #print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
271
- sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
272
-
360
+ # print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
361
+ sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
362
+
273
363
  def identify_file(self, filename):
274
- """Identify the type of @param filename.
275
- Call self.handle_matches instead of returning a value.
364
+ """
365
+ Identify the type of @param filename.
366
+ Call self.handle_matches instead of returning a value.
276
367
  """
277
368
  self.current_file = filename
278
369
  self.matchtype = "signature"
@@ -282,9 +373,19 @@ class Fido:
282
373
  size = os.stat(filename)[6]
283
374
  self.current_filesize = size
284
375
  if self.current_filesize == 0:
285
- sys.stderr.write("FIDO: Zero byte file (empty): Path is: {0}\n".format(filename))
286
- bofbuffer, eofbuffer = self.get_buffers(f, size, seekable=True)
287
- matches = self.match_formats(bofbuffer, eofbuffer)
376
+ sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + filename + "\n")
377
+ bofbuffer, eofbuffer, _ = self.get_buffers(f, size, seekable=True)
378
+ matches = self.match_formats(bofbuffer, eofbuffer)
379
+ container_type = self.container_type(matches)
380
+ if container_type in ("zip", "ole"):
381
+ container_file = ET.parse(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
382
+ if container_type == "zip":
383
+ container_matches = self.match_container("ZIP", ZipPackage, filename, container_file)
384
+ else:
385
+ container_matches = self.match_container("OLE2", OlePackage, filename, container_file)
386
+ if len(container_matches) > 0:
387
+ self.handle_matches(filename, container_matches, time.clock() - t0, "container")
388
+ return
288
389
  # from here is also repeated in walk_zip
289
390
  # we should make this uniform in a next version!
290
391
  #
@@ -296,30 +397,37 @@ class Fido:
296
397
  elif len(matches) == 0 or self.current_filesize == 0:
297
398
  matches = self.match_extensions(filename)
298
399
  self.handle_matches(filename, matches, time.clock() - t0, "extension")
400
+ # only recurse into certain containers, like ZIP or TAR
401
+ container = self.container_type(matches)
299
402
  # till here matey!
300
- if self.zip:
301
- self.identify_contents(filename, type=self.container_type(matches))
403
+ if self.zip and self.can_recurse_into_container(container):
404
+ self.identify_contents(filename, type=container)
302
405
  except IOError:
303
- #print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
406
+ # print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
304
407
  sys.stderr.write("FIDO: Error in identify_file: Path is {0}\n".format(filename))
305
408
 
306
409
  def identify_contents(self, filename, fileobj=None, type=False):
307
- """Identify each item in a container (such as a zip or tar file). Call self.handle_matches on each item.
308
- @param fileobj could be a file, or a stream.
309
410
  """
310
- if type == False:
411
+ Identify each item in a container (such as a zip or tar file). Call
412
+ self.handle_matches on each item.
413
+ @param fileobj could be a file, or a stream.
414
+ """
415
+ if not type:
311
416
  return
312
417
  elif type == 'zip':
313
418
  self.walk_zip(filename, fileobj)
314
419
  elif type == 'tar':
315
420
  self.walk_tar(filename, fileobj)
316
- else: # TODO: ouch!
421
+ else: # TODO: ouch!
317
422
  raise RuntimeError("Unknown container type: " + repr(type))
318
-
423
+
319
424
  def identify_multi_object_stream(self, stream):
320
- """Does not work!
321
- Stream may contain one or more objects each with an HTTP style header that must include content-length.
322
- The headers consist of keyword:value pairs terminated by a newline. There must be a newline following the headers.
425
+ """
426
+ Does not work!
427
+ Stream may contain one or more objects each with an HTTP style header
428
+ that must include content-length. The headers consist of keyword:value
429
+ pairs terminated by a newline. There must be a newline following the
430
+ headers.
323
431
  """
324
432
  offset = 0
325
433
  while True:
@@ -337,10 +445,10 @@ class Fido:
337
445
  content_length = int(pair[1])
338
446
  if content_length == -1:
339
447
  return
340
- # Consume exactly content-length bytes
448
+ # Consume exactly content-length bytes
341
449
  self.current_file = 'STDIN!(at ' + str(offset) + ' bytes)'
342
450
  self.current_filesize = content_length
343
- bofbuffer, eofbuffer = self.get_buffers(stream, content_length)
451
+ bofbuffer, eofbuffer, _ = self.get_buffers(stream, content_length)
344
452
  matches = self.match_formats(bofbuffer, eofbuffer)
345
453
  # MdR: this needs attention
346
454
  if len(matches) > 0:
@@ -348,11 +456,12 @@ class Fido:
348
456
  elif len(matches) == 0 or self.current_filesize == 0:
349
457
  matches = self.match_extensions(self.current_file)
350
458
  self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
351
-
459
+
352
460
  def identify_stream(self, stream, filename):
353
- """Identify the type of @param stream.
354
- Call self.handle_matches instead of returning a value.
355
- Does not close stream.
461
+ """
462
+ Identify the type of @param stream.
463
+ Call self.handle_matches instead of returning a value.
464
+ Does not close stream.
356
465
  """
357
466
  t0 = time.clock()
358
467
  bofbuffer, eofbuffer, bytes_read = self.get_buffers(stream, length=None)
@@ -381,20 +490,40 @@ class Fido:
381
490
  if (os.name != "nt"):
382
491
  self.current_file = 'STDIN'
383
492
  self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
384
-
493
+
385
494
  def container_type(self, matches):
386
- """Determine if one of the @param matches is the format of a container that we can look inside of (e.g., zip, tar).
387
- @return False, zip, or tar.
388
495
  """
389
- for (format, unused) in matches:
390
- container = format.find('container')
391
- if container != None:
496
+ Determine if one of the @param matches is the format of a container
497
+ that we can look inside of (e.g., zip, tar).
498
+ @return False, zip, or tar.
499
+ """
500
+ for (format_, unused) in matches:
501
+ container = format_.find('container')
502
+ if container is not None:
392
503
  return container.text
504
+
505
+ # aside from checking <container> elements,
506
+ # check for fmt/111, which is OLE
507
+ puid = format_.find('puid')
508
+ if puid is not None and puid.text == 'fmt/111':
509
+ return 'ole'
393
510
  return False
394
-
511
+
512
+ def can_recurse_into_container(self, container_type):
513
+ """
514
+ Determine if the passed container type can:
515
+ a) be extracted, and
516
+ b) contain individual files which can be identified separately.
517
+
518
+ This function is useful for filtering out containers such as OLE,
519
+ which are usually most interesting as compound objects rather than
520
+ for their contents.
521
+ """
522
+ return container_type in ('zip', 'tar')
523
+
395
524
  def blocking_read(self, file, bytes_to_read):
396
525
  bytes_read = 0
397
- buffer = ''
526
+ buffer = b''
398
527
  while bytes_read < bytes_to_read:
399
528
  readbuffer = file.read(bytes_to_read - bytes_read)
400
529
  buffer += readbuffer
@@ -403,18 +532,19 @@ class Fido:
403
532
  if readbuffer == '':
404
533
  break
405
534
  return buffer
406
-
535
+
407
536
  def get_buffers(self, stream, length=None, seekable=False):
408
- """Return buffers from the beginning and end of stream and the number of bytes read
409
- if there may be more bytes in the stream.
410
-
411
- If length is None, return the length as found.
412
- If seekable is False, the steam does not support a seek operation.
413
537
  """
414
- bytes_to_read = self.bufsize if length == None else min(length, self.bufsize)
538
+ Return buffers from the beginning and end of stream and the number of
539
+ bytes read if there may be more bytes in the stream.
540
+
541
+ If length is None, return the length as found.
542
+ If seekable is False, the steam does not support a seek operation.
543
+ """
544
+ bytes_to_read = self.bufsize if length is None else min(length, self.bufsize)
415
545
  bofbuffer = self.blocking_read(stream, bytes_to_read)
416
546
  bytes_read = len(bofbuffer)
417
- if length == None:
547
+ if length is None:
418
548
  # A stream with unknown length; have to keep two buffers around
419
549
  prevbuffer = bofbuffer
420
550
  while True:
@@ -439,101 +569,86 @@ class Fido:
439
569
  stream.seek(length - self.bufsize)
440
570
  eofbuffer = self.blocking_read(stream, self.bufsize)
441
571
  else:
442
- # We have more to read and know how much.
572
+ # We have more to read and know how much.
443
573
  # n*bufsize + r = length
444
574
  (n, r) = divmod(bytes_unread, self.bufsize)
445
575
  # skip n-1*bufsize bytes
446
- for unused_i in xrange(1, n):
576
+ for unused_i in range(1, n):
447
577
  self.blocking_read(stream, self.bufsize)
448
578
  # skip r bytes
449
579
  self.blocking_read(stream, r)
450
580
  # and read the remaining bufsize bytes into the eofbuffer
451
581
  eofbuffer = self.blocking_read(stream, self.bufsize)
452
- return bofbuffer, eofbuffer
453
-
582
+ return bofbuffer, eofbuffer, bytes_to_read
583
+
454
584
  def walk_zip(self, filename, fileobj=None):
455
- """Identify the type of each item in the zip
456
- @param fileobj. If fileobj is not provided, open
457
- @param filename.
458
- Call self.handle_matches instead of returning a value.
459
585
  """
460
- # IN 2.7+: with zipfile.ZipFile((fileobj if fileobj != None else filename), 'r') as stream:
461
- import zipfile, tempfile
586
+ Identify the type of each item in the zip
587
+ @param fileobj. If fileobj is not provided, open.
588
+ @param filename.
589
+ Call self.handle_matches instead of returning a value.
590
+ """
462
591
  try:
463
- zipstream = None
464
- zipstream = zipfile.ZipFile((fileobj if fileobj != None else filename), 'r')
465
- for item in zipstream.infolist():
466
- if item.file_size == 0:
467
- continue #TODO: Find a better test for isdir
468
- t0 = time.clock()
469
- # with zipstream.open(item) as f:
470
- f = None
471
- try:
472
- f = zipstream.open(item)
473
- item_name = filename + '!' + item.filename
474
- self.current_file = item_name
475
- self.current_filesize = item.file_size
476
- if self.current_filesize == 0:
477
- sys.stderr.write("FIDO: Zero byte file (empty): Path is: {0}\n".format(item_name))
478
- bofbuffer, eofbuffer = self.get_buffers(f, item.file_size)
479
- finally:
480
- if f != None: f.close()
481
- matches = self.match_formats(bofbuffer, eofbuffer)
482
- if len(matches) > 0 and self.current_filesize > 0:
483
- self.handle_matches(item_name, matches, time.clock() - t0, "signature")
484
- elif len(matches) == 0 or self.current_filesize == 0:
485
- matches = self.match_extensions(item_name)
486
- self.handle_matches(item_name, matches, time.clock() - t0, "extension")
487
- if self.container_type(matches):
592
+ with zipfile.ZipFile((fileobj if fileobj else filename), 'r') as zipstream:
593
+ for item in zipstream.infolist():
594
+ if item.file_size == 0:
595
+ continue # TODO: Find a better test for isdir
596
+ t0 = time.clock()
597
+ with zipstream.open(item) as f:
598
+ item_name = filename + '!' + item.filename
599
+ self.current_file = item_name
600
+ self.current_filesize = item.file_size
601
+ if self.current_filesize == 0:
602
+ sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + item_name + "\n")
603
+ bofbuffer, eofbuffer, _ = self.get_buffers(f, item.file_size)
604
+ matches = self.match_formats(bofbuffer, eofbuffer)
605
+ if len(matches) > 0 and self.current_filesize > 0:
606
+ self.handle_matches(item_name, matches, time.clock() - t0, "signature")
607
+ elif len(matches) == 0 or self.current_filesize == 0:
608
+ matches = self.match_extensions(item_name)
609
+ self.handle_matches(item_name, matches, time.clock() - t0, "extension")
610
+ if self.container_type(matches):
488
611
  target = tempfile.SpooledTemporaryFile(prefix='Fido')
489
- #with zipstream.open(item) as source:
490
- try:
491
- source = zipstream.open(item)
612
+ with zipstream.open(item) as source:
492
613
  self.copy_stream(source, target)
493
- #target.seek(0)
614
+ # target.seek(0)
494
615
  self.identify_contents(item_name, target, self.container_type(matches))
495
- finally:
496
- source.close()
497
616
  except IOError:
498
617
  sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
499
618
  except zipfile.BadZipfile:
500
619
  sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
501
-
502
- finally:
503
- if zipstream != None: zipstream.close()
504
620
 
505
621
  def walk_tar(self, filename, fileobj):
506
- """Identify the type of each item in the tar
507
- @param fileobj. If fileobj is not provided, open
508
- @param filename.
509
- Call self.handle_matches instead of returning a value.
510
622
  """
511
- import tarfile
512
- tarstream = None
623
+ Identify the type of each item in the tar.
624
+ @param fileobj. If fileobj is not provided, open.
625
+ @param filename.
626
+ Call self.handle_matches instead of returning a value.
627
+ """
513
628
  try:
514
- tarstream = tarfile.TarFile(filename, fileobj=fileobj, mode='r')
515
- for item in tarstream.getmembers():
516
- if item.isfile():
629
+ with tarfile.TarFile(filename, fileobj=fileobj, mode='r') as tarstream:
630
+ for item in tarstream.getmembers():
631
+ if not item.isfile():
632
+ continue
517
633
  t0 = time.clock()
518
- f = tarstream.extractfile(item)
519
- tar_item_name = filename + '!' + item.name
520
- self.current_file = tar_item_name
521
- self.current_filesize = item.size
522
- bofbuffer, eofbuffer = self.get_buffers(f, item.size)
523
- matches = self.match_formats(bofbuffer, eofbuffer)
524
- self.handle_matches(tar_item_name, matches, time.clock() - t0)
525
- if self.container_type(matches):
526
- f.seek(0)
527
- self.identify_contents(tar_item_name, f, self.container_type(matches))
528
- f.close()
634
+ with closing(tarstream.extractfile(item)) as f:
635
+ tar_item_name = filename + '!' + item.name
636
+ self.current_file = tar_item_name
637
+ self.current_filesize = item.size
638
+ bofbuffer, eofbuffer, _ = self.get_buffers(f, item.size)
639
+ matches = self.match_formats(bofbuffer, eofbuffer)
640
+ self.handle_matches(tar_item_name, matches, time.clock() - t0)
641
+ if self.container_type(matches):
642
+ f.seek(0)
643
+ self.identify_contents(tar_item_name, f, self.container_type(matches))
529
644
  except tarfile.TarError:
530
- sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
531
- finally:
532
- if tarstream != None: tarstream.close()
645
+ sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
533
646
 
534
647
  def as_good_as_any(self, f1, match_list):
535
- """Return True if the proposed format is as good as any in the match_list.
536
- For example, if there is no format in the match_list that has priority over the proposed one"""
648
+ """
649
+ Return True if the proposed format is as good as any in the match_list.
650
+ For example, if there is no format in the match_list that has priority over the proposed one
651
+ """
537
652
  if match_list != []:
538
653
  f1_puid = self.get_puid(f1)
539
654
  for (f2, unused) in match_list:
@@ -542,9 +657,10 @@ class Fido:
542
657
  elif f1_puid in self.puid_has_priority_over_map[self.get_puid(f2)]:
543
658
  return False
544
659
  return True
545
-
660
+
546
661
  def buffered_read(self, file_pos, overlap):
547
- """Buffered read of data chunks
662
+ """
663
+ Buffered read of data chunks.
548
664
  """
549
665
  buf = ""
550
666
  if not overlap:
@@ -552,102 +668,24 @@ class Fido:
552
668
  else:
553
669
  bufsize = self.container_bufsize + self.overlap_range
554
670
  file_end = self.current_filesize
555
- file_handle = file(self.current_file, 'rb')
556
- file_handle.seek(file_pos)
557
- if file_end - file_pos < bufsize:
558
- file_read = file_end - file_pos
559
- else:
560
- file_read = self.bufsize
561
- buf = file_handle.read(file_read)
562
- return buf
563
-
564
- def read_container(self,parent_buffer,parent_result):
565
- """Header of compound containers can be further away than default 128 KB buffer
566
- especially with big files containing binary objects.
567
- This function reads containers in chunks of 512 KB (defaults['container_bufsize'])
568
- Each chunk is inspected with the PRONOM container sequences.
569
- Each chunk smuggles in a piece from the previous chunk to prevent
570
- cutting off patterns we are looking for in the middle.
571
- This method is somewhat slower than reading the complete file at once.
572
- This is to prevent Fido to potentially crash in the midst of scanning a very big file.
573
- NOTE (MdR): this piece of code is still a bit quirky
574
- as it does not yet takes byte positions into account which
575
- are available in the DROID container signature file
576
- """
577
- container_result = []
578
- nobuffer = False
579
- overlap = False
580
- self.overlap_range = 512 # bytes
581
- container_hit = False
582
- passes = 1
583
- container_buffer = ""
584
- # TODO: find better way to handle zip contents
585
- # for now: ugly hack, but working
586
- # this slows down because the zip is re-opened on each item
587
- # if "!" is in filename, it is a zip item
588
- # if "!" in self.current_file:
589
- # import zipfile, tempfile
590
- # zip, item = self.current_file.split("!")
591
- # zipitem = tempfile.SpooledTemporaryFile(prefix='Fido')
592
- #with zipstream.open(item) as source:
593
- # try:
594
- # source = zipstream.open(item)
595
- # self.copy_stream(source, target)
596
- # target.seek(0)
597
- # self.identify_contents(item_name, target, self.container_type(matches))
598
- # finally:
599
- # source.close()
600
- #exit()
601
- # in case argument 'nocontainer' is set
602
- # read default bofbuffer
603
- if self.nocontainer or self.current_filesize <= self.bufsize or self.current_file == "STDIN":
604
- passes = 1
605
- nobuffer = True
606
- else:
607
- passes = int(float(self.current_filesize / self.container_bufsize) + 1)
608
- pos = 0
609
- for i in xrange(passes):
610
- if nobuffer is True:
611
- container_buffer = parent_buffer
671
+ with open(self.current_file, 'rb') as file_handle:
672
+ file_handle.seek(file_pos)
673
+ if file_end - file_pos < bufsize:
674
+ file_read = file_end - file_pos
612
675
  else:
613
- if i == 0:
614
- pos = 0
615
- else:
616
- pos = ((self.container_bufsize * i) - self.overlap_range)
617
- overlap = True
618
- container_buffer = self.buffered_read(pos, overlap)
619
- for (container_id,container_regexes) in self.sequenceSignature.iteritems():
620
- # set hitcounter in case a container entry
621
- # has more than one regex
622
- hitcounter = 0
623
- if len(container_regexes) > 0:
624
- for container_regex in container_regexes:
625
- if re.search(container_regex, container_buffer):
626
- hitcounter += 1
627
- # if the hitcounter matches the number of regexes
628
- # then it must be a positive hit, else continue
629
- # to match the rest of the sequences
630
- if hitcounter < len(container_regexes):
631
- continue
632
- self.matchtype = "container"
633
- for container_puid in self.puidMapping[container_id]:
634
- for container_format in self.formats:
635
- if container_format.find('puid').text == container_puid:
636
- if self.as_good_as_any(container_format, parent_result):
637
- for container_sig in self.get_signatures(container_format):
638
- container_result.append((container_format, container_sig))
639
- break
640
- return container_result
676
+ file_read = self.bufsize
677
+ buf = file_handle.read(file_read)
678
+ return buf
641
679
 
642
680
  def match_formats(self, bofbuffer, eofbuffer):
643
- """Apply the patterns for formats to the supplied buffers.
644
- @return a match list of (format, signature) tuples.
645
- The list has inferior matches removed.
681
+ """
682
+ Apply the patterns for formats to the supplied buffers.
683
+ @return a match list of (format, signature) tuples.
684
+ The list has inferior matches removed.
646
685
  """
647
686
  self.current_count += 1
648
- #t0 = time.clock()
687
+ # t0 = time.clock()
649
688
  result = []
650
- container_result = []
651
689
  for format in self.formats:
652
690
  try:
653
691
  self.current_format = format
@@ -659,7 +697,7 @@ class Fido:
659
697
  self.current_pat = pat
660
698
  pos = self.get_pos(pat)
661
699
  regex = self.get_regex(pat)
662
- #print 'trying ', regex
700
+ # print 'trying ', regex
663
701
  if pos == 'BOF':
664
702
  if not re.match(regex, bofbuffer):
665
703
  success = False
@@ -670,60 +708,55 @@ class Fido:
670
708
  break
671
709
  elif pos == 'VAR':
672
710
  if not re.search(regex, bofbuffer):
673
- success = False
711
+ success = False
674
712
  break
675
713
  elif pos == 'IFB':
676
714
  if not re.search(regex, bofbuffer):
677
- success = False
715
+ success = False
678
716
  break
679
717
  if success:
680
- result.append((format, sig))
681
- # check if file needs to be parsed with container signature
682
- # we skip files with extension "zip" (x-fmt/263)
683
- ext = os.path.splitext(self.current_file)[1].lower().lstrip(".")
684
- if format.find('puid').text in self.puidTriggers and ext != "zip":
685
- container_result = self.read_container(bofbuffer,result)
686
- if len(container_result) != 0:
687
- for (k,v) in container_result:
688
- result.append((k,v))
689
- break
718
+ result.append((format, sig.findtext("name")))
690
719
  except Exception as e:
691
- sys.stderr.write(str(e)+"\n")
720
+ sys.stderr.write(str(e) + "\n")
692
721
  continue
693
722
  # TODO: MdR: needs some <3
694
- #print "Unexpected error:", sys.exc_info()[0], e
695
- #sys.stdout.write('***', self.get_puid(format), regex)
696
-
723
+ # print "Unexpected error:", sys.exc_info()[0], e
724
+ # sys.stdout.write('***', self.get_puid(format), regex)
725
+
697
726
  # t1 = time.clock()
698
727
  # if t1 - t0 > 0.02:
699
728
  # print >> sys.stderr, "FIDO: Slow ID", self.current_file
700
729
  result = [match for match in result if self.as_good_as_any(match[0], result)]
701
- result = list(set(result)) # remove duplicate results, this is due to ??? in self.read_container(), needs fix
702
730
  return result
703
-
731
+
704
732
  def match_extensions(self, filename):
705
- "Return the list of (format, self.externalsig) for every format whose extension matches the filename."
733
+ """
734
+ Return the list of (format, self.externalsig) for every format whose extension matches the filename.
735
+ """
706
736
  myext = os.path.splitext(filename)[1].lower().lstrip(".")
707
737
  result = []
708
- if len(myext) > 0:
709
- for element in self.formats:
710
- if element.findall('extension') != None:
711
- for format in element.findall('extension'):
712
- if myext == format.text:
713
- result.append((element, self.externalsig))
714
- break
738
+ if not myext:
739
+ return result
740
+ for element in self.formats:
741
+ for format_ in element.findall('extension'):
742
+ if myext == format_.text:
743
+ result.append((element, self.externalsig.findtext("name")))
744
+ break
715
745
  result = [match for match in result if self.as_good_as_any(match[0], result)]
716
746
  return result
717
-
747
+
718
748
  def copy_stream(self, source, target):
719
749
  while True:
720
750
  buf = source.read(self.bufsize)
721
751
  if len(buf) == 0:
722
752
  break
723
753
  target.write(buf)
724
-
754
+
755
+
725
756
  def list_files(roots, recurse=False):
726
- "Return the files one at a time. Roots could be a fileobj or a list."
757
+ """
758
+ Return the files one at a time. Roots could be a fileobj or a list.
759
+ """
727
760
  for root in roots:
728
761
  root = (root if root[-1] != '\n' else root[:-1])
729
762
  root = os.path.normpath(root)
@@ -733,17 +766,14 @@ def list_files(roots, recurse=False):
733
766
  for path, unused, files in os.walk(root):
734
767
  for f in files:
735
768
  yield os.path.join(path, f)
736
- if recurse == False:
769
+ if not recurse:
737
770
  break
738
-
739
- def main(arglist=None):
740
- # The argparse package was introduced in 2.7
741
- t0 = time.clock()
742
- from argparselocal import ArgumentParser, RawTextHelpFormatter
743
- if arglist == None:
744
- arglist = sys.argv[1:]
745
- if len(arglist) == False:
746
- arglist.append("-h")
771
+
772
+
773
+ def main(args=None):
774
+ if not args:
775
+ args = sys.argv[1:]
776
+
747
777
  parser = ArgumentParser(description=defaults['description'], epilog=defaults['epilog'], fromfile_prefix_chars='@', formatter_class=RawTextHelpFormatter)
748
778
  parser.add_argument('-v', default=False, action='store_true', help='show version information')
749
779
  parser.add_argument('-q', default=False, action='store_true', help='run (more) quietly')
@@ -751,87 +781,86 @@ def main(arglist=None):
751
781
  parser.add_argument('-zip', default=False, action='store_true', help='recurse into zip and tar files')
752
782
  parser.add_argument('-nocontainer', default=False, action='store_true', help='disable deep scan of container documents, increases speed but may reduce accuracy with big files')
753
783
  parser.add_argument('-pronom_only', default=False, action='store_true', help='disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results')
784
+
754
785
  group = parser.add_mutually_exclusive_group()
755
786
  group.add_argument('-input', default=False, help='file containing a list of files to check, one per line. - means stdin')
756
787
  group.add_argument('files', nargs='*', default=[], metavar='FILE', help='files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.')
788
+
757
789
  parser.add_argument('-filename', default=None, help='filename if file contents passed through STDIN')
758
790
  parser.add_argument('-useformats', metavar='INCLUDEPUIDS', default=None, help='comma separated string of formats to use in identification')
759
791
  parser.add_argument('-nouseformats', metavar='EXCLUDEPUIDS', default=None, help='comma separated string of formats not to use in identification')
760
792
  parser.add_argument('-matchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use on match. See nomatchprintf, README.txt.')
761
793
  parser.add_argument('-nomatchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use if no match. See README.txt')
762
- parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['bufsize'])+' bytes)')
763
- parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['container_bufsize'])+' bytes)')
764
-
794
+ parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['bufsize']) + ' bytes)')
795
+ parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['container_bufsize']) + ' bytes)')
765
796
  parser.add_argument('-loadformats', default=None, metavar='XML1,...,XMLn', help='comma separated string of XML format files to add.')
766
- parser.add_argument('-confdir', default=None, help='configuration directory to load_fido_xml, for example, the format specifications from.')
767
-
768
- # what is this doing here only once?
769
- #mydir = os.path.abspath(os.path.dirname(__file__))
770
-
771
- # PROCESS ARGUMENTS
772
- args = parser.parse_args(arglist)
773
- # print args
774
- # sys.exit()
775
- # process confdir
776
- # load versions.xml
777
- # and stick it in defaults
778
- if args.confdir:
779
- versionsFile = os.path.join(os.path.abspath(args.confdir), defaults['versions_file'])
780
- else:
781
- versionsFile = os.path.join(os.path.abspath(defaults['conf_dir']), defaults['versions_file'])
782
- try:
783
- versions = VET.parse(versionsFile)
784
- except Exception, e:
785
- sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
786
- sys.exit()
787
- defaults['xml_pronomSignature'] = versions.find("pronomSignature").text
788
- # defaults['xml_pronomContainerSignature'] = versions.find("pronomContainerSignature").text
789
- defaults['containersignature_file'] = versions.find("pronomContainerSignature").text
790
- defaults['xml_fidoExtensionSignature'] = versions.find("fidoExtensionSignature").text
791
- defaults['format_files'] = []
792
- defaults['format_files'].append(defaults['xml_pronomSignature'])
797
+ parser.add_argument('-confdir', default=CONFIG_DIR, help='configuration directory to load_fido_xml, for example, the format specifications from.')
798
+
799
+ if len(sys.argv) == 1:
800
+ parser.print_help()
801
+ sys.exit(1)
802
+ args = parser.parse_args(args)
803
+
804
+ t0 = time.clock()
805
+
806
+ versions = get_local_pronom_versions(args.confdir)
807
+
808
+ defaults['xml_pronomSignature'] = versions.pronom_signature
809
+ defaults['containersignature_file'] = versions.pronom_container_signature
810
+ defaults['xml_fidoExtensionSignature'] = versions.fido_extension_signature
811
+ defaults['format_files'] = [defaults['xml_pronomSignature']]
812
+
793
813
  if args.pronom_only:
794
- versionHeader = "FIDO v{0} ({1}, {2})\n".format(version,defaults['xml_pronomSignature'],defaults['containersignature_file'])
814
+ versionHeader = "FIDO v{0} ({1}, {2})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'])
795
815
  else:
796
- versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(version,defaults['xml_pronomSignature'],defaults['containersignature_file'],defaults['xml_fidoExtensionSignature'])
816
+ versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'], defaults['xml_fidoExtensionSignature'])
797
817
  defaults['format_files'].append(defaults['xml_fidoExtensionSignature'])
798
-
799
- if args.v :
818
+
819
+ if args.v:
800
820
  sys.stdout.write(versionHeader)
801
821
  sys.exit(0)
802
- if args.matchprintf != None:
822
+
823
+ if args.matchprintf:
803
824
  args.matchprintf = args.matchprintf.decode('string_escape')
804
- if args.nomatchprintf != None:
825
+ if args.nomatchprintf:
805
826
  args.nomatchprintf = args.nomatchprintf.decode('string_escape')
806
- fido = Fido(quiet=args.q, bufsize=args.bufsize, container_bufsize=args.container_bufsize,
807
- printmatch=args.matchprintf, printnomatch=args.nomatchprintf, zip=args.zip, nocontainer = args.nocontainer, conf_dir=args.confdir)
808
-
809
- #TODO: Allow conf options to be dis-included
827
+
828
+ fido = Fido(
829
+ quiet=args.q,
830
+ bufsize=args.bufsize,
831
+ container_bufsize=args.container_bufsize,
832
+ printmatch=args.matchprintf,
833
+ printnomatch=args.nomatchprintf,
834
+ zip=args.zip,
835
+ nocontainer=args.nocontainer,
836
+ conf_dir=args.confdir)
837
+
838
+ # TODO: Allow conf options to be dis-included
810
839
  if args.loadformats:
811
840
  for file in args.loadformats.split(','):
812
841
  fido.load_fido_xml(file)
813
-
814
- #TODO: remove from maps
842
+
843
+ # TODO: remove from maps
815
844
  if args.useformats:
816
845
  args.useformats = args.useformats.split(',')
817
846
  fido.formats = [f for f in fido.formats if f.find('puid').text in args.useformats]
818
847
  elif args.nouseformats:
819
848
  args.nouseformats = args.nouseformats.split(',')
820
849
  fido.formats = [f for f in fido.formats if f.find('puid').text not in args.nouseformats]
821
-
850
+
822
851
  # Set up to use stdin, or open input files:
823
852
  if args.input == '-':
824
853
  args.files = sys.stdin
825
854
  elif args.input:
826
855
  args.files = open(args.input, 'r')
827
-
856
+
828
857
  # RUN
829
858
  try:
830
859
  if not args.q:
831
860
  sys.stderr.write(versionHeader)
832
861
  sys.stderr.flush()
833
862
  if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
834
- if fido.zip == True:
863
+ if fido.zip:
835
864
  raise RuntimeError("Multiple content read from stdin not yet supported.")
836
865
  sys.exit(1)
837
866
  fido.identify_multi_object_stream(sys.stdin)
@@ -844,11 +873,12 @@ def main(arglist=None):
844
873
  msg = "FIDO: Interrupt while identifying file {0}"
845
874
  sys.stderr.write(msg.format(fido.current_file))
846
875
  sys.exit(1)
847
-
876
+
848
877
  if not args.q:
849
878
  sys.stdout.flush()
850
879
  fido.print_summary(time.clock() - t0)
851
880
  sys.stderr.flush()
852
881
 
882
+
853
883
  if __name__ == '__main__':
854
884
  main()