libis-format 0.9.32 → 0.9.33

Sign up to get free protection for your applications and to get access to all the features.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/data/types.yml +30 -16
  3. data/lib/libis/format/config.rb +7 -18
  4. data/lib/libis/format/converter/image_converter.rb +6 -0
  5. data/lib/libis/format/droid.rb +82 -25
  6. data/lib/libis/format/extension_identification.rb +55 -0
  7. data/lib/libis/format/fido.rb +57 -72
  8. data/lib/libis/format/file_tool.rb +76 -0
  9. data/lib/libis/format/identification_tool.rb +174 -0
  10. data/lib/libis/format/identifier.rb +129 -117
  11. data/lib/libis/format/type_database.rb +36 -5
  12. data/lib/libis/format/version.rb +1 -1
  13. data/lib/libis/format.rb +3 -0
  14. data/libis-format.gemspec +2 -1
  15. data/spec/converter_spec.rb +6 -4
  16. data/spec/identifier_spec.rb +125 -34
  17. metadata +21 -126
  18. data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
  19. data/tools/droid/container-signature-20170330.xml +0 -3584
  20. data/tools/droid/droid-command-line-6.3.jar +0 -0
  21. data/tools/droid/droid.bat +0 -152
  22. data/tools/droid/droid.sh +0 -152
  23. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  24. data/tools/droid/lib/activation-1.1.jar +0 -0
  25. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  26. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  27. data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
  28. data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
  29. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  30. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  31. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  32. data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
  33. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  34. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  35. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  36. data/tools/droid/lib/commons-codec-1.10.jar +0 -0
  37. data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
  38. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  39. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  40. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  41. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  42. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  43. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  44. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  45. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  46. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  47. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  48. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  49. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  50. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  51. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  52. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  53. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  54. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  55. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  56. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  57. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  58. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  59. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  60. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  61. data/tools/droid/lib/droid-container-6.3.jar +0 -0
  62. data/tools/droid/lib/droid-core-6.3.jar +0 -0
  63. data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
  64. data/tools/droid/lib/droid-export-6.3.jar +0 -0
  65. data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
  66. data/tools/droid/lib/droid-help-6.3.jar +0 -0
  67. data/tools/droid/lib/droid-report-6.3.jar +0 -0
  68. data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
  69. data/tools/droid/lib/droid-results-6.3.jar +0 -0
  70. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  71. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  72. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  73. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  74. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  75. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  76. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  77. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  78. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  79. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  80. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  81. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  82. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  83. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  84. data/tools/droid/lib/jta-1.1.jar +0 -0
  85. data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
  86. data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
  87. data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
  88. data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
  89. data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
  90. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  91. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  92. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  93. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  94. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  95. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  96. data/tools/droid/lib/poi-3.13.jar +0 -0
  97. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  98. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  99. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  100. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  101. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  102. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  103. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  104. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  105. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  106. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  107. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  108. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  109. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  110. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  111. data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
  112. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  113. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  114. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  115. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  116. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  117. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  118. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  119. data/tools/droid/lib/xz-1.0.jar +0 -0
  120. data/tools/fido/__init__.py +0 -50
  121. data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
  122. data/tools/fido/conf/container-signature-20170330.xml +0 -3584
  123. data/tools/fido/conf/dc.xsd +0 -119
  124. data/tools/fido/conf/dcmitype.xsd +0 -53
  125. data/tools/fido/conf/dcterms.xsd +0 -383
  126. data/tools/fido/conf/fido-formats.xsd +0 -173
  127. data/tools/fido/conf/format_extension_template.xml +0 -105
  128. data/tools/fido/conf/format_extensions.xml +0 -484
  129. data/tools/fido/conf/formats-v90.xml +0 -48877
  130. data/tools/fido/conf/pronom-xml-v90.zip +0 -0
  131. data/tools/fido/conf/versions.xml +0 -8
  132. data/tools/fido/fido.bat +0 -4
  133. data/tools/fido/fido.py +0 -884
  134. data/tools/fido/fido.sh +0 -5
  135. data/tools/fido/package.py +0 -96
  136. data/tools/fido/prepare.py +0 -645
  137. data/tools/fido/pronomutils.py +0 -200
  138. data/tools/fido/toxml.py +0 -60
  139. data/tools/fido/update_signatures.py +0 -183
data/tools/fido/fido.py DELETED
@@ -1,884 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- Format Identification for Digital Objects (FIDO).
6
-
7
- FIDO is a command-line tool to identify the file formats of digital objects.
8
- It is designed for simple integration into automated work-flows.
9
- """
10
-
11
- from __future__ import absolute_import
12
-
13
- from argparse import ArgumentParser, RawTextHelpFormatter
14
- from contextlib import closing
15
- import os
16
- import re
17
- import sys
18
- import tarfile
19
- import tempfile
20
- import time
21
- from xml.etree import cElementTree as ET
22
- from xml.etree import ElementTree as CET
23
- import zipfile
24
-
25
- from six.moves import range
26
-
27
- from . import __version__, CONFIG_DIR
28
- from .package import OlePackage, ZipPackage
29
- from .pronomutils import get_local_pronom_versions
30
-
31
-
32
- defaults = {
33
- 'bufsize': 128 * 1024, # (bytes)
34
- 'regexcachesize': 2084, # (bytes)
35
- 'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
36
- 'printnomatch': "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
37
- 'format_files': [
38
- 'formats-v88.xml',
39
- 'format_extensions.xml'
40
- ],
41
- 'containersignature_file': 'container-signature-20170330.xml',
42
- 'container_bufsize': 512 * 1024, # (bytes)
43
- 'description': """Format Identification for Digital Objects (fido).
44
- FIDO is a command-line tool to identify the file formats of digital objects.
45
- It is designed for simple integration into automated work-flows.""",
46
- 'epilog': """
47
- Open Planets Foundation (http://www.openplanetsfoundation.org)
48
- See License.txt for license information.
49
- Download from: https://github.com/openplanets/fido/releases
50
- Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
51
- Author: Adam Farquhar (BL), 2010
52
- Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
53
- FIDO uses the UK National Archives (TNA) PRONOM File Format
54
- and Container descriptions.
55
- PRONOM is available from http://www.nationalarchives.gov.uk/pronom/""",
56
- }
57
-
58
-
59
- class Fido:
60
- def __init__(self, quiet=False, bufsize=None, container_bufsize=None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=CONFIG_DIR, format_files=None, containersignature_file=None):
61
- global defaults
62
- self.quiet = quiet
63
- self.bufsize = defaults['bufsize'] if bufsize is None else bufsize
64
- self.container_bufsize = defaults['container_bufsize'] if container_bufsize is None else container_bufsize
65
- self.printmatch = defaults['printmatch'] if printmatch is None else printmatch
66
- self.printnomatch = defaults['printnomatch'] if printnomatch is None else printnomatch
67
- self.handle_matches = self.print_matches if handle_matches is None else handle_matches
68
- self.zip = zip
69
- self.nocontainer = nocontainer
70
- self.conf_dir = conf_dir
71
- self.format_files = defaults['format_files'] if format_files is None else format_files
72
- self.containersignature_file = defaults['containersignature_file']
73
- self.formats = []
74
- self.puid_format_map = {}
75
- self.puid_has_priority_over_map = {}
76
- # load signatures
77
- for xml_file in self.format_files:
78
- self.load_fido_xml(os.path.join(os.path.abspath(self.conf_dir), xml_file))
79
- self.load_container_signature(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
80
- self.current_file = ''
81
- self.current_filesize = 0
82
- self.current_format = None
83
- self.current_sig = None
84
- self.current_pat = None
85
- self.current_count = 0 # Count of calls to match_formats
86
- re._MAXCACHE = defaults['regexcachesize']
87
- self.externalsig = ET.XML('<signature><name>External</name></signature>')
88
-
89
- _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
90
- _special = '$()*+.?![]^\\{|}' # Before: '$*+.?![]^\\{|}'
91
- _hex = '0123456789abcdef'
92
-
93
- def _escape_char(self, c):
94
- if c in '\n':
95
- return '\\n'
96
- elif c == '\r':
97
- return '\\r'
98
- elif c in self._special:
99
- return '\\' + c
100
- else:
101
- (high, low) = divmod(ord(c), 16)
102
- return '\\x' + self._hex[high] + self._hex[low]
103
-
104
- def escape(self, string):
105
- """
106
- Escape characters in pattern that are non-printable, non-ascii, or
107
- special for regexes.
108
- """
109
- escaped = ''.join(c if c in self._ordinary else self._escape_char(c) for c in string)
110
- return escaped
111
-
112
- def convert_container_sequence(self, sig):
113
- """
114
- Parse the PRONOM container sequences and convert to regular
115
- expressions.
116
- """
117
- # The sequence is regex matching bytes from a file so the sequence must also be bytes
118
- seq = b'(?s)'
119
- inq = False
120
- byt = False
121
- rng = False
122
- ror = False
123
- for i in range(len(sig)):
124
- if not inq and not rng:
125
- if sig[i] == "'":
126
- inq = True
127
- continue
128
- if sig[i] == " ":
129
- continue
130
- if sig[i] == "[":
131
- seq += b"("
132
- rng = True
133
- continue
134
- if not byt:
135
- seq += b"\\x" + sig[i].lower().encode('utf8')
136
- byt = True
137
- continue
138
- if byt:
139
- seq += sig[i].lower().encode('utf8')
140
- byt = False
141
- continue
142
- if inq:
143
- if sig[i] == "'" and not rng:
144
- inq = False
145
- continue
146
- seq += self.escape(sig[i]).encode('utf8')
147
- continue
148
- if rng:
149
- if sig[i] == "]":
150
- seq += b")"
151
- rng = False
152
- continue
153
- if sig[i] != "-" and sig[i] != "'" and ror:
154
- seq += self.escape(sig[i]).encode('utf8')
155
- continue
156
- if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and sig[i] != ":" and not ror and not byt:
157
- seq += b"\\x" + sig[i].lower().encode('utf8')
158
- byt = True
159
- continue
160
- if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and byt:
161
- seq += sig[i].lower().encode('utf8')
162
- byt = False
163
- continue
164
- if sig[i] == "-" or sig[i] == " ":
165
- seq += b"|"
166
- continue
167
- if sig[i] == "'" and not ror:
168
- ror = True
169
- continue
170
- if sig[i] == "'" and ror:
171
- ror = False
172
- continue
173
-
174
- return seq
175
-
176
- def load_container_signature(self, containersignature_file):
177
- """
178
- Load the PRONOM container-signature file and convert sequences to
179
- regular expressions.
180
- """
181
- tree = CET.parse(containersignature_file)
182
- # load and have container signatures converted
183
- self.sequenceSignature = {}
184
- for signature in tree.getroot().findall('ContainerSignatures/ContainerSignature'):
185
- signatureId = signature.get('Id')
186
- signatureSequence = signature.findall('Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence')
187
- self.sequenceSignature[signatureId] = []
188
- for sequence in signatureSequence:
189
- self.sequenceSignature[signatureId].append(self.convert_container_sequence(sequence[0].text))
190
- # map PUID to container signatureId
191
- self.puidMapping = {}
192
- mappings = tree.find('FileFormatMappings')
193
- for mapping in mappings.findall('FileFormatMapping'):
194
- if mapping.get('signatureId') not in self.puidMapping:
195
- self.puidMapping[mapping.get('signatureId')] = []
196
- self.puidMapping[mapping.get('signatureId')].append(mapping.get('Puid'))
197
- # print "sequences:\n",self.sequenceSignature
198
- # print "mapping:\n",self.puidMapping
199
- # exit()
200
-
201
- def extract_signatures(self, doc, signature_type="ZIP"):
202
- """
203
- Given an XML container signature file, returns a dictionary of signatures.
204
-
205
- The format of the dictionary is:
206
-
207
- {
208
- path_to_file_inside_zip: {puid: [signatures]}
209
- }
210
- """
211
- root = doc.getroot()
212
- format_mappings = root.find("FileFormatMappings")
213
-
214
- def get_puid(doc, element_id):
215
- return format_mappings.find('FileFormatMapping[@signatureId="{}"]'.format(element_id)).attrib["Puid"]
216
-
217
- def format_signature_attributes(element):
218
- return {
219
- "path": element.findtext("Files/File/Path"),
220
- "id": element.attrib["Id"],
221
- "signature": self.convert_container_sequence(element.findtext("Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence/Sequence"))
222
- }
223
-
224
- elements = root.findall("ContainerSignatures/ContainerSignature[@ContainerType=\"{}\"]".format(signature_type))
225
- signatures = {}
226
- for el in elements:
227
- if el.find("Files/File/BinarySignatures") is None:
228
- continue
229
-
230
- puid = get_puid(doc, el.attrib["Id"])
231
- signature = format_signature_attributes(el)
232
- path = signature["path"]
233
- if path not in signatures:
234
- signatures[path] = {}
235
- if puid not in signatures[path]:
236
- signatures[path][puid] = []
237
- signatures[path][puid].append(format_signature_attributes(el))
238
- return signatures
239
-
240
- def match_container(self, signature_type, klass, file, signature_file):
241
- puids = klass(file, self.extract_signatures(signature_file, signature_type=signature_type)).detect_formats()
242
- results = []
243
- for puid in puids:
244
- format = self.puid_format_map[puid]
245
- signature = format.findtext("name")
246
- results.append((format, signature))
247
- return results
248
-
249
- def load_fido_xml(self, file):
250
- """
251
- Load the fido format information from @param file.
252
- As a side-effect, set self.formats.
253
- @return list of ElementTree.Element, one for each format.
254
- """
255
- tree = ET.parse(file)
256
- # print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
257
- # TODO: Handle empty regexes properly; perhaps remove from the format list
258
- for element in tree.getroot().findall('./format'):
259
- puid = self.get_puid(element)
260
- # Handle over-writes in multiple file loads
261
- existing = self.puid_format_map.get(puid, False)
262
- if existing:
263
- # Already have one, so replace old with new!
264
- self.formats[self.formats.index(existing)] = element
265
- else:
266
- self.formats.append(element)
267
- self.puid_format_map[puid] = element
268
- # Build some structures to speed things up
269
- self.puid_has_priority_over_map[puid] = frozenset([puid_element.text for puid_element in element.findall('has_priority_over')])
270
- return self.formats
271
-
272
- # To delete a format: (1) remove from self.formats, (2) remove from puid_format_map, (3) remove from selt.puid_has_priority_over_map
273
- def get_signatures(self, format):
274
- return format.findall('signature')
275
-
276
- def has_priority_over(self, format, possibly_inferior):
277
- return self.get_puid(possibly_inferior)in self.puid_has_priority_over_map[self.get_puid(format)]
278
-
279
- def get_puid(self, format):
280
- return format.find('puid').text
281
-
282
- def get_patterns(self, signature):
283
- return signature.findall('pattern')
284
-
285
- def get_pos(self, pat):
286
- return pat.find('position').text
287
-
288
- def get_regex(self, pat):
289
- # The regex is matching bytes from a file so regex must also be bytes
290
- return pat.find('regex').text.encode('utf8')
291
-
292
- def get_extension(self, format):
293
- return format.find('extension').text
294
-
295
- def print_matches(self, fullname, matches, delta_t, matchtype=''):
296
- """
297
- The default match handler. Prints out information for each match in the list.
298
- @param fullname is name of the file being matched
299
- @param matches is a list of (format, signature)
300
- @param delta_t is the time taken for the match.
301
- @param matchtype is the type of match (signature, containersignature, extension, fail)
302
- """
303
- class Info:
304
- pass
305
- obj = Info()
306
- obj.count = self.current_count
307
- obj.group_size = len(matches)
308
- obj.filename = fullname
309
- obj.time = int(delta_t * 1000)
310
- obj.filesize = self.current_filesize
311
- obj.matchtype = matchtype
312
- if len(matches) == 0:
313
- sys.stdout.write(self.printnomatch % {
314
- "info.time": obj.time,
315
- "info.filesize": obj.filesize,
316
- "info.filename": obj.filename,
317
- "info.count": obj.count,
318
- "info.matchtype": "fail"
319
- })
320
- return
321
- i = 0
322
- for (f, sig_name) in matches:
323
- i += 1
324
- obj.group_index = i
325
- obj.puid = self.get_puid(f)
326
- obj.formatname = f.find('name').text
327
- obj.signaturename = sig_name
328
- mime = f.find('mime')
329
- obj.mimetype = mime.text if mime is not None else None
330
- version = f.find('version')
331
- obj.version = version.text if version is not None else None
332
- alias = f.find('alias')
333
- obj.alias = alias.text if alias is not None else None
334
- apple_uti = f.find('apple_uid')
335
- obj.apple_uti = apple_uti.text if apple_uti is not None else None
336
- sys.stdout.write(self.printmatch % {
337
- "info.time": obj.time,
338
- "info.puid": obj.puid,
339
- "info.formatname": obj.formatname,
340
- "info.signaturename": obj.signaturename,
341
- "info.filesize": obj.filesize,
342
- "info.filename": obj.filename,
343
- "info.mimetype": obj.mimetype,
344
- "info.matchtype": obj.matchtype,
345
- "info.version": obj.version,
346
- "info.alias": obj.alias,
347
- "info.apple_uti": obj.apple_uti,
348
- "info.group_size": obj.group_size,
349
- "info.group_index": obj.group_index,
350
- "info.count": obj.count
351
- })
352
-
353
- def print_summary(self, secs):
354
- """
355
- Print summary information on the number of matches and time taken.
356
- """
357
- count = self.current_count
358
- if not self.quiet:
359
- rate = (int(round(count / secs)) if secs != 0 else 9999)
360
- # print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
361
- sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
362
-
363
- def identify_file(self, filename):
364
- """
365
- Identify the type of @param filename.
366
- Call self.handle_matches instead of returning a value.
367
- """
368
- self.current_file = filename
369
- self.matchtype = "signature"
370
- try:
371
- t0 = time.clock()
372
- f = open(filename, 'rb')
373
- size = os.stat(filename)[6]
374
- self.current_filesize = size
375
- if self.current_filesize == 0:
376
- sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + filename + "\n")
377
- bofbuffer, eofbuffer, _ = self.get_buffers(f, size, seekable=True)
378
- matches = self.match_formats(bofbuffer, eofbuffer)
379
- container_type = self.container_type(matches)
380
- if container_type in ("zip", "ole"):
381
- container_file = ET.parse(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
382
- if container_type == "zip":
383
- container_matches = self.match_container("ZIP", ZipPackage, filename, container_file)
384
- else:
385
- container_matches = self.match_container("OLE2", OlePackage, filename, container_file)
386
- if len(container_matches) > 0:
387
- self.handle_matches(filename, container_matches, time.clock() - t0, "container")
388
- return
389
- # from here is also repeated in walk_zip
390
- # we should make this uniform in a next version!
391
- #
392
- # filesize is made conditional because files with 0 bytes
393
- # are falsely characterised being 'rtf' (due to wacky sig)
394
- # in these cases we try to match the extension instead
395
- if len(matches) > 0 and self.current_filesize > 0:
396
- self.handle_matches(filename, matches, time.clock() - t0, self.matchtype)
397
- elif len(matches) == 0 or self.current_filesize == 0:
398
- matches = self.match_extensions(filename)
399
- self.handle_matches(filename, matches, time.clock() - t0, "extension")
400
- # only recurse into certain containers, like ZIP or TAR
401
- container = self.container_type(matches)
402
- # till here matey!
403
- if self.zip and self.can_recurse_into_container(container):
404
- self.identify_contents(filename, type=container)
405
- except IOError:
406
- # print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
407
- sys.stderr.write("FIDO: Error in identify_file: Path is {0}\n".format(filename))
408
-
409
- def identify_contents(self, filename, fileobj=None, type=False):
410
- """
411
- Identify each item in a container (such as a zip or tar file). Call
412
- self.handle_matches on each item.
413
- @param fileobj could be a file, or a stream.
414
- """
415
- if not type:
416
- return
417
- elif type == 'zip':
418
- self.walk_zip(filename, fileobj)
419
- elif type == 'tar':
420
- self.walk_tar(filename, fileobj)
421
- else: # TODO: ouch!
422
- raise RuntimeError("Unknown container type: " + repr(type))
423
-
424
- def identify_multi_object_stream(self, stream):
425
- """
426
- Does not work!
427
- Stream may contain one or more objects each with an HTTP style header
428
- that must include content-length. The headers consist of keyword:value
429
- pairs terminated by a newline. There must be a newline following the
430
- headers.
431
- """
432
- offset = 0
433
- while True:
434
- t0 = time.clock()
435
- content_length = -1
436
- for line in stream:
437
- offset += len(line)
438
- if line == '\n':
439
- if content_length < 0:
440
- raise EnvironmentError("No content-length provided.")
441
- else:
442
- break
443
- pair = line.lower().split(':', 2)
444
- if pair[0] == 'content-length':
445
- content_length = int(pair[1])
446
- if content_length == -1:
447
- return
448
- # Consume exactly content-length bytes
449
- self.current_file = 'STDIN!(at ' + str(offset) + ' bytes)'
450
- self.current_filesize = content_length
451
- bofbuffer, eofbuffer, _ = self.get_buffers(stream, content_length)
452
- matches = self.match_formats(bofbuffer, eofbuffer)
453
- # MdR: this needs attention
454
- if len(matches) > 0:
455
- self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
456
- elif len(matches) == 0 or self.current_filesize == 0:
457
- matches = self.match_extensions(self.current_file)
458
- self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
459
-
460
- def identify_stream(self, stream, filename):
461
- """
462
- Identify the type of @param stream.
463
- Call self.handle_matches instead of returning a value.
464
- Does not close stream.
465
- """
466
- t0 = time.clock()
467
- bofbuffer, eofbuffer, bytes_read = self.get_buffers(stream, length=None)
468
- self.current_filesize = bytes_read
469
- self.current_file = 'STDIN'
470
- matches = self.match_formats(bofbuffer, eofbuffer)
471
- # MdR: this needs attention
472
- if len(matches) > 0:
473
- self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
474
- elif len(matches) == 0 or self.current_filesize == 0:
475
- # we can only determine the filename from the STDIN stream
476
- # on Linux, on Windows there is not a (simple) way to do that
477
- if (os.name != "nt"):
478
- try:
479
- self.current_file = os.readlink("/proc/self/fd/0")
480
- except:
481
- if filename is not None:
482
- self.current_file = filename
483
- else:
484
- self.current_file = 'STDIN'
485
- else:
486
- if filename is not None:
487
- self.current_file = filename
488
- matches = self.match_extensions(self.current_file)
489
- # we have to reset self.current_file if not on Windows
490
- if (os.name != "nt"):
491
- self.current_file = 'STDIN'
492
- self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
493
-
494
- def container_type(self, matches):
495
- """
496
- Determine if one of the @param matches is the format of a container
497
- that we can look inside of (e.g., zip, tar).
498
- @return False, zip, or tar.
499
- """
500
- for (format_, unused) in matches:
501
- container = format_.find('container')
502
- if container is not None:
503
- return container.text
504
-
505
- # aside from checking <container> elements,
506
- # check for fmt/111, which is OLE
507
- puid = format_.find('puid')
508
- if puid is not None and puid.text == 'fmt/111':
509
- return 'ole'
510
- return False
511
-
512
- def can_recurse_into_container(self, container_type):
513
- """
514
- Determine if the passed container type can:
515
- a) be extracted, and
516
- b) contain individual files which can be identified separately.
517
-
518
- This function is useful for filtering out containers such as OLE,
519
- which are usually most interesting as compound objects rather than
520
- for their contents.
521
- """
522
- return container_type in ('zip', 'tar')
523
-
524
- def blocking_read(self, file, bytes_to_read):
525
- bytes_read = 0
526
- buffer = b''
527
- while bytes_read < bytes_to_read:
528
- readbuffer = file.read(bytes_to_read - bytes_read)
529
- buffer += readbuffer
530
- bytes_read = len(buffer)
531
- # break out if EOF is reached.
532
- if readbuffer == '':
533
- break
534
- return buffer
535
-
536
- def get_buffers(self, stream, length=None, seekable=False):
537
- """
538
- Return buffers from the beginning and end of stream and the number of
539
- bytes read if there may be more bytes in the stream.
540
-
541
- If length is None, return the length as found.
542
- If seekable is False, the steam does not support a seek operation.
543
- """
544
- bytes_to_read = self.bufsize if length is None else min(length, self.bufsize)
545
- bofbuffer = self.blocking_read(stream, bytes_to_read)
546
- bytes_read = len(bofbuffer)
547
- if length is None:
548
- # A stream with unknown length; have to keep two buffers around
549
- prevbuffer = bofbuffer
550
- while True:
551
- buffer = self.blocking_read(stream, self.bufsize)
552
- bytes_read += len(buffer)
553
- if len(buffer) == self.bufsize:
554
- prevbuffer = buffer
555
- else:
556
- eofbuffer = prevbuffer if len(buffer) == 0 else prevbuffer[-(self.bufsize - len(buffer)):] + buffer
557
- break
558
- return bofbuffer, eofbuffer, bytes_read
559
- else:
560
- bytes_unread = length - len(bofbuffer)
561
- if bytes_unread == 0:
562
- eofbuffer = bofbuffer
563
- elif bytes_unread < self.bufsize:
564
- # The buffs overlap
565
- eofbuffer = bofbuffer[bytes_unread:] + self.blocking_read(stream, bytes_unread)
566
- elif bytes_unread == self.bufsize:
567
- eofbuffer = self.blocking_read(stream, self.bufsize)
568
- elif seekable: # easy case when we can just seek!
569
- stream.seek(length - self.bufsize)
570
- eofbuffer = self.blocking_read(stream, self.bufsize)
571
- else:
572
- # We have more to read and know how much.
573
- # n*bufsize + r = length
574
- (n, r) = divmod(bytes_unread, self.bufsize)
575
- # skip n-1*bufsize bytes
576
- for unused_i in range(1, n):
577
- self.blocking_read(stream, self.bufsize)
578
- # skip r bytes
579
- self.blocking_read(stream, r)
580
- # and read the remaining bufsize bytes into the eofbuffer
581
- eofbuffer = self.blocking_read(stream, self.bufsize)
582
- return bofbuffer, eofbuffer, bytes_to_read
583
-
584
- def walk_zip(self, filename, fileobj=None):
585
- """
586
- Identify the type of each item in the zip
587
- @param fileobj. If fileobj is not provided, open.
588
- @param filename.
589
- Call self.handle_matches instead of returning a value.
590
- """
591
- try:
592
- with zipfile.ZipFile((fileobj if fileobj else filename), 'r') as zipstream:
593
- for item in zipstream.infolist():
594
- if item.file_size == 0:
595
- continue # TODO: Find a better test for isdir
596
- t0 = time.clock()
597
- with zipstream.open(item) as f:
598
- item_name = filename + '!' + item.filename
599
- self.current_file = item_name
600
- self.current_filesize = item.file_size
601
- if self.current_filesize == 0:
602
- sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + item_name + "\n")
603
- bofbuffer, eofbuffer, _ = self.get_buffers(f, item.file_size)
604
- matches = self.match_formats(bofbuffer, eofbuffer)
605
- if len(matches) > 0 and self.current_filesize > 0:
606
- self.handle_matches(item_name, matches, time.clock() - t0, "signature")
607
- elif len(matches) == 0 or self.current_filesize == 0:
608
- matches = self.match_extensions(item_name)
609
- self.handle_matches(item_name, matches, time.clock() - t0, "extension")
610
- if self.container_type(matches):
611
- target = tempfile.SpooledTemporaryFile(prefix='Fido')
612
- with zipstream.open(item) as source:
613
- self.copy_stream(source, target)
614
- # target.seek(0)
615
- self.identify_contents(item_name, target, self.container_type(matches))
616
- except IOError:
617
- sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
618
- except zipfile.BadZipfile:
619
- sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
620
-
621
- def walk_tar(self, filename, fileobj):
622
- """
623
- Identify the type of each item in the tar.
624
- @param fileobj. If fileobj is not provided, open.
625
- @param filename.
626
- Call self.handle_matches instead of returning a value.
627
- """
628
- try:
629
- with tarfile.TarFile(filename, fileobj=fileobj, mode='r') as tarstream:
630
- for item in tarstream.getmembers():
631
- if not item.isfile():
632
- continue
633
- t0 = time.clock()
634
- with closing(tarstream.extractfile(item)) as f:
635
- tar_item_name = filename + '!' + item.name
636
- self.current_file = tar_item_name
637
- self.current_filesize = item.size
638
- bofbuffer, eofbuffer, _ = self.get_buffers(f, item.size)
639
- matches = self.match_formats(bofbuffer, eofbuffer)
640
- self.handle_matches(tar_item_name, matches, time.clock() - t0)
641
- if self.container_type(matches):
642
- f.seek(0)
643
- self.identify_contents(tar_item_name, f, self.container_type(matches))
644
- except tarfile.TarError:
645
- sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
646
-
647
- def as_good_as_any(self, f1, match_list):
648
- """
649
- Return True if the proposed format is as good as any in the match_list.
650
- For example, if there is no format in the match_list that has priority over the proposed one
651
- """
652
- if match_list != []:
653
- f1_puid = self.get_puid(f1)
654
- for (f2, unused) in match_list:
655
- if f1 == f2:
656
- continue
657
- elif f1_puid in self.puid_has_priority_over_map[self.get_puid(f2)]:
658
- return False
659
- return True
660
-
661
- def buffered_read(self, file_pos, overlap):
662
- """
663
- Buffered read of data chunks.
664
- """
665
- buf = ""
666
- if not overlap:
667
- bufsize = self.container_bufsize
668
- else:
669
- bufsize = self.container_bufsize + self.overlap_range
670
- file_end = self.current_filesize
671
- with open(self.current_file, 'rb') as file_handle:
672
- file_handle.seek(file_pos)
673
- if file_end - file_pos < bufsize:
674
- file_read = file_end - file_pos
675
- else:
676
- file_read = self.bufsize
677
- buf = file_handle.read(file_read)
678
- return buf
679
-
680
- def match_formats(self, bofbuffer, eofbuffer):
681
- """
682
- Apply the patterns for formats to the supplied buffers.
683
- @return a match list of (format, signature) tuples.
684
- The list has inferior matches removed.
685
- """
686
- self.current_count += 1
687
- # t0 = time.clock()
688
- result = []
689
- for format in self.formats:
690
- try:
691
- self.current_format = format
692
- if self.as_good_as_any(format, result):
693
- for sig in self.get_signatures(format):
694
- self.current_sig = sig
695
- success = True
696
- for pat in self.get_patterns(sig):
697
- self.current_pat = pat
698
- pos = self.get_pos(pat)
699
- regex = self.get_regex(pat)
700
- # print 'trying ', regex
701
- if pos == 'BOF':
702
- if not re.match(regex, bofbuffer):
703
- success = False
704
- break
705
- elif pos == 'EOF':
706
- if not re.search(regex, eofbuffer):
707
- success = False
708
- break
709
- elif pos == 'VAR':
710
- if not re.search(regex, bofbuffer):
711
- success = False
712
- break
713
- elif pos == 'IFB':
714
- if not re.search(regex, bofbuffer):
715
- success = False
716
- break
717
- if success:
718
- result.append((format, sig.findtext("name")))
719
- except Exception as e:
720
- sys.stderr.write(str(e) + "\n")
721
- continue
722
- # TODO: MdR: needs some <3
723
- # print "Unexpected error:", sys.exc_info()[0], e
724
- # sys.stdout.write('***', self.get_puid(format), regex)
725
-
726
- # t1 = time.clock()
727
- # if t1 - t0 > 0.02:
728
- # print >> sys.stderr, "FIDO: Slow ID", self.current_file
729
- result = [match for match in result if self.as_good_as_any(match[0], result)]
730
- return result
731
-
732
- def match_extensions(self, filename):
733
- """
734
- Return the list of (format, self.externalsig) for every format whose extension matches the filename.
735
- """
736
- myext = os.path.splitext(filename)[1].lower().lstrip(".")
737
- result = []
738
- if not myext:
739
- return result
740
- for element in self.formats:
741
- for format_ in element.findall('extension'):
742
- if myext == format_.text:
743
- result.append((element, self.externalsig.findtext("name")))
744
- break
745
- result = [match for match in result if self.as_good_as_any(match[0], result)]
746
- return result
747
-
748
- def copy_stream(self, source, target):
749
- while True:
750
- buf = source.read(self.bufsize)
751
- if len(buf) == 0:
752
- break
753
- target.write(buf)
754
-
755
-
756
- def list_files(roots, recurse=False):
757
- """
758
- Return the files one at a time. Roots could be a fileobj or a list.
759
- """
760
- for root in roots:
761
- root = (root if root[-1] != '\n' else root[:-1])
762
- root = os.path.normpath(root)
763
- if os.path.isfile(root):
764
- yield root
765
- else:
766
- for path, unused, files in os.walk(root):
767
- for f in files:
768
- yield os.path.join(path, f)
769
- if not recurse:
770
- break
771
-
772
-
773
- def main(args=None):
774
- if not args:
775
- args = sys.argv[1:]
776
-
777
- parser = ArgumentParser(description=defaults['description'], epilog=defaults['epilog'], fromfile_prefix_chars='@', formatter_class=RawTextHelpFormatter)
778
- parser.add_argument('-v', default=False, action='store_true', help='show version information')
779
- parser.add_argument('-q', default=False, action='store_true', help='run (more) quietly')
780
- parser.add_argument('-recurse', default=False, action='store_true', help='recurse into subdirectories')
781
- parser.add_argument('-zip', default=False, action='store_true', help='recurse into zip and tar files')
782
- parser.add_argument('-nocontainer', default=False, action='store_true', help='disable deep scan of container documents, increases speed but may reduce accuracy with big files')
783
- parser.add_argument('-pronom_only', default=False, action='store_true', help='disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results')
784
-
785
- group = parser.add_mutually_exclusive_group()
786
- group.add_argument('-input', default=False, help='file containing a list of files to check, one per line. - means stdin')
787
- group.add_argument('files', nargs='*', default=[], metavar='FILE', help='files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.')
788
-
789
- parser.add_argument('-filename', default=None, help='filename if file contents passed through STDIN')
790
- parser.add_argument('-useformats', metavar='INCLUDEPUIDS', default=None, help='comma separated string of formats to use in identification')
791
- parser.add_argument('-nouseformats', metavar='EXCLUDEPUIDS', default=None, help='comma separated string of formats not to use in identification')
792
- parser.add_argument('-matchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use on match. See nomatchprintf, README.txt.')
793
- parser.add_argument('-nomatchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use if no match. See README.txt')
794
- parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['bufsize']) + ' bytes)')
795
- parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['container_bufsize']) + ' bytes)')
796
- parser.add_argument('-loadformats', default=None, metavar='XML1,...,XMLn', help='comma separated string of XML format files to add.')
797
- parser.add_argument('-confdir', default=CONFIG_DIR, help='configuration directory to load_fido_xml, for example, the format specifications from.')
798
-
799
- if len(sys.argv) == 1:
800
- parser.print_help()
801
- sys.exit(1)
802
- args = parser.parse_args(args)
803
-
804
- t0 = time.clock()
805
-
806
- versions = get_local_pronom_versions(args.confdir)
807
-
808
- defaults['xml_pronomSignature'] = versions.pronom_signature
809
- defaults['containersignature_file'] = versions.pronom_container_signature
810
- defaults['xml_fidoExtensionSignature'] = versions.fido_extension_signature
811
- defaults['format_files'] = [defaults['xml_pronomSignature']]
812
-
813
- if args.pronom_only:
814
- versionHeader = "FIDO v{0} ({1}, {2})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'])
815
- else:
816
- versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'], defaults['xml_fidoExtensionSignature'])
817
- defaults['format_files'].append(defaults['xml_fidoExtensionSignature'])
818
-
819
- if args.v:
820
- sys.stdout.write(versionHeader)
821
- sys.exit(0)
822
-
823
- if args.matchprintf:
824
- args.matchprintf = args.matchprintf.decode('string_escape')
825
- if args.nomatchprintf:
826
- args.nomatchprintf = args.nomatchprintf.decode('string_escape')
827
-
828
- fido = Fido(
829
- quiet=args.q,
830
- bufsize=args.bufsize,
831
- container_bufsize=args.container_bufsize,
832
- printmatch=args.matchprintf,
833
- printnomatch=args.nomatchprintf,
834
- zip=args.zip,
835
- nocontainer=args.nocontainer,
836
- conf_dir=args.confdir)
837
-
838
- # TODO: Allow conf options to be dis-included
839
- if args.loadformats:
840
- for file in args.loadformats.split(','):
841
- fido.load_fido_xml(file)
842
-
843
- # TODO: remove from maps
844
- if args.useformats:
845
- args.useformats = args.useformats.split(',')
846
- fido.formats = [f for f in fido.formats if f.find('puid').text in args.useformats]
847
- elif args.nouseformats:
848
- args.nouseformats = args.nouseformats.split(',')
849
- fido.formats = [f for f in fido.formats if f.find('puid').text not in args.nouseformats]
850
-
851
- # Set up to use stdin, or open input files:
852
- if args.input == '-':
853
- args.files = sys.stdin
854
- elif args.input:
855
- args.files = open(args.input, 'r')
856
-
857
- # RUN
858
- try:
859
- if not args.q:
860
- sys.stderr.write(versionHeader)
861
- sys.stderr.flush()
862
- if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
863
- if fido.zip:
864
- raise RuntimeError("Multiple content read from stdin not yet supported.")
865
- sys.exit(1)
866
- fido.identify_multi_object_stream(sys.stdin)
867
- else:
868
- fido.identify_stream(sys.stdin, args.filename)
869
- else:
870
- for file in list_files(args.files, args.recurse):
871
- fido.identify_file(file)
872
- except KeyboardInterrupt:
873
- msg = "FIDO: Interrupt while identifying file {0}"
874
- sys.stderr.write(msg.format(fido.current_file))
875
- sys.exit(1)
876
-
877
- if not args.q:
878
- sys.stdout.flush()
879
- fido.print_summary(time.clock() - t0)
880
- sys.stderr.flush()
881
-
882
-
883
- if __name__ == '__main__':
884
- main()