libis-format 0.9.32 → 0.9.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/data/types.yml +30 -16
  3. data/lib/libis/format/config.rb +7 -18
  4. data/lib/libis/format/converter/image_converter.rb +6 -0
  5. data/lib/libis/format/droid.rb +82 -25
  6. data/lib/libis/format/extension_identification.rb +55 -0
  7. data/lib/libis/format/fido.rb +57 -72
  8. data/lib/libis/format/file_tool.rb +76 -0
  9. data/lib/libis/format/identification_tool.rb +174 -0
  10. data/lib/libis/format/identifier.rb +129 -117
  11. data/lib/libis/format/type_database.rb +36 -5
  12. data/lib/libis/format/version.rb +1 -1
  13. data/lib/libis/format.rb +3 -0
  14. data/libis-format.gemspec +2 -1
  15. data/spec/converter_spec.rb +6 -4
  16. data/spec/identifier_spec.rb +125 -34
  17. metadata +21 -126
  18. data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
  19. data/tools/droid/container-signature-20170330.xml +0 -3584
  20. data/tools/droid/droid-command-line-6.3.jar +0 -0
  21. data/tools/droid/droid.bat +0 -152
  22. data/tools/droid/droid.sh +0 -152
  23. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  24. data/tools/droid/lib/activation-1.1.jar +0 -0
  25. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  26. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  27. data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
  28. data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
  29. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  30. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  31. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  32. data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
  33. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  34. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  35. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  36. data/tools/droid/lib/commons-codec-1.10.jar +0 -0
  37. data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
  38. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  39. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  40. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  41. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  42. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  43. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  44. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  45. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  46. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  47. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  48. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  49. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  50. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  51. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  52. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  53. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  54. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  55. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  56. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  57. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  58. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  59. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  60. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  61. data/tools/droid/lib/droid-container-6.3.jar +0 -0
  62. data/tools/droid/lib/droid-core-6.3.jar +0 -0
  63. data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
  64. data/tools/droid/lib/droid-export-6.3.jar +0 -0
  65. data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
  66. data/tools/droid/lib/droid-help-6.3.jar +0 -0
  67. data/tools/droid/lib/droid-report-6.3.jar +0 -0
  68. data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
  69. data/tools/droid/lib/droid-results-6.3.jar +0 -0
  70. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  71. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  72. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  73. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  74. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  75. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  76. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  77. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  78. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  79. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  80. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  81. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  82. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  83. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  84. data/tools/droid/lib/jta-1.1.jar +0 -0
  85. data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
  86. data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
  87. data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
  88. data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
  89. data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
  90. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  91. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  92. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  93. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  94. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  95. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  96. data/tools/droid/lib/poi-3.13.jar +0 -0
  97. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  98. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  99. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  100. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  101. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  102. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  103. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  104. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  105. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  106. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  107. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  108. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  109. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  110. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  111. data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
  112. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  113. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  114. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  115. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  116. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  117. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  118. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  119. data/tools/droid/lib/xz-1.0.jar +0 -0
  120. data/tools/fido/__init__.py +0 -50
  121. data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
  122. data/tools/fido/conf/container-signature-20170330.xml +0 -3584
  123. data/tools/fido/conf/dc.xsd +0 -119
  124. data/tools/fido/conf/dcmitype.xsd +0 -53
  125. data/tools/fido/conf/dcterms.xsd +0 -383
  126. data/tools/fido/conf/fido-formats.xsd +0 -173
  127. data/tools/fido/conf/format_extension_template.xml +0 -105
  128. data/tools/fido/conf/format_extensions.xml +0 -484
  129. data/tools/fido/conf/formats-v90.xml +0 -48877
  130. data/tools/fido/conf/pronom-xml-v90.zip +0 -0
  131. data/tools/fido/conf/versions.xml +0 -8
  132. data/tools/fido/fido.bat +0 -4
  133. data/tools/fido/fido.py +0 -884
  134. data/tools/fido/fido.sh +0 -5
  135. data/tools/fido/package.py +0 -96
  136. data/tools/fido/prepare.py +0 -645
  137. data/tools/fido/pronomutils.py +0 -200
  138. data/tools/fido/toxml.py +0 -60
  139. data/tools/fido/update_signatures.py +0 -183
data/tools/fido/fido.py DELETED
@@ -1,884 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- Format Identification for Digital Objects (FIDO).
6
-
7
- FIDO is a command-line tool to identify the file formats of digital objects.
8
- It is designed for simple integration into automated work-flows.
9
- """
10
-
11
- from __future__ import absolute_import
12
-
13
- from argparse import ArgumentParser, RawTextHelpFormatter
14
- from contextlib import closing
15
- import os
16
- import re
17
- import sys
18
- import tarfile
19
- import tempfile
20
- import time
21
- from xml.etree import cElementTree as ET
22
- from xml.etree import ElementTree as CET
23
- import zipfile
24
-
25
- from six.moves import range
26
-
27
- from . import __version__, CONFIG_DIR
28
- from .package import OlePackage, ZipPackage
29
- from .pronomutils import get_local_pronom_versions
30
-
31
-
32
- defaults = {
33
- 'bufsize': 128 * 1024, # (bytes)
34
- 'regexcachesize': 2084, # (bytes)
35
- 'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
36
- 'printnomatch': "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
37
- 'format_files': [
38
- 'formats-v88.xml',
39
- 'format_extensions.xml'
40
- ],
41
- 'containersignature_file': 'container-signature-20170330.xml',
42
- 'container_bufsize': 512 * 1024, # (bytes)
43
- 'description': """Format Identification for Digital Objects (fido).
44
- FIDO is a command-line tool to identify the file formats of digital objects.
45
- It is designed for simple integration into automated work-flows.""",
46
- 'epilog': """
47
- Open Planets Foundation (http://www.openplanetsfoundation.org)
48
- See License.txt for license information.
49
- Download from: https://github.com/openplanets/fido/releases
50
- Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
51
- Author: Adam Farquhar (BL), 2010
52
- Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
53
- FIDO uses the UK National Archives (TNA) PRONOM File Format
54
- and Container descriptions.
55
- PRONOM is available from http://www.nationalarchives.gov.uk/pronom/""",
56
- }
57
-
58
-
59
- class Fido:
60
- def __init__(self, quiet=False, bufsize=None, container_bufsize=None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=CONFIG_DIR, format_files=None, containersignature_file=None):
61
- global defaults
62
- self.quiet = quiet
63
- self.bufsize = defaults['bufsize'] if bufsize is None else bufsize
64
- self.container_bufsize = defaults['container_bufsize'] if container_bufsize is None else container_bufsize
65
- self.printmatch = defaults['printmatch'] if printmatch is None else printmatch
66
- self.printnomatch = defaults['printnomatch'] if printnomatch is None else printnomatch
67
- self.handle_matches = self.print_matches if handle_matches is None else handle_matches
68
- self.zip = zip
69
- self.nocontainer = nocontainer
70
- self.conf_dir = conf_dir
71
- self.format_files = defaults['format_files'] if format_files is None else format_files
72
- self.containersignature_file = defaults['containersignature_file']
73
- self.formats = []
74
- self.puid_format_map = {}
75
- self.puid_has_priority_over_map = {}
76
- # load signatures
77
- for xml_file in self.format_files:
78
- self.load_fido_xml(os.path.join(os.path.abspath(self.conf_dir), xml_file))
79
- self.load_container_signature(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
80
- self.current_file = ''
81
- self.current_filesize = 0
82
- self.current_format = None
83
- self.current_sig = None
84
- self.current_pat = None
85
- self.current_count = 0 # Count of calls to match_formats
86
- re._MAXCACHE = defaults['regexcachesize']
87
- self.externalsig = ET.XML('<signature><name>External</name></signature>')
88
-
89
- _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
90
- _special = '$()*+.?![]^\\{|}' # Before: '$*+.?![]^\\{|}'
91
- _hex = '0123456789abcdef'
92
-
93
- def _escape_char(self, c):
94
- if c in '\n':
95
- return '\\n'
96
- elif c == '\r':
97
- return '\\r'
98
- elif c in self._special:
99
- return '\\' + c
100
- else:
101
- (high, low) = divmod(ord(c), 16)
102
- return '\\x' + self._hex[high] + self._hex[low]
103
-
104
- def escape(self, string):
105
- """
106
- Escape characters in pattern that are non-printable, non-ascii, or
107
- special for regexes.
108
- """
109
- escaped = ''.join(c if c in self._ordinary else self._escape_char(c) for c in string)
110
- return escaped
111
-
112
- def convert_container_sequence(self, sig):
113
- """
114
- Parse the PRONOM container sequences and convert to regular
115
- expressions.
116
- """
117
- # The sequence is regex matching bytes from a file so the sequence must also be bytes
118
- seq = b'(?s)'
119
- inq = False
120
- byt = False
121
- rng = False
122
- ror = False
123
- for i in range(len(sig)):
124
- if not inq and not rng:
125
- if sig[i] == "'":
126
- inq = True
127
- continue
128
- if sig[i] == " ":
129
- continue
130
- if sig[i] == "[":
131
- seq += b"("
132
- rng = True
133
- continue
134
- if not byt:
135
- seq += b"\\x" + sig[i].lower().encode('utf8')
136
- byt = True
137
- continue
138
- if byt:
139
- seq += sig[i].lower().encode('utf8')
140
- byt = False
141
- continue
142
- if inq:
143
- if sig[i] == "'" and not rng:
144
- inq = False
145
- continue
146
- seq += self.escape(sig[i]).encode('utf8')
147
- continue
148
- if rng:
149
- if sig[i] == "]":
150
- seq += b")"
151
- rng = False
152
- continue
153
- if sig[i] != "-" and sig[i] != "'" and ror:
154
- seq += self.escape(sig[i]).encode('utf8')
155
- continue
156
- if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and sig[i] != ":" and not ror and not byt:
157
- seq += b"\\x" + sig[i].lower().encode('utf8')
158
- byt = True
159
- continue
160
- if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and byt:
161
- seq += sig[i].lower().encode('utf8')
162
- byt = False
163
- continue
164
- if sig[i] == "-" or sig[i] == " ":
165
- seq += b"|"
166
- continue
167
- if sig[i] == "'" and not ror:
168
- ror = True
169
- continue
170
- if sig[i] == "'" and ror:
171
- ror = False
172
- continue
173
-
174
- return seq
175
-
176
- def load_container_signature(self, containersignature_file):
177
- """
178
- Load the PRONOM container-signature file and convert sequences to
179
- regular expressions.
180
- """
181
- tree = CET.parse(containersignature_file)
182
- # load and have container signatures converted
183
- self.sequenceSignature = {}
184
- for signature in tree.getroot().findall('ContainerSignatures/ContainerSignature'):
185
- signatureId = signature.get('Id')
186
- signatureSequence = signature.findall('Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence')
187
- self.sequenceSignature[signatureId] = []
188
- for sequence in signatureSequence:
189
- self.sequenceSignature[signatureId].append(self.convert_container_sequence(sequence[0].text))
190
- # map PUID to container signatureId
191
- self.puidMapping = {}
192
- mappings = tree.find('FileFormatMappings')
193
- for mapping in mappings.findall('FileFormatMapping'):
194
- if mapping.get('signatureId') not in self.puidMapping:
195
- self.puidMapping[mapping.get('signatureId')] = []
196
- self.puidMapping[mapping.get('signatureId')].append(mapping.get('Puid'))
197
- # print "sequences:\n",self.sequenceSignature
198
- # print "mapping:\n",self.puidMapping
199
- # exit()
200
-
201
- def extract_signatures(self, doc, signature_type="ZIP"):
202
- """
203
- Given an XML container signature file, returns a dictionary of signatures.
204
-
205
- The format of the dictionary is:
206
-
207
- {
208
- path_to_file_inside_zip: {puid: [signatures]}
209
- }
210
- """
211
- root = doc.getroot()
212
- format_mappings = root.find("FileFormatMappings")
213
-
214
- def get_puid(doc, element_id):
215
- return format_mappings.find('FileFormatMapping[@signatureId="{}"]'.format(element_id)).attrib["Puid"]
216
-
217
- def format_signature_attributes(element):
218
- return {
219
- "path": element.findtext("Files/File/Path"),
220
- "id": element.attrib["Id"],
221
- "signature": self.convert_container_sequence(element.findtext("Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence/Sequence"))
222
- }
223
-
224
- elements = root.findall("ContainerSignatures/ContainerSignature[@ContainerType=\"{}\"]".format(signature_type))
225
- signatures = {}
226
- for el in elements:
227
- if el.find("Files/File/BinarySignatures") is None:
228
- continue
229
-
230
- puid = get_puid(doc, el.attrib["Id"])
231
- signature = format_signature_attributes(el)
232
- path = signature["path"]
233
- if path not in signatures:
234
- signatures[path] = {}
235
- if puid not in signatures[path]:
236
- signatures[path][puid] = []
237
- signatures[path][puid].append(format_signature_attributes(el))
238
- return signatures
239
-
240
- def match_container(self, signature_type, klass, file, signature_file):
241
- puids = klass(file, self.extract_signatures(signature_file, signature_type=signature_type)).detect_formats()
242
- results = []
243
- for puid in puids:
244
- format = self.puid_format_map[puid]
245
- signature = format.findtext("name")
246
- results.append((format, signature))
247
- return results
248
-
249
- def load_fido_xml(self, file):
250
- """
251
- Load the fido format information from @param file.
252
- As a side-effect, set self.formats.
253
- @return list of ElementTree.Element, one for each format.
254
- """
255
- tree = ET.parse(file)
256
- # print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
257
- # TODO: Handle empty regexes properly; perhaps remove from the format list
258
- for element in tree.getroot().findall('./format'):
259
- puid = self.get_puid(element)
260
- # Handle over-writes in multiple file loads
261
- existing = self.puid_format_map.get(puid, False)
262
- if existing:
263
- # Already have one, so replace old with new!
264
- self.formats[self.formats.index(existing)] = element
265
- else:
266
- self.formats.append(element)
267
- self.puid_format_map[puid] = element
268
- # Build some structures to speed things up
269
- self.puid_has_priority_over_map[puid] = frozenset([puid_element.text for puid_element in element.findall('has_priority_over')])
270
- return self.formats
271
-
272
- # To delete a format: (1) remove from self.formats, (2) remove from puid_format_map, (3) remove from selt.puid_has_priority_over_map
273
- def get_signatures(self, format):
274
- return format.findall('signature')
275
-
276
- def has_priority_over(self, format, possibly_inferior):
277
- return self.get_puid(possibly_inferior)in self.puid_has_priority_over_map[self.get_puid(format)]
278
-
279
- def get_puid(self, format):
280
- return format.find('puid').text
281
-
282
- def get_patterns(self, signature):
283
- return signature.findall('pattern')
284
-
285
- def get_pos(self, pat):
286
- return pat.find('position').text
287
-
288
- def get_regex(self, pat):
289
- # The regex is matching bytes from a file so regex must also be bytes
290
- return pat.find('regex').text.encode('utf8')
291
-
292
- def get_extension(self, format):
293
- return format.find('extension').text
294
-
295
- def print_matches(self, fullname, matches, delta_t, matchtype=''):
296
- """
297
- The default match handler. Prints out information for each match in the list.
298
- @param fullname is name of the file being matched
299
- @param matches is a list of (format, signature)
300
- @param delta_t is the time taken for the match.
301
- @param matchtype is the type of match (signature, containersignature, extension, fail)
302
- """
303
- class Info:
304
- pass
305
- obj = Info()
306
- obj.count = self.current_count
307
- obj.group_size = len(matches)
308
- obj.filename = fullname
309
- obj.time = int(delta_t * 1000)
310
- obj.filesize = self.current_filesize
311
- obj.matchtype = matchtype
312
- if len(matches) == 0:
313
- sys.stdout.write(self.printnomatch % {
314
- "info.time": obj.time,
315
- "info.filesize": obj.filesize,
316
- "info.filename": obj.filename,
317
- "info.count": obj.count,
318
- "info.matchtype": "fail"
319
- })
320
- return
321
- i = 0
322
- for (f, sig_name) in matches:
323
- i += 1
324
- obj.group_index = i
325
- obj.puid = self.get_puid(f)
326
- obj.formatname = f.find('name').text
327
- obj.signaturename = sig_name
328
- mime = f.find('mime')
329
- obj.mimetype = mime.text if mime is not None else None
330
- version = f.find('version')
331
- obj.version = version.text if version is not None else None
332
- alias = f.find('alias')
333
- obj.alias = alias.text if alias is not None else None
334
- apple_uti = f.find('apple_uid')
335
- obj.apple_uti = apple_uti.text if apple_uti is not None else None
336
- sys.stdout.write(self.printmatch % {
337
- "info.time": obj.time,
338
- "info.puid": obj.puid,
339
- "info.formatname": obj.formatname,
340
- "info.signaturename": obj.signaturename,
341
- "info.filesize": obj.filesize,
342
- "info.filename": obj.filename,
343
- "info.mimetype": obj.mimetype,
344
- "info.matchtype": obj.matchtype,
345
- "info.version": obj.version,
346
- "info.alias": obj.alias,
347
- "info.apple_uti": obj.apple_uti,
348
- "info.group_size": obj.group_size,
349
- "info.group_index": obj.group_index,
350
- "info.count": obj.count
351
- })
352
-
353
- def print_summary(self, secs):
354
- """
355
- Print summary information on the number of matches and time taken.
356
- """
357
- count = self.current_count
358
- if not self.quiet:
359
- rate = (int(round(count / secs)) if secs != 0 else 9999)
360
- # print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
361
- sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
362
-
363
- def identify_file(self, filename):
364
- """
365
- Identify the type of @param filename.
366
- Call self.handle_matches instead of returning a value.
367
- """
368
- self.current_file = filename
369
- self.matchtype = "signature"
370
- try:
371
- t0 = time.clock()
372
- f = open(filename, 'rb')
373
- size = os.stat(filename)[6]
374
- self.current_filesize = size
375
- if self.current_filesize == 0:
376
- sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + filename + "\n")
377
- bofbuffer, eofbuffer, _ = self.get_buffers(f, size, seekable=True)
378
- matches = self.match_formats(bofbuffer, eofbuffer)
379
- container_type = self.container_type(matches)
380
- if container_type in ("zip", "ole"):
381
- container_file = ET.parse(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
382
- if container_type == "zip":
383
- container_matches = self.match_container("ZIP", ZipPackage, filename, container_file)
384
- else:
385
- container_matches = self.match_container("OLE2", OlePackage, filename, container_file)
386
- if len(container_matches) > 0:
387
- self.handle_matches(filename, container_matches, time.clock() - t0, "container")
388
- return
389
- # from here is also repeated in walk_zip
390
- # we should make this uniform in a next version!
391
- #
392
- # filesize is made conditional because files with 0 bytes
393
- # are falsely characterised being 'rtf' (due to wacky sig)
394
- # in these cases we try to match the extension instead
395
- if len(matches) > 0 and self.current_filesize > 0:
396
- self.handle_matches(filename, matches, time.clock() - t0, self.matchtype)
397
- elif len(matches) == 0 or self.current_filesize == 0:
398
- matches = self.match_extensions(filename)
399
- self.handle_matches(filename, matches, time.clock() - t0, "extension")
400
- # only recurse into certain containers, like ZIP or TAR
401
- container = self.container_type(matches)
402
- # till here matey!
403
- if self.zip and self.can_recurse_into_container(container):
404
- self.identify_contents(filename, type=container)
405
- except IOError:
406
- # print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
407
- sys.stderr.write("FIDO: Error in identify_file: Path is {0}\n".format(filename))
408
-
409
- def identify_contents(self, filename, fileobj=None, type=False):
410
- """
411
- Identify each item in a container (such as a zip or tar file). Call
412
- self.handle_matches on each item.
413
- @param fileobj could be a file, or a stream.
414
- """
415
- if not type:
416
- return
417
- elif type == 'zip':
418
- self.walk_zip(filename, fileobj)
419
- elif type == 'tar':
420
- self.walk_tar(filename, fileobj)
421
- else: # TODO: ouch!
422
- raise RuntimeError("Unknown container type: " + repr(type))
423
-
424
- def identify_multi_object_stream(self, stream):
425
- """
426
- Does not work!
427
- Stream may contain one or more objects each with an HTTP style header
428
- that must include content-length. The headers consist of keyword:value
429
- pairs terminated by a newline. There must be a newline following the
430
- headers.
431
- """
432
- offset = 0
433
- while True:
434
- t0 = time.clock()
435
- content_length = -1
436
- for line in stream:
437
- offset += len(line)
438
- if line == '\n':
439
- if content_length < 0:
440
- raise EnvironmentError("No content-length provided.")
441
- else:
442
- break
443
- pair = line.lower().split(':', 2)
444
- if pair[0] == 'content-length':
445
- content_length = int(pair[1])
446
- if content_length == -1:
447
- return
448
- # Consume exactly content-length bytes
449
- self.current_file = 'STDIN!(at ' + str(offset) + ' bytes)'
450
- self.current_filesize = content_length
451
- bofbuffer, eofbuffer, _ = self.get_buffers(stream, content_length)
452
- matches = self.match_formats(bofbuffer, eofbuffer)
453
- # MdR: this needs attention
454
- if len(matches) > 0:
455
- self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
456
- elif len(matches) == 0 or self.current_filesize == 0:
457
- matches = self.match_extensions(self.current_file)
458
- self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
459
-
460
- def identify_stream(self, stream, filename):
461
- """
462
- Identify the type of @param stream.
463
- Call self.handle_matches instead of returning a value.
464
- Does not close stream.
465
- """
466
- t0 = time.clock()
467
- bofbuffer, eofbuffer, bytes_read = self.get_buffers(stream, length=None)
468
- self.current_filesize = bytes_read
469
- self.current_file = 'STDIN'
470
- matches = self.match_formats(bofbuffer, eofbuffer)
471
- # MdR: this needs attention
472
- if len(matches) > 0:
473
- self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
474
- elif len(matches) == 0 or self.current_filesize == 0:
475
- # we can only determine the filename from the STDIN stream
476
- # on Linux, on Windows there is not a (simple) way to do that
477
- if (os.name != "nt"):
478
- try:
479
- self.current_file = os.readlink("/proc/self/fd/0")
480
- except:
481
- if filename is not None:
482
- self.current_file = filename
483
- else:
484
- self.current_file = 'STDIN'
485
- else:
486
- if filename is not None:
487
- self.current_file = filename
488
- matches = self.match_extensions(self.current_file)
489
- # we have to reset self.current_file if not on Windows
490
- if (os.name != "nt"):
491
- self.current_file = 'STDIN'
492
- self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
493
-
494
- def container_type(self, matches):
495
- """
496
- Determine if one of the @param matches is the format of a container
497
- that we can look inside of (e.g., zip, tar).
498
- @return False, zip, or tar.
499
- """
500
- for (format_, unused) in matches:
501
- container = format_.find('container')
502
- if container is not None:
503
- return container.text
504
-
505
- # aside from checking <container> elements,
506
- # check for fmt/111, which is OLE
507
- puid = format_.find('puid')
508
- if puid is not None and puid.text == 'fmt/111':
509
- return 'ole'
510
- return False
511
-
512
- def can_recurse_into_container(self, container_type):
513
- """
514
- Determine if the passed container type can:
515
- a) be extracted, and
516
- b) contain individual files which can be identified separately.
517
-
518
- This function is useful for filtering out containers such as OLE,
519
- which are usually most interesting as compound objects rather than
520
- for their contents.
521
- """
522
- return container_type in ('zip', 'tar')
523
-
524
- def blocking_read(self, file, bytes_to_read):
525
- bytes_read = 0
526
- buffer = b''
527
- while bytes_read < bytes_to_read:
528
- readbuffer = file.read(bytes_to_read - bytes_read)
529
- buffer += readbuffer
530
- bytes_read = len(buffer)
531
- # break out if EOF is reached.
532
- if readbuffer == '':
533
- break
534
- return buffer
535
-
536
- def get_buffers(self, stream, length=None, seekable=False):
537
- """
538
- Return buffers from the beginning and end of stream and the number of
539
- bytes read if there may be more bytes in the stream.
540
-
541
- If length is None, return the length as found.
542
- If seekable is False, the steam does not support a seek operation.
543
- """
544
- bytes_to_read = self.bufsize if length is None else min(length, self.bufsize)
545
- bofbuffer = self.blocking_read(stream, bytes_to_read)
546
- bytes_read = len(bofbuffer)
547
- if length is None:
548
- # A stream with unknown length; have to keep two buffers around
549
- prevbuffer = bofbuffer
550
- while True:
551
- buffer = self.blocking_read(stream, self.bufsize)
552
- bytes_read += len(buffer)
553
- if len(buffer) == self.bufsize:
554
- prevbuffer = buffer
555
- else:
556
- eofbuffer = prevbuffer if len(buffer) == 0 else prevbuffer[-(self.bufsize - len(buffer)):] + buffer
557
- break
558
- return bofbuffer, eofbuffer, bytes_read
559
- else:
560
- bytes_unread = length - len(bofbuffer)
561
- if bytes_unread == 0:
562
- eofbuffer = bofbuffer
563
- elif bytes_unread < self.bufsize:
564
- # The buffs overlap
565
- eofbuffer = bofbuffer[bytes_unread:] + self.blocking_read(stream, bytes_unread)
566
- elif bytes_unread == self.bufsize:
567
- eofbuffer = self.blocking_read(stream, self.bufsize)
568
- elif seekable: # easy case when we can just seek!
569
- stream.seek(length - self.bufsize)
570
- eofbuffer = self.blocking_read(stream, self.bufsize)
571
- else:
572
- # We have more to read and know how much.
573
- # n*bufsize + r = length
574
- (n, r) = divmod(bytes_unread, self.bufsize)
575
- # skip n-1*bufsize bytes
576
- for unused_i in range(1, n):
577
- self.blocking_read(stream, self.bufsize)
578
- # skip r bytes
579
- self.blocking_read(stream, r)
580
- # and read the remaining bufsize bytes into the eofbuffer
581
- eofbuffer = self.blocking_read(stream, self.bufsize)
582
- return bofbuffer, eofbuffer, bytes_to_read
583
-
584
- def walk_zip(self, filename, fileobj=None):
585
- """
586
- Identify the type of each item in the zip
587
- @param fileobj. If fileobj is not provided, open.
588
- @param filename.
589
- Call self.handle_matches instead of returning a value.
590
- """
591
- try:
592
- with zipfile.ZipFile((fileobj if fileobj else filename), 'r') as zipstream:
593
- for item in zipstream.infolist():
594
- if item.file_size == 0:
595
- continue # TODO: Find a better test for isdir
596
- t0 = time.clock()
597
- with zipstream.open(item) as f:
598
- item_name = filename + '!' + item.filename
599
- self.current_file = item_name
600
- self.current_filesize = item.file_size
601
- if self.current_filesize == 0:
602
- sys.stderr.write("FIDO: Zero byte file (empty): Path is: " + item_name + "\n")
603
- bofbuffer, eofbuffer, _ = self.get_buffers(f, item.file_size)
604
- matches = self.match_formats(bofbuffer, eofbuffer)
605
- if len(matches) > 0 and self.current_filesize > 0:
606
- self.handle_matches(item_name, matches, time.clock() - t0, "signature")
607
- elif len(matches) == 0 or self.current_filesize == 0:
608
- matches = self.match_extensions(item_name)
609
- self.handle_matches(item_name, matches, time.clock() - t0, "extension")
610
- if self.container_type(matches):
611
- target = tempfile.SpooledTemporaryFile(prefix='Fido')
612
- with zipstream.open(item) as source:
613
- self.copy_stream(source, target)
614
- # target.seek(0)
615
- self.identify_contents(item_name, target, self.container_type(matches))
616
- except IOError:
617
- sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
618
- except zipfile.BadZipfile:
619
- sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
620
-
621
- def walk_tar(self, filename, fileobj):
622
- """
623
- Identify the type of each item in the tar.
624
- @param fileobj. If fileobj is not provided, open.
625
- @param filename.
626
- Call self.handle_matches instead of returning a value.
627
- """
628
- try:
629
- with tarfile.TarFile(filename, fileobj=fileobj, mode='r') as tarstream:
630
- for item in tarstream.getmembers():
631
- if not item.isfile():
632
- continue
633
- t0 = time.clock()
634
- with closing(tarstream.extractfile(item)) as f:
635
- tar_item_name = filename + '!' + item.name
636
- self.current_file = tar_item_name
637
- self.current_filesize = item.size
638
- bofbuffer, eofbuffer, _ = self.get_buffers(f, item.size)
639
- matches = self.match_formats(bofbuffer, eofbuffer)
640
- self.handle_matches(tar_item_name, matches, time.clock() - t0)
641
- if self.container_type(matches):
642
- f.seek(0)
643
- self.identify_contents(tar_item_name, f, self.container_type(matches))
644
- except tarfile.TarError:
645
- sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
646
-
647
- def as_good_as_any(self, f1, match_list):
648
- """
649
- Return True if the proposed format is as good as any in the match_list.
650
- For example, if there is no format in the match_list that has priority over the proposed one
651
- """
652
- if match_list != []:
653
- f1_puid = self.get_puid(f1)
654
- for (f2, unused) in match_list:
655
- if f1 == f2:
656
- continue
657
- elif f1_puid in self.puid_has_priority_over_map[self.get_puid(f2)]:
658
- return False
659
- return True
660
-
661
- def buffered_read(self, file_pos, overlap):
662
- """
663
- Buffered read of data chunks.
664
- """
665
- buf = ""
666
- if not overlap:
667
- bufsize = self.container_bufsize
668
- else:
669
- bufsize = self.container_bufsize + self.overlap_range
670
- file_end = self.current_filesize
671
- with open(self.current_file, 'rb') as file_handle:
672
- file_handle.seek(file_pos)
673
- if file_end - file_pos < bufsize:
674
- file_read = file_end - file_pos
675
- else:
676
- file_read = self.bufsize
677
- buf = file_handle.read(file_read)
678
- return buf
679
-
680
- def match_formats(self, bofbuffer, eofbuffer):
681
- """
682
- Apply the patterns for formats to the supplied buffers.
683
- @return a match list of (format, signature) tuples.
684
- The list has inferior matches removed.
685
- """
686
- self.current_count += 1
687
- # t0 = time.clock()
688
- result = []
689
- for format in self.formats:
690
- try:
691
- self.current_format = format
692
- if self.as_good_as_any(format, result):
693
- for sig in self.get_signatures(format):
694
- self.current_sig = sig
695
- success = True
696
- for pat in self.get_patterns(sig):
697
- self.current_pat = pat
698
- pos = self.get_pos(pat)
699
- regex = self.get_regex(pat)
700
- # print 'trying ', regex
701
- if pos == 'BOF':
702
- if not re.match(regex, bofbuffer):
703
- success = False
704
- break
705
- elif pos == 'EOF':
706
- if not re.search(regex, eofbuffer):
707
- success = False
708
- break
709
- elif pos == 'VAR':
710
- if not re.search(regex, bofbuffer):
711
- success = False
712
- break
713
- elif pos == 'IFB':
714
- if not re.search(regex, bofbuffer):
715
- success = False
716
- break
717
- if success:
718
- result.append((format, sig.findtext("name")))
719
- except Exception as e:
720
- sys.stderr.write(str(e) + "\n")
721
- continue
722
- # TODO: MdR: needs some <3
723
- # print "Unexpected error:", sys.exc_info()[0], e
724
- # sys.stdout.write('***', self.get_puid(format), regex)
725
-
726
- # t1 = time.clock()
727
- # if t1 - t0 > 0.02:
728
- # print >> sys.stderr, "FIDO: Slow ID", self.current_file
729
- result = [match for match in result if self.as_good_as_any(match[0], result)]
730
- return result
731
-
732
- def match_extensions(self, filename):
733
- """
734
- Return the list of (format, self.externalsig) for every format whose extension matches the filename.
735
- """
736
- myext = os.path.splitext(filename)[1].lower().lstrip(".")
737
- result = []
738
- if not myext:
739
- return result
740
- for element in self.formats:
741
- for format_ in element.findall('extension'):
742
- if myext == format_.text:
743
- result.append((element, self.externalsig.findtext("name")))
744
- break
745
- result = [match for match in result if self.as_good_as_any(match[0], result)]
746
- return result
747
-
748
- def copy_stream(self, source, target):
749
- while True:
750
- buf = source.read(self.bufsize)
751
- if len(buf) == 0:
752
- break
753
- target.write(buf)
754
-
755
-
756
- def list_files(roots, recurse=False):
757
- """
758
- Return the files one at a time. Roots could be a fileobj or a list.
759
- """
760
- for root in roots:
761
- root = (root if root[-1] != '\n' else root[:-1])
762
- root = os.path.normpath(root)
763
- if os.path.isfile(root):
764
- yield root
765
- else:
766
- for path, unused, files in os.walk(root):
767
- for f in files:
768
- yield os.path.join(path, f)
769
- if not recurse:
770
- break
771
-
772
-
773
- def main(args=None):
774
- if not args:
775
- args = sys.argv[1:]
776
-
777
- parser = ArgumentParser(description=defaults['description'], epilog=defaults['epilog'], fromfile_prefix_chars='@', formatter_class=RawTextHelpFormatter)
778
- parser.add_argument('-v', default=False, action='store_true', help='show version information')
779
- parser.add_argument('-q', default=False, action='store_true', help='run (more) quietly')
780
- parser.add_argument('-recurse', default=False, action='store_true', help='recurse into subdirectories')
781
- parser.add_argument('-zip', default=False, action='store_true', help='recurse into zip and tar files')
782
- parser.add_argument('-nocontainer', default=False, action='store_true', help='disable deep scan of container documents, increases speed but may reduce accuracy with big files')
783
- parser.add_argument('-pronom_only', default=False, action='store_true', help='disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results')
784
-
785
- group = parser.add_mutually_exclusive_group()
786
- group.add_argument('-input', default=False, help='file containing a list of files to check, one per line. - means stdin')
787
- group.add_argument('files', nargs='*', default=[], metavar='FILE', help='files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.')
788
-
789
- parser.add_argument('-filename', default=None, help='filename if file contents passed through STDIN')
790
- parser.add_argument('-useformats', metavar='INCLUDEPUIDS', default=None, help='comma separated string of formats to use in identification')
791
- parser.add_argument('-nouseformats', metavar='EXCLUDEPUIDS', default=None, help='comma separated string of formats not to use in identification')
792
- parser.add_argument('-matchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use on match. See nomatchprintf, README.txt.')
793
- parser.add_argument('-nomatchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use if no match. See README.txt')
794
- parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['bufsize']) + ' bytes)')
795
- parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default=' + str(defaults['container_bufsize']) + ' bytes)')
796
- parser.add_argument('-loadformats', default=None, metavar='XML1,...,XMLn', help='comma separated string of XML format files to add.')
797
- parser.add_argument('-confdir', default=CONFIG_DIR, help='configuration directory to load_fido_xml, for example, the format specifications from.')
798
-
799
- if len(sys.argv) == 1:
800
- parser.print_help()
801
- sys.exit(1)
802
- args = parser.parse_args(args)
803
-
804
- t0 = time.clock()
805
-
806
- versions = get_local_pronom_versions(args.confdir)
807
-
808
- defaults['xml_pronomSignature'] = versions.pronom_signature
809
- defaults['containersignature_file'] = versions.pronom_container_signature
810
- defaults['xml_fidoExtensionSignature'] = versions.fido_extension_signature
811
- defaults['format_files'] = [defaults['xml_pronomSignature']]
812
-
813
- if args.pronom_only:
814
- versionHeader = "FIDO v{0} ({1}, {2})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'])
815
- else:
816
- versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(__version__, defaults['xml_pronomSignature'], defaults['containersignature_file'], defaults['xml_fidoExtensionSignature'])
817
- defaults['format_files'].append(defaults['xml_fidoExtensionSignature'])
818
-
819
- if args.v:
820
- sys.stdout.write(versionHeader)
821
- sys.exit(0)
822
-
823
- if args.matchprintf:
824
- args.matchprintf = args.matchprintf.decode('string_escape')
825
- if args.nomatchprintf:
826
- args.nomatchprintf = args.nomatchprintf.decode('string_escape')
827
-
828
- fido = Fido(
829
- quiet=args.q,
830
- bufsize=args.bufsize,
831
- container_bufsize=args.container_bufsize,
832
- printmatch=args.matchprintf,
833
- printnomatch=args.nomatchprintf,
834
- zip=args.zip,
835
- nocontainer=args.nocontainer,
836
- conf_dir=args.confdir)
837
-
838
- # TODO: Allow conf options to be dis-included
839
- if args.loadformats:
840
- for file in args.loadformats.split(','):
841
- fido.load_fido_xml(file)
842
-
843
- # TODO: remove from maps
844
- if args.useformats:
845
- args.useformats = args.useformats.split(',')
846
- fido.formats = [f for f in fido.formats if f.find('puid').text in args.useformats]
847
- elif args.nouseformats:
848
- args.nouseformats = args.nouseformats.split(',')
849
- fido.formats = [f for f in fido.formats if f.find('puid').text not in args.nouseformats]
850
-
851
- # Set up to use stdin, or open input files:
852
- if args.input == '-':
853
- args.files = sys.stdin
854
- elif args.input:
855
- args.files = open(args.input, 'r')
856
-
857
- # RUN
858
- try:
859
- if not args.q:
860
- sys.stderr.write(versionHeader)
861
- sys.stderr.flush()
862
- if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
863
- if fido.zip:
864
- raise RuntimeError("Multiple content read from stdin not yet supported.")
865
- sys.exit(1)
866
- fido.identify_multi_object_stream(sys.stdin)
867
- else:
868
- fido.identify_stream(sys.stdin, args.filename)
869
- else:
870
- for file in list_files(args.files, args.recurse):
871
- fido.identify_file(file)
872
- except KeyboardInterrupt:
873
- msg = "FIDO: Interrupt while identifying file {0}"
874
- sys.stderr.write(msg.format(fido.current_file))
875
- sys.exit(1)
876
-
877
- if not args.q:
878
- sys.stdout.flush()
879
- fido.print_summary(time.clock() - t0)
880
- sys.stderr.flush()
881
-
882
-
883
- if __name__ == '__main__':
884
- main()