libis-format 0.9.5-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.coveralls.yml +2 -0
- data/.gitignore +18 -0
- data/.travis.yml +41 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +39 -0
- data/Rakefile +8 -0
- data/bin/droid +15 -0
- data/bin/fido +12 -0
- data/bin/pdf_copy +13 -0
- data/data/ISOcoated_v2_eci.icc +0 -0
- data/data/PDFA_def.ps +40 -0
- data/data/ead.xsd +2728 -0
- data/data/eciRGB_v2.icc +0 -0
- data/data/lias_formats.xml +106 -0
- data/data/types.yml +217 -0
- data/lib/libis/format/config.rb +35 -0
- data/lib/libis/format/converter/base.rb +101 -0
- data/lib/libis/format/converter/chain.rb +167 -0
- data/lib/libis/format/converter/image_converter.rb +214 -0
- data/lib/libis/format/converter/office_converter.rb +50 -0
- data/lib/libis/format/converter/pdf_converter.rb +139 -0
- data/lib/libis/format/converter/repository.rb +98 -0
- data/lib/libis/format/converter.rb +11 -0
- data/lib/libis/format/droid.rb +45 -0
- data/lib/libis/format/fido.rb +102 -0
- data/lib/libis/format/identifier.rb +189 -0
- data/lib/libis/format/office_to_pdf.rb +52 -0
- data/lib/libis/format/pdf_copy.rb +40 -0
- data/lib/libis/format/pdf_merge.rb +41 -0
- data/lib/libis/format/pdf_split.rb +39 -0
- data/lib/libis/format/pdf_to_pdfa.rb +76 -0
- data/lib/libis/format/pdfa_validator.rb +61 -0
- data/lib/libis/format/type_database.rb +170 -0
- data/lib/libis/format/version.rb +5 -0
- data/lib/libis/format.rb +23 -0
- data/lib/libis-format.rb +1 -0
- data/libis-format.gemspec +34 -0
- data/spec/converter_spec.rb +212 -0
- data/spec/data/Cevennes2.bmp +0 -0
- data/spec/data/Cevennes2.jp2 +0 -0
- data/spec/data/Cevennes2.ppm +22492 -0
- data/spec/data/test-ead.xml +392 -0
- data/spec/data/test-jpg.tif +0 -0
- data/spec/data/test-lzw.tif +0 -0
- data/spec/data/test-options.jpg +0 -0
- data/spec/data/test.bmp +0 -0
- data/spec/data/test.doc +0 -0
- data/spec/data/test.docx +0 -0
- data/spec/data/test.gif +0 -0
- data/spec/data/test.jpg +0 -0
- data/spec/data/test.ods +0 -0
- data/spec/data/test.odt +0 -0
- data/spec/data/test.pdf +0 -0
- data/spec/data/test.pdf.tif +0 -0
- data/spec/data/test.png +0 -0
- data/spec/data/test.ps +8631 -0
- data/spec/data/test.psd +0 -0
- data/spec/data/test.rtf +1455 -0
- data/spec/data/test.tif +0 -0
- data/spec/data/test.txt +12 -0
- data/spec/data/test.xcf +0 -0
- data/spec/data/test.xls +0 -0
- data/spec/data/test.xlsx +0 -0
- data/spec/data/test.xml +4 -0
- data/spec/data/test_pdfa.pdf +0 -0
- data/spec/identifier_spec.rb +60 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/test_types.yml +12 -0
- data/spec/type_database_spec.rb +140 -0
- data/tools/PdfTool.jar +0 -0
- data/tools/bcpkix-jdk15on-1.49.jar +0 -0
- data/tools/bcprov-jdk15on-1.49.jar +0 -0
- data/tools/droid/DROID_SignatureFile_V82.xml +32681 -0
- data/tools/droid/container-signature-20150307.xml +2235 -0
- data/tools/droid/droid-command-line-6.1.5.jar +0 -0
- data/tools/droid/droid.bat +154 -0
- data/tools/droid/droid.sh +138 -0
- data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
- data/tools/droid/lib/activation-1.1.jar +0 -0
- data/tools/droid/lib/antlr-2.7.7.jar +0 -0
- data/tools/droid/lib/antlr-3.2.jar +0 -0
- data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
- data/tools/droid/lib/aopalliance-1.0.jar +0 -0
- data/tools/droid/lib/asm-2.2.3.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
- data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
- data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
- data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
- data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
- data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
- data/tools/droid/lib/classmate-1.0.0.jar +0 -0
- data/tools/droid/lib/commons-cli-1.2.jar +0 -0
- data/tools/droid/lib/commons-codec-1.4.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
- data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
- data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
- data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
- data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
- data/tools/droid/lib/commons-io-2.4.jar +0 -0
- data/tools/droid/lib/commons-lang-2.6.jar +0 -0
- data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
- data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
- data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
- data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
- data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
- data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-help-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
- data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
- data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
- data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
- data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
- data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
- data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
- data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
- data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
- data/tools/droid/lib/itext-2.0.8.jar +0 -0
- data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
- data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
- data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
- data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
- data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
- data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
- data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
- data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
- data/tools/droid/lib/jta-1.1.jar +0 -0
- data/tools/droid/lib/log4j-1.2.13.jar +0 -0
- data/tools/droid/lib/neethi-2.0.4.jar +0 -0
- data/tools/droid/lib/opencsv-2.3.jar +0 -0
- data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
- data/tools/droid/lib/poi-3.7.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
- data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
- data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
- data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
- data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
- data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
- data/tools/droid/lib/truezip-6.8.4.jar +0 -0
- data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
- data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
- data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
- data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
- data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
- data/tools/droid/lib/xz-1.0.jar +0 -0
- data/tools/fido/__init__.py +0 -0
- data/tools/fido/argparselocal.py +2355 -0
- data/tools/fido/conf/DROID_SignatureFile-v81.xml +2 -0
- data/tools/fido/conf/container-signature-20150307.xml +2238 -0
- data/tools/fido/conf/dc.xsd +119 -0
- data/tools/fido/conf/dcmitype.xsd +53 -0
- data/tools/fido/conf/dcterms.xsd +383 -0
- data/tools/fido/conf/fido-formats.xsd +173 -0
- data/tools/fido/conf/format_extension_template.xml +105 -0
- data/tools/fido/conf/format_extensions.xml +498 -0
- data/tools/fido/conf/formats-v81.xml +38355 -0
- data/tools/fido/conf/pronom-xml-v81.zip +0 -0
- data/tools/fido/conf/versions.xml +8 -0
- data/tools/fido/fido.bat +4 -0
- data/tools/fido/fido.py +854 -0
- data/tools/fido/fido.sh +5 -0
- data/tools/fido/prepare.py +616 -0
- data/tools/fido/pronomutils.py +115 -0
- data/tools/fido/toxml.py +52 -0
- data/tools/fido/update_signatures.py +171 -0
- data/tools/pdfbox/pdfbox-app-1.8.10.jar +0 -0
- data/tools/pdfbox/preflight-app-1.8.10.jar +0 -0
- metadata +396 -0
data/tools/fido/fido.py
ADDED
@@ -0,0 +1,854 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
import sys, re, os, time, math
|
4
|
+
import hashlib, urllib, urlparse, csv, getopt
|
5
|
+
from xml.etree import cElementTree as ET
|
6
|
+
from xml.etree import ElementTree as CET
|
7
|
+
from xml.etree import ElementTree as VET # versions.xml
|
8
|
+
|
9
|
+
version = '1.3.1'
|
10
|
+
defaults = {'bufsize': 128 * 1024, # (bytes)
|
11
|
+
'regexcachesize' :2084, # (bytes)
|
12
|
+
'conf_dir' : os.path.join(os.path.dirname(__file__), 'conf'),
|
13
|
+
'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
|
14
|
+
'printnomatch' : "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
|
15
|
+
'format_files': ['formats-v75.xml', 'format_extensions.xml'],
|
16
|
+
'containersignature_file' : 'container-signature-20150307.xml',
|
17
|
+
# versions.xml is where fido.py reads version information
|
18
|
+
# about which xml to load
|
19
|
+
'versions_file' : 'versions.xml',
|
20
|
+
'container_bufsize' : 512 * 1024, # (bytes)
|
21
|
+
'description' : """
|
22
|
+
Format Identification for Digital Objects (fido).
|
23
|
+
FIDO is a command-line tool to identify the file formats of digital objects.
|
24
|
+
It is designed for simple integration into automated work-flows.
|
25
|
+
""",
|
26
|
+
'epilog' : """
|
27
|
+
Open Planets Foundation (http://www.openplanetsfoundation.org)
|
28
|
+
See License.txt for license information.
|
29
|
+
Download from: https://github.com/openplanets/fido/releases
|
30
|
+
Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
|
31
|
+
Author: Adam Farquhar (BL), 2010
|
32
|
+
Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
|
33
|
+
FIDO uses the UK National Archives (TNA) PRONOM File Format
|
34
|
+
and Container descriptions.
|
35
|
+
PRONOM is available from http://www.nationalarchives.gov.uk/pronom/"""
|
36
|
+
}
|
37
|
+
|
38
|
+
class Fido:
|
39
|
+
def __init__(self, quiet=False, bufsize=None, container_bufsize = None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=None, format_files=None, containersignature_file=None):
|
40
|
+
global defaults
|
41
|
+
self.quiet = quiet
|
42
|
+
self.bufsize = (defaults['bufsize'] if bufsize == None else bufsize)
|
43
|
+
self.container_bufsize = (defaults['container_bufsize'] if container_bufsize == None else container_bufsize)
|
44
|
+
self.printmatch = (defaults['printmatch'] if printmatch == None else printmatch)
|
45
|
+
self.printnomatch = (defaults['printnomatch'] if printnomatch == None else printnomatch)
|
46
|
+
self.handle_matches = (self.print_matches if handle_matches == None else handle_matches)
|
47
|
+
self.zip = zip
|
48
|
+
self.nocontainer = (defaults['nocontainer'] if nocontainer == None else nocontainer)
|
49
|
+
self.conf_dir = defaults['conf_dir'] if conf_dir == None else conf_dir
|
50
|
+
# print defaults
|
51
|
+
# sys.exit()
|
52
|
+
self.format_files = defaults['format_files'] if format_files == None else format_files
|
53
|
+
#self.containersignature_file = defaults['containersignature_file'] if containersignature_file == None else containersignature_file
|
54
|
+
self.containersignature_file = defaults['containersignature_file'] #if containersignature_file == None else containersignature_file
|
55
|
+
self.formats = []
|
56
|
+
self.puid_format_map = {}
|
57
|
+
self.puid_has_priority_over_map = {}
|
58
|
+
# load signatures
|
59
|
+
for xml_file in self.format_files:
|
60
|
+
self.load_fido_xml(os.path.join(os.path.abspath(self.conf_dir), xml_file))
|
61
|
+
self.load_container_signature(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
|
62
|
+
self.current_file = ''
|
63
|
+
self.current_filesize = 0
|
64
|
+
self.current_format = None
|
65
|
+
self.current_sig = None
|
66
|
+
self.current_pat = None
|
67
|
+
self.current_count = 0 # Count of calls to match_formats
|
68
|
+
re._MAXCACHE = defaults['regexcachesize']
|
69
|
+
self.externalsig = ET.XML('<signature><name>External</name></signature>')
|
70
|
+
|
71
|
+
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
72
|
+
#_special = '$*+.?![]^\\{|}'
|
73
|
+
_special = '$()*+.?![]^\\{|}'
|
74
|
+
_hex = '0123456789abcdef'
|
75
|
+
def _escape_char(self,c):
|
76
|
+
if c in '\n':
|
77
|
+
return '\\n'
|
78
|
+
elif c == '\r':
|
79
|
+
return '\\r'
|
80
|
+
elif c in self._special:
|
81
|
+
return '\\' + c
|
82
|
+
else:
|
83
|
+
(high, low) = divmod(ord(c), 16)
|
84
|
+
return '\\x' + self._hex[high] + self._hex[low]
|
85
|
+
|
86
|
+
def escape(self,string):
|
87
|
+
"Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
|
88
|
+
escaped = ''.join(c if c in self._ordinary else self._escape_char(c) for c in string)
|
89
|
+
return escaped
|
90
|
+
|
91
|
+
def convert_container_sequence(self,sig):
|
92
|
+
"""Parse the PRONOM container sequences
|
93
|
+
and convert to regular expressions
|
94
|
+
"""
|
95
|
+
seq = '(?s)'
|
96
|
+
inq = False
|
97
|
+
byt = False
|
98
|
+
rng = False
|
99
|
+
ror = False
|
100
|
+
for i in range(len(sig)):
|
101
|
+
if not inq and not rng:
|
102
|
+
if sig[i] == "'":
|
103
|
+
inq = True
|
104
|
+
continue
|
105
|
+
if sig[i] == " ":
|
106
|
+
continue
|
107
|
+
if sig[i] == "[":
|
108
|
+
seq += "("
|
109
|
+
rng = True
|
110
|
+
continue
|
111
|
+
if not byt:
|
112
|
+
seq += "\\x" + sig[i].lower()
|
113
|
+
byt = True
|
114
|
+
continue
|
115
|
+
if byt:
|
116
|
+
seq += sig[i].lower()
|
117
|
+
byt = False
|
118
|
+
continue
|
119
|
+
if inq:
|
120
|
+
if sig[i] == "'" and not rng:
|
121
|
+
inq = False
|
122
|
+
continue
|
123
|
+
seq += self.escape(sig[i])
|
124
|
+
continue
|
125
|
+
if rng:
|
126
|
+
if sig[i] == "]":
|
127
|
+
seq += ")"
|
128
|
+
rng = False
|
129
|
+
continue
|
130
|
+
if sig[i] != "-" and sig[i] != "'" and ror:
|
131
|
+
seq += self.escape(sig[i])
|
132
|
+
continue
|
133
|
+
if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and not byt:
|
134
|
+
seq += "\\x" + sig[i].lower()
|
135
|
+
byt = True
|
136
|
+
continue
|
137
|
+
if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and byt:
|
138
|
+
seq += sig[i].lower()
|
139
|
+
byt = False
|
140
|
+
continue
|
141
|
+
if sig[i] == "-" or sig[i] == " ":
|
142
|
+
seq += "|"
|
143
|
+
continue
|
144
|
+
if sig[i] == "'" and not ror:
|
145
|
+
ror = True
|
146
|
+
continue
|
147
|
+
if sig[i] == "'" and ror:
|
148
|
+
ror = False
|
149
|
+
continue
|
150
|
+
#print seq
|
151
|
+
return seq
|
152
|
+
|
153
|
+
def load_container_signature(self, containersignature_file):
|
154
|
+
"""Load the PRONOM container-signature file
|
155
|
+
and convert sequences to regular expressions
|
156
|
+
"""
|
157
|
+
tree = CET.parse(containersignature_file)
|
158
|
+
# load and have container signatures converted
|
159
|
+
self.sequenceSignature = {}
|
160
|
+
for signature in tree.getroot().findall('ContainerSignatures/ContainerSignature'):
|
161
|
+
signatureId = signature.get('Id')
|
162
|
+
signatureSequence = signature.findall('Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence')
|
163
|
+
self.sequenceSignature[signatureId] = []
|
164
|
+
for sequence in signatureSequence:
|
165
|
+
self.sequenceSignature[signatureId].append(self.convert_container_sequence(sequence[0].text))
|
166
|
+
# find PUIDs which trigger container matching
|
167
|
+
self.puidTriggers = {}
|
168
|
+
triggers = tree.find('TriggerPuids')
|
169
|
+
for puid in triggers.findall('TriggerPuid'):
|
170
|
+
self.puidTriggers[puid.get('Puid')] = True
|
171
|
+
# map PUID to container signatureId
|
172
|
+
self.puidMapping = {}
|
173
|
+
mappings = tree.find('FileFormatMappings')
|
174
|
+
for mapping in mappings.findall('FileFormatMapping'):
|
175
|
+
if mapping.get('signatureId') not in self.puidMapping:
|
176
|
+
self.puidMapping[mapping.get('signatureId')] = []
|
177
|
+
self.puidMapping[mapping.get('signatureId')].append(mapping.get('Puid'))
|
178
|
+
# print "sequences:\n",self.sequenceSignature
|
179
|
+
# print "trigger:\n",self.puidTriggers
|
180
|
+
# print "mapping:\n",self.puidMapping
|
181
|
+
# exit()
|
182
|
+
|
183
|
+
def load_fido_xml(self, file):
|
184
|
+
"""Load the fido format information from @param file.
|
185
|
+
As a side-effect, set self.formats
|
186
|
+
@return list of ElementTree.Element, one for each format.
|
187
|
+
"""
|
188
|
+
tree = ET.parse(file)
|
189
|
+
#print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
|
190
|
+
#TODO: Handle empty regexes properly; perhaps remove from the format list
|
191
|
+
for element in tree.getroot().findall('./format'):
|
192
|
+
puid = self.get_puid(element)
|
193
|
+
# Handle over-writes in multiple file loads
|
194
|
+
existing = self.puid_format_map.get(puid, False)
|
195
|
+
if existing:
|
196
|
+
# Already have one, so replace old with new!
|
197
|
+
self.formats[self.formats.index(existing)] = element
|
198
|
+
else:
|
199
|
+
self.formats.append(element)
|
200
|
+
self.puid_format_map[puid] = element
|
201
|
+
# Build some structures to speed things up
|
202
|
+
self.puid_has_priority_over_map[puid] = frozenset([puid_element.text for puid_element in element.findall('has_priority_over')])
|
203
|
+
return self.formats
|
204
|
+
|
205
|
+
# To delete a format: (1) remove from self.formats, (2) remove from puid_format_map, (3) remove from selt.puid_has_priority_over_map
|
206
|
+
def get_signatures(self, format):
|
207
|
+
return format.findall('signature')
|
208
|
+
|
209
|
+
def has_priority_over(self, format, possibly_inferior):
|
210
|
+
return self.get_puid(possibly_inferior)in self.puid_has_priority_over_map[self.get_puid(format)]
|
211
|
+
|
212
|
+
def get_puid(self, format):
|
213
|
+
return format.find('puid').text
|
214
|
+
|
215
|
+
def get_patterns(self, signature):
|
216
|
+
return signature.findall('pattern')
|
217
|
+
|
218
|
+
def get_pos(self, pat):
|
219
|
+
return pat.find('position').text
|
220
|
+
|
221
|
+
def get_regex(self, pat):
|
222
|
+
return pat.find('regex').text
|
223
|
+
|
224
|
+
def get_extension(self, format):
|
225
|
+
return format.find('extension').text
|
226
|
+
|
227
|
+
def print_matches(self, fullname, matches, delta_t, matchtype=''):
|
228
|
+
"""The default match handler. Prints out information for each match in the list.
|
229
|
+
@param fullname is name of the file being matched
|
230
|
+
@param matches is a list of (format, signature)
|
231
|
+
@param delta_t is the time taken for the match.
|
232
|
+
@param matchtype is the type of match (signature, containersignature, extension, fail)
|
233
|
+
"""
|
234
|
+
class Info:
|
235
|
+
pass
|
236
|
+
obj = Info()
|
237
|
+
obj.count = self.current_count
|
238
|
+
obj.group_size = len(matches)
|
239
|
+
obj.filename = fullname
|
240
|
+
obj.time = int(delta_t * 1000)
|
241
|
+
obj.filesize = self.current_filesize
|
242
|
+
obj.matchtype = matchtype
|
243
|
+
if len(matches) == 0:
|
244
|
+
sys.stdout.write(self.printnomatch % { "info.time" : obj.time, "info.filesize" : obj.filesize, "info.filename" : obj.filename, "info.count"
|
245
|
+
: obj.count, "info.matchtype" : "fail" } )
|
246
|
+
else:
|
247
|
+
i = 0
|
248
|
+
for (f, s) in matches:
|
249
|
+
i += 1
|
250
|
+
obj.group_index = i
|
251
|
+
obj.puid = self.get_puid(f)
|
252
|
+
obj.formatname = f.find('name').text
|
253
|
+
obj.signaturename = s.find('name').text
|
254
|
+
mime = f.find('mime')
|
255
|
+
obj.mimetype = mime.text if mime != None else None
|
256
|
+
version = f.find('version')
|
257
|
+
obj.version = version.text if version != None else None
|
258
|
+
alias = f.find('alias')
|
259
|
+
obj.alias = alias.text if alias != None else None
|
260
|
+
apple_uti = f.find('apple_uid')
|
261
|
+
obj.apple_uti = apple_uti.text if apple_uti != None else None
|
262
|
+
sys.stdout.write(self.printmatch % { "info.time" : obj.time, "info.puid" : obj.puid, "info.formatname" : obj.formatname, "info.signaturename" : obj.signaturename, "info.filesize" : obj.filesize, "info.filename" : obj.filename, "info.mimetype" : obj.mimetype, "info.matchtype" : obj.matchtype, "info.version" : obj.version, "info.alias" : obj.alias, "info.apple_uti" : obj.apple_uti, "info.group_size" : obj.group_size, "info.group_index" : obj.group_index, "info.count" : obj.count })
|
263
|
+
|
264
|
+
def print_summary(self, secs):
|
265
|
+
"""Print summary information on the number of matches and time taken.
|
266
|
+
"""
|
267
|
+
count = self.current_count
|
268
|
+
if not self.quiet:
|
269
|
+
rate = (int(round(count / secs)) if secs != 0 else 9999)
|
270
|
+
#print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
|
271
|
+
sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
|
272
|
+
|
273
|
+
def identify_file(self, filename):
|
274
|
+
"""Identify the type of @param filename.
|
275
|
+
Call self.handle_matches instead of returning a value.
|
276
|
+
"""
|
277
|
+
self.current_file = filename
|
278
|
+
self.matchtype = "signature"
|
279
|
+
try:
|
280
|
+
t0 = time.clock()
|
281
|
+
f = open(filename, 'rb')
|
282
|
+
size = os.stat(filename)[6]
|
283
|
+
self.current_filesize = size
|
284
|
+
if self.current_filesize == 0:
|
285
|
+
sys.stderr.write("FIDO: Zero byte file (empty): Path is: {0}\n".format(filename))
|
286
|
+
bofbuffer, eofbuffer = self.get_buffers(f, size, seekable=True)
|
287
|
+
matches = self.match_formats(bofbuffer, eofbuffer)
|
288
|
+
# from here is also repeated in walk_zip
|
289
|
+
# we should make this uniform in a next version!
|
290
|
+
#
|
291
|
+
# filesize is made conditional because files with 0 bytes
|
292
|
+
# are falsely characterised being 'rtf' (due to wacky sig)
|
293
|
+
# in these cases we try to match the extension instead
|
294
|
+
if len(matches) > 0 and self.current_filesize > 0:
|
295
|
+
self.handle_matches(filename, matches, time.clock() - t0, self.matchtype)
|
296
|
+
elif len(matches) == 0 or self.current_filesize == 0:
|
297
|
+
matches = self.match_extensions(filename)
|
298
|
+
self.handle_matches(filename, matches, time.clock() - t0, "extension")
|
299
|
+
# till here matey!
|
300
|
+
if self.zip:
|
301
|
+
self.identify_contents(filename, type=self.container_type(matches))
|
302
|
+
except IOError:
|
303
|
+
#print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
|
304
|
+
sys.stderr.write("FIDO: Error in identify_file: Path is {0}\n".format(filename))
|
305
|
+
|
306
|
+
def identify_contents(self, filename, fileobj=None, type=False):
|
307
|
+
"""Identify each item in a container (such as a zip or tar file). Call self.handle_matches on each item.
|
308
|
+
@param fileobj could be a file, or a stream.
|
309
|
+
"""
|
310
|
+
if type == False:
|
311
|
+
return
|
312
|
+
elif type == 'zip':
|
313
|
+
self.walk_zip(filename, fileobj)
|
314
|
+
elif type == 'tar':
|
315
|
+
self.walk_tar(filename, fileobj)
|
316
|
+
else: # TODO: ouch!
|
317
|
+
raise RuntimeError("Unknown container type: " + repr(type))
|
318
|
+
|
319
|
+
def identify_multi_object_stream(self, stream):
|
320
|
+
"""Does not work!
|
321
|
+
Stream may contain one or more objects each with an HTTP style header that must include content-length.
|
322
|
+
The headers consist of keyword:value pairs terminated by a newline. There must be a newline following the headers.
|
323
|
+
"""
|
324
|
+
offset = 0
|
325
|
+
while True:
|
326
|
+
t0 = time.clock()
|
327
|
+
content_length = -1
|
328
|
+
for line in stream:
|
329
|
+
offset += len(line)
|
330
|
+
if line == '\n':
|
331
|
+
if content_length < 0:
|
332
|
+
raise EnvironmentError("No content-length provided.")
|
333
|
+
else:
|
334
|
+
break
|
335
|
+
pair = line.lower().split(':', 2)
|
336
|
+
if pair[0] == 'content-length':
|
337
|
+
content_length = int(pair[1])
|
338
|
+
if content_length == -1:
|
339
|
+
return
|
340
|
+
# Consume exactly content-length bytes
|
341
|
+
self.current_file = 'STDIN!(at ' + str(offset) + ' bytes)'
|
342
|
+
self.current_filesize = content_length
|
343
|
+
bofbuffer, eofbuffer = self.get_buffers(stream, content_length)
|
344
|
+
matches = self.match_formats(bofbuffer, eofbuffer)
|
345
|
+
# MdR: this needs attention
|
346
|
+
if len(matches) > 0:
|
347
|
+
self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
|
348
|
+
elif len(matches) == 0 or self.current_filesize == 0:
|
349
|
+
matches = self.match_extensions(self.current_file)
|
350
|
+
self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
|
351
|
+
|
352
|
+
def identify_stream(self, stream, filename):
|
353
|
+
"""Identify the type of @param stream.
|
354
|
+
Call self.handle_matches instead of returning a value.
|
355
|
+
Does not close stream.
|
356
|
+
"""
|
357
|
+
t0 = time.clock()
|
358
|
+
bofbuffer, eofbuffer, bytes_read = self.get_buffers(stream, length=None)
|
359
|
+
self.current_filesize = bytes_read
|
360
|
+
self.current_file = 'STDIN'
|
361
|
+
matches = self.match_formats(bofbuffer, eofbuffer)
|
362
|
+
# MdR: this needs attention
|
363
|
+
if len(matches) > 0:
|
364
|
+
self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
|
365
|
+
elif len(matches) == 0 or self.current_filesize == 0:
|
366
|
+
# we can only determine the filename from the STDIN stream
|
367
|
+
# on Linux, on Windows there is not a (simple) way to do that
|
368
|
+
if (os.name != "nt"):
|
369
|
+
try:
|
370
|
+
self.current_file = os.readlink("/proc/self/fd/0")
|
371
|
+
except:
|
372
|
+
if filename is not None:
|
373
|
+
self.current_file = filename
|
374
|
+
else:
|
375
|
+
self.current_file = 'STDIN'
|
376
|
+
else:
|
377
|
+
if filename is not None:
|
378
|
+
self.current_file = filename
|
379
|
+
matches = self.match_extensions(self.current_file)
|
380
|
+
# we have to reset self.current_file if not on Windows
|
381
|
+
if (os.name != "nt"):
|
382
|
+
self.current_file = 'STDIN'
|
383
|
+
self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
|
384
|
+
|
385
|
+
def container_type(self, matches):
|
386
|
+
"""Determine if one of the @param matches is the format of a container that we can look inside of (e.g., zip, tar).
|
387
|
+
@return False, zip, or tar.
|
388
|
+
"""
|
389
|
+
for (format, unused) in matches:
|
390
|
+
container = format.find('container')
|
391
|
+
if container != None:
|
392
|
+
return container.text
|
393
|
+
return False
|
394
|
+
|
395
|
+
def blocking_read(self, file, bytes_to_read):
|
396
|
+
bytes_read = 0
|
397
|
+
buffer = ''
|
398
|
+
while bytes_read < bytes_to_read:
|
399
|
+
readbuffer = file.read(bytes_to_read - bytes_read)
|
400
|
+
buffer += readbuffer
|
401
|
+
bytes_read = len(buffer)
|
402
|
+
# break out if EOF is reached.
|
403
|
+
if readbuffer == '':
|
404
|
+
break
|
405
|
+
return buffer
|
406
|
+
|
407
|
+
def get_buffers(self, stream, length=None, seekable=False):
|
408
|
+
"""Return buffers from the beginning and end of stream and the number of bytes read
|
409
|
+
if there may be more bytes in the stream.
|
410
|
+
|
411
|
+
If length is None, return the length as found.
|
412
|
+
If seekable is False, the steam does not support a seek operation.
|
413
|
+
"""
|
414
|
+
bytes_to_read = self.bufsize if length == None else min(length, self.bufsize)
|
415
|
+
bofbuffer = self.blocking_read(stream, bytes_to_read)
|
416
|
+
bytes_read = len(bofbuffer)
|
417
|
+
if length == None:
|
418
|
+
# A stream with unknown length; have to keep two buffers around
|
419
|
+
prevbuffer = bofbuffer
|
420
|
+
while True:
|
421
|
+
buffer = self.blocking_read(stream, self.bufsize)
|
422
|
+
bytes_read += len(buffer)
|
423
|
+
if len(buffer) == self.bufsize:
|
424
|
+
prevbuffer = buffer
|
425
|
+
else:
|
426
|
+
eofbuffer = prevbuffer if len(buffer) == 0 else prevbuffer[-(self.bufsize - len(buffer)):] + buffer
|
427
|
+
break
|
428
|
+
return bofbuffer, eofbuffer, bytes_read
|
429
|
+
else:
|
430
|
+
bytes_unread = length - len(bofbuffer)
|
431
|
+
if bytes_unread == 0:
|
432
|
+
eofbuffer = bofbuffer
|
433
|
+
elif bytes_unread < self.bufsize:
|
434
|
+
# The buffs overlap
|
435
|
+
eofbuffer = bofbuffer[bytes_unread:] + self.blocking_read(stream, bytes_unread)
|
436
|
+
elif bytes_unread == self.bufsize:
|
437
|
+
eofbuffer = self.blocking_read(stream, self.bufsize)
|
438
|
+
elif seekable: # easy case when we can just seek!
|
439
|
+
stream.seek(length - self.bufsize)
|
440
|
+
eofbuffer = self.blocking_read(stream, self.bufsize)
|
441
|
+
else:
|
442
|
+
# We have more to read and know how much.
|
443
|
+
# n*bufsize + r = length
|
444
|
+
(n, r) = divmod(bytes_unread, self.bufsize)
|
445
|
+
# skip n-1*bufsize bytes
|
446
|
+
for unused_i in xrange(1, n):
|
447
|
+
self.blocking_read(stream, self.bufsize)
|
448
|
+
# skip r bytes
|
449
|
+
self.blocking_read(stream, r)
|
450
|
+
# and read the remaining bufsize bytes into the eofbuffer
|
451
|
+
eofbuffer = self.blocking_read(stream, self.bufsize)
|
452
|
+
return bofbuffer, eofbuffer
|
453
|
+
|
454
|
+
def walk_zip(self, filename, fileobj=None):
|
455
|
+
"""Identify the type of each item in the zip
|
456
|
+
@param fileobj. If fileobj is not provided, open
|
457
|
+
@param filename.
|
458
|
+
Call self.handle_matches instead of returning a value.
|
459
|
+
"""
|
460
|
+
# IN 2.7+: with zipfile.ZipFile((fileobj if fileobj != None else filename), 'r') as stream:
|
461
|
+
import zipfile, tempfile
|
462
|
+
try:
|
463
|
+
zipstream = None
|
464
|
+
zipstream = zipfile.ZipFile((fileobj if fileobj != None else filename), 'r')
|
465
|
+
for item in zipstream.infolist():
|
466
|
+
if item.file_size == 0:
|
467
|
+
continue #TODO: Find a better test for isdir
|
468
|
+
t0 = time.clock()
|
469
|
+
# with zipstream.open(item) as f:
|
470
|
+
f = None
|
471
|
+
try:
|
472
|
+
f = zipstream.open(item)
|
473
|
+
item_name = filename + '!' + item.filename
|
474
|
+
self.current_file = item_name
|
475
|
+
self.current_filesize = item.file_size
|
476
|
+
if self.current_filesize == 0:
|
477
|
+
sys.stderr.write("FIDO: Zero byte file (empty): Path is: {0}\n".format(item_name))
|
478
|
+
bofbuffer, eofbuffer = self.get_buffers(f, item.file_size)
|
479
|
+
finally:
|
480
|
+
if f != None: f.close()
|
481
|
+
matches = self.match_formats(bofbuffer, eofbuffer)
|
482
|
+
if len(matches) > 0 and self.current_filesize > 0:
|
483
|
+
self.handle_matches(item_name, matches, time.clock() - t0, "signature")
|
484
|
+
elif len(matches) == 0 or self.current_filesize == 0:
|
485
|
+
matches = self.match_extensions(item_name)
|
486
|
+
self.handle_matches(item_name, matches, time.clock() - t0, "extension")
|
487
|
+
if self.container_type(matches):
|
488
|
+
target = tempfile.SpooledTemporaryFile(prefix='Fido')
|
489
|
+
#with zipstream.open(item) as source:
|
490
|
+
try:
|
491
|
+
source = zipstream.open(item)
|
492
|
+
self.copy_stream(source, target)
|
493
|
+
#target.seek(0)
|
494
|
+
self.identify_contents(item_name, target, self.container_type(matches))
|
495
|
+
finally:
|
496
|
+
source.close()
|
497
|
+
except IOError:
|
498
|
+
sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
|
499
|
+
except zipfile.BadZipfile:
|
500
|
+
sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
|
501
|
+
|
502
|
+
finally:
|
503
|
+
if zipstream != None: zipstream.close()
|
504
|
+
|
505
|
+
def walk_tar(self, filename, fileobj):
|
506
|
+
"""Identify the type of each item in the tar
|
507
|
+
@param fileobj. If fileobj is not provided, open
|
508
|
+
@param filename.
|
509
|
+
Call self.handle_matches instead of returning a value.
|
510
|
+
"""
|
511
|
+
import tarfile
|
512
|
+
tarstream = None
|
513
|
+
try:
|
514
|
+
tarstream = tarfile.TarFile(filename, fileobj=fileobj, mode='r')
|
515
|
+
for item in tarstream.getmembers():
|
516
|
+
if item.isfile():
|
517
|
+
t0 = time.clock()
|
518
|
+
f = tarstream.extractfile(item)
|
519
|
+
tar_item_name = filename + '!' + item.name
|
520
|
+
self.current_file = tar_item_name
|
521
|
+
self.current_filesize = item.size
|
522
|
+
bofbuffer, eofbuffer = self.get_buffers(f, item.size)
|
523
|
+
matches = self.match_formats(bofbuffer, eofbuffer)
|
524
|
+
self.handle_matches(tar_item_name, matches, time.clock() - t0)
|
525
|
+
if self.container_type(matches):
|
526
|
+
f.seek(0)
|
527
|
+
self.identify_contents(tar_item_name, f, self.container_type(matches))
|
528
|
+
f.close()
|
529
|
+
except tarfile.TarError:
|
530
|
+
sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
|
531
|
+
finally:
|
532
|
+
if tarstream != None: tarstream.close()
|
533
|
+
|
534
|
+
def as_good_as_any(self, f1, match_list):
|
535
|
+
"""Return True if the proposed format is as good as any in the match_list.
|
536
|
+
For example, if there is no format in the match_list that has priority over the proposed one"""
|
537
|
+
if match_list != []:
|
538
|
+
f1_puid = self.get_puid(f1)
|
539
|
+
for (f2, unused) in match_list:
|
540
|
+
if f1 == f2:
|
541
|
+
continue
|
542
|
+
elif f1_puid in self.puid_has_priority_over_map[self.get_puid(f2)]:
|
543
|
+
return False
|
544
|
+
return True
|
545
|
+
|
546
|
+
def buffered_read(self, file_pos, overlap):
|
547
|
+
"""Buffered read of data chunks
|
548
|
+
"""
|
549
|
+
buf = ""
|
550
|
+
if not overlap:
|
551
|
+
bufsize = self.container_bufsize
|
552
|
+
else:
|
553
|
+
bufsize = self.container_bufsize + self.overlap_range
|
554
|
+
file_end = self.current_filesize
|
555
|
+
file_handle = file(self.current_file, 'rb')
|
556
|
+
file_handle.seek(file_pos)
|
557
|
+
if file_end - file_pos < bufsize:
|
558
|
+
file_read = file_end - file_pos
|
559
|
+
else:
|
560
|
+
file_read = self.bufsize
|
561
|
+
buf = file_handle.read(file_read)
|
562
|
+
return buf
|
563
|
+
|
564
|
+
def read_container(self,parent_buffer,parent_result):
|
565
|
+
"""Header of compound containers can be further away than default 128 KB buffer
|
566
|
+
especially with big files containing binary objects.
|
567
|
+
This function reads containers in chunks of 512 KB (defaults['container_bufsize'])
|
568
|
+
Each chunk is inspected with the PRONOM container sequences.
|
569
|
+
Each chunk smuggles in a piece from the previous chunk to prevent
|
570
|
+
cutting off patterns we are looking for in the middle.
|
571
|
+
This method is somewhat slower than reading the complete file at once.
|
572
|
+
This is to prevent Fido to potentially crash in the midst of scanning a very big file.
|
573
|
+
NOTE (MdR): this piece of code is still a bit quirky
|
574
|
+
as it does not yet takes byte positions into account which
|
575
|
+
are available in the DROID container signature file
|
576
|
+
"""
|
577
|
+
container_result = []
|
578
|
+
nobuffer = False
|
579
|
+
overlap = False
|
580
|
+
self.overlap_range = 512 # bytes
|
581
|
+
container_hit = False
|
582
|
+
passes = 1
|
583
|
+
container_buffer = ""
|
584
|
+
# TODO: find better way to handle zip contents
|
585
|
+
# for now: ugly hack, but working
|
586
|
+
# this slows down because the zip is re-opened on each item
|
587
|
+
# if "!" is in filename, it is a zip item
|
588
|
+
# if "!" in self.current_file:
|
589
|
+
# import zipfile, tempfile
|
590
|
+
# zip, item = self.current_file.split("!")
|
591
|
+
# zipitem = tempfile.SpooledTemporaryFile(prefix='Fido')
|
592
|
+
#with zipstream.open(item) as source:
|
593
|
+
# try:
|
594
|
+
# source = zipstream.open(item)
|
595
|
+
# self.copy_stream(source, target)
|
596
|
+
# target.seek(0)
|
597
|
+
# self.identify_contents(item_name, target, self.container_type(matches))
|
598
|
+
# finally:
|
599
|
+
# source.close()
|
600
|
+
#exit()
|
601
|
+
# in case argument 'nocontainer' is set
|
602
|
+
# read default bofbuffer
|
603
|
+
if self.nocontainer or self.current_filesize <= self.bufsize or self.current_file == "STDIN":
|
604
|
+
passes = 1
|
605
|
+
nobuffer = True
|
606
|
+
else:
|
607
|
+
passes = int(float(self.current_filesize / self.container_bufsize) + 1)
|
608
|
+
pos = 0
|
609
|
+
for i in xrange(passes):
|
610
|
+
if nobuffer is True:
|
611
|
+
container_buffer = parent_buffer
|
612
|
+
else:
|
613
|
+
if i == 0:
|
614
|
+
pos = 0
|
615
|
+
else:
|
616
|
+
pos = ((self.container_bufsize * i) - self.overlap_range)
|
617
|
+
overlap = True
|
618
|
+
container_buffer = self.buffered_read(pos, overlap)
|
619
|
+
for (container_id,container_regexes) in self.sequenceSignature.iteritems():
|
620
|
+
# set hitcounter in case a container entry
|
621
|
+
# has more than one regex
|
622
|
+
hitcounter = 0
|
623
|
+
if len(container_regexes) > 0:
|
624
|
+
for container_regex in container_regexes:
|
625
|
+
if re.search(container_regex, container_buffer):
|
626
|
+
hitcounter += 1
|
627
|
+
# if the hitcounter matches the number of regexes
|
628
|
+
# then it must be a positive hit, else continue
|
629
|
+
# to match the rest of the sequences
|
630
|
+
if hitcounter < len(container_regexes):
|
631
|
+
continue
|
632
|
+
self.matchtype = "container"
|
633
|
+
for container_puid in self.puidMapping[container_id]:
|
634
|
+
for container_format in self.formats:
|
635
|
+
if container_format.find('puid').text == container_puid:
|
636
|
+
if self.as_good_as_any(container_format, parent_result):
|
637
|
+
for container_sig in self.get_signatures(container_format):
|
638
|
+
container_result.append((container_format, container_sig))
|
639
|
+
break
|
640
|
+
return container_result
|
641
|
+
|
642
|
+
def match_formats(self, bofbuffer, eofbuffer):
|
643
|
+
"""Apply the patterns for formats to the supplied buffers.
|
644
|
+
@return a match list of (format, signature) tuples.
|
645
|
+
The list has inferior matches removed.
|
646
|
+
"""
|
647
|
+
self.current_count += 1
|
648
|
+
#t0 = time.clock()
|
649
|
+
result = []
|
650
|
+
container_result = []
|
651
|
+
for format in self.formats:
|
652
|
+
try:
|
653
|
+
self.current_format = format
|
654
|
+
if self.as_good_as_any(format, result):
|
655
|
+
for sig in self.get_signatures(format):
|
656
|
+
self.current_sig = sig
|
657
|
+
success = True
|
658
|
+
for pat in self.get_patterns(sig):
|
659
|
+
self.current_pat = pat
|
660
|
+
pos = self.get_pos(pat)
|
661
|
+
regex = self.get_regex(pat)
|
662
|
+
#print 'trying ', regex
|
663
|
+
if pos == 'BOF':
|
664
|
+
if not re.match(regex, bofbuffer):
|
665
|
+
success = False
|
666
|
+
break
|
667
|
+
elif pos == 'EOF':
|
668
|
+
if not re.search(regex, eofbuffer):
|
669
|
+
success = False
|
670
|
+
break
|
671
|
+
elif pos == 'VAR':
|
672
|
+
if not re.search(regex, bofbuffer):
|
673
|
+
success = False
|
674
|
+
break
|
675
|
+
elif pos == 'IFB':
|
676
|
+
if not re.search(regex, bofbuffer):
|
677
|
+
success = False
|
678
|
+
break
|
679
|
+
if success:
|
680
|
+
result.append((format, sig))
|
681
|
+
# check if file needs to be parsed with container signature
|
682
|
+
# we skip files with extension "zip" (x-fmt/263)
|
683
|
+
ext = os.path.splitext(self.current_file)[1].lower().lstrip(".")
|
684
|
+
if format.find('puid').text in self.puidTriggers and ext != "zip":
|
685
|
+
container_result = self.read_container(bofbuffer,result)
|
686
|
+
if len(container_result) != 0:
|
687
|
+
for (k,v) in container_result:
|
688
|
+
result.append((k,v))
|
689
|
+
break
|
690
|
+
except Exception as e:
|
691
|
+
sys.stderr.write(str(e)+"\n")
|
692
|
+
continue
|
693
|
+
# TODO: MdR: needs some <3
|
694
|
+
#print "Unexpected error:", sys.exc_info()[0], e
|
695
|
+
#sys.stdout.write('***', self.get_puid(format), regex)
|
696
|
+
|
697
|
+
# t1 = time.clock()
|
698
|
+
# if t1 - t0 > 0.02:
|
699
|
+
# print >> sys.stderr, "FIDO: Slow ID", self.current_file
|
700
|
+
result = [match for match in result if self.as_good_as_any(match[0], result)]
|
701
|
+
result = list(set(result)) # remove duplicate results, this is due to ??? in self.read_container(), needs fix
|
702
|
+
return result
|
703
|
+
|
704
|
+
def match_extensions(self, filename):
|
705
|
+
"Return the list of (format, self.externalsig) for every format whose extension matches the filename."
|
706
|
+
myext = os.path.splitext(filename)[1].lower().lstrip(".")
|
707
|
+
result = []
|
708
|
+
if len(myext) > 0:
|
709
|
+
for element in self.formats:
|
710
|
+
if element.findall('extension') != None:
|
711
|
+
for format in element.findall('extension'):
|
712
|
+
if myext == format.text:
|
713
|
+
result.append((element, self.externalsig))
|
714
|
+
break
|
715
|
+
result = [match for match in result if self.as_good_as_any(match[0], result)]
|
716
|
+
return result
|
717
|
+
|
718
|
+
def copy_stream(self, source, target):
|
719
|
+
while True:
|
720
|
+
buf = source.read(self.bufsize)
|
721
|
+
if len(buf) == 0:
|
722
|
+
break
|
723
|
+
target.write(buf)
|
724
|
+
|
725
|
+
def list_files(roots, recurse=False):
|
726
|
+
"Return the files one at a time. Roots could be a fileobj or a list."
|
727
|
+
for root in roots:
|
728
|
+
root = (root if root[-1] != '\n' else root[:-1])
|
729
|
+
root = os.path.normpath(root)
|
730
|
+
if os.path.isfile(root):
|
731
|
+
yield root
|
732
|
+
else:
|
733
|
+
for path, unused, files in os.walk(root):
|
734
|
+
for f in files:
|
735
|
+
yield os.path.join(path, f)
|
736
|
+
if recurse == False:
|
737
|
+
break
|
738
|
+
|
739
|
+
def main(arglist=None):
|
740
|
+
# The argparse package was introduced in 2.7
|
741
|
+
t0 = time.clock()
|
742
|
+
from argparselocal import ArgumentParser, RawTextHelpFormatter
|
743
|
+
if arglist == None:
|
744
|
+
arglist = sys.argv[1:]
|
745
|
+
if len(arglist) == False:
|
746
|
+
arglist.append("-h")
|
747
|
+
parser = ArgumentParser(description=defaults['description'], epilog=defaults['epilog'], fromfile_prefix_chars='@', formatter_class=RawTextHelpFormatter)
|
748
|
+
parser.add_argument('-v', default=False, action='store_true', help='show version information')
|
749
|
+
parser.add_argument('-q', default=False, action='store_true', help='run (more) quietly')
|
750
|
+
parser.add_argument('-recurse', default=False, action='store_true', help='recurse into subdirectories')
|
751
|
+
parser.add_argument('-zip', default=False, action='store_true', help='recurse into zip and tar files')
|
752
|
+
parser.add_argument('-nocontainer', default=False, action='store_true', help='disable deep scan of container documents, increases speed but may reduce accuracy with big files')
|
753
|
+
parser.add_argument('-pronom_only', default=False, action='store_true', help='disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results')
|
754
|
+
group = parser.add_mutually_exclusive_group()
|
755
|
+
group.add_argument('-input', default=False, help='file containing a list of files to check, one per line. - means stdin')
|
756
|
+
group.add_argument('files', nargs='*', default=[], metavar='FILE', help='files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.')
|
757
|
+
parser.add_argument('-filename', default=None, help='filename if file contents passed through STDIN')
|
758
|
+
parser.add_argument('-useformats', metavar='INCLUDEPUIDS', default=None, help='comma separated string of formats to use in identification')
|
759
|
+
parser.add_argument('-nouseformats', metavar='EXCLUDEPUIDS', default=None, help='comma separated string of formats not to use in identification')
|
760
|
+
parser.add_argument('-matchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use on match. See nomatchprintf, README.txt.')
|
761
|
+
parser.add_argument('-nomatchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use if no match. See README.txt')
|
762
|
+
parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['bufsize'])+' bytes)')
|
763
|
+
parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['container_bufsize'])+' bytes)')
|
764
|
+
|
765
|
+
parser.add_argument('-loadformats', default=None, metavar='XML1,...,XMLn', help='comma separated string of XML format files to add.')
|
766
|
+
parser.add_argument('-confdir', default=None, help='configuration directory to load_fido_xml, for example, the format specifications from.')
|
767
|
+
|
768
|
+
# what is this doing here only once?
|
769
|
+
#mydir = os.path.abspath(os.path.dirname(__file__))
|
770
|
+
|
771
|
+
# PROCESS ARGUMENTS
|
772
|
+
args = parser.parse_args(arglist)
|
773
|
+
# print args
|
774
|
+
# sys.exit()
|
775
|
+
# process confdir
|
776
|
+
# load versions.xml
|
777
|
+
# and stick it in defaults
|
778
|
+
if args.confdir:
|
779
|
+
versionsFile = os.path.join(os.path.abspath(args.confdir), defaults['versions_file'])
|
780
|
+
else:
|
781
|
+
versionsFile = os.path.join(os.path.abspath(defaults['conf_dir']), defaults['versions_file'])
|
782
|
+
try:
|
783
|
+
versions = VET.parse(versionsFile)
|
784
|
+
except Exception, e:
|
785
|
+
sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
|
786
|
+
sys.exit()
|
787
|
+
defaults['xml_pronomSignature'] = versions.find("pronomSignature").text
|
788
|
+
# defaults['xml_pronomContainerSignature'] = versions.find("pronomContainerSignature").text
|
789
|
+
defaults['containersignature_file'] = versions.find("pronomContainerSignature").text
|
790
|
+
defaults['xml_fidoExtensionSignature'] = versions.find("fidoExtensionSignature").text
|
791
|
+
defaults['format_files'] = []
|
792
|
+
defaults['format_files'].append(defaults['xml_pronomSignature'])
|
793
|
+
if args.pronom_only:
|
794
|
+
versionHeader = "FIDO v{0} ({1}, {2})\n".format(version,defaults['xml_pronomSignature'],defaults['containersignature_file'])
|
795
|
+
else:
|
796
|
+
versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(version,defaults['xml_pronomSignature'],defaults['containersignature_file'],defaults['xml_fidoExtensionSignature'])
|
797
|
+
defaults['format_files'].append(defaults['xml_fidoExtensionSignature'])
|
798
|
+
|
799
|
+
if args.v :
|
800
|
+
sys.stdout.write(versionHeader)
|
801
|
+
sys.exit(0)
|
802
|
+
if args.matchprintf != None:
|
803
|
+
args.matchprintf = args.matchprintf.decode('string_escape')
|
804
|
+
if args.nomatchprintf != None:
|
805
|
+
args.nomatchprintf = args.nomatchprintf.decode('string_escape')
|
806
|
+
fido = Fido(quiet=args.q, bufsize=args.bufsize, container_bufsize=args.container_bufsize,
|
807
|
+
printmatch=args.matchprintf, printnomatch=args.nomatchprintf, zip=args.zip, nocontainer = args.nocontainer, conf_dir=args.confdir)
|
808
|
+
|
809
|
+
#TODO: Allow conf options to be dis-included
|
810
|
+
if args.loadformats:
|
811
|
+
for file in args.loadformats.split(','):
|
812
|
+
fido.load_fido_xml(file)
|
813
|
+
|
814
|
+
#TODO: remove from maps
|
815
|
+
if args.useformats:
|
816
|
+
args.useformats = args.useformats.split(',')
|
817
|
+
fido.formats = [f for f in fido.formats if f.find('puid').text in args.useformats]
|
818
|
+
elif args.nouseformats:
|
819
|
+
args.nouseformats = args.nouseformats.split(',')
|
820
|
+
fido.formats = [f for f in fido.formats if f.find('puid').text not in args.nouseformats]
|
821
|
+
|
822
|
+
# Set up to use stdin, or open input files:
|
823
|
+
if args.input == '-':
|
824
|
+
args.files = sys.stdin
|
825
|
+
elif args.input:
|
826
|
+
args.files = open(args.input, 'r')
|
827
|
+
|
828
|
+
# RUN
|
829
|
+
try:
|
830
|
+
if not args.q:
|
831
|
+
sys.stderr.write(versionHeader)
|
832
|
+
sys.stderr.flush()
|
833
|
+
if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
|
834
|
+
if fido.zip == True:
|
835
|
+
raise RuntimeError("Multiple content read from stdin not yet supported.")
|
836
|
+
sys.exit(1)
|
837
|
+
fido.identify_multi_object_stream(sys.stdin)
|
838
|
+
else:
|
839
|
+
fido.identify_stream(sys.stdin, args.filename)
|
840
|
+
else:
|
841
|
+
for file in list_files(args.files, args.recurse):
|
842
|
+
fido.identify_file(file)
|
843
|
+
except KeyboardInterrupt:
|
844
|
+
msg = "FIDO: Interrupt while identifying file {0}"
|
845
|
+
sys.stderr.write(msg.format(fido.current_file))
|
846
|
+
sys.exit(1)
|
847
|
+
|
848
|
+
if not args.q:
|
849
|
+
sys.stdout.flush()
|
850
|
+
fido.print_summary(time.clock() - t0)
|
851
|
+
sys.stderr.flush()
|
852
|
+
|
853
|
+
if __name__ == '__main__':
|
854
|
+
main()
|