libis-format 0.9.5-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (207) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +18 -0
  4. data/.travis.yml +41 -0
  5. data/Gemfile +5 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +39 -0
  8. data/Rakefile +8 -0
  9. data/bin/droid +15 -0
  10. data/bin/fido +12 -0
  11. data/bin/pdf_copy +13 -0
  12. data/data/ISOcoated_v2_eci.icc +0 -0
  13. data/data/PDFA_def.ps +40 -0
  14. data/data/ead.xsd +2728 -0
  15. data/data/eciRGB_v2.icc +0 -0
  16. data/data/lias_formats.xml +106 -0
  17. data/data/types.yml +217 -0
  18. data/lib/libis/format/config.rb +35 -0
  19. data/lib/libis/format/converter/base.rb +101 -0
  20. data/lib/libis/format/converter/chain.rb +167 -0
  21. data/lib/libis/format/converter/image_converter.rb +214 -0
  22. data/lib/libis/format/converter/office_converter.rb +50 -0
  23. data/lib/libis/format/converter/pdf_converter.rb +139 -0
  24. data/lib/libis/format/converter/repository.rb +98 -0
  25. data/lib/libis/format/converter.rb +11 -0
  26. data/lib/libis/format/droid.rb +45 -0
  27. data/lib/libis/format/fido.rb +102 -0
  28. data/lib/libis/format/identifier.rb +189 -0
  29. data/lib/libis/format/office_to_pdf.rb +52 -0
  30. data/lib/libis/format/pdf_copy.rb +40 -0
  31. data/lib/libis/format/pdf_merge.rb +41 -0
  32. data/lib/libis/format/pdf_split.rb +39 -0
  33. data/lib/libis/format/pdf_to_pdfa.rb +76 -0
  34. data/lib/libis/format/pdfa_validator.rb +61 -0
  35. data/lib/libis/format/type_database.rb +170 -0
  36. data/lib/libis/format/version.rb +5 -0
  37. data/lib/libis/format.rb +23 -0
  38. data/lib/libis-format.rb +1 -0
  39. data/libis-format.gemspec +34 -0
  40. data/spec/converter_spec.rb +212 -0
  41. data/spec/data/Cevennes2.bmp +0 -0
  42. data/spec/data/Cevennes2.jp2 +0 -0
  43. data/spec/data/Cevennes2.ppm +22492 -0
  44. data/spec/data/test-ead.xml +392 -0
  45. data/spec/data/test-jpg.tif +0 -0
  46. data/spec/data/test-lzw.tif +0 -0
  47. data/spec/data/test-options.jpg +0 -0
  48. data/spec/data/test.bmp +0 -0
  49. data/spec/data/test.doc +0 -0
  50. data/spec/data/test.docx +0 -0
  51. data/spec/data/test.gif +0 -0
  52. data/spec/data/test.jpg +0 -0
  53. data/spec/data/test.ods +0 -0
  54. data/spec/data/test.odt +0 -0
  55. data/spec/data/test.pdf +0 -0
  56. data/spec/data/test.pdf.tif +0 -0
  57. data/spec/data/test.png +0 -0
  58. data/spec/data/test.ps +8631 -0
  59. data/spec/data/test.psd +0 -0
  60. data/spec/data/test.rtf +1455 -0
  61. data/spec/data/test.tif +0 -0
  62. data/spec/data/test.txt +12 -0
  63. data/spec/data/test.xcf +0 -0
  64. data/spec/data/test.xls +0 -0
  65. data/spec/data/test.xlsx +0 -0
  66. data/spec/data/test.xml +4 -0
  67. data/spec/data/test_pdfa.pdf +0 -0
  68. data/spec/identifier_spec.rb +60 -0
  69. data/spec/spec_helper.rb +9 -0
  70. data/spec/test_types.yml +12 -0
  71. data/spec/type_database_spec.rb +140 -0
  72. data/tools/PdfTool.jar +0 -0
  73. data/tools/bcpkix-jdk15on-1.49.jar +0 -0
  74. data/tools/bcprov-jdk15on-1.49.jar +0 -0
  75. data/tools/droid/DROID_SignatureFile_V82.xml +32681 -0
  76. data/tools/droid/container-signature-20150307.xml +2235 -0
  77. data/tools/droid/droid-command-line-6.1.5.jar +0 -0
  78. data/tools/droid/droid.bat +154 -0
  79. data/tools/droid/droid.sh +138 -0
  80. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  81. data/tools/droid/lib/activation-1.1.jar +0 -0
  82. data/tools/droid/lib/antlr-2.7.7.jar +0 -0
  83. data/tools/droid/lib/antlr-3.2.jar +0 -0
  84. data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
  85. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  86. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  87. data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
  88. data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
  89. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  90. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  91. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  92. data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
  93. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  94. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  95. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  96. data/tools/droid/lib/commons-codec-1.4.jar +0 -0
  97. data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
  98. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  99. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  100. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  101. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  102. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  103. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  104. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  105. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  106. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  107. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  108. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  109. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  110. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  111. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  112. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  113. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  114. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  115. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  116. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  117. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  118. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  119. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  120. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  121. data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
  122. data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
  123. data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
  124. data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
  125. data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
  126. data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
  127. data/tools/droid/lib/droid-help-6.1.5.jar +0 -0
  128. data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
  129. data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
  130. data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
  131. data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
  132. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  133. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  134. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  135. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  136. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  137. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  138. data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
  139. data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
  140. data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
  141. data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
  142. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  143. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  144. data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
  145. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  146. data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
  147. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  148. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  149. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  150. data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
  151. data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
  152. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  153. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  154. data/tools/droid/lib/jta-1.1.jar +0 -0
  155. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  156. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  157. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  158. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  159. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  160. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  161. data/tools/droid/lib/poi-3.7.jar +0 -0
  162. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  163. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  164. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  165. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  166. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  167. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  168. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  169. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  170. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  171. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  172. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  173. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  174. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  175. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  176. data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
  177. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  178. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  179. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  180. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  181. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  182. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  183. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  184. data/tools/droid/lib/xz-1.0.jar +0 -0
  185. data/tools/fido/__init__.py +0 -0
  186. data/tools/fido/argparselocal.py +2355 -0
  187. data/tools/fido/conf/DROID_SignatureFile-v81.xml +2 -0
  188. data/tools/fido/conf/container-signature-20150307.xml +2238 -0
  189. data/tools/fido/conf/dc.xsd +119 -0
  190. data/tools/fido/conf/dcmitype.xsd +53 -0
  191. data/tools/fido/conf/dcterms.xsd +383 -0
  192. data/tools/fido/conf/fido-formats.xsd +173 -0
  193. data/tools/fido/conf/format_extension_template.xml +105 -0
  194. data/tools/fido/conf/format_extensions.xml +498 -0
  195. data/tools/fido/conf/formats-v81.xml +38355 -0
  196. data/tools/fido/conf/pronom-xml-v81.zip +0 -0
  197. data/tools/fido/conf/versions.xml +8 -0
  198. data/tools/fido/fido.bat +4 -0
  199. data/tools/fido/fido.py +854 -0
  200. data/tools/fido/fido.sh +5 -0
  201. data/tools/fido/prepare.py +616 -0
  202. data/tools/fido/pronomutils.py +115 -0
  203. data/tools/fido/toxml.py +52 -0
  204. data/tools/fido/update_signatures.py +171 -0
  205. data/tools/pdfbox/pdfbox-app-1.8.10.jar +0 -0
  206. data/tools/pdfbox/preflight-app-1.8.10.jar +0 -0
  207. metadata +396 -0
@@ -0,0 +1,854 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ import sys, re, os, time, math
4
+ import hashlib, urllib, urlparse, csv, getopt
5
+ from xml.etree import cElementTree as ET
6
+ from xml.etree import ElementTree as CET
7
+ from xml.etree import ElementTree as VET # versions.xml
8
+
9
+ version = '1.3.1'
10
+ defaults = {'bufsize': 128 * 1024, # (bytes)
11
+ 'regexcachesize' :2084, # (bytes)
12
+ 'conf_dir' : os.path.join(os.path.dirname(__file__), 'conf'),
13
+ 'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
14
+ 'printnomatch' : "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
15
+ 'format_files': ['formats-v75.xml', 'format_extensions.xml'],
16
+ 'containersignature_file' : 'container-signature-20150307.xml',
17
+ # versions.xml is where fido.py reads version information
18
+ # about which xml to load
19
+ 'versions_file' : 'versions.xml',
20
+ 'container_bufsize' : 512 * 1024, # (bytes)
21
+ 'description' : """
22
+ Format Identification for Digital Objects (fido).
23
+ FIDO is a command-line tool to identify the file formats of digital objects.
24
+ It is designed for simple integration into automated work-flows.
25
+ """,
26
+ 'epilog' : """
27
+ Open Planets Foundation (http://www.openplanetsfoundation.org)
28
+ See License.txt for license information.
29
+ Download from: https://github.com/openplanets/fido/releases
30
+ Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
31
+ Author: Adam Farquhar (BL), 2010
32
+ Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
33
+ FIDO uses the UK National Archives (TNA) PRONOM File Format
34
+ and Container descriptions.
35
+ PRONOM is available from http://www.nationalarchives.gov.uk/pronom/"""
36
+ }
37
+
38
+ class Fido:
39
+ def __init__(self, quiet=False, bufsize=None, container_bufsize = None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=None, format_files=None, containersignature_file=None):
40
+ global defaults
41
+ self.quiet = quiet
42
+ self.bufsize = (defaults['bufsize'] if bufsize == None else bufsize)
43
+ self.container_bufsize = (defaults['container_bufsize'] if container_bufsize == None else container_bufsize)
44
+ self.printmatch = (defaults['printmatch'] if printmatch == None else printmatch)
45
+ self.printnomatch = (defaults['printnomatch'] if printnomatch == None else printnomatch)
46
+ self.handle_matches = (self.print_matches if handle_matches == None else handle_matches)
47
+ self.zip = zip
48
+ self.nocontainer = (defaults['nocontainer'] if nocontainer == None else nocontainer)
49
+ self.conf_dir = defaults['conf_dir'] if conf_dir == None else conf_dir
50
+ # print defaults
51
+ # sys.exit()
52
+ self.format_files = defaults['format_files'] if format_files == None else format_files
53
+ #self.containersignature_file = defaults['containersignature_file'] if containersignature_file == None else containersignature_file
54
+ self.containersignature_file = defaults['containersignature_file'] #if containersignature_file == None else containersignature_file
55
+ self.formats = []
56
+ self.puid_format_map = {}
57
+ self.puid_has_priority_over_map = {}
58
+ # load signatures
59
+ for xml_file in self.format_files:
60
+ self.load_fido_xml(os.path.join(os.path.abspath(self.conf_dir), xml_file))
61
+ self.load_container_signature(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
62
+ self.current_file = ''
63
+ self.current_filesize = 0
64
+ self.current_format = None
65
+ self.current_sig = None
66
+ self.current_pat = None
67
+ self.current_count = 0 # Count of calls to match_formats
68
+ re._MAXCACHE = defaults['regexcachesize']
69
+ self.externalsig = ET.XML('<signature><name>External</name></signature>')
70
+
71
+ _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
72
+ #_special = '$*+.?![]^\\{|}'
73
+ _special = '$()*+.?![]^\\{|}'
74
+ _hex = '0123456789abcdef'
75
+ def _escape_char(self,c):
76
+ if c in '\n':
77
+ return '\\n'
78
+ elif c == '\r':
79
+ return '\\r'
80
+ elif c in self._special:
81
+ return '\\' + c
82
+ else:
83
+ (high, low) = divmod(ord(c), 16)
84
+ return '\\x' + self._hex[high] + self._hex[low]
85
+
86
+ def escape(self,string):
87
+ "Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
88
+ escaped = ''.join(c if c in self._ordinary else self._escape_char(c) for c in string)
89
+ return escaped
90
+
91
+ def convert_container_sequence(self,sig):
92
+ """Parse the PRONOM container sequences
93
+ and convert to regular expressions
94
+ """
95
+ seq = '(?s)'
96
+ inq = False
97
+ byt = False
98
+ rng = False
99
+ ror = False
100
+ for i in range(len(sig)):
101
+ if not inq and not rng:
102
+ if sig[i] == "'":
103
+ inq = True
104
+ continue
105
+ if sig[i] == " ":
106
+ continue
107
+ if sig[i] == "[":
108
+ seq += "("
109
+ rng = True
110
+ continue
111
+ if not byt:
112
+ seq += "\\x" + sig[i].lower()
113
+ byt = True
114
+ continue
115
+ if byt:
116
+ seq += sig[i].lower()
117
+ byt = False
118
+ continue
119
+ if inq:
120
+ if sig[i] == "'" and not rng:
121
+ inq = False
122
+ continue
123
+ seq += self.escape(sig[i])
124
+ continue
125
+ if rng:
126
+ if sig[i] == "]":
127
+ seq += ")"
128
+ rng = False
129
+ continue
130
+ if sig[i] != "-" and sig[i] != "'" and ror:
131
+ seq += self.escape(sig[i])
132
+ continue
133
+ if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and not byt:
134
+ seq += "\\x" + sig[i].lower()
135
+ byt = True
136
+ continue
137
+ if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and byt:
138
+ seq += sig[i].lower()
139
+ byt = False
140
+ continue
141
+ if sig[i] == "-" or sig[i] == " ":
142
+ seq += "|"
143
+ continue
144
+ if sig[i] == "'" and not ror:
145
+ ror = True
146
+ continue
147
+ if sig[i] == "'" and ror:
148
+ ror = False
149
+ continue
150
+ #print seq
151
+ return seq
152
+
153
+ def load_container_signature(self, containersignature_file):
154
+ """Load the PRONOM container-signature file
155
+ and convert sequences to regular expressions
156
+ """
157
+ tree = CET.parse(containersignature_file)
158
+ # load and have container signatures converted
159
+ self.sequenceSignature = {}
160
+ for signature in tree.getroot().findall('ContainerSignatures/ContainerSignature'):
161
+ signatureId = signature.get('Id')
162
+ signatureSequence = signature.findall('Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence')
163
+ self.sequenceSignature[signatureId] = []
164
+ for sequence in signatureSequence:
165
+ self.sequenceSignature[signatureId].append(self.convert_container_sequence(sequence[0].text))
166
+ # find PUIDs which trigger container matching
167
+ self.puidTriggers = {}
168
+ triggers = tree.find('TriggerPuids')
169
+ for puid in triggers.findall('TriggerPuid'):
170
+ self.puidTriggers[puid.get('Puid')] = True
171
+ # map PUID to container signatureId
172
+ self.puidMapping = {}
173
+ mappings = tree.find('FileFormatMappings')
174
+ for mapping in mappings.findall('FileFormatMapping'):
175
+ if mapping.get('signatureId') not in self.puidMapping:
176
+ self.puidMapping[mapping.get('signatureId')] = []
177
+ self.puidMapping[mapping.get('signatureId')].append(mapping.get('Puid'))
178
+ # print "sequences:\n",self.sequenceSignature
179
+ # print "trigger:\n",self.puidTriggers
180
+ # print "mapping:\n",self.puidMapping
181
+ # exit()
182
+
183
+ def load_fido_xml(self, file):
184
+ """Load the fido format information from @param file.
185
+ As a side-effect, set self.formats
186
+ @return list of ElementTree.Element, one for each format.
187
+ """
188
+ tree = ET.parse(file)
189
+ #print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
190
+ #TODO: Handle empty regexes properly; perhaps remove from the format list
191
+ for element in tree.getroot().findall('./format'):
192
+ puid = self.get_puid(element)
193
+ # Handle over-writes in multiple file loads
194
+ existing = self.puid_format_map.get(puid, False)
195
+ if existing:
196
+ # Already have one, so replace old with new!
197
+ self.formats[self.formats.index(existing)] = element
198
+ else:
199
+ self.formats.append(element)
200
+ self.puid_format_map[puid] = element
201
+ # Build some structures to speed things up
202
+ self.puid_has_priority_over_map[puid] = frozenset([puid_element.text for puid_element in element.findall('has_priority_over')])
203
+ return self.formats
204
+
205
+ # To delete a format: (1) remove from self.formats, (2) remove from puid_format_map, (3) remove from selt.puid_has_priority_over_map
206
+ def get_signatures(self, format):
207
+ return format.findall('signature')
208
+
209
+ def has_priority_over(self, format, possibly_inferior):
210
+ return self.get_puid(possibly_inferior)in self.puid_has_priority_over_map[self.get_puid(format)]
211
+
212
+ def get_puid(self, format):
213
+ return format.find('puid').text
214
+
215
+ def get_patterns(self, signature):
216
+ return signature.findall('pattern')
217
+
218
+ def get_pos(self, pat):
219
+ return pat.find('position').text
220
+
221
+ def get_regex(self, pat):
222
+ return pat.find('regex').text
223
+
224
+ def get_extension(self, format):
225
+ return format.find('extension').text
226
+
227
+ def print_matches(self, fullname, matches, delta_t, matchtype=''):
228
+ """The default match handler. Prints out information for each match in the list.
229
+ @param fullname is name of the file being matched
230
+ @param matches is a list of (format, signature)
231
+ @param delta_t is the time taken for the match.
232
+ @param matchtype is the type of match (signature, containersignature, extension, fail)
233
+ """
234
+ class Info:
235
+ pass
236
+ obj = Info()
237
+ obj.count = self.current_count
238
+ obj.group_size = len(matches)
239
+ obj.filename = fullname
240
+ obj.time = int(delta_t * 1000)
241
+ obj.filesize = self.current_filesize
242
+ obj.matchtype = matchtype
243
+ if len(matches) == 0:
244
+ sys.stdout.write(self.printnomatch % { "info.time" : obj.time, "info.filesize" : obj.filesize, "info.filename" : obj.filename, "info.count"
245
+ : obj.count, "info.matchtype" : "fail" } )
246
+ else:
247
+ i = 0
248
+ for (f, s) in matches:
249
+ i += 1
250
+ obj.group_index = i
251
+ obj.puid = self.get_puid(f)
252
+ obj.formatname = f.find('name').text
253
+ obj.signaturename = s.find('name').text
254
+ mime = f.find('mime')
255
+ obj.mimetype = mime.text if mime != None else None
256
+ version = f.find('version')
257
+ obj.version = version.text if version != None else None
258
+ alias = f.find('alias')
259
+ obj.alias = alias.text if alias != None else None
260
+ apple_uti = f.find('apple_uid')
261
+ obj.apple_uti = apple_uti.text if apple_uti != None else None
262
+ sys.stdout.write(self.printmatch % { "info.time" : obj.time, "info.puid" : obj.puid, "info.formatname" : obj.formatname, "info.signaturename" : obj.signaturename, "info.filesize" : obj.filesize, "info.filename" : obj.filename, "info.mimetype" : obj.mimetype, "info.matchtype" : obj.matchtype, "info.version" : obj.version, "info.alias" : obj.alias, "info.apple_uti" : obj.apple_uti, "info.group_size" : obj.group_size, "info.group_index" : obj.group_index, "info.count" : obj.count })
263
+
264
+ def print_summary(self, secs):
265
+ """Print summary information on the number of matches and time taken.
266
+ """
267
+ count = self.current_count
268
+ if not self.quiet:
269
+ rate = (int(round(count / secs)) if secs != 0 else 9999)
270
+ #print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
271
+ sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
272
+
273
+ def identify_file(self, filename):
274
+ """Identify the type of @param filename.
275
+ Call self.handle_matches instead of returning a value.
276
+ """
277
+ self.current_file = filename
278
+ self.matchtype = "signature"
279
+ try:
280
+ t0 = time.clock()
281
+ f = open(filename, 'rb')
282
+ size = os.stat(filename)[6]
283
+ self.current_filesize = size
284
+ if self.current_filesize == 0:
285
+ sys.stderr.write("FIDO: Zero byte file (empty): Path is: {0}\n".format(filename))
286
+ bofbuffer, eofbuffer = self.get_buffers(f, size, seekable=True)
287
+ matches = self.match_formats(bofbuffer, eofbuffer)
288
+ # from here is also repeated in walk_zip
289
+ # we should make this uniform in a next version!
290
+ #
291
+ # filesize is made conditional because files with 0 bytes
292
+ # are falsely characterised being 'rtf' (due to wacky sig)
293
+ # in these cases we try to match the extension instead
294
+ if len(matches) > 0 and self.current_filesize > 0:
295
+ self.handle_matches(filename, matches, time.clock() - t0, self.matchtype)
296
+ elif len(matches) == 0 or self.current_filesize == 0:
297
+ matches = self.match_extensions(filename)
298
+ self.handle_matches(filename, matches, time.clock() - t0, "extension")
299
+ # till here matey!
300
+ if self.zip:
301
+ self.identify_contents(filename, type=self.container_type(matches))
302
+ except IOError:
303
+ #print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
304
+ sys.stderr.write("FIDO: Error in identify_file: Path is {0}\n".format(filename))
305
+
306
+ def identify_contents(self, filename, fileobj=None, type=False):
307
+ """Identify each item in a container (such as a zip or tar file). Call self.handle_matches on each item.
308
+ @param fileobj could be a file, or a stream.
309
+ """
310
+ if type == False:
311
+ return
312
+ elif type == 'zip':
313
+ self.walk_zip(filename, fileobj)
314
+ elif type == 'tar':
315
+ self.walk_tar(filename, fileobj)
316
+ else: # TODO: ouch!
317
+ raise RuntimeError("Unknown container type: " + repr(type))
318
+
319
+ def identify_multi_object_stream(self, stream):
320
+ """Does not work!
321
+ Stream may contain one or more objects each with an HTTP style header that must include content-length.
322
+ The headers consist of keyword:value pairs terminated by a newline. There must be a newline following the headers.
323
+ """
324
+ offset = 0
325
+ while True:
326
+ t0 = time.clock()
327
+ content_length = -1
328
+ for line in stream:
329
+ offset += len(line)
330
+ if line == '\n':
331
+ if content_length < 0:
332
+ raise EnvironmentError("No content-length provided.")
333
+ else:
334
+ break
335
+ pair = line.lower().split(':', 2)
336
+ if pair[0] == 'content-length':
337
+ content_length = int(pair[1])
338
+ if content_length == -1:
339
+ return
340
+ # Consume exactly content-length bytes
341
+ self.current_file = 'STDIN!(at ' + str(offset) + ' bytes)'
342
+ self.current_filesize = content_length
343
+ bofbuffer, eofbuffer = self.get_buffers(stream, content_length)
344
+ matches = self.match_formats(bofbuffer, eofbuffer)
345
+ # MdR: this needs attention
346
+ if len(matches) > 0:
347
+ self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
348
+ elif len(matches) == 0 or self.current_filesize == 0:
349
+ matches = self.match_extensions(self.current_file)
350
+ self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
351
+
352
+ def identify_stream(self, stream, filename):
353
+ """Identify the type of @param stream.
354
+ Call self.handle_matches instead of returning a value.
355
+ Does not close stream.
356
+ """
357
+ t0 = time.clock()
358
+ bofbuffer, eofbuffer, bytes_read = self.get_buffers(stream, length=None)
359
+ self.current_filesize = bytes_read
360
+ self.current_file = 'STDIN'
361
+ matches = self.match_formats(bofbuffer, eofbuffer)
362
+ # MdR: this needs attention
363
+ if len(matches) > 0:
364
+ self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
365
+ elif len(matches) == 0 or self.current_filesize == 0:
366
+ # we can only determine the filename from the STDIN stream
367
+ # on Linux, on Windows there is not a (simple) way to do that
368
+ if (os.name != "nt"):
369
+ try:
370
+ self.current_file = os.readlink("/proc/self/fd/0")
371
+ except:
372
+ if filename is not None:
373
+ self.current_file = filename
374
+ else:
375
+ self.current_file = 'STDIN'
376
+ else:
377
+ if filename is not None:
378
+ self.current_file = filename
379
+ matches = self.match_extensions(self.current_file)
380
+ # we have to reset self.current_file if not on Windows
381
+ if (os.name != "nt"):
382
+ self.current_file = 'STDIN'
383
+ self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
384
+
385
+ def container_type(self, matches):
386
+ """Determine if one of the @param matches is the format of a container that we can look inside of (e.g., zip, tar).
387
+ @return False, zip, or tar.
388
+ """
389
+ for (format, unused) in matches:
390
+ container = format.find('container')
391
+ if container != None:
392
+ return container.text
393
+ return False
394
+
395
+ def blocking_read(self, file, bytes_to_read):
396
+ bytes_read = 0
397
+ buffer = ''
398
+ while bytes_read < bytes_to_read:
399
+ readbuffer = file.read(bytes_to_read - bytes_read)
400
+ buffer += readbuffer
401
+ bytes_read = len(buffer)
402
+ # break out if EOF is reached.
403
+ if readbuffer == '':
404
+ break
405
+ return buffer
406
+
407
+ def get_buffers(self, stream, length=None, seekable=False):
408
+ """Return buffers from the beginning and end of stream and the number of bytes read
409
+ if there may be more bytes in the stream.
410
+
411
+ If length is None, return the length as found.
412
+ If seekable is False, the steam does not support a seek operation.
413
+ """
414
+ bytes_to_read = self.bufsize if length == None else min(length, self.bufsize)
415
+ bofbuffer = self.blocking_read(stream, bytes_to_read)
416
+ bytes_read = len(bofbuffer)
417
+ if length == None:
418
+ # A stream with unknown length; have to keep two buffers around
419
+ prevbuffer = bofbuffer
420
+ while True:
421
+ buffer = self.blocking_read(stream, self.bufsize)
422
+ bytes_read += len(buffer)
423
+ if len(buffer) == self.bufsize:
424
+ prevbuffer = buffer
425
+ else:
426
+ eofbuffer = prevbuffer if len(buffer) == 0 else prevbuffer[-(self.bufsize - len(buffer)):] + buffer
427
+ break
428
+ return bofbuffer, eofbuffer, bytes_read
429
+ else:
430
+ bytes_unread = length - len(bofbuffer)
431
+ if bytes_unread == 0:
432
+ eofbuffer = bofbuffer
433
+ elif bytes_unread < self.bufsize:
434
+ # The buffs overlap
435
+ eofbuffer = bofbuffer[bytes_unread:] + self.blocking_read(stream, bytes_unread)
436
+ elif bytes_unread == self.bufsize:
437
+ eofbuffer = self.blocking_read(stream, self.bufsize)
438
+ elif seekable: # easy case when we can just seek!
439
+ stream.seek(length - self.bufsize)
440
+ eofbuffer = self.blocking_read(stream, self.bufsize)
441
+ else:
442
+ # We have more to read and know how much.
443
+ # n*bufsize + r = length
444
+ (n, r) = divmod(bytes_unread, self.bufsize)
445
+ # skip n-1*bufsize bytes
446
+ for unused_i in xrange(1, n):
447
+ self.blocking_read(stream, self.bufsize)
448
+ # skip r bytes
449
+ self.blocking_read(stream, r)
450
+ # and read the remaining bufsize bytes into the eofbuffer
451
+ eofbuffer = self.blocking_read(stream, self.bufsize)
452
+ return bofbuffer, eofbuffer
453
+
454
+ def walk_zip(self, filename, fileobj=None):
455
+ """Identify the type of each item in the zip
456
+ @param fileobj. If fileobj is not provided, open
457
+ @param filename.
458
+ Call self.handle_matches instead of returning a value.
459
+ """
460
+ # IN 2.7+: with zipfile.ZipFile((fileobj if fileobj != None else filename), 'r') as stream:
461
+ import zipfile, tempfile
462
+ try:
463
+ zipstream = None
464
+ zipstream = zipfile.ZipFile((fileobj if fileobj != None else filename), 'r')
465
+ for item in zipstream.infolist():
466
+ if item.file_size == 0:
467
+ continue #TODO: Find a better test for isdir
468
+ t0 = time.clock()
469
+ # with zipstream.open(item) as f:
470
+ f = None
471
+ try:
472
+ f = zipstream.open(item)
473
+ item_name = filename + '!' + item.filename
474
+ self.current_file = item_name
475
+ self.current_filesize = item.file_size
476
+ if self.current_filesize == 0:
477
+ sys.stderr.write("FIDO: Zero byte file (empty): Path is: {0}\n".format(item_name))
478
+ bofbuffer, eofbuffer = self.get_buffers(f, item.file_size)
479
+ finally:
480
+ if f != None: f.close()
481
+ matches = self.match_formats(bofbuffer, eofbuffer)
482
+ if len(matches) > 0 and self.current_filesize > 0:
483
+ self.handle_matches(item_name, matches, time.clock() - t0, "signature")
484
+ elif len(matches) == 0 or self.current_filesize == 0:
485
+ matches = self.match_extensions(item_name)
486
+ self.handle_matches(item_name, matches, time.clock() - t0, "extension")
487
+ if self.container_type(matches):
488
+ target = tempfile.SpooledTemporaryFile(prefix='Fido')
489
+ #with zipstream.open(item) as source:
490
+ try:
491
+ source = zipstream.open(item)
492
+ self.copy_stream(source, target)
493
+ #target.seek(0)
494
+ self.identify_contents(item_name, target, self.container_type(matches))
495
+ finally:
496
+ source.close()
497
+ except IOError:
498
+ sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
499
+ except zipfile.BadZipfile:
500
+ sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
501
+
502
+ finally:
503
+ if zipstream != None: zipstream.close()
504
+
505
+ def walk_tar(self, filename, fileobj):
506
+ """Identify the type of each item in the tar
507
+ @param fileobj. If fileobj is not provided, open
508
+ @param filename.
509
+ Call self.handle_matches instead of returning a value.
510
+ """
511
+ import tarfile
512
+ tarstream = None
513
+ try:
514
+ tarstream = tarfile.TarFile(filename, fileobj=fileobj, mode='r')
515
+ for item in tarstream.getmembers():
516
+ if item.isfile():
517
+ t0 = time.clock()
518
+ f = tarstream.extractfile(item)
519
+ tar_item_name = filename + '!' + item.name
520
+ self.current_file = tar_item_name
521
+ self.current_filesize = item.size
522
+ bofbuffer, eofbuffer = self.get_buffers(f, item.size)
523
+ matches = self.match_formats(bofbuffer, eofbuffer)
524
+ self.handle_matches(tar_item_name, matches, time.clock() - t0)
525
+ if self.container_type(matches):
526
+ f.seek(0)
527
+ self.identify_contents(tar_item_name, f, self.container_type(matches))
528
+ f.close()
529
+ except tarfile.TarError:
530
+ sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
531
+ finally:
532
+ if tarstream != None: tarstream.close()
533
+
534
+ def as_good_as_any(self, f1, match_list):
535
+ """Return True if the proposed format is as good as any in the match_list.
536
+ For example, if there is no format in the match_list that has priority over the proposed one"""
537
+ if match_list != []:
538
+ f1_puid = self.get_puid(f1)
539
+ for (f2, unused) in match_list:
540
+ if f1 == f2:
541
+ continue
542
+ elif f1_puid in self.puid_has_priority_over_map[self.get_puid(f2)]:
543
+ return False
544
+ return True
545
+
546
+ def buffered_read(self, file_pos, overlap):
547
+ """Buffered read of data chunks
548
+ """
549
+ buf = ""
550
+ if not overlap:
551
+ bufsize = self.container_bufsize
552
+ else:
553
+ bufsize = self.container_bufsize + self.overlap_range
554
+ file_end = self.current_filesize
555
+ file_handle = file(self.current_file, 'rb')
556
+ file_handle.seek(file_pos)
557
+ if file_end - file_pos < bufsize:
558
+ file_read = file_end - file_pos
559
+ else:
560
+ file_read = self.bufsize
561
+ buf = file_handle.read(file_read)
562
+ return buf
563
+
564
+ def read_container(self,parent_buffer,parent_result):
565
+ """Header of compound containers can be further away than default 128 KB buffer
566
+ especially with big files containing binary objects.
567
+ This function reads containers in chunks of 512 KB (defaults['container_bufsize'])
568
+ Each chunk is inspected with the PRONOM container sequences.
569
+ Each chunk smuggles in a piece from the previous chunk to prevent
570
+ cutting off patterns we are looking for in the middle.
571
+ This method is somewhat slower than reading the complete file at once.
572
+ This is to prevent Fido to potentially crash in the midst of scanning a very big file.
573
+ NOTE (MdR): this piece of code is still a bit quirky
574
+ as it does not yet takes byte positions into account which
575
+ are available in the DROID container signature file
576
+ """
577
+ container_result = []
578
+ nobuffer = False
579
+ overlap = False
580
+ self.overlap_range = 512 # bytes
581
+ container_hit = False
582
+ passes = 1
583
+ container_buffer = ""
584
+ # TODO: find better way to handle zip contents
585
+ # for now: ugly hack, but working
586
+ # this slows down because the zip is re-opened on each item
587
+ # if "!" is in filename, it is a zip item
588
+ # if "!" in self.current_file:
589
+ # import zipfile, tempfile
590
+ # zip, item = self.current_file.split("!")
591
+ # zipitem = tempfile.SpooledTemporaryFile(prefix='Fido')
592
+ #with zipstream.open(item) as source:
593
+ # try:
594
+ # source = zipstream.open(item)
595
+ # self.copy_stream(source, target)
596
+ # target.seek(0)
597
+ # self.identify_contents(item_name, target, self.container_type(matches))
598
+ # finally:
599
+ # source.close()
600
+ #exit()
601
+ # in case argument 'nocontainer' is set
602
+ # read default bofbuffer
603
+ if self.nocontainer or self.current_filesize <= self.bufsize or self.current_file == "STDIN":
604
+ passes = 1
605
+ nobuffer = True
606
+ else:
607
+ passes = int(float(self.current_filesize / self.container_bufsize) + 1)
608
+ pos = 0
609
+ for i in xrange(passes):
610
+ if nobuffer is True:
611
+ container_buffer = parent_buffer
612
+ else:
613
+ if i == 0:
614
+ pos = 0
615
+ else:
616
+ pos = ((self.container_bufsize * i) - self.overlap_range)
617
+ overlap = True
618
+ container_buffer = self.buffered_read(pos, overlap)
619
+ for (container_id,container_regexes) in self.sequenceSignature.iteritems():
620
+ # set hitcounter in case a container entry
621
+ # has more than one regex
622
+ hitcounter = 0
623
+ if len(container_regexes) > 0:
624
+ for container_regex in container_regexes:
625
+ if re.search(container_regex, container_buffer):
626
+ hitcounter += 1
627
+ # if the hitcounter matches the number of regexes
628
+ # then it must be a positive hit, else continue
629
+ # to match the rest of the sequences
630
+ if hitcounter < len(container_regexes):
631
+ continue
632
+ self.matchtype = "container"
633
+ for container_puid in self.puidMapping[container_id]:
634
+ for container_format in self.formats:
635
+ if container_format.find('puid').text == container_puid:
636
+ if self.as_good_as_any(container_format, parent_result):
637
+ for container_sig in self.get_signatures(container_format):
638
+ container_result.append((container_format, container_sig))
639
+ break
640
+ return container_result
641
+
642
+ def match_formats(self, bofbuffer, eofbuffer):
643
+ """Apply the patterns for formats to the supplied buffers.
644
+ @return a match list of (format, signature) tuples.
645
+ The list has inferior matches removed.
646
+ """
647
+ self.current_count += 1
648
+ #t0 = time.clock()
649
+ result = []
650
+ container_result = []
651
+ for format in self.formats:
652
+ try:
653
+ self.current_format = format
654
+ if self.as_good_as_any(format, result):
655
+ for sig in self.get_signatures(format):
656
+ self.current_sig = sig
657
+ success = True
658
+ for pat in self.get_patterns(sig):
659
+ self.current_pat = pat
660
+ pos = self.get_pos(pat)
661
+ regex = self.get_regex(pat)
662
+ #print 'trying ', regex
663
+ if pos == 'BOF':
664
+ if not re.match(regex, bofbuffer):
665
+ success = False
666
+ break
667
+ elif pos == 'EOF':
668
+ if not re.search(regex, eofbuffer):
669
+ success = False
670
+ break
671
+ elif pos == 'VAR':
672
+ if not re.search(regex, bofbuffer):
673
+ success = False
674
+ break
675
+ elif pos == 'IFB':
676
+ if not re.search(regex, bofbuffer):
677
+ success = False
678
+ break
679
+ if success:
680
+ result.append((format, sig))
681
+ # check if file needs to be parsed with container signature
682
+ # we skip files with extension "zip" (x-fmt/263)
683
+ ext = os.path.splitext(self.current_file)[1].lower().lstrip(".")
684
+ if format.find('puid').text in self.puidTriggers and ext != "zip":
685
+ container_result = self.read_container(bofbuffer,result)
686
+ if len(container_result) != 0:
687
+ for (k,v) in container_result:
688
+ result.append((k,v))
689
+ break
690
+ except Exception as e:
691
+ sys.stderr.write(str(e)+"\n")
692
+ continue
693
+ # TODO: MdR: needs some <3
694
+ #print "Unexpected error:", sys.exc_info()[0], e
695
+ #sys.stdout.write('***', self.get_puid(format), regex)
696
+
697
+ # t1 = time.clock()
698
+ # if t1 - t0 > 0.02:
699
+ # print >> sys.stderr, "FIDO: Slow ID", self.current_file
700
+ result = [match for match in result if self.as_good_as_any(match[0], result)]
701
+ result = list(set(result)) # remove duplicate results, this is due to ??? in self.read_container(), needs fix
702
+ return result
703
+
704
+ def match_extensions(self, filename):
705
+ "Return the list of (format, self.externalsig) for every format whose extension matches the filename."
706
+ myext = os.path.splitext(filename)[1].lower().lstrip(".")
707
+ result = []
708
+ if len(myext) > 0:
709
+ for element in self.formats:
710
+ if element.findall('extension') != None:
711
+ for format in element.findall('extension'):
712
+ if myext == format.text:
713
+ result.append((element, self.externalsig))
714
+ break
715
+ result = [match for match in result if self.as_good_as_any(match[0], result)]
716
+ return result
717
+
718
+ def copy_stream(self, source, target):
719
+ while True:
720
+ buf = source.read(self.bufsize)
721
+ if len(buf) == 0:
722
+ break
723
+ target.write(buf)
724
+
725
+ def list_files(roots, recurse=False):
726
+ "Return the files one at a time. Roots could be a fileobj or a list."
727
+ for root in roots:
728
+ root = (root if root[-1] != '\n' else root[:-1])
729
+ root = os.path.normpath(root)
730
+ if os.path.isfile(root):
731
+ yield root
732
+ else:
733
+ for path, unused, files in os.walk(root):
734
+ for f in files:
735
+ yield os.path.join(path, f)
736
+ if recurse == False:
737
+ break
738
+
739
+ def main(arglist=None):
740
+ # The argparse package was introduced in 2.7
741
+ t0 = time.clock()
742
+ from argparselocal import ArgumentParser, RawTextHelpFormatter
743
+ if arglist == None:
744
+ arglist = sys.argv[1:]
745
+ if len(arglist) == False:
746
+ arglist.append("-h")
747
+ parser = ArgumentParser(description=defaults['description'], epilog=defaults['epilog'], fromfile_prefix_chars='@', formatter_class=RawTextHelpFormatter)
748
+ parser.add_argument('-v', default=False, action='store_true', help='show version information')
749
+ parser.add_argument('-q', default=False, action='store_true', help='run (more) quietly')
750
+ parser.add_argument('-recurse', default=False, action='store_true', help='recurse into subdirectories')
751
+ parser.add_argument('-zip', default=False, action='store_true', help='recurse into zip and tar files')
752
+ parser.add_argument('-nocontainer', default=False, action='store_true', help='disable deep scan of container documents, increases speed but may reduce accuracy with big files')
753
+ parser.add_argument('-pronom_only', default=False, action='store_true', help='disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results')
754
+ group = parser.add_mutually_exclusive_group()
755
+ group.add_argument('-input', default=False, help='file containing a list of files to check, one per line. - means stdin')
756
+ group.add_argument('files', nargs='*', default=[], metavar='FILE', help='files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.')
757
+ parser.add_argument('-filename', default=None, help='filename if file contents passed through STDIN')
758
+ parser.add_argument('-useformats', metavar='INCLUDEPUIDS', default=None, help='comma separated string of formats to use in identification')
759
+ parser.add_argument('-nouseformats', metavar='EXCLUDEPUIDS', default=None, help='comma separated string of formats not to use in identification')
760
+ parser.add_argument('-matchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use on match. See nomatchprintf, README.txt.')
761
+ parser.add_argument('-nomatchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use if no match. See README.txt')
762
+ parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['bufsize'])+' bytes)')
763
+ parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['container_bufsize'])+' bytes)')
764
+
765
+ parser.add_argument('-loadformats', default=None, metavar='XML1,...,XMLn', help='comma separated string of XML format files to add.')
766
+ parser.add_argument('-confdir', default=None, help='configuration directory to load_fido_xml, for example, the format specifications from.')
767
+
768
+ # what is this doing here only once?
769
+ #mydir = os.path.abspath(os.path.dirname(__file__))
770
+
771
+ # PROCESS ARGUMENTS
772
+ args = parser.parse_args(arglist)
773
+ # print args
774
+ # sys.exit()
775
+ # process confdir
776
+ # load versions.xml
777
+ # and stick it in defaults
778
+ if args.confdir:
779
+ versionsFile = os.path.join(os.path.abspath(args.confdir), defaults['versions_file'])
780
+ else:
781
+ versionsFile = os.path.join(os.path.abspath(defaults['conf_dir']), defaults['versions_file'])
782
+ try:
783
+ versions = VET.parse(versionsFile)
784
+ except Exception, e:
785
+ sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
786
+ sys.exit()
787
+ defaults['xml_pronomSignature'] = versions.find("pronomSignature").text
788
+ # defaults['xml_pronomContainerSignature'] = versions.find("pronomContainerSignature").text
789
+ defaults['containersignature_file'] = versions.find("pronomContainerSignature").text
790
+ defaults['xml_fidoExtensionSignature'] = versions.find("fidoExtensionSignature").text
791
+ defaults['format_files'] = []
792
+ defaults['format_files'].append(defaults['xml_pronomSignature'])
793
+ if args.pronom_only:
794
+ versionHeader = "FIDO v{0} ({1}, {2})\n".format(version,defaults['xml_pronomSignature'],defaults['containersignature_file'])
795
+ else:
796
+ versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(version,defaults['xml_pronomSignature'],defaults['containersignature_file'],defaults['xml_fidoExtensionSignature'])
797
+ defaults['format_files'].append(defaults['xml_fidoExtensionSignature'])
798
+
799
+ if args.v :
800
+ sys.stdout.write(versionHeader)
801
+ sys.exit(0)
802
+ if args.matchprintf != None:
803
+ args.matchprintf = args.matchprintf.decode('string_escape')
804
+ if args.nomatchprintf != None:
805
+ args.nomatchprintf = args.nomatchprintf.decode('string_escape')
806
+ fido = Fido(quiet=args.q, bufsize=args.bufsize, container_bufsize=args.container_bufsize,
807
+ printmatch=args.matchprintf, printnomatch=args.nomatchprintf, zip=args.zip, nocontainer = args.nocontainer, conf_dir=args.confdir)
808
+
809
+ #TODO: Allow conf options to be dis-included
810
+ if args.loadformats:
811
+ for file in args.loadformats.split(','):
812
+ fido.load_fido_xml(file)
813
+
814
+ #TODO: remove from maps
815
+ if args.useformats:
816
+ args.useformats = args.useformats.split(',')
817
+ fido.formats = [f for f in fido.formats if f.find('puid').text in args.useformats]
818
+ elif args.nouseformats:
819
+ args.nouseformats = args.nouseformats.split(',')
820
+ fido.formats = [f for f in fido.formats if f.find('puid').text not in args.nouseformats]
821
+
822
+ # Set up to use stdin, or open input files:
823
+ if args.input == '-':
824
+ args.files = sys.stdin
825
+ elif args.input:
826
+ args.files = open(args.input, 'r')
827
+
828
+ # RUN
829
+ try:
830
+ if not args.q:
831
+ sys.stderr.write(versionHeader)
832
+ sys.stderr.flush()
833
+ if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
834
+ if fido.zip == True:
835
+ raise RuntimeError("Multiple content read from stdin not yet supported.")
836
+ sys.exit(1)
837
+ fido.identify_multi_object_stream(sys.stdin)
838
+ else:
839
+ fido.identify_stream(sys.stdin, args.filename)
840
+ else:
841
+ for file in list_files(args.files, args.recurse):
842
+ fido.identify_file(file)
843
+ except KeyboardInterrupt:
844
+ msg = "FIDO: Interrupt while identifying file {0}"
845
+ sys.stderr.write(msg.format(fido.current_file))
846
+ sys.exit(1)
847
+
848
+ if not args.q:
849
+ sys.stdout.flush()
850
+ fido.print_summary(time.clock() - t0)
851
+ sys.stderr.flush()
852
+
853
+ if __name__ == '__main__':
854
+ main()