libis-format 0.9.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +18 -0
  4. data/.travis.yml +41 -0
  5. data/Gemfile +5 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +39 -0
  8. data/Rakefile +8 -0
  9. data/bin/droid +15 -0
  10. data/bin/fido +12 -0
  11. data/bin/pdf_copy +13 -0
  12. data/data/ISOcoated_v2_eci.icc +0 -0
  13. data/data/PDFA_def.ps +40 -0
  14. data/data/ead.xsd +2728 -0
  15. data/data/eciRGB_v2.icc +0 -0
  16. data/data/lias_formats.xml +106 -0
  17. data/data/types.yml +217 -0
  18. data/lib/libis/format/config.rb +35 -0
  19. data/lib/libis/format/converter/base.rb +101 -0
  20. data/lib/libis/format/converter/chain.rb +167 -0
  21. data/lib/libis/format/converter/image_converter.rb +214 -0
  22. data/lib/libis/format/converter/office_converter.rb +50 -0
  23. data/lib/libis/format/converter/pdf_converter.rb +139 -0
  24. data/lib/libis/format/converter/repository.rb +98 -0
  25. data/lib/libis/format/converter.rb +11 -0
  26. data/lib/libis/format/droid.rb +45 -0
  27. data/lib/libis/format/fido.rb +102 -0
  28. data/lib/libis/format/identifier.rb +189 -0
  29. data/lib/libis/format/office_to_pdf.rb +52 -0
  30. data/lib/libis/format/pdf_copy.rb +40 -0
  31. data/lib/libis/format/pdf_merge.rb +41 -0
  32. data/lib/libis/format/pdf_split.rb +39 -0
  33. data/lib/libis/format/pdf_to_pdfa.rb +76 -0
  34. data/lib/libis/format/pdfa_validator.rb +61 -0
  35. data/lib/libis/format/type_database.rb +170 -0
  36. data/lib/libis/format/version.rb +5 -0
  37. data/lib/libis/format.rb +23 -0
  38. data/lib/libis-format.rb +1 -0
  39. data/libis-format.gemspec +34 -0
  40. data/spec/converter_spec.rb +212 -0
  41. data/spec/data/Cevennes2.bmp +0 -0
  42. data/spec/data/Cevennes2.jp2 +0 -0
  43. data/spec/data/Cevennes2.ppm +22492 -0
  44. data/spec/data/test-ead.xml +392 -0
  45. data/spec/data/test-jpg.tif +0 -0
  46. data/spec/data/test-lzw.tif +0 -0
  47. data/spec/data/test-options.jpg +0 -0
  48. data/spec/data/test.bmp +0 -0
  49. data/spec/data/test.doc +0 -0
  50. data/spec/data/test.docx +0 -0
  51. data/spec/data/test.gif +0 -0
  52. data/spec/data/test.jpg +0 -0
  53. data/spec/data/test.ods +0 -0
  54. data/spec/data/test.odt +0 -0
  55. data/spec/data/test.pdf +0 -0
  56. data/spec/data/test.pdf.tif +0 -0
  57. data/spec/data/test.png +0 -0
  58. data/spec/data/test.ps +8631 -0
  59. data/spec/data/test.psd +0 -0
  60. data/spec/data/test.rtf +1455 -0
  61. data/spec/data/test.tif +0 -0
  62. data/spec/data/test.txt +12 -0
  63. data/spec/data/test.xcf +0 -0
  64. data/spec/data/test.xls +0 -0
  65. data/spec/data/test.xlsx +0 -0
  66. data/spec/data/test.xml +4 -0
  67. data/spec/data/test_pdfa.pdf +0 -0
  68. data/spec/identifier_spec.rb +60 -0
  69. data/spec/spec_helper.rb +9 -0
  70. data/spec/test_types.yml +12 -0
  71. data/spec/type_database_spec.rb +140 -0
  72. data/tools/PdfTool.jar +0 -0
  73. data/tools/bcpkix-jdk15on-1.49.jar +0 -0
  74. data/tools/bcprov-jdk15on-1.49.jar +0 -0
  75. data/tools/droid/DROID_SignatureFile_V82.xml +32681 -0
  76. data/tools/droid/container-signature-20150307.xml +2235 -0
  77. data/tools/droid/droid-command-line-6.1.5.jar +0 -0
  78. data/tools/droid/droid.bat +154 -0
  79. data/tools/droid/droid.sh +138 -0
  80. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  81. data/tools/droid/lib/activation-1.1.jar +0 -0
  82. data/tools/droid/lib/antlr-2.7.7.jar +0 -0
  83. data/tools/droid/lib/antlr-3.2.jar +0 -0
  84. data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
  85. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  86. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  87. data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
  88. data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
  89. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  90. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  91. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  92. data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
  93. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  94. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  95. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  96. data/tools/droid/lib/commons-codec-1.4.jar +0 -0
  97. data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
  98. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  99. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  100. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  101. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  102. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  103. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  104. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  105. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  106. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  107. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  108. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  109. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  110. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  111. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  112. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  113. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  114. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  115. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  116. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  117. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  118. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  119. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  120. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  121. data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
  122. data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
  123. data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
  124. data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
  125. data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
  126. data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
  127. data/tools/droid/lib/droid-help-6.1.5.jar +0 -0
  128. data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
  129. data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
  130. data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
  131. data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
  132. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  133. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  134. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  135. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  136. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  137. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  138. data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
  139. data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
  140. data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
  141. data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
  142. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  143. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  144. data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
  145. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  146. data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
  147. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  148. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  149. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  150. data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
  151. data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
  152. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  153. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  154. data/tools/droid/lib/jta-1.1.jar +0 -0
  155. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  156. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  157. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  158. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  159. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  160. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  161. data/tools/droid/lib/poi-3.7.jar +0 -0
  162. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  163. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  164. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  165. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  166. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  167. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  168. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  169. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  170. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  171. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  172. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  173. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  174. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  175. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  176. data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
  177. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  178. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  179. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  180. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  181. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  182. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  183. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  184. data/tools/droid/lib/xz-1.0.jar +0 -0
  185. data/tools/fido/__init__.py +0 -0
  186. data/tools/fido/argparselocal.py +2355 -0
  187. data/tools/fido/conf/DROID_SignatureFile-v81.xml +2 -0
  188. data/tools/fido/conf/container-signature-20150307.xml +2238 -0
  189. data/tools/fido/conf/dc.xsd +119 -0
  190. data/tools/fido/conf/dcmitype.xsd +53 -0
  191. data/tools/fido/conf/dcterms.xsd +383 -0
  192. data/tools/fido/conf/fido-formats.xsd +173 -0
  193. data/tools/fido/conf/format_extension_template.xml +105 -0
  194. data/tools/fido/conf/format_extensions.xml +498 -0
  195. data/tools/fido/conf/formats-v81.xml +38355 -0
  196. data/tools/fido/conf/pronom-xml-v81.zip +0 -0
  197. data/tools/fido/conf/versions.xml +8 -0
  198. data/tools/fido/fido.bat +4 -0
  199. data/tools/fido/fido.py +854 -0
  200. data/tools/fido/fido.sh +5 -0
  201. data/tools/fido/prepare.py +616 -0
  202. data/tools/fido/pronomutils.py +115 -0
  203. data/tools/fido/toxml.py +52 -0
  204. data/tools/fido/update_signatures.py +171 -0
  205. data/tools/pdfbox/pdfbox-app-1.8.10.jar +0 -0
  206. data/tools/pdfbox/preflight-app-1.8.10.jar +0 -0
  207. metadata +396 -0
@@ -0,0 +1,854 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ import sys, re, os, time, math
4
+ import hashlib, urllib, urlparse, csv, getopt
5
+ from xml.etree import cElementTree as ET
6
+ from xml.etree import ElementTree as CET
7
+ from xml.etree import ElementTree as VET # versions.xml
8
+
9
+ version = '1.3.1'
10
+ defaults = {'bufsize': 128 * 1024, # (bytes)
11
+ 'regexcachesize' :2084, # (bytes)
12
+ 'conf_dir' : os.path.join(os.path.dirname(__file__), 'conf'),
13
+ 'printmatch': "OK,%(info.time)s,%(info.puid)s,\"%(info.formatname)s\",\"%(info.signaturename)s\",%(info.filesize)s,\"%(info.filename)s\",\"%(info.mimetype)s\",\"%(info.matchtype)s\"\n",
14
+ 'printnomatch' : "KO,%(info.time)s,,,,%(info.filesize)s,\"%(info.filename)s\",,\"%(info.matchtype)s\"\n",
15
+ 'format_files': ['formats-v75.xml', 'format_extensions.xml'],
16
+ 'containersignature_file' : 'container-signature-20150307.xml',
17
+ # versions.xml is where fido.py reads version information
18
+ # about which xml to load
19
+ 'versions_file' : 'versions.xml',
20
+ 'container_bufsize' : 512 * 1024, # (bytes)
21
+ 'description' : """
22
+ Format Identification for Digital Objects (fido).
23
+ FIDO is a command-line tool to identify the file formats of digital objects.
24
+ It is designed for simple integration into automated work-flows.
25
+ """,
26
+ 'epilog' : """
27
+ Open Planets Foundation (http://www.openplanetsfoundation.org)
28
+ See License.txt for license information.
29
+ Download from: https://github.com/openplanets/fido/releases
30
+ Usage guide: http://wiki.opf-labs.org/display/KB/FIDO+usage+guide
31
+ Author: Adam Farquhar (BL), 2010
32
+ Maintainer: Maurice de Rooij (OPF/NANETH), 2011, 2012, 2013
33
+ FIDO uses the UK National Archives (TNA) PRONOM File Format
34
+ and Container descriptions.
35
+ PRONOM is available from http://www.nationalarchives.gov.uk/pronom/"""
36
+ }
37
+
38
+ class Fido:
39
+ def __init__(self, quiet=False, bufsize=None, container_bufsize = None, printnomatch=None, printmatch=None, zip=False, nocontainer=False, handle_matches=None, conf_dir=None, format_files=None, containersignature_file=None):
40
+ global defaults
41
+ self.quiet = quiet
42
+ self.bufsize = (defaults['bufsize'] if bufsize == None else bufsize)
43
+ self.container_bufsize = (defaults['container_bufsize'] if container_bufsize == None else container_bufsize)
44
+ self.printmatch = (defaults['printmatch'] if printmatch == None else printmatch)
45
+ self.printnomatch = (defaults['printnomatch'] if printnomatch == None else printnomatch)
46
+ self.handle_matches = (self.print_matches if handle_matches == None else handle_matches)
47
+ self.zip = zip
48
+ self.nocontainer = (defaults['nocontainer'] if nocontainer == None else nocontainer)
49
+ self.conf_dir = defaults['conf_dir'] if conf_dir == None else conf_dir
50
+ # print defaults
51
+ # sys.exit()
52
+ self.format_files = defaults['format_files'] if format_files == None else format_files
53
+ #self.containersignature_file = defaults['containersignature_file'] if containersignature_file == None else containersignature_file
54
+ self.containersignature_file = defaults['containersignature_file'] #if containersignature_file == None else containersignature_file
55
+ self.formats = []
56
+ self.puid_format_map = {}
57
+ self.puid_has_priority_over_map = {}
58
+ # load signatures
59
+ for xml_file in self.format_files:
60
+ self.load_fido_xml(os.path.join(os.path.abspath(self.conf_dir), xml_file))
61
+ self.load_container_signature(os.path.join(os.path.abspath(self.conf_dir), self.containersignature_file))
62
+ self.current_file = ''
63
+ self.current_filesize = 0
64
+ self.current_format = None
65
+ self.current_sig = None
66
+ self.current_pat = None
67
+ self.current_count = 0 # Count of calls to match_formats
68
+ re._MAXCACHE = defaults['regexcachesize']
69
+ self.externalsig = ET.XML('<signature><name>External</name></signature>')
70
+
71
+ _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
72
+ #_special = '$*+.?![]^\\{|}'
73
+ _special = '$()*+.?![]^\\{|}'
74
+ _hex = '0123456789abcdef'
75
+ def _escape_char(self,c):
76
+ if c in '\n':
77
+ return '\\n'
78
+ elif c == '\r':
79
+ return '\\r'
80
+ elif c in self._special:
81
+ return '\\' + c
82
+ else:
83
+ (high, low) = divmod(ord(c), 16)
84
+ return '\\x' + self._hex[high] + self._hex[low]
85
+
86
+ def escape(self,string):
87
+ "Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
88
+ escaped = ''.join(c if c in self._ordinary else self._escape_char(c) for c in string)
89
+ return escaped
90
+
91
+ def convert_container_sequence(self,sig):
92
+ """Parse the PRONOM container sequences
93
+ and convert to regular expressions
94
+ """
95
+ seq = '(?s)'
96
+ inq = False
97
+ byt = False
98
+ rng = False
99
+ ror = False
100
+ for i in range(len(sig)):
101
+ if not inq and not rng:
102
+ if sig[i] == "'":
103
+ inq = True
104
+ continue
105
+ if sig[i] == " ":
106
+ continue
107
+ if sig[i] == "[":
108
+ seq += "("
109
+ rng = True
110
+ continue
111
+ if not byt:
112
+ seq += "\\x" + sig[i].lower()
113
+ byt = True
114
+ continue
115
+ if byt:
116
+ seq += sig[i].lower()
117
+ byt = False
118
+ continue
119
+ if inq:
120
+ if sig[i] == "'" and not rng:
121
+ inq = False
122
+ continue
123
+ seq += self.escape(sig[i])
124
+ continue
125
+ if rng:
126
+ if sig[i] == "]":
127
+ seq += ")"
128
+ rng = False
129
+ continue
130
+ if sig[i] != "-" and sig[i] != "'" and ror:
131
+ seq += self.escape(sig[i])
132
+ continue
133
+ if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and not byt:
134
+ seq += "\\x" + sig[i].lower()
135
+ byt = True
136
+ continue
137
+ if sig[i] != "-" and sig[i] != "'" and sig[i] != " " and not ror and byt:
138
+ seq += sig[i].lower()
139
+ byt = False
140
+ continue
141
+ if sig[i] == "-" or sig[i] == " ":
142
+ seq += "|"
143
+ continue
144
+ if sig[i] == "'" and not ror:
145
+ ror = True
146
+ continue
147
+ if sig[i] == "'" and ror:
148
+ ror = False
149
+ continue
150
+ #print seq
151
+ return seq
152
+
153
+ def load_container_signature(self, containersignature_file):
154
+ """Load the PRONOM container-signature file
155
+ and convert sequences to regular expressions
156
+ """
157
+ tree = CET.parse(containersignature_file)
158
+ # load and have container signatures converted
159
+ self.sequenceSignature = {}
160
+ for signature in tree.getroot().findall('ContainerSignatures/ContainerSignature'):
161
+ signatureId = signature.get('Id')
162
+ signatureSequence = signature.findall('Files/File/BinarySignatures/InternalSignatureCollection/InternalSignature/ByteSequence/SubSequence')
163
+ self.sequenceSignature[signatureId] = []
164
+ for sequence in signatureSequence:
165
+ self.sequenceSignature[signatureId].append(self.convert_container_sequence(sequence[0].text))
166
+ # find PUIDs which trigger container matching
167
+ self.puidTriggers = {}
168
+ triggers = tree.find('TriggerPuids')
169
+ for puid in triggers.findall('TriggerPuid'):
170
+ self.puidTriggers[puid.get('Puid')] = True
171
+ # map PUID to container signatureId
172
+ self.puidMapping = {}
173
+ mappings = tree.find('FileFormatMappings')
174
+ for mapping in mappings.findall('FileFormatMapping'):
175
+ if mapping.get('signatureId') not in self.puidMapping:
176
+ self.puidMapping[mapping.get('signatureId')] = []
177
+ self.puidMapping[mapping.get('signatureId')].append(mapping.get('Puid'))
178
+ # print "sequences:\n",self.sequenceSignature
179
+ # print "trigger:\n",self.puidTriggers
180
+ # print "mapping:\n",self.puidMapping
181
+ # exit()
182
+
183
+ def load_fido_xml(self, file):
184
+ """Load the fido format information from @param file.
185
+ As a side-effect, set self.formats
186
+ @return list of ElementTree.Element, one for each format.
187
+ """
188
+ tree = ET.parse(file)
189
+ #print "Loaded format specs in {0:>6.2f}ms".format((t1 - t0) * 1000)
190
+ #TODO: Handle empty regexes properly; perhaps remove from the format list
191
+ for element in tree.getroot().findall('./format'):
192
+ puid = self.get_puid(element)
193
+ # Handle over-writes in multiple file loads
194
+ existing = self.puid_format_map.get(puid, False)
195
+ if existing:
196
+ # Already have one, so replace old with new!
197
+ self.formats[self.formats.index(existing)] = element
198
+ else:
199
+ self.formats.append(element)
200
+ self.puid_format_map[puid] = element
201
+ # Build some structures to speed things up
202
+ self.puid_has_priority_over_map[puid] = frozenset([puid_element.text for puid_element in element.findall('has_priority_over')])
203
+ return self.formats
204
+
205
+ # To delete a format: (1) remove from self.formats, (2) remove from puid_format_map, (3) remove from selt.puid_has_priority_over_map
206
+ def get_signatures(self, format):
207
+ return format.findall('signature')
208
+
209
+ def has_priority_over(self, format, possibly_inferior):
210
+ return self.get_puid(possibly_inferior)in self.puid_has_priority_over_map[self.get_puid(format)]
211
+
212
+ def get_puid(self, format):
213
+ return format.find('puid').text
214
+
215
+ def get_patterns(self, signature):
216
+ return signature.findall('pattern')
217
+
218
+ def get_pos(self, pat):
219
+ return pat.find('position').text
220
+
221
+ def get_regex(self, pat):
222
+ return pat.find('regex').text
223
+
224
+ def get_extension(self, format):
225
+ return format.find('extension').text
226
+
227
+ def print_matches(self, fullname, matches, delta_t, matchtype=''):
228
+ """The default match handler. Prints out information for each match in the list.
229
+ @param fullname is name of the file being matched
230
+ @param matches is a list of (format, signature)
231
+ @param delta_t is the time taken for the match.
232
+ @param matchtype is the type of match (signature, containersignature, extension, fail)
233
+ """
234
+ class Info:
235
+ pass
236
+ obj = Info()
237
+ obj.count = self.current_count
238
+ obj.group_size = len(matches)
239
+ obj.filename = fullname
240
+ obj.time = int(delta_t * 1000)
241
+ obj.filesize = self.current_filesize
242
+ obj.matchtype = matchtype
243
+ if len(matches) == 0:
244
+ sys.stdout.write(self.printnomatch % { "info.time" : obj.time, "info.filesize" : obj.filesize, "info.filename" : obj.filename, "info.count"
245
+ : obj.count, "info.matchtype" : "fail" } )
246
+ else:
247
+ i = 0
248
+ for (f, s) in matches:
249
+ i += 1
250
+ obj.group_index = i
251
+ obj.puid = self.get_puid(f)
252
+ obj.formatname = f.find('name').text
253
+ obj.signaturename = s.find('name').text
254
+ mime = f.find('mime')
255
+ obj.mimetype = mime.text if mime != None else None
256
+ version = f.find('version')
257
+ obj.version = version.text if version != None else None
258
+ alias = f.find('alias')
259
+ obj.alias = alias.text if alias != None else None
260
+ apple_uti = f.find('apple_uid')
261
+ obj.apple_uti = apple_uti.text if apple_uti != None else None
262
+ sys.stdout.write(self.printmatch % { "info.time" : obj.time, "info.puid" : obj.puid, "info.formatname" : obj.formatname, "info.signaturename" : obj.signaturename, "info.filesize" : obj.filesize, "info.filename" : obj.filename, "info.mimetype" : obj.mimetype, "info.matchtype" : obj.matchtype, "info.version" : obj.version, "info.alias" : obj.alias, "info.apple_uti" : obj.apple_uti, "info.group_size" : obj.group_size, "info.group_index" : obj.group_index, "info.count" : obj.count })
263
+
264
+ def print_summary(self, secs):
265
+ """Print summary information on the number of matches and time taken.
266
+ """
267
+ count = self.current_count
268
+ if not self.quiet:
269
+ rate = (int(round(count / secs)) if secs != 0 else 9999)
270
+ #print >> sys.stderr, 'FIDO: Processed %6d files in %6.2f msec, %2d files/sec' % (count, secs * 1000, rate)
271
+ sys.stderr.write('FIDO: Processed %6d files in %6.2f msec, %2d files/sec\n' % (count, secs * 1000, rate))
272
+
273
+ def identify_file(self, filename):
274
+ """Identify the type of @param filename.
275
+ Call self.handle_matches instead of returning a value.
276
+ """
277
+ self.current_file = filename
278
+ self.matchtype = "signature"
279
+ try:
280
+ t0 = time.clock()
281
+ f = open(filename, 'rb')
282
+ size = os.stat(filename)[6]
283
+ self.current_filesize = size
284
+ if self.current_filesize == 0:
285
+ sys.stderr.write("FIDO: Zero byte file (empty): Path is: {0}\n".format(filename))
286
+ bofbuffer, eofbuffer = self.get_buffers(f, size, seekable=True)
287
+ matches = self.match_formats(bofbuffer, eofbuffer)
288
+ # from here is also repeated in walk_zip
289
+ # we should make this uniform in a next version!
290
+ #
291
+ # filesize is made conditional because files with 0 bytes
292
+ # are falsely characterised being 'rtf' (due to wacky sig)
293
+ # in these cases we try to match the extension instead
294
+ if len(matches) > 0 and self.current_filesize > 0:
295
+ self.handle_matches(filename, matches, time.clock() - t0, self.matchtype)
296
+ elif len(matches) == 0 or self.current_filesize == 0:
297
+ matches = self.match_extensions(filename)
298
+ self.handle_matches(filename, matches, time.clock() - t0, "extension")
299
+ # till here matey!
300
+ if self.zip:
301
+ self.identify_contents(filename, type=self.container_type(matches))
302
+ except IOError:
303
+ #print >> sys.stderr, "FIDO: Error in identify_file: Path is {0}".format(filename)
304
+ sys.stderr.write("FIDO: Error in identify_file: Path is {0}\n".format(filename))
305
+
306
+ def identify_contents(self, filename, fileobj=None, type=False):
307
+ """Identify each item in a container (such as a zip or tar file). Call self.handle_matches on each item.
308
+ @param fileobj could be a file, or a stream.
309
+ """
310
+ if type == False:
311
+ return
312
+ elif type == 'zip':
313
+ self.walk_zip(filename, fileobj)
314
+ elif type == 'tar':
315
+ self.walk_tar(filename, fileobj)
316
+ else: # TODO: ouch!
317
+ raise RuntimeError("Unknown container type: " + repr(type))
318
+
319
+ def identify_multi_object_stream(self, stream):
320
+ """Does not work!
321
+ Stream may contain one or more objects each with an HTTP style header that must include content-length.
322
+ The headers consist of keyword:value pairs terminated by a newline. There must be a newline following the headers.
323
+ """
324
+ offset = 0
325
+ while True:
326
+ t0 = time.clock()
327
+ content_length = -1
328
+ for line in stream:
329
+ offset += len(line)
330
+ if line == '\n':
331
+ if content_length < 0:
332
+ raise EnvironmentError("No content-length provided.")
333
+ else:
334
+ break
335
+ pair = line.lower().split(':', 2)
336
+ if pair[0] == 'content-length':
337
+ content_length = int(pair[1])
338
+ if content_length == -1:
339
+ return
340
+ # Consume exactly content-length bytes
341
+ self.current_file = 'STDIN!(at ' + str(offset) + ' bytes)'
342
+ self.current_filesize = content_length
343
+ bofbuffer, eofbuffer = self.get_buffers(stream, content_length)
344
+ matches = self.match_formats(bofbuffer, eofbuffer)
345
+ # MdR: this needs attention
346
+ if len(matches) > 0:
347
+ self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
348
+ elif len(matches) == 0 or self.current_filesize == 0:
349
+ matches = self.match_extensions(self.current_file)
350
+ self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
351
+
352
+ def identify_stream(self, stream, filename):
353
+ """Identify the type of @param stream.
354
+ Call self.handle_matches instead of returning a value.
355
+ Does not close stream.
356
+ """
357
+ t0 = time.clock()
358
+ bofbuffer, eofbuffer, bytes_read = self.get_buffers(stream, length=None)
359
+ self.current_filesize = bytes_read
360
+ self.current_file = 'STDIN'
361
+ matches = self.match_formats(bofbuffer, eofbuffer)
362
+ # MdR: this needs attention
363
+ if len(matches) > 0:
364
+ self.handle_matches(self.current_file, matches, time.clock() - t0, "signature")
365
+ elif len(matches) == 0 or self.current_filesize == 0:
366
+ # we can only determine the filename from the STDIN stream
367
+ # on Linux, on Windows there is not a (simple) way to do that
368
+ if (os.name != "nt"):
369
+ try:
370
+ self.current_file = os.readlink("/proc/self/fd/0")
371
+ except:
372
+ if filename is not None:
373
+ self.current_file = filename
374
+ else:
375
+ self.current_file = 'STDIN'
376
+ else:
377
+ if filename is not None:
378
+ self.current_file = filename
379
+ matches = self.match_extensions(self.current_file)
380
+ # we have to reset self.current_file if not on Windows
381
+ if (os.name != "nt"):
382
+ self.current_file = 'STDIN'
383
+ self.handle_matches(self.current_file, matches, time.clock() - t0, "extension")
384
+
385
+ def container_type(self, matches):
386
+ """Determine if one of the @param matches is the format of a container that we can look inside of (e.g., zip, tar).
387
+ @return False, zip, or tar.
388
+ """
389
+ for (format, unused) in matches:
390
+ container = format.find('container')
391
+ if container != None:
392
+ return container.text
393
+ return False
394
+
395
+ def blocking_read(self, file, bytes_to_read):
396
+ bytes_read = 0
397
+ buffer = ''
398
+ while bytes_read < bytes_to_read:
399
+ readbuffer = file.read(bytes_to_read - bytes_read)
400
+ buffer += readbuffer
401
+ bytes_read = len(buffer)
402
+ # break out if EOF is reached.
403
+ if readbuffer == '':
404
+ break
405
+ return buffer
406
+
407
+ def get_buffers(self, stream, length=None, seekable=False):
408
+ """Return buffers from the beginning and end of stream and the number of bytes read
409
+ if there may be more bytes in the stream.
410
+
411
+ If length is None, return the length as found.
412
+ If seekable is False, the steam does not support a seek operation.
413
+ """
414
+ bytes_to_read = self.bufsize if length == None else min(length, self.bufsize)
415
+ bofbuffer = self.blocking_read(stream, bytes_to_read)
416
+ bytes_read = len(bofbuffer)
417
+ if length == None:
418
+ # A stream with unknown length; have to keep two buffers around
419
+ prevbuffer = bofbuffer
420
+ while True:
421
+ buffer = self.blocking_read(stream, self.bufsize)
422
+ bytes_read += len(buffer)
423
+ if len(buffer) == self.bufsize:
424
+ prevbuffer = buffer
425
+ else:
426
+ eofbuffer = prevbuffer if len(buffer) == 0 else prevbuffer[-(self.bufsize - len(buffer)):] + buffer
427
+ break
428
+ return bofbuffer, eofbuffer, bytes_read
429
+ else:
430
+ bytes_unread = length - len(bofbuffer)
431
+ if bytes_unread == 0:
432
+ eofbuffer = bofbuffer
433
+ elif bytes_unread < self.bufsize:
434
+ # The buffs overlap
435
+ eofbuffer = bofbuffer[bytes_unread:] + self.blocking_read(stream, bytes_unread)
436
+ elif bytes_unread == self.bufsize:
437
+ eofbuffer = self.blocking_read(stream, self.bufsize)
438
+ elif seekable: # easy case when we can just seek!
439
+ stream.seek(length - self.bufsize)
440
+ eofbuffer = self.blocking_read(stream, self.bufsize)
441
+ else:
442
+ # We have more to read and know how much.
443
+ # n*bufsize + r = length
444
+ (n, r) = divmod(bytes_unread, self.bufsize)
445
+ # skip n-1*bufsize bytes
446
+ for unused_i in xrange(1, n):
447
+ self.blocking_read(stream, self.bufsize)
448
+ # skip r bytes
449
+ self.blocking_read(stream, r)
450
+ # and read the remaining bufsize bytes into the eofbuffer
451
+ eofbuffer = self.blocking_read(stream, self.bufsize)
452
+ return bofbuffer, eofbuffer
453
+
454
+ def walk_zip(self, filename, fileobj=None):
455
+ """Identify the type of each item in the zip
456
+ @param fileobj. If fileobj is not provided, open
457
+ @param filename.
458
+ Call self.handle_matches instead of returning a value.
459
+ """
460
+ # IN 2.7+: with zipfile.ZipFile((fileobj if fileobj != None else filename), 'r') as stream:
461
+ import zipfile, tempfile
462
+ try:
463
+ zipstream = None
464
+ zipstream = zipfile.ZipFile((fileobj if fileobj != None else filename), 'r')
465
+ for item in zipstream.infolist():
466
+ if item.file_size == 0:
467
+ continue #TODO: Find a better test for isdir
468
+ t0 = time.clock()
469
+ # with zipstream.open(item) as f:
470
+ f = None
471
+ try:
472
+ f = zipstream.open(item)
473
+ item_name = filename + '!' + item.filename
474
+ self.current_file = item_name
475
+ self.current_filesize = item.file_size
476
+ if self.current_filesize == 0:
477
+ sys.stderr.write("FIDO: Zero byte file (empty): Path is: {0}\n".format(item_name))
478
+ bofbuffer, eofbuffer = self.get_buffers(f, item.file_size)
479
+ finally:
480
+ if f != None: f.close()
481
+ matches = self.match_formats(bofbuffer, eofbuffer)
482
+ if len(matches) > 0 and self.current_filesize > 0:
483
+ self.handle_matches(item_name, matches, time.clock() - t0, "signature")
484
+ elif len(matches) == 0 or self.current_filesize == 0:
485
+ matches = self.match_extensions(item_name)
486
+ self.handle_matches(item_name, matches, time.clock() - t0, "extension")
487
+ if self.container_type(matches):
488
+ target = tempfile.SpooledTemporaryFile(prefix='Fido')
489
+ #with zipstream.open(item) as source:
490
+ try:
491
+ source = zipstream.open(item)
492
+ self.copy_stream(source, target)
493
+ #target.seek(0)
494
+ self.identify_contents(item_name, target, self.container_type(matches))
495
+ finally:
496
+ source.close()
497
+ except IOError:
498
+ sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
499
+ except zipfile.BadZipfile:
500
+ sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
501
+
502
+ finally:
503
+ if zipstream != None: zipstream.close()
504
+
505
+ def walk_tar(self, filename, fileobj):
506
+ """Identify the type of each item in the tar
507
+ @param fileobj. If fileobj is not provided, open
508
+ @param filename.
509
+ Call self.handle_matches instead of returning a value.
510
+ """
511
+ import tarfile
512
+ tarstream = None
513
+ try:
514
+ tarstream = tarfile.TarFile(filename, fileobj=fileobj, mode='r')
515
+ for item in tarstream.getmembers():
516
+ if item.isfile():
517
+ t0 = time.clock()
518
+ f = tarstream.extractfile(item)
519
+ tar_item_name = filename + '!' + item.name
520
+ self.current_file = tar_item_name
521
+ self.current_filesize = item.size
522
+ bofbuffer, eofbuffer = self.get_buffers(f, item.size)
523
+ matches = self.match_formats(bofbuffer, eofbuffer)
524
+ self.handle_matches(tar_item_name, matches, time.clock() - t0)
525
+ if self.container_type(matches):
526
+ f.seek(0)
527
+ self.identify_contents(tar_item_name, f, self.container_type(matches))
528
+ f.close()
529
+ except tarfile.TarError:
530
+ sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))
531
+ finally:
532
+ if tarstream != None: tarstream.close()
533
+
534
+ def as_good_as_any(self, f1, match_list):
535
+ """Return True if the proposed format is as good as any in the match_list.
536
+ For example, if there is no format in the match_list that has priority over the proposed one"""
537
+ if match_list != []:
538
+ f1_puid = self.get_puid(f1)
539
+ for (f2, unused) in match_list:
540
+ if f1 == f2:
541
+ continue
542
+ elif f1_puid in self.puid_has_priority_over_map[self.get_puid(f2)]:
543
+ return False
544
+ return True
545
+
546
+ def buffered_read(self, file_pos, overlap):
547
+ """Buffered read of data chunks
548
+ """
549
+ buf = ""
550
+ if not overlap:
551
+ bufsize = self.container_bufsize
552
+ else:
553
+ bufsize = self.container_bufsize + self.overlap_range
554
+ file_end = self.current_filesize
555
+ file_handle = file(self.current_file, 'rb')
556
+ file_handle.seek(file_pos)
557
+ if file_end - file_pos < bufsize:
558
+ file_read = file_end - file_pos
559
+ else:
560
+ file_read = self.bufsize
561
+ buf = file_handle.read(file_read)
562
+ return buf
563
+
564
+ def read_container(self,parent_buffer,parent_result):
565
+ """Header of compound containers can be further away than default 128 KB buffer
566
+ especially with big files containing binary objects.
567
+ This function reads containers in chunks of 512 KB (defaults['container_bufsize'])
568
+ Each chunk is inspected with the PRONOM container sequences.
569
+ Each chunk smuggles in a piece from the previous chunk to prevent
570
+ cutting off patterns we are looking for in the middle.
571
+ This method is somewhat slower than reading the complete file at once.
572
+ This is to prevent Fido to potentially crash in the midst of scanning a very big file.
573
+ NOTE (MdR): this piece of code is still a bit quirky
574
+ as it does not yet takes byte positions into account which
575
+ are available in the DROID container signature file
576
+ """
577
+ container_result = []
578
+ nobuffer = False
579
+ overlap = False
580
+ self.overlap_range = 512 # bytes
581
+ container_hit = False
582
+ passes = 1
583
+ container_buffer = ""
584
+ # TODO: find better way to handle zip contents
585
+ # for now: ugly hack, but working
586
+ # this slows down because the zip is re-opened on each item
587
+ # if "!" is in filename, it is a zip item
588
+ # if "!" in self.current_file:
589
+ # import zipfile, tempfile
590
+ # zip, item = self.current_file.split("!")
591
+ # zipitem = tempfile.SpooledTemporaryFile(prefix='Fido')
592
+ #with zipstream.open(item) as source:
593
+ # try:
594
+ # source = zipstream.open(item)
595
+ # self.copy_stream(source, target)
596
+ # target.seek(0)
597
+ # self.identify_contents(item_name, target, self.container_type(matches))
598
+ # finally:
599
+ # source.close()
600
+ #exit()
601
+ # in case argument 'nocontainer' is set
602
+ # read default bofbuffer
603
+ if self.nocontainer or self.current_filesize <= self.bufsize or self.current_file == "STDIN":
604
+ passes = 1
605
+ nobuffer = True
606
+ else:
607
+ passes = int(float(self.current_filesize / self.container_bufsize) + 1)
608
+ pos = 0
609
+ for i in xrange(passes):
610
+ if nobuffer is True:
611
+ container_buffer = parent_buffer
612
+ else:
613
+ if i == 0:
614
+ pos = 0
615
+ else:
616
+ pos = ((self.container_bufsize * i) - self.overlap_range)
617
+ overlap = True
618
+ container_buffer = self.buffered_read(pos, overlap)
619
+ for (container_id,container_regexes) in self.sequenceSignature.iteritems():
620
+ # set hitcounter in case a container entry
621
+ # has more than one regex
622
+ hitcounter = 0
623
+ if len(container_regexes) > 0:
624
+ for container_regex in container_regexes:
625
+ if re.search(container_regex, container_buffer):
626
+ hitcounter += 1
627
+ # if the hitcounter matches the number of regexes
628
+ # then it must be a positive hit, else continue
629
+ # to match the rest of the sequences
630
+ if hitcounter < len(container_regexes):
631
+ continue
632
+ self.matchtype = "container"
633
+ for container_puid in self.puidMapping[container_id]:
634
+ for container_format in self.formats:
635
+ if container_format.find('puid').text == container_puid:
636
+ if self.as_good_as_any(container_format, parent_result):
637
+ for container_sig in self.get_signatures(container_format):
638
+ container_result.append((container_format, container_sig))
639
+ break
640
+ return container_result
641
+
642
+ def match_formats(self, bofbuffer, eofbuffer):
643
+ """Apply the patterns for formats to the supplied buffers.
644
+ @return a match list of (format, signature) tuples.
645
+ The list has inferior matches removed.
646
+ """
647
+ self.current_count += 1
648
+ #t0 = time.clock()
649
+ result = []
650
+ container_result = []
651
+ for format in self.formats:
652
+ try:
653
+ self.current_format = format
654
+ if self.as_good_as_any(format, result):
655
+ for sig in self.get_signatures(format):
656
+ self.current_sig = sig
657
+ success = True
658
+ for pat in self.get_patterns(sig):
659
+ self.current_pat = pat
660
+ pos = self.get_pos(pat)
661
+ regex = self.get_regex(pat)
662
+ #print 'trying ', regex
663
+ if pos == 'BOF':
664
+ if not re.match(regex, bofbuffer):
665
+ success = False
666
+ break
667
+ elif pos == 'EOF':
668
+ if not re.search(regex, eofbuffer):
669
+ success = False
670
+ break
671
+ elif pos == 'VAR':
672
+ if not re.search(regex, bofbuffer):
673
+ success = False
674
+ break
675
+ elif pos == 'IFB':
676
+ if not re.search(regex, bofbuffer):
677
+ success = False
678
+ break
679
+ if success:
680
+ result.append((format, sig))
681
+ # check if file needs to be parsed with container signature
682
+ # we skip files with extension "zip" (x-fmt/263)
683
+ ext = os.path.splitext(self.current_file)[1].lower().lstrip(".")
684
+ if format.find('puid').text in self.puidTriggers and ext != "zip":
685
+ container_result = self.read_container(bofbuffer,result)
686
+ if len(container_result) != 0:
687
+ for (k,v) in container_result:
688
+ result.append((k,v))
689
+ break
690
+ except Exception as e:
691
+ sys.stderr.write(str(e)+"\n")
692
+ continue
693
+ # TODO: MdR: needs some <3
694
+ #print "Unexpected error:", sys.exc_info()[0], e
695
+ #sys.stdout.write('***', self.get_puid(format), regex)
696
+
697
+ # t1 = time.clock()
698
+ # if t1 - t0 > 0.02:
699
+ # print >> sys.stderr, "FIDO: Slow ID", self.current_file
700
+ result = [match for match in result if self.as_good_as_any(match[0], result)]
701
+ result = list(set(result)) # remove duplicate results, this is due to ??? in self.read_container(), needs fix
702
+ return result
703
+
704
+ def match_extensions(self, filename):
705
+ "Return the list of (format, self.externalsig) for every format whose extension matches the filename."
706
+ myext = os.path.splitext(filename)[1].lower().lstrip(".")
707
+ result = []
708
+ if len(myext) > 0:
709
+ for element in self.formats:
710
+ if element.findall('extension') != None:
711
+ for format in element.findall('extension'):
712
+ if myext == format.text:
713
+ result.append((element, self.externalsig))
714
+ break
715
+ result = [match for match in result if self.as_good_as_any(match[0], result)]
716
+ return result
717
+
718
+ def copy_stream(self, source, target):
719
+ while True:
720
+ buf = source.read(self.bufsize)
721
+ if len(buf) == 0:
722
+ break
723
+ target.write(buf)
724
+
725
+ def list_files(roots, recurse=False):
726
+ "Return the files one at a time. Roots could be a fileobj or a list."
727
+ for root in roots:
728
+ root = (root if root[-1] != '\n' else root[:-1])
729
+ root = os.path.normpath(root)
730
+ if os.path.isfile(root):
731
+ yield root
732
+ else:
733
+ for path, unused, files in os.walk(root):
734
+ for f in files:
735
+ yield os.path.join(path, f)
736
+ if recurse == False:
737
+ break
738
+
739
+ def main(arglist=None):
740
+ # The argparse package was introduced in 2.7
741
+ t0 = time.clock()
742
+ from argparselocal import ArgumentParser, RawTextHelpFormatter
743
+ if arglist == None:
744
+ arglist = sys.argv[1:]
745
+ if len(arglist) == False:
746
+ arglist.append("-h")
747
+ parser = ArgumentParser(description=defaults['description'], epilog=defaults['epilog'], fromfile_prefix_chars='@', formatter_class=RawTextHelpFormatter)
748
+ parser.add_argument('-v', default=False, action='store_true', help='show version information')
749
+ parser.add_argument('-q', default=False, action='store_true', help='run (more) quietly')
750
+ parser.add_argument('-recurse', default=False, action='store_true', help='recurse into subdirectories')
751
+ parser.add_argument('-zip', default=False, action='store_true', help='recurse into zip and tar files')
752
+ parser.add_argument('-nocontainer', default=False, action='store_true', help='disable deep scan of container documents, increases speed but may reduce accuracy with big files')
753
+ parser.add_argument('-pronom_only', default=False, action='store_true', help='disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results')
754
+ group = parser.add_mutually_exclusive_group()
755
+ group.add_argument('-input', default=False, help='file containing a list of files to check, one per line. - means stdin')
756
+ group.add_argument('files', nargs='*', default=[], metavar='FILE', help='files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.')
757
+ parser.add_argument('-filename', default=None, help='filename if file contents passed through STDIN')
758
+ parser.add_argument('-useformats', metavar='INCLUDEPUIDS', default=None, help='comma separated string of formats to use in identification')
759
+ parser.add_argument('-nouseformats', metavar='EXCLUDEPUIDS', default=None, help='comma separated string of formats not to use in identification')
760
+ parser.add_argument('-matchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use on match. See nomatchprintf, README.txt.')
761
+ parser.add_argument('-nomatchprintf', metavar='FORMATSTRING', default=None, help='format string (Python style) to use if no match. See README.txt')
762
+ parser.add_argument('-bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['bufsize'])+' bytes)')
763
+ parser.add_argument('-container_bufsize', type=int, default=None, help='size (in bytes) of the buffer to match against (default='+str(defaults['container_bufsize'])+' bytes)')
764
+
765
+ parser.add_argument('-loadformats', default=None, metavar='XML1,...,XMLn', help='comma separated string of XML format files to add.')
766
+ parser.add_argument('-confdir', default=None, help='configuration directory to load_fido_xml, for example, the format specifications from.')
767
+
768
+ # what is this doing here only once?
769
+ #mydir = os.path.abspath(os.path.dirname(__file__))
770
+
771
+ # PROCESS ARGUMENTS
772
+ args = parser.parse_args(arglist)
773
+ # print args
774
+ # sys.exit()
775
+ # process confdir
776
+ # load versions.xml
777
+ # and stick it in defaults
778
+ if args.confdir:
779
+ versionsFile = os.path.join(os.path.abspath(args.confdir), defaults['versions_file'])
780
+ else:
781
+ versionsFile = os.path.join(os.path.abspath(defaults['conf_dir']), defaults['versions_file'])
782
+ try:
783
+ versions = VET.parse(versionsFile)
784
+ except Exception, e:
785
+ sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
786
+ sys.exit()
787
+ defaults['xml_pronomSignature'] = versions.find("pronomSignature").text
788
+ # defaults['xml_pronomContainerSignature'] = versions.find("pronomContainerSignature").text
789
+ defaults['containersignature_file'] = versions.find("pronomContainerSignature").text
790
+ defaults['xml_fidoExtensionSignature'] = versions.find("fidoExtensionSignature").text
791
+ defaults['format_files'] = []
792
+ defaults['format_files'].append(defaults['xml_pronomSignature'])
793
+ if args.pronom_only:
794
+ versionHeader = "FIDO v{0} ({1}, {2})\n".format(version,defaults['xml_pronomSignature'],defaults['containersignature_file'])
795
+ else:
796
+ versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(version,defaults['xml_pronomSignature'],defaults['containersignature_file'],defaults['xml_fidoExtensionSignature'])
797
+ defaults['format_files'].append(defaults['xml_fidoExtensionSignature'])
798
+
799
+ if args.v :
800
+ sys.stdout.write(versionHeader)
801
+ sys.exit(0)
802
+ if args.matchprintf != None:
803
+ args.matchprintf = args.matchprintf.decode('string_escape')
804
+ if args.nomatchprintf != None:
805
+ args.nomatchprintf = args.nomatchprintf.decode('string_escape')
806
+ fido = Fido(quiet=args.q, bufsize=args.bufsize, container_bufsize=args.container_bufsize,
807
+ printmatch=args.matchprintf, printnomatch=args.nomatchprintf, zip=args.zip, nocontainer = args.nocontainer, conf_dir=args.confdir)
808
+
809
+ #TODO: Allow conf options to be dis-included
810
+ if args.loadformats:
811
+ for file in args.loadformats.split(','):
812
+ fido.load_fido_xml(file)
813
+
814
+ #TODO: remove from maps
815
+ if args.useformats:
816
+ args.useformats = args.useformats.split(',')
817
+ fido.formats = [f for f in fido.formats if f.find('puid').text in args.useformats]
818
+ elif args.nouseformats:
819
+ args.nouseformats = args.nouseformats.split(',')
820
+ fido.formats = [f for f in fido.formats if f.find('puid').text not in args.nouseformats]
821
+
822
+ # Set up to use stdin, or open input files:
823
+ if args.input == '-':
824
+ args.files = sys.stdin
825
+ elif args.input:
826
+ args.files = open(args.input, 'r')
827
+
828
+ # RUN
829
+ try:
830
+ if not args.q:
831
+ sys.stderr.write(versionHeader)
832
+ sys.stderr.flush()
833
+ if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
834
+ if fido.zip == True:
835
+ raise RuntimeError("Multiple content read from stdin not yet supported.")
836
+ sys.exit(1)
837
+ fido.identify_multi_object_stream(sys.stdin)
838
+ else:
839
+ fido.identify_stream(sys.stdin, args.filename)
840
+ else:
841
+ for file in list_files(args.files, args.recurse):
842
+ fido.identify_file(file)
843
+ except KeyboardInterrupt:
844
+ msg = "FIDO: Interrupt while identifying file {0}"
845
+ sys.stderr.write(msg.format(fido.current_file))
846
+ sys.exit(1)
847
+
848
+ if not args.q:
849
+ sys.stdout.flush()
850
+ fido.print_summary(time.clock() - t0)
851
+ sys.stderr.flush()
852
+
853
+ if __name__ == '__main__':
854
+ main()