libis-format 0.9.5-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (207) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +18 -0
  4. data/.travis.yml +41 -0
  5. data/Gemfile +5 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +39 -0
  8. data/Rakefile +8 -0
  9. data/bin/droid +15 -0
  10. data/bin/fido +12 -0
  11. data/bin/pdf_copy +13 -0
  12. data/data/ISOcoated_v2_eci.icc +0 -0
  13. data/data/PDFA_def.ps +40 -0
  14. data/data/ead.xsd +2728 -0
  15. data/data/eciRGB_v2.icc +0 -0
  16. data/data/lias_formats.xml +106 -0
  17. data/data/types.yml +217 -0
  18. data/lib/libis/format/config.rb +35 -0
  19. data/lib/libis/format/converter/base.rb +101 -0
  20. data/lib/libis/format/converter/chain.rb +167 -0
  21. data/lib/libis/format/converter/image_converter.rb +214 -0
  22. data/lib/libis/format/converter/office_converter.rb +50 -0
  23. data/lib/libis/format/converter/pdf_converter.rb +139 -0
  24. data/lib/libis/format/converter/repository.rb +98 -0
  25. data/lib/libis/format/converter.rb +11 -0
  26. data/lib/libis/format/droid.rb +45 -0
  27. data/lib/libis/format/fido.rb +102 -0
  28. data/lib/libis/format/identifier.rb +189 -0
  29. data/lib/libis/format/office_to_pdf.rb +52 -0
  30. data/lib/libis/format/pdf_copy.rb +40 -0
  31. data/lib/libis/format/pdf_merge.rb +41 -0
  32. data/lib/libis/format/pdf_split.rb +39 -0
  33. data/lib/libis/format/pdf_to_pdfa.rb +76 -0
  34. data/lib/libis/format/pdfa_validator.rb +61 -0
  35. data/lib/libis/format/type_database.rb +170 -0
  36. data/lib/libis/format/version.rb +5 -0
  37. data/lib/libis/format.rb +23 -0
  38. data/lib/libis-format.rb +1 -0
  39. data/libis-format.gemspec +34 -0
  40. data/spec/converter_spec.rb +212 -0
  41. data/spec/data/Cevennes2.bmp +0 -0
  42. data/spec/data/Cevennes2.jp2 +0 -0
  43. data/spec/data/Cevennes2.ppm +22492 -0
  44. data/spec/data/test-ead.xml +392 -0
  45. data/spec/data/test-jpg.tif +0 -0
  46. data/spec/data/test-lzw.tif +0 -0
  47. data/spec/data/test-options.jpg +0 -0
  48. data/spec/data/test.bmp +0 -0
  49. data/spec/data/test.doc +0 -0
  50. data/spec/data/test.docx +0 -0
  51. data/spec/data/test.gif +0 -0
  52. data/spec/data/test.jpg +0 -0
  53. data/spec/data/test.ods +0 -0
  54. data/spec/data/test.odt +0 -0
  55. data/spec/data/test.pdf +0 -0
  56. data/spec/data/test.pdf.tif +0 -0
  57. data/spec/data/test.png +0 -0
  58. data/spec/data/test.ps +8631 -0
  59. data/spec/data/test.psd +0 -0
  60. data/spec/data/test.rtf +1455 -0
  61. data/spec/data/test.tif +0 -0
  62. data/spec/data/test.txt +12 -0
  63. data/spec/data/test.xcf +0 -0
  64. data/spec/data/test.xls +0 -0
  65. data/spec/data/test.xlsx +0 -0
  66. data/spec/data/test.xml +4 -0
  67. data/spec/data/test_pdfa.pdf +0 -0
  68. data/spec/identifier_spec.rb +60 -0
  69. data/spec/spec_helper.rb +9 -0
  70. data/spec/test_types.yml +12 -0
  71. data/spec/type_database_spec.rb +140 -0
  72. data/tools/PdfTool.jar +0 -0
  73. data/tools/bcpkix-jdk15on-1.49.jar +0 -0
  74. data/tools/bcprov-jdk15on-1.49.jar +0 -0
  75. data/tools/droid/DROID_SignatureFile_V82.xml +32681 -0
  76. data/tools/droid/container-signature-20150307.xml +2235 -0
  77. data/tools/droid/droid-command-line-6.1.5.jar +0 -0
  78. data/tools/droid/droid.bat +154 -0
  79. data/tools/droid/droid.sh +138 -0
  80. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  81. data/tools/droid/lib/activation-1.1.jar +0 -0
  82. data/tools/droid/lib/antlr-2.7.7.jar +0 -0
  83. data/tools/droid/lib/antlr-3.2.jar +0 -0
  84. data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
  85. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  86. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  87. data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
  88. data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
  89. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  90. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  91. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  92. data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
  93. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  94. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  95. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  96. data/tools/droid/lib/commons-codec-1.4.jar +0 -0
  97. data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
  98. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  99. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  100. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  101. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  102. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  103. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  104. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  105. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  106. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  107. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  108. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  109. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  110. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  111. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  112. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  113. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  114. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  115. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  116. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  117. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  118. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  119. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  120. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  121. data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
  122. data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
  123. data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
  124. data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
  125. data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
  126. data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
  127. data/tools/droid/lib/droid-help-6.1.5.jar +0 -0
  128. data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
  129. data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
  130. data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
  131. data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
  132. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  133. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  134. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  135. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  136. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  137. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  138. data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
  139. data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
  140. data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
  141. data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
  142. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  143. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  144. data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
  145. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  146. data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
  147. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  148. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  149. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  150. data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
  151. data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
  152. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  153. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  154. data/tools/droid/lib/jta-1.1.jar +0 -0
  155. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  156. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  157. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  158. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  159. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  160. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  161. data/tools/droid/lib/poi-3.7.jar +0 -0
  162. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  163. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  164. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  165. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  166. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  167. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  168. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  169. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  170. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  171. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  172. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  173. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  174. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  175. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  176. data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
  177. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  178. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  179. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  180. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  181. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  182. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  183. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  184. data/tools/droid/lib/xz-1.0.jar +0 -0
  185. data/tools/fido/__init__.py +0 -0
  186. data/tools/fido/argparselocal.py +2355 -0
  187. data/tools/fido/conf/DROID_SignatureFile-v81.xml +2 -0
  188. data/tools/fido/conf/container-signature-20150307.xml +2238 -0
  189. data/tools/fido/conf/dc.xsd +119 -0
  190. data/tools/fido/conf/dcmitype.xsd +53 -0
  191. data/tools/fido/conf/dcterms.xsd +383 -0
  192. data/tools/fido/conf/fido-formats.xsd +173 -0
  193. data/tools/fido/conf/format_extension_template.xml +105 -0
  194. data/tools/fido/conf/format_extensions.xml +498 -0
  195. data/tools/fido/conf/formats-v81.xml +38355 -0
  196. data/tools/fido/conf/pronom-xml-v81.zip +0 -0
  197. data/tools/fido/conf/versions.xml +8 -0
  198. data/tools/fido/fido.bat +4 -0
  199. data/tools/fido/fido.py +854 -0
  200. data/tools/fido/fido.sh +5 -0
  201. data/tools/fido/prepare.py +616 -0
  202. data/tools/fido/pronomutils.py +115 -0
  203. data/tools/fido/toxml.py +52 -0
  204. data/tools/fido/update_signatures.py +171 -0
  205. data/tools/pdfbox/pdfbox-app-1.8.10.jar +0 -0
  206. data/tools/pdfbox/preflight-app-1.8.10.jar +0 -0
  207. metadata +396 -0
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ BASEDIR=$(dirname $0)
3
+ FIDO_PROG=$BASEDIR/fido.py
4
+
5
+ python "$FIDO_PROG" -bufsize 1000000 -container_bufsize 1000000 -q "$@"
@@ -0,0 +1,616 @@
1
+ #!python
2
+ # -*- coding: utf-8 -*-
3
+ # Format Identification for Digital Objects
4
+
5
+ # MdR: 'reload(sys)' and 'setdefaultencoding("utf-8")' needed to fix utf-8 encoding errors
6
+ # when converting from PRONOM to FIDO format
7
+ import sys
8
+ reload(sys)
9
+ sys.setdefaultencoding("utf-8")
10
+ import cStringIO, zipfile, os
11
+ import hashlib
12
+ import urllib
13
+ from xml.etree import ElementTree as ET
14
+ from xml.etree import ElementTree as VET # versions.xml
15
+ # needed for debug
16
+ # print_r: https://github.com/marcbelmont/python-print_r
17
+ # from print_r import print_r
18
+
19
+ class NS:
20
+ """Helper class for XML name spaces in ElementTree.
21
+ Use like MYNS=NS("{http://some/uri}") and then
22
+ MYNS(tag1/tag2).
23
+ """
24
+ def __init__(self, uri):
25
+ self.uri = uri
26
+ def __getattr__(self, tag):
27
+ return self.uri + tag
28
+ def __call__(self, path):
29
+ return "/".join(getattr(self, tag) for tag in path.split("/"))
30
+
31
+ # XHTML namespace
32
+ XHTML = NS("{http://www.w3.org/1999/xhtml}")
33
+ # TNA namespace
34
+ TNA = NS("{http://pronom.nationalarchives.gov.uk}")
35
+
36
+ def get_text_tna(element, tag, default=''):
37
+ """Helper function to return the text for a tag or path using the TNA namespace.
38
+ """
39
+ part = element.find(TNA(tag))
40
+ return part.text.strip() if part != None and part.text != None else default
41
+
42
+ def prettify(elem):
43
+ """Return a pretty-printed XML string for the Element.
44
+ """
45
+ from xml.dom import minidom
46
+ rough_string = ET.tostring(elem, 'UTF-8')
47
+ reparsed = minidom.parseString(rough_string)
48
+ return reparsed.toprettyxml(indent=" ")
49
+
50
+ class FormatInfo:
51
+ def __init__(self, pronom_files, format_list=[]):
52
+ self.info = {}
53
+ self.formats = []
54
+ self.pronom_files = pronom_files
55
+ for f in format_list:
56
+ self.add_format(f)
57
+
58
+ def save(self, dst):
59
+ """Write the fido XML format definitions to @param dst
60
+ """
61
+ tree = ET.ElementTree(ET.Element('formats', {'version':'0.3',
62
+ 'xmlns:xsi' : "http://www.w3.org/2001/XMLSchema-instance",
63
+ 'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
64
+ 'xmlns:dc': "http://purl.org/dc/elements/1.1/",
65
+ 'xmlns:dcterms': "http://purl.org/dc/terms/"}))
66
+ root = tree.getroot()
67
+ for f in self.formats:
68
+ # MdR: this skipped puids without sig, but we want them ALL
69
+ # because puid might be matched on extension
70
+ #if f.find('signature'):
71
+ root.append(f)
72
+ self.indent(root)
73
+ with open(dst, 'wb') as out:
74
+ #print >>out, ET.tostring(root,encoding='utf-8')
75
+ print >>out, ET.tostring(root)
76
+
77
+ def indent(self, elem, level=0):
78
+ i = "\n" + level*" "
79
+ if len(elem):
80
+ if not elem.text or not elem.text.strip():
81
+ elem.text = i + " "
82
+ if not elem.tail or not elem.tail.strip():
83
+ elem.tail = i
84
+ for elem in elem:
85
+ self.indent(elem, level+1)
86
+ if not elem.tail or not elem.tail.strip():
87
+ elem.tail = i
88
+ else:
89
+ if level and (not elem.tail or not elem.tail.strip()):
90
+ elem.tail = i
91
+
92
+ def load_pronom_xml(self, puid_filter=None):
93
+ """Load the pronom XML from self.pronom_files and convert it to fido XML.
94
+ As a side-effect, set self.formats to a list of ElementTree.Element
95
+ If a @param puid is specified, only that one will be loaded.
96
+ """
97
+ formats = []
98
+ #for p in self.pronom_files:
99
+ # print p
100
+ #print self.pronom_files
101
+ #exit()
102
+ try:
103
+ zip = zipfile.ZipFile(self.pronom_files, 'r')
104
+ for item in zip.infolist():
105
+ #print item.filename
106
+ try:
107
+ stream = zip.open(item)
108
+ # Work is done here!
109
+ #if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
110
+ format = self.parse_pronom_xml(stream, puid_filter)
111
+ if format != None:
112
+ formats.append(format)
113
+ finally:
114
+ stream.close()
115
+ finally:
116
+ try:
117
+ zip.close()
118
+ except Exception, e:
119
+ sys.stderr.write("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e))
120
+ sys.exit()
121
+ # Replace the formatID with puids in has_priority_over
122
+ id_map = {}
123
+ for element in formats:
124
+ puid = element.find('puid').text
125
+ #print "working on puid:",puid
126
+ pronom_id = element.find('pronom_id').text
127
+ id_map[pronom_id] = puid
128
+ for element in formats:
129
+ for rel in element.findall('has_priority_over'):
130
+ rel.text = id_map[rel.text]
131
+
132
+ self._sort_formats(formats)
133
+ self.formats = formats
134
+
135
+ def parse_pronom_xml(self, source, puid_filter=None):
136
+ """Read a pronom XML from @param source, convert it to fido XML and
137
+ @return ET.ElementTree Element representing it.
138
+ If a @param puid is specified, only that one will be loaded.
139
+ """
140
+ pronom_xml = ET.parse(source)
141
+ pronom_root = pronom_xml.getroot()
142
+ pronom_format = pronom_root.find(TNA('report_format_detail/FileFormat'))
143
+ fido_format = ET.Element('format')
144
+ # Get the base Format information
145
+ for id in pronom_format.findall(TNA('FileFormatIdentifier')):
146
+ type = get_text_tna(id, 'IdentifierType')
147
+ if type == 'PUID':
148
+ puid = get_text_tna(id, 'Identifier')
149
+ ET.SubElement(fido_format, 'puid').text = puid
150
+ if puid_filter != None and puid != puid_filter:
151
+ return None
152
+ # A bit clumsy. I want to have puid first, then mime, then container.
153
+ for id in pronom_format.findall(TNA('FileFormatIdentifier')):
154
+ type = get_text_tna(id, 'IdentifierType')
155
+ if type == 'MIME':
156
+ ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
157
+ elif type == 'PUID':
158
+ puid = get_text_tna(id, 'Identifier')
159
+ if puid == 'x-fmt/263':
160
+ ET.SubElement(fido_format, 'container').text = 'zip'
161
+ elif puid == 'x-fmt/265':
162
+ ET.SubElement(fido_format, 'container').text = 'tar'
163
+ ET.SubElement(fido_format, 'name').text = get_text_tna(pronom_format, 'FormatName')
164
+ ET.SubElement(fido_format, 'version').text = get_text_tna(pronom_format, 'FormatVersion')
165
+ ET.SubElement(fido_format, 'alias').text = get_text_tna(pronom_format, 'FormatAliases')
166
+ ET.SubElement(fido_format, 'pronom_id').text = get_text_tna(pronom_format, 'FormatID')
167
+ # Get the extensions from the ExternalSignature
168
+ for x in pronom_format.findall(TNA('ExternalSignature')):
169
+ ET.SubElement(fido_format, 'extension').text = get_text_tna(x, 'Signature')
170
+ for id in pronom_format.findall(TNA('FileFormatIdentifier')):
171
+ type = get_text_tna(id, 'IdentifierType')
172
+ if type == 'Apple Uniform Type Identifier':
173
+ ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
174
+ # Handle the relationships
175
+ for x in pronom_format.findall(TNA('RelatedFormat')):
176
+ rel = get_text_tna(x, 'RelationshipType')
177
+ if rel == 'Has priority over':
178
+ ET.SubElement(fido_format, 'has_priority_over').text = get_text_tna(x, 'RelatedFormatID')
179
+ # Get the InternalSignature information
180
+ for pronom_sig in pronom_format.findall(TNA('InternalSignature')):
181
+ fido_sig = ET.SubElement(fido_format, 'signature')
182
+ ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
183
+ # There are some funny chars in the notes, which caused me trouble and it is a unicode string,
184
+ ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote').encode('UTF-8')
185
+ for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
186
+ fido_pat = ET.SubElement(fido_sig, 'pattern')
187
+ pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
188
+ bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
189
+ offset = get_text_tna(pronom_pat, 'Offset')
190
+ max_offset = get_text_tna(pronom_pat, 'MaxOffset')
191
+ if max_offset == None:
192
+ pass
193
+ #print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
194
+ regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
195
+ #print "done puid", puid
196
+ if regex == "__INCOMPATIBLE_SIG__":
197
+ print >> sys.stderr, "Error: incompatible PRONOM signature found for puid", puid, ", skipping..."
198
+ # remove the empty 'signature' nodes
199
+ # now that the signature is not compatible and thus "regex" is empty
200
+ remove = fido_format.findall('signature')
201
+ for r in remove:
202
+ fido_format.remove(r)
203
+ continue
204
+ ET.SubElement(fido_pat, 'position').text = pos
205
+ ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
206
+ ET.SubElement(fido_pat, 'regex').text = regex
207
+ # Get the format details
208
+ fido_details = ET.SubElement(fido_format,'details')
209
+ ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription').encode('utf8')
210
+ ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
211
+ ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
212
+ ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
213
+ for x in pronom_format.findall(TNA('RelatedFormat')):
214
+ rel = get_text_tna(x, 'RelationshipType')
215
+ if rel == 'Is supertype of':
216
+ ET.SubElement(fido_details, 'is_supertype_of').text = get_text_tna(x, 'RelatedFormatID')
217
+ for x in pronom_format.findall(TNA('RelatedFormat')):
218
+ rel = get_text_tna(x, 'RelationshipType')
219
+ if rel == 'Is subtype of':
220
+ ET.SubElement(fido_details, 'is_subtype_of').text = get_text_tna(x, 'RelatedFormatID')
221
+ ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
222
+ # References
223
+ for x in pronom_format.findall(TNA("Document")):
224
+ r = ET.SubElement(fido_details,'reference')
225
+ ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
226
+ ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
227
+ ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
228
+ ET.SubElement(r, 'dcterms:available').text = get_text_tna(x, 'PublicationDate')
229
+ for id in x.findall(TNA('DocumentIdentifier')):
230
+ type = get_text_tna(id, 'IdentifierType')
231
+ if type == 'URL':
232
+ ET.SubElement(r, 'dc:identifier').text = "http://"+get_text_tna(id, 'Identifier')
233
+ else:
234
+ ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
235
+ ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
236
+ ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
237
+ ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription')+" "+get_text_tna(x, 'AvailabilityNote')
238
+ ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
239
+ # Examples
240
+ for x in pronom_format.findall(TNA("ReferenceFile")):
241
+ rf = ET.SubElement(fido_details,'example_file')
242
+ ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
243
+ ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
244
+ checksum = ""
245
+ for id in x.findall(TNA('ReferenceFileIdentifier')):
246
+ type = get_text_tna(id, 'IdentifierType')
247
+ if type == 'URL':
248
+ url = "http://"+get_text_tna(id, 'Identifier')
249
+ ET.SubElement(rf, 'dc:identifier').text = url
250
+ # And calculate the checksum of this resource:
251
+ m = hashlib.md5()
252
+ sock = urllib.urlopen(url)
253
+ m.update(sock.read())
254
+ sock.close()
255
+ checksum=m.hexdigest()
256
+ else:
257
+ ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
258
+ ET.SubElement(rf, 'dcterms:license').text = ""
259
+ ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
260
+ checksumElement = ET.SubElement(rf, 'checksum')
261
+ checksumElement.text = checksum
262
+ checksumElement.attrib['type'] = "md5"
263
+ # Record Metadata
264
+ md = ET.SubElement(fido_details,'record_metadata')
265
+ ET.SubElement(md, 'status').text ='unknown'
266
+ ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
267
+ ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
268
+ ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
269
+ ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription').encode('utf8')
270
+ return fido_format
271
+
272
+ #FIXME: I don't think that this quite works yet!
273
+ def _sort_formats(self, formatlist):
274
+ """Sort the format list based on their priority relationships so higher priority
275
+ formats appear earlier in the list.
276
+ """
277
+ def compare_formats(f1, f2):
278
+ f1ID = f1.find('puid').text
279
+ f2ID = f2.find('puid').text
280
+ for worse in f1.findall('has_priority_over'):
281
+ if worse.text == f2ID:
282
+ return - 1
283
+ for worse in f2.findall('has_priority_over'):
284
+ if worse.text == f1ID:
285
+ return 1
286
+ if f1ID < f2ID:
287
+ return - 1
288
+ elif f1ID == f2ID:
289
+ return 0
290
+ else:
291
+ return 1
292
+ return sorted(formatlist, cmp=compare_formats)
293
+
294
+ def fido_position(pronom_position):
295
+ """@return BOF/EOF/VAR instead of the more verbose pronom position names.
296
+ """
297
+ if pronom_position == 'Absolute from BOF':
298
+ return 'BOF'
299
+ elif pronom_position == 'Absolute from EOF':
300
+ return 'EOF'
301
+ elif pronom_position == 'Variable':
302
+ return 'VAR'
303
+ elif pronom_position == 'Indirect From BOF':
304
+ return 'IFB'
305
+ else: # to make sure FIDO does not crash (IFB aftermath)
306
+ sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
307
+ return 'VAR'
308
+
309
+ def _convert_err_msg(msg, c, i, chars):
310
+ return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
311
+
312
+ def doByte(chars, i, littleendian):
313
+ """Convert two chars[i] and chars[i+1] into a byte.
314
+ @return a tuple (byte, 2)
315
+ """
316
+ c1 = '0123456789ABCDEF'.find(chars[i].upper())
317
+ c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
318
+ if (c1 < 0 or c2 < 0):
319
+ raise Exception(_convert_err_msg('bad byte sequence', chars[i:i + 2], i, chars))
320
+ if littleendian:
321
+ val = chr(16 * c1 + c2)
322
+ else:
323
+ val = chr(c1 + 16 * c2)
324
+ return (escape(val), 2)
325
+
326
+ # \a\b\n\r\t\v
327
+ # MdR: took out '<' and '>' out of _ordinary because they were converted to entities &lt;&gt;
328
+ # MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
329
+ _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
330
+ _special = '$()*+.?![]^\\{|}'
331
+ _hex = '0123456789abcdef'
332
+ def _escape_char(c):
333
+ if c in '\n':
334
+ return '\\n'
335
+ elif c == '\r':
336
+ return '\\r'
337
+ elif c in _special:
338
+ return '\\' + c
339
+ else:
340
+ (high, low) = divmod(ord(c), 16)
341
+ return '\\x' + _hex[high] + _hex[low]
342
+
343
+ def escape(string):
344
+ "Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
345
+ return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
346
+
347
+ def calculate_repetition(char, pos, offset, maxoffset):
348
+ """
349
+ Recursively calculates offset/maxoffset repetition,
350
+ when one or both offsets is greater than 65535 bytes (64KB)
351
+ see: bugs.python.org/issue13169
352
+ Otherwise it returns the {offset,maxoffset}
353
+ """
354
+ calcbuf = cStringIO.StringIO()
355
+
356
+ calcremain = False
357
+ offsetremain = 0
358
+ maxoffsetremain = 0
359
+
360
+ if offset != None and offset != '':
361
+ if int(offset) > 65535:
362
+ offsetremain = str(int(offset) - 65535)
363
+ offset = '65535'
364
+ calcremain = True
365
+ if maxoffset != None and maxoffset != '':
366
+ if int(maxoffset) > 65535:
367
+ maxoffsetremain = str(int(maxoffset) - 65535)
368
+ maxoffset = '65535'
369
+ calcremain = True
370
+
371
+ if pos == "BOF" or pos == "EOF":
372
+ if offset != '0':
373
+ calcbuf.write(char + '{' + str(offset))
374
+ if maxoffset != None:
375
+ calcbuf.write(',' + maxoffset)
376
+ calcbuf.write('}')
377
+ elif maxoffset != None:
378
+ calcbuf.write(char + '{0,' + maxoffset + '}')
379
+
380
+ if pos == "IFB":
381
+ if offset != '0':
382
+ calcbuf.write(char + '{' + str(offset))
383
+ if maxoffset != None:
384
+ calcbuf.write(',' + maxoffset)
385
+ calcbuf.write('}')
386
+ if maxoffset == None:
387
+ calcbuf.write(',}')
388
+ elif maxoffset != None:
389
+ calcbuf.write(char + '{0,' + maxoffset + '}')
390
+
391
+ if calcremain: # recursion happens here
392
+ calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
393
+
394
+ val = calcbuf.getvalue()
395
+ calcbuf.close()
396
+ return val
397
+
398
+ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
399
+ """Convert
400
+ @param chars, a pronom bytesequence, into a
401
+ @return regular expression.
402
+ Endianness is not used.
403
+ """
404
+
405
+ if 'Big' in endianness:
406
+ littleendian = False
407
+ else:
408
+ littleendian = True
409
+ if len(offset) == 0:
410
+ offset = '0'
411
+ if len(maxoffset) == 0:
412
+ maxoffset = None
413
+ # make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
414
+ global buf
415
+ buf = cStringIO.StringIO()
416
+ buf.write("(?s)") #If a regex starts with (?s), it is equivalent to DOTALL.
417
+ i = 0
418
+ state = 'start'
419
+ if 'BOF' in pos:
420
+ buf.write('\\A') # start of regex
421
+ buf.write(calculate_repetition('.', pos, offset, maxoffset))
422
+
423
+ if 'IFB' in pos:
424
+ buf.write('\\A')
425
+ buf.write(calculate_repetition('.', pos, offset, maxoffset))
426
+
427
+ while True:
428
+ if i == len(chars):
429
+ break
430
+ #print _convert_err_msg(state,chars[i],i,chars)
431
+ if state == 'start':
432
+ if chars[i].isalnum():
433
+ state = 'bytes'
434
+ elif chars[i] == '[' and chars[i + 1] == '!':
435
+ state = 'non-match'
436
+ elif chars[i] == '[':
437
+ state = 'bracket'
438
+ elif chars[i] == '{':
439
+ state = 'curly'
440
+ elif chars[i] == '(':
441
+ state = 'paren'
442
+ elif chars[i] in '*+?':
443
+ state = 'specials'
444
+ else:
445
+ raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars))
446
+ elif state == 'bytes':
447
+ (byt, inc) = doByte(chars, i, littleendian)
448
+ buf.write(byt)
449
+ i += inc
450
+ state = 'start'
451
+ elif state == 'non-match':
452
+ buf.write('(!')
453
+ i += 2
454
+ while True:
455
+ if chars[i].isalnum():
456
+ (byt, inc) = doByte(chars, i, littleendian)
457
+ buf.write(byt)
458
+ i += inc
459
+ elif chars[i] == ']':
460
+ break
461
+ else:
462
+ raise Exception(_convert_err_msg('Illegal character in non-match', chars[i], i, chars))
463
+ buf.write(')')
464
+ i += 1
465
+ state = 'start'
466
+
467
+ elif state == 'bracket':
468
+ try:
469
+ buf.write('[')
470
+ i += 1
471
+ (byt, inc) = doByte(chars, i, littleendian)
472
+ buf.write(byt)
473
+ i += inc
474
+ #assert(chars[i] == ':')
475
+ if chars[i] != ':':
476
+ return "__INCOMPATIBLE_SIG__"
477
+ buf.write('-')
478
+ i += 1
479
+ (byt, inc) = doByte(chars, i, littleendian)
480
+ buf.write(byt)
481
+ i += inc
482
+ #assert(chars[i] == ']')
483
+ if chars[i] != ']':
484
+ return "__INCOMPATIBLE_SIG__"
485
+ buf.write(']')
486
+ i += 1
487
+ except Exception:
488
+ print _convert_err_msg('Illegal character in bracket', chars[i], i, chars)
489
+ raise
490
+ if i < len(chars) and chars[i] == '{':
491
+ state = 'curly-after-bracket'
492
+ else:
493
+ state = 'start'
494
+ elif state == 'paren':
495
+ buf.write('(?:')
496
+ i += 1
497
+ while True:
498
+ if chars[i].isalnum():
499
+ (byt, inc) = doByte(chars, i, littleendian)
500
+ buf.write(byt)
501
+ i += inc
502
+ elif chars[i] == '|':
503
+ buf.write('|')
504
+ i += 1
505
+ elif chars[i] == ')':
506
+ break
507
+ # START fix FIDO-20
508
+ elif chars[i] == '[':
509
+ buf.write('[')
510
+ i += 1
511
+ (byt, inc) = doByte(chars, i, littleendian)
512
+ buf.write(byt)
513
+ i += inc
514
+ #assert(chars[i] == ':')
515
+ if chars[i] != ':':
516
+ return "__INCOMPATIBLE_SIG__"
517
+ buf.write('-')
518
+ i += 1
519
+ (byt, inc) = doByte(chars, i, littleendian)
520
+ buf.write(byt)
521
+ i += inc
522
+
523
+ #assert(chars[i] == ']')
524
+ if chars[i] != ']':
525
+ return "__INCOMPATIBLE_SIG__"
526
+ buf.write(']')
527
+ i += 1
528
+ else:
529
+ raise Exception(_convert_err_msg(('Current state = \'{0}\' : Illegal character in paren').format(state), chars[i], i, chars))
530
+ buf.write(')')
531
+ i += 1
532
+ state = 'start'
533
+ # END fix FIDO-20
534
+ elif state in ['curly', 'curly-after-bracket']:
535
+ # {nnnn} or {nnn-nnn} or {nnn-*}
536
+ # {nnn} or {nnn,nnn} or {nnn,}
537
+ # when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
538
+ # The above, while sensible, appears to be incorrect. A '.' is always needed.
539
+ # for droid equiv behavior
540
+ #if state == 'curly':
541
+ buf.write('.')
542
+ buf.write('{')
543
+ i += 1 # skip the (
544
+ while True:
545
+ if chars[i].isalnum():
546
+ buf.write(chars[i])
547
+ i += 1
548
+ elif chars[i] == '-':
549
+ buf.write(',')
550
+ i += 1
551
+ elif chars[i] == '*': # skip the *
552
+ i += 1
553
+ elif chars[i] == '}':
554
+ break
555
+ else:
556
+ raise Exception(_convert_err_msg('Illegal character in curly', chars[i], i, chars))
557
+ buf.write('}')
558
+ i += 1 # skip the )
559
+ state = 'start'
560
+ elif state == 'specials':
561
+ if chars[i] == '*':
562
+ buf.write('.*')
563
+ i += 1
564
+ elif chars[i] == '+':
565
+ buf.write('.+')
566
+ i += 1
567
+ elif chars[i] == '?':
568
+ if chars[i + 1] != '?':
569
+ raise Exception(_convert_err_msg('Illegal character after ?', chars[i + 1], i + 1, chars))
570
+ buf.write('.?')
571
+ i += 2
572
+ state = 'start'
573
+ else:
574
+ raise Exception('Illegal state {0}'.format(state))
575
+
576
+ if 'EOF' in pos:
577
+ buf.write(calculate_repetition('.', pos, offset, maxoffset))
578
+ buf.write('\\Z')
579
+
580
+ val = buf.getvalue()
581
+ buf.close()
582
+ return val
583
+
584
+ def main(arg=None):
585
+ import sys
586
+ from argparselocal import ArgumentParser
587
+ if arg != None:
588
+ arglist = arg
589
+ else:
590
+ arglist = sys.argv[1:]
591
+ # print arglist
592
+ # exit()
593
+ mydir = os.path.abspath(os.path.dirname(__file__))
594
+ # parse version file to fetch versions
595
+ versionsFile = os.path.join(mydir, 'conf', 'versions.xml')
596
+ try:
597
+ versions = VET.parse(versionsFile)
598
+ except Exception, e:
599
+ sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
600
+ sys.exit()
601
+ xml_pronomSignature = os.path.join(mydir, 'conf', versions.find('pronomSignature').text)
602
+ xml_pronomZipFile = os.path.join(mydir, 'conf', "pronom-xml-v{0}.zip".format(versions.find('pronomVersion').text))
603
+ parser = ArgumentParser(description='Produce the fido format xml that is loaded at run-time')
604
+ parser.add_argument('-input', default=xml_pronomZipFile, help='input file, a zip containing Pronom xml files')
605
+ parser.add_argument('-output', default=xml_pronomSignature, help='output file')
606
+ parser.add_argument('-puid', default=None, help='a particular PUID record to extract')
607
+ # PROCESS ARGUMENTS
608
+ args = parser.parse_args(arglist)
609
+ # print os.path.abspath(args.input), os.path.abspath(args.output)
610
+ info = FormatInfo(args.input)
611
+ info.load_pronom_xml(args.puid)
612
+ info.save(args.output)
613
+ print >> sys.stderr, 'Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats))
614
+
615
+ if __name__ == '__main__':
616
+ main()