libis-format 0.9.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +18 -0
  4. data/.travis.yml +41 -0
  5. data/Gemfile +5 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +39 -0
  8. data/Rakefile +8 -0
  9. data/bin/droid +15 -0
  10. data/bin/fido +12 -0
  11. data/bin/pdf_copy +13 -0
  12. data/data/ISOcoated_v2_eci.icc +0 -0
  13. data/data/PDFA_def.ps +40 -0
  14. data/data/ead.xsd +2728 -0
  15. data/data/eciRGB_v2.icc +0 -0
  16. data/data/lias_formats.xml +106 -0
  17. data/data/types.yml +217 -0
  18. data/lib/libis/format/config.rb +35 -0
  19. data/lib/libis/format/converter/base.rb +101 -0
  20. data/lib/libis/format/converter/chain.rb +167 -0
  21. data/lib/libis/format/converter/image_converter.rb +214 -0
  22. data/lib/libis/format/converter/office_converter.rb +50 -0
  23. data/lib/libis/format/converter/pdf_converter.rb +139 -0
  24. data/lib/libis/format/converter/repository.rb +98 -0
  25. data/lib/libis/format/converter.rb +11 -0
  26. data/lib/libis/format/droid.rb +45 -0
  27. data/lib/libis/format/fido.rb +102 -0
  28. data/lib/libis/format/identifier.rb +189 -0
  29. data/lib/libis/format/office_to_pdf.rb +52 -0
  30. data/lib/libis/format/pdf_copy.rb +40 -0
  31. data/lib/libis/format/pdf_merge.rb +41 -0
  32. data/lib/libis/format/pdf_split.rb +39 -0
  33. data/lib/libis/format/pdf_to_pdfa.rb +76 -0
  34. data/lib/libis/format/pdfa_validator.rb +61 -0
  35. data/lib/libis/format/type_database.rb +170 -0
  36. data/lib/libis/format/version.rb +5 -0
  37. data/lib/libis/format.rb +23 -0
  38. data/lib/libis-format.rb +1 -0
  39. data/libis-format.gemspec +34 -0
  40. data/spec/converter_spec.rb +212 -0
  41. data/spec/data/Cevennes2.bmp +0 -0
  42. data/spec/data/Cevennes2.jp2 +0 -0
  43. data/spec/data/Cevennes2.ppm +22492 -0
  44. data/spec/data/test-ead.xml +392 -0
  45. data/spec/data/test-jpg.tif +0 -0
  46. data/spec/data/test-lzw.tif +0 -0
  47. data/spec/data/test-options.jpg +0 -0
  48. data/spec/data/test.bmp +0 -0
  49. data/spec/data/test.doc +0 -0
  50. data/spec/data/test.docx +0 -0
  51. data/spec/data/test.gif +0 -0
  52. data/spec/data/test.jpg +0 -0
  53. data/spec/data/test.ods +0 -0
  54. data/spec/data/test.odt +0 -0
  55. data/spec/data/test.pdf +0 -0
  56. data/spec/data/test.pdf.tif +0 -0
  57. data/spec/data/test.png +0 -0
  58. data/spec/data/test.ps +8631 -0
  59. data/spec/data/test.psd +0 -0
  60. data/spec/data/test.rtf +1455 -0
  61. data/spec/data/test.tif +0 -0
  62. data/spec/data/test.txt +12 -0
  63. data/spec/data/test.xcf +0 -0
  64. data/spec/data/test.xls +0 -0
  65. data/spec/data/test.xlsx +0 -0
  66. data/spec/data/test.xml +4 -0
  67. data/spec/data/test_pdfa.pdf +0 -0
  68. data/spec/identifier_spec.rb +60 -0
  69. data/spec/spec_helper.rb +9 -0
  70. data/spec/test_types.yml +12 -0
  71. data/spec/type_database_spec.rb +140 -0
  72. data/tools/PdfTool.jar +0 -0
  73. data/tools/bcpkix-jdk15on-1.49.jar +0 -0
  74. data/tools/bcprov-jdk15on-1.49.jar +0 -0
  75. data/tools/droid/DROID_SignatureFile_V82.xml +32681 -0
  76. data/tools/droid/container-signature-20150307.xml +2235 -0
  77. data/tools/droid/droid-command-line-6.1.5.jar +0 -0
  78. data/tools/droid/droid.bat +154 -0
  79. data/tools/droid/droid.sh +138 -0
  80. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  81. data/tools/droid/lib/activation-1.1.jar +0 -0
  82. data/tools/droid/lib/antlr-2.7.7.jar +0 -0
  83. data/tools/droid/lib/antlr-3.2.jar +0 -0
  84. data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
  85. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  86. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  87. data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
  88. data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
  89. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  90. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  91. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  92. data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
  93. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  94. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  95. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  96. data/tools/droid/lib/commons-codec-1.4.jar +0 -0
  97. data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
  98. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  99. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  100. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  101. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  102. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  103. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  104. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  105. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  106. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  107. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  108. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  109. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  110. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  111. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  112. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  113. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  114. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  115. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  116. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  117. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  118. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  119. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  120. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  121. data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
  122. data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
  123. data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
  124. data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
  125. data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
  126. data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
  127. data/tools/droid/lib/droid-help-6.1.5.jar +0 -0
  128. data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
  129. data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
  130. data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
  131. data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
  132. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  133. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  134. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  135. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  136. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  137. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  138. data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
  139. data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
  140. data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
  141. data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
  142. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  143. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  144. data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
  145. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  146. data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
  147. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  148. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  149. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  150. data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
  151. data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
  152. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  153. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  154. data/tools/droid/lib/jta-1.1.jar +0 -0
  155. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  156. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  157. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  158. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  159. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  160. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  161. data/tools/droid/lib/poi-3.7.jar +0 -0
  162. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  163. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  164. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  165. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  166. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  167. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  168. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  169. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  170. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  171. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  172. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  173. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  174. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  175. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  176. data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
  177. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  178. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  179. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  180. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  181. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  182. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  183. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  184. data/tools/droid/lib/xz-1.0.jar +0 -0
  185. data/tools/fido/__init__.py +0 -0
  186. data/tools/fido/argparselocal.py +2355 -0
  187. data/tools/fido/conf/DROID_SignatureFile-v81.xml +2 -0
  188. data/tools/fido/conf/container-signature-20150307.xml +2238 -0
  189. data/tools/fido/conf/dc.xsd +119 -0
  190. data/tools/fido/conf/dcmitype.xsd +53 -0
  191. data/tools/fido/conf/dcterms.xsd +383 -0
  192. data/tools/fido/conf/fido-formats.xsd +173 -0
  193. data/tools/fido/conf/format_extension_template.xml +105 -0
  194. data/tools/fido/conf/format_extensions.xml +498 -0
  195. data/tools/fido/conf/formats-v81.xml +38355 -0
  196. data/tools/fido/conf/pronom-xml-v81.zip +0 -0
  197. data/tools/fido/conf/versions.xml +8 -0
  198. data/tools/fido/fido.bat +4 -0
  199. data/tools/fido/fido.py +854 -0
  200. data/tools/fido/fido.sh +5 -0
  201. data/tools/fido/prepare.py +616 -0
  202. data/tools/fido/pronomutils.py +115 -0
  203. data/tools/fido/toxml.py +52 -0
  204. data/tools/fido/update_signatures.py +171 -0
  205. data/tools/pdfbox/pdfbox-app-1.8.10.jar +0 -0
  206. data/tools/pdfbox/preflight-app-1.8.10.jar +0 -0
  207. metadata +396 -0
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ BASEDIR=$(dirname $0)
3
+ FIDO_PROG=$BASEDIR/fido.py
4
+
5
+ python "$FIDO_PROG" -bufsize 1000000 -container_bufsize 1000000 -q "$@"
@@ -0,0 +1,616 @@
1
+ #!python
2
+ # -*- coding: utf-8 -*-
3
+ # Format Identification for Digital Objects
4
+
5
+ # MdR: 'reload(sys)' and 'setdefaultencoding("utf-8")' needed to fix utf-8 encoding errors
6
+ # when converting from PRONOM to FIDO format
7
+ import sys
8
+ reload(sys)
9
+ sys.setdefaultencoding("utf-8")
10
+ import cStringIO, zipfile, os
11
+ import hashlib
12
+ import urllib
13
+ from xml.etree import ElementTree as ET
14
+ from xml.etree import ElementTree as VET # versions.xml
15
+ # needed for debug
16
+ # print_r: https://github.com/marcbelmont/python-print_r
17
+ # from print_r import print_r
18
+
19
+ class NS:
20
+ """Helper class for XML name spaces in ElementTree.
21
+ Use like MYNS=NS("{http://some/uri}") and then
22
+ MYNS(tag1/tag2).
23
+ """
24
+ def __init__(self, uri):
25
+ self.uri = uri
26
+ def __getattr__(self, tag):
27
+ return self.uri + tag
28
+ def __call__(self, path):
29
+ return "/".join(getattr(self, tag) for tag in path.split("/"))
30
+
31
+ # XHTML namespace
32
+ XHTML = NS("{http://www.w3.org/1999/xhtml}")
33
+ # TNA namespace
34
+ TNA = NS("{http://pronom.nationalarchives.gov.uk}")
35
+
36
+ def get_text_tna(element, tag, default=''):
37
+ """Helper function to return the text for a tag or path using the TNA namespace.
38
+ """
39
+ part = element.find(TNA(tag))
40
+ return part.text.strip() if part != None and part.text != None else default
41
+
42
+ def prettify(elem):
43
+ """Return a pretty-printed XML string for the Element.
44
+ """
45
+ from xml.dom import minidom
46
+ rough_string = ET.tostring(elem, 'UTF-8')
47
+ reparsed = minidom.parseString(rough_string)
48
+ return reparsed.toprettyxml(indent=" ")
49
+
50
+ class FormatInfo:
51
+ def __init__(self, pronom_files, format_list=[]):
52
+ self.info = {}
53
+ self.formats = []
54
+ self.pronom_files = pronom_files
55
+ for f in format_list:
56
+ self.add_format(f)
57
+
58
+ def save(self, dst):
59
+ """Write the fido XML format definitions to @param dst
60
+ """
61
+ tree = ET.ElementTree(ET.Element('formats', {'version':'0.3',
62
+ 'xmlns:xsi' : "http://www.w3.org/2001/XMLSchema-instance",
63
+ 'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
64
+ 'xmlns:dc': "http://purl.org/dc/elements/1.1/",
65
+ 'xmlns:dcterms': "http://purl.org/dc/terms/"}))
66
+ root = tree.getroot()
67
+ for f in self.formats:
68
+ # MdR: this skipped puids without sig, but we want them ALL
69
+ # because puid might be matched on extension
70
+ #if f.find('signature'):
71
+ root.append(f)
72
+ self.indent(root)
73
+ with open(dst, 'wb') as out:
74
+ #print >>out, ET.tostring(root,encoding='utf-8')
75
+ print >>out, ET.tostring(root)
76
+
77
+ def indent(self, elem, level=0):
78
+ i = "\n" + level*" "
79
+ if len(elem):
80
+ if not elem.text or not elem.text.strip():
81
+ elem.text = i + " "
82
+ if not elem.tail or not elem.tail.strip():
83
+ elem.tail = i
84
+ for elem in elem:
85
+ self.indent(elem, level+1)
86
+ if not elem.tail or not elem.tail.strip():
87
+ elem.tail = i
88
+ else:
89
+ if level and (not elem.tail or not elem.tail.strip()):
90
+ elem.tail = i
91
+
92
+ def load_pronom_xml(self, puid_filter=None):
93
+ """Load the pronom XML from self.pronom_files and convert it to fido XML.
94
+ As a side-effect, set self.formats to a list of ElementTree.Element
95
+ If a @param puid is specified, only that one will be loaded.
96
+ """
97
+ formats = []
98
+ #for p in self.pronom_files:
99
+ # print p
100
+ #print self.pronom_files
101
+ #exit()
102
+ try:
103
+ zip = zipfile.ZipFile(self.pronom_files, 'r')
104
+ for item in zip.infolist():
105
+ #print item.filename
106
+ try:
107
+ stream = zip.open(item)
108
+ # Work is done here!
109
+ #if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
110
+ format = self.parse_pronom_xml(stream, puid_filter)
111
+ if format != None:
112
+ formats.append(format)
113
+ finally:
114
+ stream.close()
115
+ finally:
116
+ try:
117
+ zip.close()
118
+ except Exception, e:
119
+ sys.stderr.write("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e))
120
+ sys.exit()
121
+ # Replace the formatID with puids in has_priority_over
122
+ id_map = {}
123
+ for element in formats:
124
+ puid = element.find('puid').text
125
+ #print "working on puid:",puid
126
+ pronom_id = element.find('pronom_id').text
127
+ id_map[pronom_id] = puid
128
+ for element in formats:
129
+ for rel in element.findall('has_priority_over'):
130
+ rel.text = id_map[rel.text]
131
+
132
+ self._sort_formats(formats)
133
+ self.formats = formats
134
+
135
+ def parse_pronom_xml(self, source, puid_filter=None):
136
+ """Read a pronom XML from @param source, convert it to fido XML and
137
+ @return ET.ElementTree Element representing it.
138
+ If a @param puid is specified, only that one will be loaded.
139
+ """
140
+ pronom_xml = ET.parse(source)
141
+ pronom_root = pronom_xml.getroot()
142
+ pronom_format = pronom_root.find(TNA('report_format_detail/FileFormat'))
143
+ fido_format = ET.Element('format')
144
+ # Get the base Format information
145
+ for id in pronom_format.findall(TNA('FileFormatIdentifier')):
146
+ type = get_text_tna(id, 'IdentifierType')
147
+ if type == 'PUID':
148
+ puid = get_text_tna(id, 'Identifier')
149
+ ET.SubElement(fido_format, 'puid').text = puid
150
+ if puid_filter != None and puid != puid_filter:
151
+ return None
152
+ # A bit clumsy. I want to have puid first, then mime, then container.
153
+ for id in pronom_format.findall(TNA('FileFormatIdentifier')):
154
+ type = get_text_tna(id, 'IdentifierType')
155
+ if type == 'MIME':
156
+ ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
157
+ elif type == 'PUID':
158
+ puid = get_text_tna(id, 'Identifier')
159
+ if puid == 'x-fmt/263':
160
+ ET.SubElement(fido_format, 'container').text = 'zip'
161
+ elif puid == 'x-fmt/265':
162
+ ET.SubElement(fido_format, 'container').text = 'tar'
163
+ ET.SubElement(fido_format, 'name').text = get_text_tna(pronom_format, 'FormatName')
164
+ ET.SubElement(fido_format, 'version').text = get_text_tna(pronom_format, 'FormatVersion')
165
+ ET.SubElement(fido_format, 'alias').text = get_text_tna(pronom_format, 'FormatAliases')
166
+ ET.SubElement(fido_format, 'pronom_id').text = get_text_tna(pronom_format, 'FormatID')
167
+ # Get the extensions from the ExternalSignature
168
+ for x in pronom_format.findall(TNA('ExternalSignature')):
169
+ ET.SubElement(fido_format, 'extension').text = get_text_tna(x, 'Signature')
170
+ for id in pronom_format.findall(TNA('FileFormatIdentifier')):
171
+ type = get_text_tna(id, 'IdentifierType')
172
+ if type == 'Apple Uniform Type Identifier':
173
+ ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
174
+ # Handle the relationships
175
+ for x in pronom_format.findall(TNA('RelatedFormat')):
176
+ rel = get_text_tna(x, 'RelationshipType')
177
+ if rel == 'Has priority over':
178
+ ET.SubElement(fido_format, 'has_priority_over').text = get_text_tna(x, 'RelatedFormatID')
179
+ # Get the InternalSignature information
180
+ for pronom_sig in pronom_format.findall(TNA('InternalSignature')):
181
+ fido_sig = ET.SubElement(fido_format, 'signature')
182
+ ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
183
+ # There are some funny chars in the notes, which caused me trouble and it is a unicode string,
184
+ ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote').encode('UTF-8')
185
+ for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
186
+ fido_pat = ET.SubElement(fido_sig, 'pattern')
187
+ pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
188
+ bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
189
+ offset = get_text_tna(pronom_pat, 'Offset')
190
+ max_offset = get_text_tna(pronom_pat, 'MaxOffset')
191
+ if max_offset == None:
192
+ pass
193
+ #print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
194
+ regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
195
+ #print "done puid", puid
196
+ if regex == "__INCOMPATIBLE_SIG__":
197
+ print >> sys.stderr, "Error: incompatible PRONOM signature found for puid", puid, ", skipping..."
198
+ # remove the empty 'signature' nodes
199
+ # now that the signature is not compatible and thus "regex" is empty
200
+ remove = fido_format.findall('signature')
201
+ for r in remove:
202
+ fido_format.remove(r)
203
+ continue
204
+ ET.SubElement(fido_pat, 'position').text = pos
205
+ ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
206
+ ET.SubElement(fido_pat, 'regex').text = regex
207
+ # Get the format details
208
+ fido_details = ET.SubElement(fido_format,'details')
209
+ ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription').encode('utf8')
210
+ ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
211
+ ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
212
+ ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
213
+ for x in pronom_format.findall(TNA('RelatedFormat')):
214
+ rel = get_text_tna(x, 'RelationshipType')
215
+ if rel == 'Is supertype of':
216
+ ET.SubElement(fido_details, 'is_supertype_of').text = get_text_tna(x, 'RelatedFormatID')
217
+ for x in pronom_format.findall(TNA('RelatedFormat')):
218
+ rel = get_text_tna(x, 'RelationshipType')
219
+ if rel == 'Is subtype of':
220
+ ET.SubElement(fido_details, 'is_subtype_of').text = get_text_tna(x, 'RelatedFormatID')
221
+ ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
222
+ # References
223
+ for x in pronom_format.findall(TNA("Document")):
224
+ r = ET.SubElement(fido_details,'reference')
225
+ ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
226
+ ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
227
+ ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
228
+ ET.SubElement(r, 'dcterms:available').text = get_text_tna(x, 'PublicationDate')
229
+ for id in x.findall(TNA('DocumentIdentifier')):
230
+ type = get_text_tna(id, 'IdentifierType')
231
+ if type == 'URL':
232
+ ET.SubElement(r, 'dc:identifier').text = "http://"+get_text_tna(id, 'Identifier')
233
+ else:
234
+ ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
235
+ ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
236
+ ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
237
+ ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription')+" "+get_text_tna(x, 'AvailabilityNote')
238
+ ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
239
+ # Examples
240
+ for x in pronom_format.findall(TNA("ReferenceFile")):
241
+ rf = ET.SubElement(fido_details,'example_file')
242
+ ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
243
+ ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
244
+ checksum = ""
245
+ for id in x.findall(TNA('ReferenceFileIdentifier')):
246
+ type = get_text_tna(id, 'IdentifierType')
247
+ if type == 'URL':
248
+ url = "http://"+get_text_tna(id, 'Identifier')
249
+ ET.SubElement(rf, 'dc:identifier').text = url
250
+ # And calculate the checksum of this resource:
251
+ m = hashlib.md5()
252
+ sock = urllib.urlopen(url)
253
+ m.update(sock.read())
254
+ sock.close()
255
+ checksum=m.hexdigest()
256
+ else:
257
+ ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
258
+ ET.SubElement(rf, 'dcterms:license').text = ""
259
+ ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
260
+ checksumElement = ET.SubElement(rf, 'checksum')
261
+ checksumElement.text = checksum
262
+ checksumElement.attrib['type'] = "md5"
263
+ # Record Metadata
264
+ md = ET.SubElement(fido_details,'record_metadata')
265
+ ET.SubElement(md, 'status').text ='unknown'
266
+ ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
267
+ ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
268
+ ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
269
+ ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription').encode('utf8')
270
+ return fido_format
271
+
272
+ #FIXME: I don't think that this quite works yet!
273
+ def _sort_formats(self, formatlist):
274
+ """Sort the format list based on their priority relationships so higher priority
275
+ formats appear earlier in the list.
276
+ """
277
+ def compare_formats(f1, f2):
278
+ f1ID = f1.find('puid').text
279
+ f2ID = f2.find('puid').text
280
+ for worse in f1.findall('has_priority_over'):
281
+ if worse.text == f2ID:
282
+ return - 1
283
+ for worse in f2.findall('has_priority_over'):
284
+ if worse.text == f1ID:
285
+ return 1
286
+ if f1ID < f2ID:
287
+ return - 1
288
+ elif f1ID == f2ID:
289
+ return 0
290
+ else:
291
+ return 1
292
+ return sorted(formatlist, cmp=compare_formats)
293
+
294
+ def fido_position(pronom_position):
295
+ """@return BOF/EOF/VAR instead of the more verbose pronom position names.
296
+ """
297
+ if pronom_position == 'Absolute from BOF':
298
+ return 'BOF'
299
+ elif pronom_position == 'Absolute from EOF':
300
+ return 'EOF'
301
+ elif pronom_position == 'Variable':
302
+ return 'VAR'
303
+ elif pronom_position == 'Indirect From BOF':
304
+ return 'IFB'
305
+ else: # to make sure FIDO does not crash (IFB aftermath)
306
+ sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
307
+ return 'VAR'
308
+
309
+ def _convert_err_msg(msg, c, i, chars):
310
+ return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
311
+
312
+ def doByte(chars, i, littleendian):
313
+ """Convert two chars[i] and chars[i+1] into a byte.
314
+ @return a tuple (byte, 2)
315
+ """
316
+ c1 = '0123456789ABCDEF'.find(chars[i].upper())
317
+ c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
318
+ if (c1 < 0 or c2 < 0):
319
+ raise Exception(_convert_err_msg('bad byte sequence', chars[i:i + 2], i, chars))
320
+ if littleendian:
321
+ val = chr(16 * c1 + c2)
322
+ else:
323
+ val = chr(c1 + 16 * c2)
324
+ return (escape(val), 2)
325
+
326
+ # \a\b\n\r\t\v
327
+ # MdR: took out '<' and '>' out of _ordinary because they were converted to entities &lt;&gt;
328
+ # MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
329
+ _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
330
+ _special = '$()*+.?![]^\\{|}'
331
+ _hex = '0123456789abcdef'
332
+ def _escape_char(c):
333
+ if c in '\n':
334
+ return '\\n'
335
+ elif c == '\r':
336
+ return '\\r'
337
+ elif c in _special:
338
+ return '\\' + c
339
+ else:
340
+ (high, low) = divmod(ord(c), 16)
341
+ return '\\x' + _hex[high] + _hex[low]
342
+
343
+ def escape(string):
344
+ "Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
345
+ return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
346
+
347
+ def calculate_repetition(char, pos, offset, maxoffset):
348
+ """
349
+ Recursively calculates offset/maxoffset repetition,
350
+ when one or both offsets is greater than 65535 bytes (64KB)
351
+ see: bugs.python.org/issue13169
352
+ Otherwise it returns the {offset,maxoffset}
353
+ """
354
+ calcbuf = cStringIO.StringIO()
355
+
356
+ calcremain = False
357
+ offsetremain = 0
358
+ maxoffsetremain = 0
359
+
360
+ if offset != None and offset != '':
361
+ if int(offset) > 65535:
362
+ offsetremain = str(int(offset) - 65535)
363
+ offset = '65535'
364
+ calcremain = True
365
+ if maxoffset != None and maxoffset != '':
366
+ if int(maxoffset) > 65535:
367
+ maxoffsetremain = str(int(maxoffset) - 65535)
368
+ maxoffset = '65535'
369
+ calcremain = True
370
+
371
+ if pos == "BOF" or pos == "EOF":
372
+ if offset != '0':
373
+ calcbuf.write(char + '{' + str(offset))
374
+ if maxoffset != None:
375
+ calcbuf.write(',' + maxoffset)
376
+ calcbuf.write('}')
377
+ elif maxoffset != None:
378
+ calcbuf.write(char + '{0,' + maxoffset + '}')
379
+
380
+ if pos == "IFB":
381
+ if offset != '0':
382
+ calcbuf.write(char + '{' + str(offset))
383
+ if maxoffset != None:
384
+ calcbuf.write(',' + maxoffset)
385
+ calcbuf.write('}')
386
+ if maxoffset == None:
387
+ calcbuf.write(',}')
388
+ elif maxoffset != None:
389
+ calcbuf.write(char + '{0,' + maxoffset + '}')
390
+
391
+ if calcremain: # recursion happens here
392
+ calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
393
+
394
+ val = calcbuf.getvalue()
395
+ calcbuf.close()
396
+ return val
397
+
398
+ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
399
+ """Convert
400
+ @param chars, a pronom bytesequence, into a
401
+ @return regular expression.
402
+ Endianness is not used.
403
+ """
404
+
405
+ if 'Big' in endianness:
406
+ littleendian = False
407
+ else:
408
+ littleendian = True
409
+ if len(offset) == 0:
410
+ offset = '0'
411
+ if len(maxoffset) == 0:
412
+ maxoffset = None
413
+ # make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
414
+ global buf
415
+ buf = cStringIO.StringIO()
416
+ buf.write("(?s)") #If a regex starts with (?s), it is equivalent to DOTALL.
417
+ i = 0
418
+ state = 'start'
419
+ if 'BOF' in pos:
420
+ buf.write('\\A') # start of regex
421
+ buf.write(calculate_repetition('.', pos, offset, maxoffset))
422
+
423
+ if 'IFB' in pos:
424
+ buf.write('\\A')
425
+ buf.write(calculate_repetition('.', pos, offset, maxoffset))
426
+
427
+ while True:
428
+ if i == len(chars):
429
+ break
430
+ #print _convert_err_msg(state,chars[i],i,chars)
431
+ if state == 'start':
432
+ if chars[i].isalnum():
433
+ state = 'bytes'
434
+ elif chars[i] == '[' and chars[i + 1] == '!':
435
+ state = 'non-match'
436
+ elif chars[i] == '[':
437
+ state = 'bracket'
438
+ elif chars[i] == '{':
439
+ state = 'curly'
440
+ elif chars[i] == '(':
441
+ state = 'paren'
442
+ elif chars[i] in '*+?':
443
+ state = 'specials'
444
+ else:
445
+ raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars))
446
+ elif state == 'bytes':
447
+ (byt, inc) = doByte(chars, i, littleendian)
448
+ buf.write(byt)
449
+ i += inc
450
+ state = 'start'
451
+ elif state == 'non-match':
452
+ buf.write('(!')
453
+ i += 2
454
+ while True:
455
+ if chars[i].isalnum():
456
+ (byt, inc) = doByte(chars, i, littleendian)
457
+ buf.write(byt)
458
+ i += inc
459
+ elif chars[i] == ']':
460
+ break
461
+ else:
462
+ raise Exception(_convert_err_msg('Illegal character in non-match', chars[i], i, chars))
463
+ buf.write(')')
464
+ i += 1
465
+ state = 'start'
466
+
467
+ elif state == 'bracket':
468
+ try:
469
+ buf.write('[')
470
+ i += 1
471
+ (byt, inc) = doByte(chars, i, littleendian)
472
+ buf.write(byt)
473
+ i += inc
474
+ #assert(chars[i] == ':')
475
+ if chars[i] != ':':
476
+ return "__INCOMPATIBLE_SIG__"
477
+ buf.write('-')
478
+ i += 1
479
+ (byt, inc) = doByte(chars, i, littleendian)
480
+ buf.write(byt)
481
+ i += inc
482
+ #assert(chars[i] == ']')
483
+ if chars[i] != ']':
484
+ return "__INCOMPATIBLE_SIG__"
485
+ buf.write(']')
486
+ i += 1
487
+ except Exception:
488
+ print _convert_err_msg('Illegal character in bracket', chars[i], i, chars)
489
+ raise
490
+ if i < len(chars) and chars[i] == '{':
491
+ state = 'curly-after-bracket'
492
+ else:
493
+ state = 'start'
494
+ elif state == 'paren':
495
+ buf.write('(?:')
496
+ i += 1
497
+ while True:
498
+ if chars[i].isalnum():
499
+ (byt, inc) = doByte(chars, i, littleendian)
500
+ buf.write(byt)
501
+ i += inc
502
+ elif chars[i] == '|':
503
+ buf.write('|')
504
+ i += 1
505
+ elif chars[i] == ')':
506
+ break
507
+ # START fix FIDO-20
508
+ elif chars[i] == '[':
509
+ buf.write('[')
510
+ i += 1
511
+ (byt, inc) = doByte(chars, i, littleendian)
512
+ buf.write(byt)
513
+ i += inc
514
+ #assert(chars[i] == ':')
515
+ if chars[i] != ':':
516
+ return "__INCOMPATIBLE_SIG__"
517
+ buf.write('-')
518
+ i += 1
519
+ (byt, inc) = doByte(chars, i, littleendian)
520
+ buf.write(byt)
521
+ i += inc
522
+
523
+ #assert(chars[i] == ']')
524
+ if chars[i] != ']':
525
+ return "__INCOMPATIBLE_SIG__"
526
+ buf.write(']')
527
+ i += 1
528
+ else:
529
+ raise Exception(_convert_err_msg(('Current state = \'{0}\' : Illegal character in paren').format(state), chars[i], i, chars))
530
+ buf.write(')')
531
+ i += 1
532
+ state = 'start'
533
+ # END fix FIDO-20
534
+ elif state in ['curly', 'curly-after-bracket']:
535
+ # {nnnn} or {nnn-nnn} or {nnn-*}
536
+ # {nnn} or {nnn,nnn} or {nnn,}
537
+ # when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
538
+ # The above, while sensible, appears to be incorrect. A '.' is always needed.
539
+ # for droid equiv behavior
540
+ #if state == 'curly':
541
+ buf.write('.')
542
+ buf.write('{')
543
+ i += 1 # skip the (
544
+ while True:
545
+ if chars[i].isalnum():
546
+ buf.write(chars[i])
547
+ i += 1
548
+ elif chars[i] == '-':
549
+ buf.write(',')
550
+ i += 1
551
+ elif chars[i] == '*': # skip the *
552
+ i += 1
553
+ elif chars[i] == '}':
554
+ break
555
+ else:
556
+ raise Exception(_convert_err_msg('Illegal character in curly', chars[i], i, chars))
557
+ buf.write('}')
558
+ i += 1 # skip the )
559
+ state = 'start'
560
+ elif state == 'specials':
561
+ if chars[i] == '*':
562
+ buf.write('.*')
563
+ i += 1
564
+ elif chars[i] == '+':
565
+ buf.write('.+')
566
+ i += 1
567
+ elif chars[i] == '?':
568
+ if chars[i + 1] != '?':
569
+ raise Exception(_convert_err_msg('Illegal character after ?', chars[i + 1], i + 1, chars))
570
+ buf.write('.?')
571
+ i += 2
572
+ state = 'start'
573
+ else:
574
+ raise Exception('Illegal state {0}'.format(state))
575
+
576
+ if 'EOF' in pos:
577
+ buf.write(calculate_repetition('.', pos, offset, maxoffset))
578
+ buf.write('\\Z')
579
+
580
+ val = buf.getvalue()
581
+ buf.close()
582
+ return val
583
+
584
+ def main(arg=None):
585
+ import sys
586
+ from argparselocal import ArgumentParser
587
+ if arg != None:
588
+ arglist = arg
589
+ else:
590
+ arglist = sys.argv[1:]
591
+ # print arglist
592
+ # exit()
593
+ mydir = os.path.abspath(os.path.dirname(__file__))
594
+ # parse version file to fetch versions
595
+ versionsFile = os.path.join(mydir, 'conf', 'versions.xml')
596
+ try:
597
+ versions = VET.parse(versionsFile)
598
+ except Exception, e:
599
+ sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
600
+ sys.exit()
601
+ xml_pronomSignature = os.path.join(mydir, 'conf', versions.find('pronomSignature').text)
602
+ xml_pronomZipFile = os.path.join(mydir, 'conf', "pronom-xml-v{0}.zip".format(versions.find('pronomVersion').text))
603
+ parser = ArgumentParser(description='Produce the fido format xml that is loaded at run-time')
604
+ parser.add_argument('-input', default=xml_pronomZipFile, help='input file, a zip containing Pronom xml files')
605
+ parser.add_argument('-output', default=xml_pronomSignature, help='output file')
606
+ parser.add_argument('-puid', default=None, help='a particular PUID record to extract')
607
+ # PROCESS ARGUMENTS
608
+ args = parser.parse_args(arglist)
609
+ # print os.path.abspath(args.input), os.path.abspath(args.output)
610
+ info = FormatInfo(args.input)
611
+ info.load_pronom_xml(args.puid)
612
+ info.save(args.output)
613
+ print >> sys.stderr, 'Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats))
614
+
615
+ if __name__ == '__main__':
616
+ main()