libis-format 0.9.32 → 0.9.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/data/types.yml +30 -16
  3. data/lib/libis/format/config.rb +7 -18
  4. data/lib/libis/format/converter/image_converter.rb +6 -0
  5. data/lib/libis/format/droid.rb +82 -25
  6. data/lib/libis/format/extension_identification.rb +55 -0
  7. data/lib/libis/format/fido.rb +57 -72
  8. data/lib/libis/format/file_tool.rb +76 -0
  9. data/lib/libis/format/identification_tool.rb +174 -0
  10. data/lib/libis/format/identifier.rb +129 -117
  11. data/lib/libis/format/type_database.rb +36 -5
  12. data/lib/libis/format/version.rb +1 -1
  13. data/lib/libis/format.rb +3 -0
  14. data/libis-format.gemspec +2 -1
  15. data/spec/converter_spec.rb +6 -4
  16. data/spec/identifier_spec.rb +125 -34
  17. metadata +21 -126
  18. data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
  19. data/tools/droid/container-signature-20170330.xml +0 -3584
  20. data/tools/droid/droid-command-line-6.3.jar +0 -0
  21. data/tools/droid/droid.bat +0 -152
  22. data/tools/droid/droid.sh +0 -152
  23. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  24. data/tools/droid/lib/activation-1.1.jar +0 -0
  25. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  26. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  27. data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
  28. data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
  29. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  30. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  31. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  32. data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
  33. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  34. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  35. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  36. data/tools/droid/lib/commons-codec-1.10.jar +0 -0
  37. data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
  38. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  39. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  40. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  41. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  42. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  43. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  44. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  45. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  46. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  47. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  48. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  49. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  50. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  51. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  52. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  53. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  54. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  55. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  56. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  57. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  58. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  59. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  60. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  61. data/tools/droid/lib/droid-container-6.3.jar +0 -0
  62. data/tools/droid/lib/droid-core-6.3.jar +0 -0
  63. data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
  64. data/tools/droid/lib/droid-export-6.3.jar +0 -0
  65. data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
  66. data/tools/droid/lib/droid-help-6.3.jar +0 -0
  67. data/tools/droid/lib/droid-report-6.3.jar +0 -0
  68. data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
  69. data/tools/droid/lib/droid-results-6.3.jar +0 -0
  70. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  71. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  72. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  73. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  74. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  75. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  76. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  77. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  78. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  79. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  80. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  81. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  82. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  83. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  84. data/tools/droid/lib/jta-1.1.jar +0 -0
  85. data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
  86. data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
  87. data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
  88. data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
  89. data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
  90. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  91. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  92. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  93. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  94. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  95. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  96. data/tools/droid/lib/poi-3.13.jar +0 -0
  97. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  98. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  99. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  100. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  101. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  102. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  103. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  104. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  105. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  106. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  107. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  108. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  109. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  110. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  111. data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
  112. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  113. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  114. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  115. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  116. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  117. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  118. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  119. data/tools/droid/lib/xz-1.0.jar +0 -0
  120. data/tools/fido/__init__.py +0 -50
  121. data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
  122. data/tools/fido/conf/container-signature-20170330.xml +0 -3584
  123. data/tools/fido/conf/dc.xsd +0 -119
  124. data/tools/fido/conf/dcmitype.xsd +0 -53
  125. data/tools/fido/conf/dcterms.xsd +0 -383
  126. data/tools/fido/conf/fido-formats.xsd +0 -173
  127. data/tools/fido/conf/format_extension_template.xml +0 -105
  128. data/tools/fido/conf/format_extensions.xml +0 -484
  129. data/tools/fido/conf/formats-v90.xml +0 -48877
  130. data/tools/fido/conf/pronom-xml-v90.zip +0 -0
  131. data/tools/fido/conf/versions.xml +0 -8
  132. data/tools/fido/fido.bat +0 -4
  133. data/tools/fido/fido.py +0 -884
  134. data/tools/fido/fido.sh +0 -5
  135. data/tools/fido/package.py +0 -96
  136. data/tools/fido/prepare.py +0 -645
  137. data/tools/fido/pronomutils.py +0 -200
  138. data/tools/fido/toxml.py +0 -60
  139. data/tools/fido/update_signatures.py +0 -183
@@ -1,645 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- """Format Identification for Digital Objects."""
5
-
6
- from __future__ import print_function
7
-
8
- from argparse import ArgumentParser
9
- import hashlib
10
- import sys
11
- from xml.dom import minidom
12
- from xml.etree import ElementTree as ET
13
- import zipfile
14
-
15
- from six.moves import cStringIO
16
- from six.moves.urllib.request import urlopen
17
- from six.moves.urllib.parse import urlparse
18
-
19
- from .pronomutils import get_local_pronom_versions
20
-
21
-
22
- # \a\b\n\r\t\v
23
- # MdR: took out '<' and '>' out of _ordinary because they were converted to entities &lt;&gt;
24
- # MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
25
- _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
26
- _special = '$()*+.?![]^\\{|}'
27
- _hex = '0123456789abcdef'
28
-
29
-
30
- class NS:
31
- """
32
- Helper class for XML name spaces in ElementTree.
33
-
34
- Use like MYNS=NS("{http://some/uri}") and then MYNS(tag1/tag2).
35
- """
36
-
37
- def __init__(self, uri):
38
- """Instantiate class with `uri` argument."""
39
- self.uri = uri
40
-
41
- def __getattr__(self, tag):
42
- """Append URI to the class attributes."""
43
- return self.uri + tag
44
-
45
- def __call__(self, path):
46
- """Define behavior when the instant is used as a function."""
47
- return "/".join(getattr(self, tag) for tag in path.split("/"))
48
-
49
-
50
- XHTML = NS("{http://www.w3.org/1999/xhtml}") # XHTML namespace
51
- TNA = NS("{http://pronom.nationalarchives.gov.uk}") # TNA namespace
52
-
53
-
54
- def get_text_tna(element, tag, default=''):
55
- """Helper function to return the text for a tag or path using the TNA namespace."""
56
- part = element.find(TNA(tag))
57
- if part is None or part.text is None:
58
- return default
59
- return part.text.strip()
60
-
61
-
62
- def prettify(elem):
63
- """Return a pretty-printed XML string for the Element."""
64
- rough_string = ET.tostring(elem, 'UTF-8')
65
- reparsed = minidom.parseString(rough_string)
66
- return reparsed.toprettyxml(indent=" ")
67
-
68
-
69
- class FormatInfo:
70
- """Convert PRONOM formats into FIDO signatures."""
71
-
72
- def __init__(self, pronom_files, format_list=[]):
73
- """Instantiate class, take a list of PRONOM files and an optional list of formats."""
74
- self.info = {}
75
- self.formats = []
76
- self.pronom_files = pronom_files
77
- for f in format_list:
78
- self.add_format(f) # FIXME: add_format is undefined!
79
-
80
- def save(self, dst=sys.stdout):
81
- """Write the fido XML format definitions to @param dst."""
82
- tree = ET.ElementTree(ET.Element('formats', {
83
- 'version': '0.3',
84
- 'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance",
85
- 'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
86
- 'xmlns:dc': "http://purl.org/dc/elements/1.1/",
87
- 'xmlns:dcterms': "http://purl.org/dc/terms/"
88
- }))
89
- root = tree.getroot()
90
- for f in self.formats:
91
- # MdR: this skipped puids without sig, but we want them ALL
92
- # because puid might be matched on extension
93
- # if f.find('signature'):
94
- root.append(f)
95
- self.indent(root)
96
- with open(dst, 'wb') as file_:
97
- # print >>out, ET.tostring(root,encoding='utf-8')
98
- print(ET.tostring(root), file=file_)
99
-
100
- def indent(self, elem, level=0):
101
- """Indent output."""
102
- i = "\n" + level * " "
103
- if len(elem):
104
- if not elem.text or not elem.text.strip():
105
- elem.text = i + " "
106
- if not elem.tail or not elem.tail.strip():
107
- elem.tail = i
108
- for elem in elem:
109
- self.indent(elem, level + 1)
110
- if not elem.tail or not elem.tail.strip():
111
- elem.tail = i
112
- else:
113
- if level and (not elem.tail or not elem.tail.strip()):
114
- elem.tail = i
115
-
116
- def load_pronom_xml(self, puid_filter=None):
117
- """
118
- Load the pronom XML from self.pronom_files and convert it to fido XML.
119
-
120
- As a side-effect, set self.formats to a list of ElementTree.Element.
121
- If a @param puid is specified, only that one will be loaded.
122
- """
123
- formats = []
124
- # for p in self.pronom_files:
125
- # print p
126
- # print self.pronom_files
127
- # exit()
128
- try:
129
- zip = zipfile.ZipFile(self.pronom_files, 'r')
130
- for item in zip.infolist():
131
- # print item.filename
132
- try:
133
- stream = zip.open(item)
134
- # Work is done here!
135
- # if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
136
- format_ = self.parse_pronom_xml(stream, puid_filter)
137
- if format_ is not None:
138
- formats.append(format_)
139
- finally:
140
- stream.close()
141
- finally:
142
- try:
143
- zip.close()
144
- except Exception as e:
145
- print("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e), file=sys.stderr)
146
- sys.exit()
147
- # Replace the formatID with puids in has_priority_over
148
- if puid_filter is None:
149
- id_map = {}
150
- for element in formats:
151
- puid = element.find('puid').text
152
- # print "working on puid:",puid
153
- pronom_id = element.find('pronom_id').text
154
- id_map[pronom_id] = puid
155
- for element in formats:
156
- for rel in element.findall('has_priority_over'):
157
- rel.text = id_map[rel.text]
158
-
159
- self._sort_formats(formats)
160
- self.formats = formats
161
-
162
- def parse_pronom_xml(self, source, puid_filter=None):
163
- """
164
- Parse PRONOM XML and convert into FIDO XML.
165
-
166
- If a @param puid is specified, only that one will be loaded.
167
- @return ET.ElementTree Element representing it.
168
- """
169
- pronom_xml = ET.parse(source)
170
- pronom_root = pronom_xml.getroot()
171
- pronom_format = pronom_root.find(TNA('report_format_detail/FileFormat'))
172
- fido_format = ET.Element('format')
173
- # Get the base Format information
174
- for id in pronom_format.findall(TNA('FileFormatIdentifier')):
175
- type = get_text_tna(id, 'IdentifierType')
176
- if type == 'PUID':
177
- puid = get_text_tna(id, 'Identifier')
178
- ET.SubElement(fido_format, 'puid').text = puid
179
- if puid_filter and puid != puid_filter:
180
- return None
181
- # A bit clumsy. I want to have puid first, then mime, then container.
182
- for id in pronom_format.findall(TNA('FileFormatIdentifier')):
183
- type = get_text_tna(id, 'IdentifierType')
184
- if type == 'MIME':
185
- ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
186
- elif type == 'PUID':
187
- puid = get_text_tna(id, 'Identifier')
188
- if puid == 'x-fmt/263':
189
- ET.SubElement(fido_format, 'container').text = 'zip'
190
- elif puid == 'x-fmt/265':
191
- ET.SubElement(fido_format, 'container').text = 'tar'
192
- ET.SubElement(fido_format, 'name').text = get_text_tna(pronom_format, 'FormatName')
193
- ET.SubElement(fido_format, 'version').text = get_text_tna(pronom_format, 'FormatVersion')
194
- ET.SubElement(fido_format, 'alias').text = get_text_tna(pronom_format, 'FormatAliases')
195
- ET.SubElement(fido_format, 'pronom_id').text = get_text_tna(pronom_format, 'FormatID')
196
- # Get the extensions from the ExternalSignature
197
- for x in pronom_format.findall(TNA('ExternalSignature')):
198
- ET.SubElement(fido_format, 'extension').text = get_text_tna(x, 'Signature')
199
- for id in pronom_format.findall(TNA('FileFormatIdentifier')):
200
- type = get_text_tna(id, 'IdentifierType')
201
- if type == 'Apple Uniform Type Identifier':
202
- ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
203
- # Handle the relationships
204
- for x in pronom_format.findall(TNA('RelatedFormat')):
205
- rel = get_text_tna(x, 'RelationshipType')
206
- if rel == 'Has priority over':
207
- ET.SubElement(fido_format, 'has_priority_over').text = get_text_tna(x, 'RelatedFormatID')
208
- # Get the InternalSignature information
209
- for pronom_sig in pronom_format.findall(TNA('InternalSignature')):
210
- fido_sig = ET.SubElement(fido_format, 'signature')
211
- ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
212
- # There are some funny chars in the notes, which caused me trouble and it is a unicode string,
213
- ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
214
- for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
215
- fido_pat = ET.SubElement(fido_sig, 'pattern')
216
- pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
217
- bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
218
- offset = get_text_tna(pronom_pat, 'Offset')
219
- max_offset = get_text_tna(pronom_pat, 'MaxOffset')
220
- if not max_offset:
221
- pass
222
- # print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
223
- regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
224
- # print "done puid", puid
225
- if regex == "__INCOMPATIBLE_SIG__":
226
- print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
227
- # remove the empty 'signature' nodes
228
- # now that the signature is not compatible and thus "regex" is empty
229
- remove = fido_format.findall('signature')
230
- for r in remove:
231
- fido_format.remove(r)
232
- continue
233
- ET.SubElement(fido_pat, 'position').text = pos
234
- ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
235
- ET.SubElement(fido_pat, 'regex').text = regex
236
- # Get the format details
237
- fido_details = ET.SubElement(fido_format, 'details')
238
- ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
239
- ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
240
- ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
241
- ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
242
- for x in pronom_format.findall(TNA('RelatedFormat')):
243
- rel = get_text_tna(x, 'RelationshipType')
244
- if rel == 'Is supertype of':
245
- ET.SubElement(fido_details, 'is_supertype_of').text = get_text_tna(x, 'RelatedFormatID')
246
- for x in pronom_format.findall(TNA('RelatedFormat')):
247
- rel = get_text_tna(x, 'RelationshipType')
248
- if rel == 'Is subtype of':
249
- ET.SubElement(fido_details, 'is_subtype_of').text = get_text_tna(x, 'RelatedFormatID')
250
- ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
251
- # References
252
- for x in pronom_format.findall(TNA("Document")):
253
- r = ET.SubElement(fido_details, 'reference')
254
- ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
255
- ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
256
- ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
257
- ET.SubElement(r, 'dcterms:available').text = get_text_tna(x, 'PublicationDate')
258
- for id in x.findall(TNA('DocumentIdentifier')):
259
- type = get_text_tna(id, 'IdentifierType')
260
- if type == 'URL':
261
- ET.SubElement(r, 'dc:identifier').text = "http://" + get_text_tna(id, 'Identifier')
262
- else:
263
- ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
264
- ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
265
- ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
266
- ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription') + " " + get_text_tna(x, 'AvailabilityNote')
267
- ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
268
- # Examples
269
- for x in pronom_format.findall(TNA("ReferenceFile")):
270
- rf = ET.SubElement(fido_details, 'example_file')
271
- ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
272
- ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
273
- checksum = ""
274
- for id in x.findall(TNA('ReferenceFileIdentifier')):
275
- type = get_text_tna(id, 'IdentifierType')
276
- if type == 'URL':
277
- # Starting with PRONOM 89, some URLs contain http://
278
- # and others do not.
279
- url = get_text_tna(id, 'Identifier')
280
- if not urlparse(url).scheme:
281
- url = "http://" + url
282
- ET.SubElement(rf, 'dc:identifier').text = url
283
- # And calculate the checksum of this resource:
284
- m = hashlib.md5()
285
- sock = urlopen(url)
286
- m.update(sock.read())
287
- sock.close()
288
- checksum = m.hexdigest()
289
- else:
290
- ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
291
- ET.SubElement(rf, 'dcterms:license').text = ""
292
- ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
293
- checksumElement = ET.SubElement(rf, 'checksum')
294
- checksumElement.text = checksum
295
- checksumElement.attrib['type'] = "md5"
296
- # Record Metadata
297
- md = ET.SubElement(fido_details, 'record_metadata')
298
- ET.SubElement(md, 'status').text = 'unknown'
299
- ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
300
- ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
301
- ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
302
- ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
303
- return fido_format
304
-
305
- # FIXME: I don't think that this quite works yet!
306
- def _sort_formats(self, formatlist):
307
- """Sort the format list based on their priority relationships so higher priority formats appear earlier in the list."""
308
- def compare_formats(f1, f2):
309
- f1ID = f1.find('puid').text
310
- f2ID = f2.find('puid').text
311
- for worse in f1.findall('has_priority_over'):
312
- if worse.text == f2ID:
313
- return - 1
314
- for worse in f2.findall('has_priority_over'):
315
- if worse.text == f1ID:
316
- return 1
317
- if f1ID < f2ID:
318
- return - 1
319
- elif f1ID == f2ID:
320
- return 0
321
- else:
322
- return 1
323
- return sorted(formatlist, cmp=compare_formats)
324
-
325
-
326
- def fido_position(pronom_position):
327
- """Return BOF/EOF/VAR instead of the more verbose pronom position names."""
328
- if pronom_position == 'Absolute from BOF':
329
- return 'BOF'
330
- elif pronom_position == 'Absolute from EOF':
331
- return 'EOF'
332
- elif pronom_position == 'Variable':
333
- return 'VAR'
334
- elif pronom_position == 'Indirect From BOF':
335
- return 'IFB'
336
- else: # to make sure FIDO does not crash (IFB aftermath)
337
- sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
338
- return 'VAR'
339
-
340
-
341
- def _convert_err_msg(msg, c, i, chars):
342
- return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
343
-
344
-
345
- def doByte(chars, i, littleendian):
346
- """
347
- Convert two chars[i] and chars[i+1] into a byte.
348
-
349
- @return a tuple (byte, 2)
350
- """
351
- c1 = '0123456789ABCDEF'.find(chars[i].upper())
352
- c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
353
- if (c1 < 0 or c2 < 0):
354
- raise Exception(_convert_err_msg('bad byte sequence', chars[i:i + 2], i, chars))
355
- if littleendian:
356
- val = chr(16 * c1 + c2)
357
- else:
358
- val = chr(c1 + 16 * c2)
359
- return (escape(val), 2)
360
-
361
-
362
- def _escape_char(c):
363
- if c in '\n':
364
- return '\\n'
365
- elif c == '\r':
366
- return '\\r'
367
- elif c in _special:
368
- return '\\' + c
369
- else:
370
- (high, low) = divmod(ord(c), 16)
371
- return '\\x' + _hex[high] + _hex[low]
372
-
373
-
374
- def escape(string):
375
- """Escape characters in pattern that are non-printable, non-ascii, or special for regexes."""
376
- return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
377
-
378
-
379
- def calculate_repetition(char, pos, offset, maxoffset):
380
- """Recursively calculates offset/maxoffset repetition, when one or both offsets is greater than 65535 bytes (64KB). See: https://bugs.python.org/issue13169."""
381
- calcbuf = cStringIO()
382
-
383
- calcremain = False
384
- offsetremain = 0
385
- maxoffsetremain = 0
386
-
387
- if offset is not None and int(offset) > 65535:
388
- offsetremain = str(int(offset) - 65535)
389
- offset = '65535'
390
- calcremain = True
391
- if maxoffset is not None and int(maxoffset) > 65535:
392
- maxoffsetremain = str(int(maxoffset) - 65535)
393
- maxoffset = '65535'
394
- calcremain = True
395
-
396
- if pos == "BOF" or pos == "EOF":
397
- if offset != '0':
398
- calcbuf.write(char + '{' + str(offset))
399
- if maxoffset is not None:
400
- calcbuf.write(',' + maxoffset)
401
- calcbuf.write('}')
402
- elif maxoffset is not None:
403
- calcbuf.write(char + '{0,' + maxoffset + '}')
404
-
405
- if pos == "IFB":
406
- if offset != '0':
407
- calcbuf.write(char + '{' + str(offset))
408
- if maxoffset is not None:
409
- calcbuf.write(',' + maxoffset)
410
- calcbuf.write('}')
411
- if maxoffset is not None:
412
- calcbuf.write(',}')
413
- elif maxoffset is not None:
414
- calcbuf.write(char + '{0,' + maxoffset + '}')
415
-
416
- if calcremain: # recursion happens here
417
- calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
418
-
419
- val = calcbuf.getvalue()
420
- calcbuf.close()
421
- return val
422
-
423
-
424
- def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
425
- """
426
- Convert to regular expression.
427
-
428
- Endianness is not used.
429
-
430
- @param chars, a pronom bytesequence, into a
431
- @return regular expression.
432
- """
433
- if 'Big' in endianness:
434
- littleendian = False
435
- else:
436
- littleendian = True
437
- if len(offset) == 0:
438
- offset = '0'
439
- if len(maxoffset) == 0:
440
- maxoffset = None
441
- if maxoffset == '0':
442
- maxoffset = None
443
- # make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
444
- global buf
445
- buf = cStringIO()
446
- buf.write("(?s)") # If a regex starts with (?s), it is equivalent to DOTALL.
447
- i = 0
448
- state = 'start'
449
- if 'BOF' in pos:
450
- buf.write('\\A') # start of regex
451
- buf.write(calculate_repetition('.', pos, offset, maxoffset))
452
-
453
- if 'IFB' in pos:
454
- buf.write('\\A')
455
- buf.write(calculate_repetition('.', pos, offset, maxoffset))
456
-
457
- while True:
458
- if i == len(chars):
459
- break
460
- # print _convert_err_msg(state,chars[i],i,chars)
461
- if state == 'start':
462
- if chars[i].isalnum():
463
- state = 'bytes'
464
- elif chars[i] == '[' and chars[i + 1] == '!':
465
- state = 'non-match'
466
- elif chars[i] == '[':
467
- state = 'bracket'
468
- elif chars[i] == '{':
469
- state = 'curly'
470
- elif chars[i] == '(':
471
- state = 'paren'
472
- elif chars[i] in '*+?':
473
- state = 'specials'
474
- else:
475
- raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars))
476
- elif state == 'bytes':
477
- (byt, inc) = doByte(chars, i, littleendian)
478
- buf.write(byt)
479
- i += inc
480
- state = 'start'
481
- elif state == 'non-match':
482
- buf.write('(!')
483
- i += 2
484
- while True:
485
- if chars[i].isalnum():
486
- (byt, inc) = doByte(chars, i, littleendian)
487
- buf.write(byt)
488
- i += inc
489
- elif chars[i] == ']':
490
- break
491
- else:
492
- raise Exception(_convert_err_msg('Illegal character in non-match', chars[i], i, chars))
493
- buf.write(')')
494
- i += 1
495
- state = 'start'
496
-
497
- elif state == 'bracket':
498
- try:
499
- buf.write('[')
500
- i += 1
501
- (byt, inc) = doByte(chars, i, littleendian)
502
- buf.write(byt)
503
- i += inc
504
- # assert(chars[i] == ':')
505
- if chars[i] != ':':
506
- return "__INCOMPATIBLE_SIG__"
507
- buf.write('-')
508
- i += 1
509
- (byt, inc) = doByte(chars, i, littleendian)
510
- buf.write(byt)
511
- i += inc
512
- # assert(chars[i] == ']')
513
- if chars[i] != ']':
514
- return "__INCOMPATIBLE_SIG__"
515
- buf.write(']')
516
- i += 1
517
- except Exception:
518
- print(_convert_err_msg('Illegal character in bracket', chars[i], i, chars))
519
- raise
520
- if i < len(chars) and chars[i] == '{':
521
- state = 'curly-after-bracket'
522
- else:
523
- state = 'start'
524
- elif state == 'paren':
525
- buf.write('(?:')
526
- i += 1
527
- while True:
528
- if chars[i].isalnum():
529
- (byt, inc) = doByte(chars, i, littleendian)
530
- buf.write(byt)
531
- i += inc
532
- elif chars[i] == '|':
533
- buf.write('|')
534
- i += 1
535
- elif chars[i] == ')':
536
- break
537
- # START fix FIDO-20
538
- elif chars[i] == '[':
539
- buf.write('[')
540
- i += 1
541
- (byt, inc) = doByte(chars, i, littleendian)
542
- buf.write(byt)
543
- i += inc
544
- # assert(chars[i] == ':')
545
- if chars[i] != ':':
546
- return "__INCOMPATIBLE_SIG__"
547
- buf.write('-')
548
- i += 1
549
- (byt, inc) = doByte(chars, i, littleendian)
550
- buf.write(byt)
551
- i += inc
552
-
553
- # assert(chars[i] == ']')
554
- if chars[i] != ']':
555
- return "__INCOMPATIBLE_SIG__"
556
- buf.write(']')
557
- i += 1
558
- else:
559
- raise Exception(_convert_err_msg(('Current state = \'{0}\' : Illegal character in paren').format(state), chars[i], i, chars))
560
- buf.write(')')
561
- i += 1
562
- state = 'start'
563
- # END fix FIDO-20
564
- elif state in ['curly', 'curly-after-bracket']:
565
- # {nnnn} or {nnn-nnn} or {nnn-*}
566
- # {nnn} or {nnn,nnn} or {nnn,}
567
- # when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
568
- # The above, while sensible, appears to be incorrect. A '.' is always needed.
569
- # for droid equiv behavior
570
- # if state == 'curly':
571
- buf.write('.')
572
- buf.write('{')
573
- i += 1 # skip the (
574
- while True:
575
- if chars[i].isalnum():
576
- buf.write(chars[i])
577
- i += 1
578
- elif chars[i] == '-':
579
- buf.write(',')
580
- i += 1
581
- elif chars[i] == '*': # skip the *
582
- i += 1
583
- elif chars[i] == '}':
584
- break
585
- else:
586
- raise Exception(_convert_err_msg('Illegal character in curly', chars[i], i, chars))
587
- buf.write('}')
588
- i += 1 # skip the )
589
- state = 'start'
590
- elif state == 'specials':
591
- if chars[i] == '*':
592
- buf.write('.*')
593
- i += 1
594
- elif chars[i] == '+':
595
- buf.write('.+')
596
- i += 1
597
- elif chars[i] == '?':
598
- if chars[i + 1] != '?':
599
- raise Exception(_convert_err_msg('Illegal character after ?', chars[i + 1], i + 1, chars))
600
- buf.write('.?')
601
- i += 2
602
- state = 'start'
603
- else:
604
- raise Exception('Illegal state {0}'.format(state))
605
-
606
- if 'EOF' in pos:
607
- buf.write(calculate_repetition('.', pos, offset, maxoffset))
608
- buf.write('\\Z')
609
-
610
- val = buf.getvalue()
611
- buf.close()
612
- return val
613
-
614
-
615
- def run(input=None, output=None, puid=None):
616
- """Convert PRONOM formats into FIDO signatures."""
617
- versions = get_local_pronom_versions()
618
-
619
- if input is None:
620
- input = versions.get_zip_file()
621
- if output is None:
622
- output = versions.get_signature_file()
623
-
624
- info = FormatInfo(input)
625
- info.load_pronom_xml(puid)
626
- info.save(output)
627
- print('Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats)), file=sys.stderr)
628
-
629
-
630
- def main(args=None):
631
- """Main CLI entrypoint."""
632
- if args is None:
633
- args = sys.argv[1:]
634
-
635
- parser = ArgumentParser(description='Produce the FIDO format XML that is loaded at run-time')
636
- parser.add_argument('-input', default=None, help='Input file, a Zip containing PRONOM XML files')
637
- parser.add_argument('-output', default=None, help='Ouptut file')
638
- parser.add_argument('-puid', default=None, help='A particular PUID record to extract')
639
- args = parser.parse_args(args)
640
-
641
- run(input=args.input, output=args.output, puid=args.puid)
642
-
643
-
644
- if __name__ == '__main__':
645
- main()