libis-format 0.9.32 → 0.9.33

Sign up to get free protection for your applications and to get access to all the features.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/data/types.yml +30 -16
  3. data/lib/libis/format/config.rb +7 -18
  4. data/lib/libis/format/converter/image_converter.rb +6 -0
  5. data/lib/libis/format/droid.rb +82 -25
  6. data/lib/libis/format/extension_identification.rb +55 -0
  7. data/lib/libis/format/fido.rb +57 -72
  8. data/lib/libis/format/file_tool.rb +76 -0
  9. data/lib/libis/format/identification_tool.rb +174 -0
  10. data/lib/libis/format/identifier.rb +129 -117
  11. data/lib/libis/format/type_database.rb +36 -5
  12. data/lib/libis/format/version.rb +1 -1
  13. data/lib/libis/format.rb +3 -0
  14. data/libis-format.gemspec +2 -1
  15. data/spec/converter_spec.rb +6 -4
  16. data/spec/identifier_spec.rb +125 -34
  17. metadata +21 -126
  18. data/tools/droid/DROID_SignatureFile_V90.xml +0 -40182
  19. data/tools/droid/container-signature-20170330.xml +0 -3584
  20. data/tools/droid/droid-command-line-6.3.jar +0 -0
  21. data/tools/droid/droid.bat +0 -152
  22. data/tools/droid/droid.sh +0 -152
  23. data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
  24. data/tools/droid/lib/activation-1.1.jar +0 -0
  25. data/tools/droid/lib/aopalliance-1.0.jar +0 -0
  26. data/tools/droid/lib/asm-2.2.3.jar +0 -0
  27. data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
  28. data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
  29. data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
  30. data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
  31. data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
  32. data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
  33. data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
  34. data/tools/droid/lib/classmate-1.0.0.jar +0 -0
  35. data/tools/droid/lib/commons-cli-1.2.jar +0 -0
  36. data/tools/droid/lib/commons-codec-1.10.jar +0 -0
  37. data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
  38. data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
  39. data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
  40. data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
  41. data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
  42. data/tools/droid/lib/commons-io-2.4.jar +0 -0
  43. data/tools/droid/lib/commons-lang-2.6.jar +0 -0
  44. data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
  45. data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
  46. data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
  47. data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
  48. data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
  49. data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
  50. data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
  51. data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
  52. data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
  53. data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
  54. data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
  55. data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
  56. data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
  57. data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
  58. data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
  59. data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
  60. data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
  61. data/tools/droid/lib/droid-container-6.3.jar +0 -0
  62. data/tools/droid/lib/droid-core-6.3.jar +0 -0
  63. data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
  64. data/tools/droid/lib/droid-export-6.3.jar +0 -0
  65. data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
  66. data/tools/droid/lib/droid-help-6.3.jar +0 -0
  67. data/tools/droid/lib/droid-report-6.3.jar +0 -0
  68. data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
  69. data/tools/droid/lib/droid-results-6.3.jar +0 -0
  70. data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
  71. data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
  72. data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
  73. data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
  74. data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
  75. data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
  76. data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
  77. data/tools/droid/lib/itext-2.0.8.jar +0 -0
  78. data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
  79. data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
  80. data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
  81. data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
  82. data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
  83. data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
  84. data/tools/droid/lib/jta-1.1.jar +0 -0
  85. data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
  86. data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
  87. data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
  88. data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
  89. data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
  90. data/tools/droid/lib/log4j-1.2.13.jar +0 -0
  91. data/tools/droid/lib/neethi-2.0.4.jar +0 -0
  92. data/tools/droid/lib/opencsv-2.3.jar +0 -0
  93. data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
  94. data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
  95. data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
  96. data/tools/droid/lib/poi-3.13.jar +0 -0
  97. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  98. data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
  99. data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
  100. data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
  101. data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
  102. data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
  103. data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
  104. data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
  105. data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
  106. data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
  107. data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
  108. data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
  109. data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
  110. data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
  111. data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
  112. data/tools/droid/lib/truezip-6.8.4.jar +0 -0
  113. data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
  114. data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
  115. data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
  116. data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
  117. data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
  118. data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
  119. data/tools/droid/lib/xz-1.0.jar +0 -0
  120. data/tools/fido/__init__.py +0 -50
  121. data/tools/fido/conf/DROID_SignatureFile-v90.xml +0 -2
  122. data/tools/fido/conf/container-signature-20170330.xml +0 -3584
  123. data/tools/fido/conf/dc.xsd +0 -119
  124. data/tools/fido/conf/dcmitype.xsd +0 -53
  125. data/tools/fido/conf/dcterms.xsd +0 -383
  126. data/tools/fido/conf/fido-formats.xsd +0 -173
  127. data/tools/fido/conf/format_extension_template.xml +0 -105
  128. data/tools/fido/conf/format_extensions.xml +0 -484
  129. data/tools/fido/conf/formats-v90.xml +0 -48877
  130. data/tools/fido/conf/pronom-xml-v90.zip +0 -0
  131. data/tools/fido/conf/versions.xml +0 -8
  132. data/tools/fido/fido.bat +0 -4
  133. data/tools/fido/fido.py +0 -884
  134. data/tools/fido/fido.sh +0 -5
  135. data/tools/fido/package.py +0 -96
  136. data/tools/fido/prepare.py +0 -645
  137. data/tools/fido/pronomutils.py +0 -200
  138. data/tools/fido/toxml.py +0 -60
  139. data/tools/fido/update_signatures.py +0 -183
@@ -1,645 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- """Format Identification for Digital Objects."""
5
-
6
- from __future__ import print_function
7
-
8
- from argparse import ArgumentParser
9
- import hashlib
10
- import sys
11
- from xml.dom import minidom
12
- from xml.etree import ElementTree as ET
13
- import zipfile
14
-
15
- from six.moves import cStringIO
16
- from six.moves.urllib.request import urlopen
17
- from six.moves.urllib.parse import urlparse
18
-
19
- from .pronomutils import get_local_pronom_versions
20
-
21
-
22
- # \a\b\n\r\t\v
23
- # MdR: took out '<' and '>' out of _ordinary because they were converted to entities &lt;&gt;
24
- # MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
25
- _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
26
- _special = '$()*+.?![]^\\{|}'
27
- _hex = '0123456789abcdef'
28
-
29
-
30
- class NS:
31
- """
32
- Helper class for XML name spaces in ElementTree.
33
-
34
- Use like MYNS=NS("{http://some/uri}") and then MYNS(tag1/tag2).
35
- """
36
-
37
- def __init__(self, uri):
38
- """Instantiate class with `uri` argument."""
39
- self.uri = uri
40
-
41
- def __getattr__(self, tag):
42
- """Append URI to the class attributes."""
43
- return self.uri + tag
44
-
45
- def __call__(self, path):
46
- """Define behavior when the instant is used as a function."""
47
- return "/".join(getattr(self, tag) for tag in path.split("/"))
48
-
49
-
50
- XHTML = NS("{http://www.w3.org/1999/xhtml}") # XHTML namespace
51
- TNA = NS("{http://pronom.nationalarchives.gov.uk}") # TNA namespace
52
-
53
-
54
- def get_text_tna(element, tag, default=''):
55
- """Helper function to return the text for a tag or path using the TNA namespace."""
56
- part = element.find(TNA(tag))
57
- if part is None or part.text is None:
58
- return default
59
- return part.text.strip()
60
-
61
-
62
- def prettify(elem):
63
- """Return a pretty-printed XML string for the Element."""
64
- rough_string = ET.tostring(elem, 'UTF-8')
65
- reparsed = minidom.parseString(rough_string)
66
- return reparsed.toprettyxml(indent=" ")
67
-
68
-
69
- class FormatInfo:
70
- """Convert PRONOM formats into FIDO signatures."""
71
-
72
- def __init__(self, pronom_files, format_list=[]):
73
- """Instantiate class, take a list of PRONOM files and an optional list of formats."""
74
- self.info = {}
75
- self.formats = []
76
- self.pronom_files = pronom_files
77
- for f in format_list:
78
- self.add_format(f) # FIXME: add_format is undefined!
79
-
80
- def save(self, dst=sys.stdout):
81
- """Write the fido XML format definitions to @param dst."""
82
- tree = ET.ElementTree(ET.Element('formats', {
83
- 'version': '0.3',
84
- 'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance",
85
- 'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
86
- 'xmlns:dc': "http://purl.org/dc/elements/1.1/",
87
- 'xmlns:dcterms': "http://purl.org/dc/terms/"
88
- }))
89
- root = tree.getroot()
90
- for f in self.formats:
91
- # MdR: this skipped puids without sig, but we want them ALL
92
- # because puid might be matched on extension
93
- # if f.find('signature'):
94
- root.append(f)
95
- self.indent(root)
96
- with open(dst, 'wb') as file_:
97
- # print >>out, ET.tostring(root,encoding='utf-8')
98
- print(ET.tostring(root), file=file_)
99
-
100
- def indent(self, elem, level=0):
101
- """Indent output."""
102
- i = "\n" + level * " "
103
- if len(elem):
104
- if not elem.text or not elem.text.strip():
105
- elem.text = i + " "
106
- if not elem.tail or not elem.tail.strip():
107
- elem.tail = i
108
- for elem in elem:
109
- self.indent(elem, level + 1)
110
- if not elem.tail or not elem.tail.strip():
111
- elem.tail = i
112
- else:
113
- if level and (not elem.tail or not elem.tail.strip()):
114
- elem.tail = i
115
-
116
- def load_pronom_xml(self, puid_filter=None):
117
- """
118
- Load the pronom XML from self.pronom_files and convert it to fido XML.
119
-
120
- As a side-effect, set self.formats to a list of ElementTree.Element.
121
- If a @param puid is specified, only that one will be loaded.
122
- """
123
- formats = []
124
- # for p in self.pronom_files:
125
- # print p
126
- # print self.pronom_files
127
- # exit()
128
- try:
129
- zip = zipfile.ZipFile(self.pronom_files, 'r')
130
- for item in zip.infolist():
131
- # print item.filename
132
- try:
133
- stream = zip.open(item)
134
- # Work is done here!
135
- # if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
136
- format_ = self.parse_pronom_xml(stream, puid_filter)
137
- if format_ is not None:
138
- formats.append(format_)
139
- finally:
140
- stream.close()
141
- finally:
142
- try:
143
- zip.close()
144
- except Exception as e:
145
- print("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e), file=sys.stderr)
146
- sys.exit()
147
- # Replace the formatID with puids in has_priority_over
148
- if puid_filter is None:
149
- id_map = {}
150
- for element in formats:
151
- puid = element.find('puid').text
152
- # print "working on puid:",puid
153
- pronom_id = element.find('pronom_id').text
154
- id_map[pronom_id] = puid
155
- for element in formats:
156
- for rel in element.findall('has_priority_over'):
157
- rel.text = id_map[rel.text]
158
-
159
- self._sort_formats(formats)
160
- self.formats = formats
161
-
162
- def parse_pronom_xml(self, source, puid_filter=None):
163
- """
164
- Parse PRONOM XML and convert into FIDO XML.
165
-
166
- If a @param puid is specified, only that one will be loaded.
167
- @return ET.ElementTree Element representing it.
168
- """
169
- pronom_xml = ET.parse(source)
170
- pronom_root = pronom_xml.getroot()
171
- pronom_format = pronom_root.find(TNA('report_format_detail/FileFormat'))
172
- fido_format = ET.Element('format')
173
- # Get the base Format information
174
- for id in pronom_format.findall(TNA('FileFormatIdentifier')):
175
- type = get_text_tna(id, 'IdentifierType')
176
- if type == 'PUID':
177
- puid = get_text_tna(id, 'Identifier')
178
- ET.SubElement(fido_format, 'puid').text = puid
179
- if puid_filter and puid != puid_filter:
180
- return None
181
- # A bit clumsy. I want to have puid first, then mime, then container.
182
- for id in pronom_format.findall(TNA('FileFormatIdentifier')):
183
- type = get_text_tna(id, 'IdentifierType')
184
- if type == 'MIME':
185
- ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
186
- elif type == 'PUID':
187
- puid = get_text_tna(id, 'Identifier')
188
- if puid == 'x-fmt/263':
189
- ET.SubElement(fido_format, 'container').text = 'zip'
190
- elif puid == 'x-fmt/265':
191
- ET.SubElement(fido_format, 'container').text = 'tar'
192
- ET.SubElement(fido_format, 'name').text = get_text_tna(pronom_format, 'FormatName')
193
- ET.SubElement(fido_format, 'version').text = get_text_tna(pronom_format, 'FormatVersion')
194
- ET.SubElement(fido_format, 'alias').text = get_text_tna(pronom_format, 'FormatAliases')
195
- ET.SubElement(fido_format, 'pronom_id').text = get_text_tna(pronom_format, 'FormatID')
196
- # Get the extensions from the ExternalSignature
197
- for x in pronom_format.findall(TNA('ExternalSignature')):
198
- ET.SubElement(fido_format, 'extension').text = get_text_tna(x, 'Signature')
199
- for id in pronom_format.findall(TNA('FileFormatIdentifier')):
200
- type = get_text_tna(id, 'IdentifierType')
201
- if type == 'Apple Uniform Type Identifier':
202
- ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
203
- # Handle the relationships
204
- for x in pronom_format.findall(TNA('RelatedFormat')):
205
- rel = get_text_tna(x, 'RelationshipType')
206
- if rel == 'Has priority over':
207
- ET.SubElement(fido_format, 'has_priority_over').text = get_text_tna(x, 'RelatedFormatID')
208
- # Get the InternalSignature information
209
- for pronom_sig in pronom_format.findall(TNA('InternalSignature')):
210
- fido_sig = ET.SubElement(fido_format, 'signature')
211
- ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
212
- # There are some funny chars in the notes, which caused me trouble and it is a unicode string,
213
- ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
214
- for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
215
- fido_pat = ET.SubElement(fido_sig, 'pattern')
216
- pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
217
- bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
218
- offset = get_text_tna(pronom_pat, 'Offset')
219
- max_offset = get_text_tna(pronom_pat, 'MaxOffset')
220
- if not max_offset:
221
- pass
222
- # print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
223
- regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
224
- # print "done puid", puid
225
- if regex == "__INCOMPATIBLE_SIG__":
226
- print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
227
- # remove the empty 'signature' nodes
228
- # now that the signature is not compatible and thus "regex" is empty
229
- remove = fido_format.findall('signature')
230
- for r in remove:
231
- fido_format.remove(r)
232
- continue
233
- ET.SubElement(fido_pat, 'position').text = pos
234
- ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
235
- ET.SubElement(fido_pat, 'regex').text = regex
236
- # Get the format details
237
- fido_details = ET.SubElement(fido_format, 'details')
238
- ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
239
- ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
240
- ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
241
- ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
242
- for x in pronom_format.findall(TNA('RelatedFormat')):
243
- rel = get_text_tna(x, 'RelationshipType')
244
- if rel == 'Is supertype of':
245
- ET.SubElement(fido_details, 'is_supertype_of').text = get_text_tna(x, 'RelatedFormatID')
246
- for x in pronom_format.findall(TNA('RelatedFormat')):
247
- rel = get_text_tna(x, 'RelationshipType')
248
- if rel == 'Is subtype of':
249
- ET.SubElement(fido_details, 'is_subtype_of').text = get_text_tna(x, 'RelatedFormatID')
250
- ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
251
- # References
252
- for x in pronom_format.findall(TNA("Document")):
253
- r = ET.SubElement(fido_details, 'reference')
254
- ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
255
- ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
256
- ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
257
- ET.SubElement(r, 'dcterms:available').text = get_text_tna(x, 'PublicationDate')
258
- for id in x.findall(TNA('DocumentIdentifier')):
259
- type = get_text_tna(id, 'IdentifierType')
260
- if type == 'URL':
261
- ET.SubElement(r, 'dc:identifier').text = "http://" + get_text_tna(id, 'Identifier')
262
- else:
263
- ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
264
- ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
265
- ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
266
- ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription') + " " + get_text_tna(x, 'AvailabilityNote')
267
- ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
268
- # Examples
269
- for x in pronom_format.findall(TNA("ReferenceFile")):
270
- rf = ET.SubElement(fido_details, 'example_file')
271
- ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
272
- ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
273
- checksum = ""
274
- for id in x.findall(TNA('ReferenceFileIdentifier')):
275
- type = get_text_tna(id, 'IdentifierType')
276
- if type == 'URL':
277
- # Starting with PRONOM 89, some URLs contain http://
278
- # and others do not.
279
- url = get_text_tna(id, 'Identifier')
280
- if not urlparse(url).scheme:
281
- url = "http://" + url
282
- ET.SubElement(rf, 'dc:identifier').text = url
283
- # And calculate the checksum of this resource:
284
- m = hashlib.md5()
285
- sock = urlopen(url)
286
- m.update(sock.read())
287
- sock.close()
288
- checksum = m.hexdigest()
289
- else:
290
- ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
291
- ET.SubElement(rf, 'dcterms:license').text = ""
292
- ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
293
- checksumElement = ET.SubElement(rf, 'checksum')
294
- checksumElement.text = checksum
295
- checksumElement.attrib['type'] = "md5"
296
- # Record Metadata
297
- md = ET.SubElement(fido_details, 'record_metadata')
298
- ET.SubElement(md, 'status').text = 'unknown'
299
- ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
300
- ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
301
- ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
302
- ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
303
- return fido_format
304
-
305
- # FIXME: I don't think that this quite works yet!
306
- def _sort_formats(self, formatlist):
307
- """Sort the format list based on their priority relationships so higher priority formats appear earlier in the list."""
308
- def compare_formats(f1, f2):
309
- f1ID = f1.find('puid').text
310
- f2ID = f2.find('puid').text
311
- for worse in f1.findall('has_priority_over'):
312
- if worse.text == f2ID:
313
- return - 1
314
- for worse in f2.findall('has_priority_over'):
315
- if worse.text == f1ID:
316
- return 1
317
- if f1ID < f2ID:
318
- return - 1
319
- elif f1ID == f2ID:
320
- return 0
321
- else:
322
- return 1
323
- return sorted(formatlist, cmp=compare_formats)
324
-
325
-
326
- def fido_position(pronom_position):
327
- """Return BOF/EOF/VAR instead of the more verbose pronom position names."""
328
- if pronom_position == 'Absolute from BOF':
329
- return 'BOF'
330
- elif pronom_position == 'Absolute from EOF':
331
- return 'EOF'
332
- elif pronom_position == 'Variable':
333
- return 'VAR'
334
- elif pronom_position == 'Indirect From BOF':
335
- return 'IFB'
336
- else: # to make sure FIDO does not crash (IFB aftermath)
337
- sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
338
- return 'VAR'
339
-
340
-
341
- def _convert_err_msg(msg, c, i, chars):
342
- return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
343
-
344
-
345
- def doByte(chars, i, littleendian):
346
- """
347
- Convert two chars[i] and chars[i+1] into a byte.
348
-
349
- @return a tuple (byte, 2)
350
- """
351
- c1 = '0123456789ABCDEF'.find(chars[i].upper())
352
- c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
353
- if (c1 < 0 or c2 < 0):
354
- raise Exception(_convert_err_msg('bad byte sequence', chars[i:i + 2], i, chars))
355
- if littleendian:
356
- val = chr(16 * c1 + c2)
357
- else:
358
- val = chr(c1 + 16 * c2)
359
- return (escape(val), 2)
360
-
361
-
362
- def _escape_char(c):
363
- if c in '\n':
364
- return '\\n'
365
- elif c == '\r':
366
- return '\\r'
367
- elif c in _special:
368
- return '\\' + c
369
- else:
370
- (high, low) = divmod(ord(c), 16)
371
- return '\\x' + _hex[high] + _hex[low]
372
-
373
-
374
- def escape(string):
375
- """Escape characters in pattern that are non-printable, non-ascii, or special for regexes."""
376
- return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
377
-
378
-
379
- def calculate_repetition(char, pos, offset, maxoffset):
380
- """Recursively calculates offset/maxoffset repetition, when one or both offsets is greater than 65535 bytes (64KB). See: https://bugs.python.org/issue13169."""
381
- calcbuf = cStringIO()
382
-
383
- calcremain = False
384
- offsetremain = 0
385
- maxoffsetremain = 0
386
-
387
- if offset is not None and int(offset) > 65535:
388
- offsetremain = str(int(offset) - 65535)
389
- offset = '65535'
390
- calcremain = True
391
- if maxoffset is not None and int(maxoffset) > 65535:
392
- maxoffsetremain = str(int(maxoffset) - 65535)
393
- maxoffset = '65535'
394
- calcremain = True
395
-
396
- if pos == "BOF" or pos == "EOF":
397
- if offset != '0':
398
- calcbuf.write(char + '{' + str(offset))
399
- if maxoffset is not None:
400
- calcbuf.write(',' + maxoffset)
401
- calcbuf.write('}')
402
- elif maxoffset is not None:
403
- calcbuf.write(char + '{0,' + maxoffset + '}')
404
-
405
- if pos == "IFB":
406
- if offset != '0':
407
- calcbuf.write(char + '{' + str(offset))
408
- if maxoffset is not None:
409
- calcbuf.write(',' + maxoffset)
410
- calcbuf.write('}')
411
- if maxoffset is not None:
412
- calcbuf.write(',}')
413
- elif maxoffset is not None:
414
- calcbuf.write(char + '{0,' + maxoffset + '}')
415
-
416
- if calcremain: # recursion happens here
417
- calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
418
-
419
- val = calcbuf.getvalue()
420
- calcbuf.close()
421
- return val
422
-
423
-
424
- def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
425
- """
426
- Convert to regular expression.
427
-
428
- Endianness is not used.
429
-
430
- @param chars, a pronom bytesequence, into a
431
- @return regular expression.
432
- """
433
- if 'Big' in endianness:
434
- littleendian = False
435
- else:
436
- littleendian = True
437
- if len(offset) == 0:
438
- offset = '0'
439
- if len(maxoffset) == 0:
440
- maxoffset = None
441
- if maxoffset == '0':
442
- maxoffset = None
443
- # make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
444
- global buf
445
- buf = cStringIO()
446
- buf.write("(?s)") # If a regex starts with (?s), it is equivalent to DOTALL.
447
- i = 0
448
- state = 'start'
449
- if 'BOF' in pos:
450
- buf.write('\\A') # start of regex
451
- buf.write(calculate_repetition('.', pos, offset, maxoffset))
452
-
453
- if 'IFB' in pos:
454
- buf.write('\\A')
455
- buf.write(calculate_repetition('.', pos, offset, maxoffset))
456
-
457
- while True:
458
- if i == len(chars):
459
- break
460
- # print _convert_err_msg(state,chars[i],i,chars)
461
- if state == 'start':
462
- if chars[i].isalnum():
463
- state = 'bytes'
464
- elif chars[i] == '[' and chars[i + 1] == '!':
465
- state = 'non-match'
466
- elif chars[i] == '[':
467
- state = 'bracket'
468
- elif chars[i] == '{':
469
- state = 'curly'
470
- elif chars[i] == '(':
471
- state = 'paren'
472
- elif chars[i] in '*+?':
473
- state = 'specials'
474
- else:
475
- raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars))
476
- elif state == 'bytes':
477
- (byt, inc) = doByte(chars, i, littleendian)
478
- buf.write(byt)
479
- i += inc
480
- state = 'start'
481
- elif state == 'non-match':
482
- buf.write('(!')
483
- i += 2
484
- while True:
485
- if chars[i].isalnum():
486
- (byt, inc) = doByte(chars, i, littleendian)
487
- buf.write(byt)
488
- i += inc
489
- elif chars[i] == ']':
490
- break
491
- else:
492
- raise Exception(_convert_err_msg('Illegal character in non-match', chars[i], i, chars))
493
- buf.write(')')
494
- i += 1
495
- state = 'start'
496
-
497
- elif state == 'bracket':
498
- try:
499
- buf.write('[')
500
- i += 1
501
- (byt, inc) = doByte(chars, i, littleendian)
502
- buf.write(byt)
503
- i += inc
504
- # assert(chars[i] == ':')
505
- if chars[i] != ':':
506
- return "__INCOMPATIBLE_SIG__"
507
- buf.write('-')
508
- i += 1
509
- (byt, inc) = doByte(chars, i, littleendian)
510
- buf.write(byt)
511
- i += inc
512
- # assert(chars[i] == ']')
513
- if chars[i] != ']':
514
- return "__INCOMPATIBLE_SIG__"
515
- buf.write(']')
516
- i += 1
517
- except Exception:
518
- print(_convert_err_msg('Illegal character in bracket', chars[i], i, chars))
519
- raise
520
- if i < len(chars) and chars[i] == '{':
521
- state = 'curly-after-bracket'
522
- else:
523
- state = 'start'
524
- elif state == 'paren':
525
- buf.write('(?:')
526
- i += 1
527
- while True:
528
- if chars[i].isalnum():
529
- (byt, inc) = doByte(chars, i, littleendian)
530
- buf.write(byt)
531
- i += inc
532
- elif chars[i] == '|':
533
- buf.write('|')
534
- i += 1
535
- elif chars[i] == ')':
536
- break
537
- # START fix FIDO-20
538
- elif chars[i] == '[':
539
- buf.write('[')
540
- i += 1
541
- (byt, inc) = doByte(chars, i, littleendian)
542
- buf.write(byt)
543
- i += inc
544
- # assert(chars[i] == ':')
545
- if chars[i] != ':':
546
- return "__INCOMPATIBLE_SIG__"
547
- buf.write('-')
548
- i += 1
549
- (byt, inc) = doByte(chars, i, littleendian)
550
- buf.write(byt)
551
- i += inc
552
-
553
- # assert(chars[i] == ']')
554
- if chars[i] != ']':
555
- return "__INCOMPATIBLE_SIG__"
556
- buf.write(']')
557
- i += 1
558
- else:
559
- raise Exception(_convert_err_msg(('Current state = \'{0}\' : Illegal character in paren').format(state), chars[i], i, chars))
560
- buf.write(')')
561
- i += 1
562
- state = 'start'
563
- # END fix FIDO-20
564
- elif state in ['curly', 'curly-after-bracket']:
565
- # {nnnn} or {nnn-nnn} or {nnn-*}
566
- # {nnn} or {nnn,nnn} or {nnn,}
567
- # when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
568
- # The above, while sensible, appears to be incorrect. A '.' is always needed.
569
- # for droid equiv behavior
570
- # if state == 'curly':
571
- buf.write('.')
572
- buf.write('{')
573
- i += 1 # skip the (
574
- while True:
575
- if chars[i].isalnum():
576
- buf.write(chars[i])
577
- i += 1
578
- elif chars[i] == '-':
579
- buf.write(',')
580
- i += 1
581
- elif chars[i] == '*': # skip the *
582
- i += 1
583
- elif chars[i] == '}':
584
- break
585
- else:
586
- raise Exception(_convert_err_msg('Illegal character in curly', chars[i], i, chars))
587
- buf.write('}')
588
- i += 1 # skip the )
589
- state = 'start'
590
- elif state == 'specials':
591
- if chars[i] == '*':
592
- buf.write('.*')
593
- i += 1
594
- elif chars[i] == '+':
595
- buf.write('.+')
596
- i += 1
597
- elif chars[i] == '?':
598
- if chars[i + 1] != '?':
599
- raise Exception(_convert_err_msg('Illegal character after ?', chars[i + 1], i + 1, chars))
600
- buf.write('.?')
601
- i += 2
602
- state = 'start'
603
- else:
604
- raise Exception('Illegal state {0}'.format(state))
605
-
606
- if 'EOF' in pos:
607
- buf.write(calculate_repetition('.', pos, offset, maxoffset))
608
- buf.write('\\Z')
609
-
610
- val = buf.getvalue()
611
- buf.close()
612
- return val
613
-
614
-
615
- def run(input=None, output=None, puid=None):
616
- """Convert PRONOM formats into FIDO signatures."""
617
- versions = get_local_pronom_versions()
618
-
619
- if input is None:
620
- input = versions.get_zip_file()
621
- if output is None:
622
- output = versions.get_signature_file()
623
-
624
- info = FormatInfo(input)
625
- info.load_pronom_xml(puid)
626
- info.save(output)
627
- print('Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats)), file=sys.stderr)
628
-
629
-
630
- def main(args=None):
631
- """Main CLI entrypoint."""
632
- if args is None:
633
- args = sys.argv[1:]
634
-
635
- parser = ArgumentParser(description='Produce the FIDO format XML that is loaded at run-time')
636
- parser.add_argument('-input', default=None, help='Input file, a Zip containing PRONOM XML files')
637
- parser.add_argument('-output', default=None, help='Ouptut file')
638
- parser.add_argument('-puid', default=None, help='A particular PUID record to extract')
639
- args = parser.parse_args(args)
640
-
641
- run(input=args.input, output=args.output, puid=args.puid)
642
-
643
-
644
- if __name__ == '__main__':
645
- main()