libis-format 0.9.30 → 0.9.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/lib/libis/format/converter/image_converter.rb +2 -2
  3. data/lib/libis/format/office_to_pdf.rb +1 -1
  4. data/lib/libis/format/version.rb +1 -1
  5. data/spec/converter_spec.rb +43 -27
  6. data/spec/data/test-options.png +0 -0
  7. data/spec/data/test.pdf.tif +0 -0
  8. data/tools/droid/{DROID_SignatureFile_V82.xml → DROID_SignatureFile_V90.xml} +8202 -701
  9. data/tools/droid/{container-signature-20150307.xml → container-signature-20170330.xml} +3584 -2235
  10. data/tools/droid/droid-command-line-6.3.jar +0 -0
  11. data/tools/droid/droid.bat +152 -154
  12. data/tools/droid/droid.sh +30 -16
  13. data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
  14. data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
  15. data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
  16. data/tools/droid/lib/commons-codec-1.10.jar +0 -0
  17. data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
  18. data/tools/droid/lib/droid-container-6.3.jar +0 -0
  19. data/tools/droid/lib/droid-core-6.3.jar +0 -0
  20. data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
  21. data/tools/droid/lib/droid-export-6.3.jar +0 -0
  22. data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
  23. data/tools/droid/lib/{droid-help-6.1.5.jar → droid-help-6.3.jar} +0 -0
  24. data/tools/droid/lib/droid-report-6.3.jar +0 -0
  25. data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
  26. data/tools/droid/lib/droid-results-6.3.jar +0 -0
  27. data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
  28. data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
  29. data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
  30. data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
  31. data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
  32. data/tools/droid/lib/poi-3.13.jar +0 -0
  33. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  34. data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
  35. data/tools/fido/__init__.py +50 -0
  36. data/tools/fido/conf/DROID_SignatureFile-v90.xml +2 -0
  37. data/tools/fido/conf/{container-signature-20150307.xml → container-signature-20170330.xml} +1487 -141
  38. data/tools/fido/conf/format_extensions.xml +0 -14
  39. data/tools/fido/conf/{formats-v81.xml → formats-v90.xml} +11409 -887
  40. data/tools/fido/conf/{pronom-xml-v81.zip → pronom-xml-v90.zip} +0 -0
  41. data/tools/fido/conf/versions.xml +6 -6
  42. data/tools/fido/fido.py +437 -407
  43. data/tools/fido/package.py +96 -0
  44. data/tools/fido/prepare.py +217 -188
  45. data/tools/fido/pronomutils.py +143 -58
  46. data/tools/fido/toxml.py +54 -46
  47. data/tools/fido/update_signatures.py +139 -127
  48. metadata +34 -40
  49. data/tools/droid/droid-command-line-6.1.5.jar +0 -0
  50. data/tools/droid/lib/antlr-2.7.7.jar +0 -0
  51. data/tools/droid/lib/antlr-3.2.jar +0 -0
  52. data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
  53. data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
  54. data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
  55. data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
  56. data/tools/droid/lib/commons-codec-1.4.jar +0 -0
  57. data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
  58. data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
  59. data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
  60. data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
  61. data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
  62. data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
  63. data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
  64. data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
  65. data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
  66. data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
  67. data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
  68. data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
  69. data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
  70. data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
  71. data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
  72. data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
  73. data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
  74. data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
  75. data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
  76. data/tools/droid/lib/poi-3.7.jar +0 -0
  77. data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
  78. data/tools/fido/argparselocal.py +0 -2355
  79. data/tools/fido/conf/DROID_SignatureFile-v81.xml +0 -2
@@ -0,0 +1,96 @@
1
+ """Support for containers."""
2
+
3
+ import re
4
+ import zipfile
5
+
6
+ import olefile
7
+ from six import iteritems
8
+
9
+
10
+ class Package(object):
11
+ """Base class for container support."""
12
+
13
+ def _process_puid_map(self, data, puid_map):
14
+ results = []
15
+ for puid, signatures in iteritems(puid_map):
16
+ results.extend(self._process_matches(data, puid, signatures))
17
+
18
+ return results
19
+
20
+ def _process_matches(self, data, puid, signatures):
21
+ results = []
22
+ for signature in signatures:
23
+ if re.search(signature["signature"], data):
24
+ results.append(puid)
25
+
26
+ return results
27
+
28
+
29
+ class OlePackage(Package):
30
+ """OlePackage supports OLE containers."""
31
+
32
+ def __init__(self, ole, signatures):
33
+ """Instantiate OlePackage object given the location of its file and signatures."""
34
+ self.ole = ole
35
+ self.signatures = signatures
36
+
37
+ def detect_formats(self):
38
+ """Detect available formats inside the OLE container."""
39
+ try:
40
+ ole = olefile.OleFileIO(self.ole)
41
+ except IOError:
42
+ return []
43
+
44
+ results = []
45
+ for path, puid_map in iteritems(self.signatures):
46
+ # Each OLE container signature lists the path of the file inside the OLE
47
+ # on which it operates; if the file is missing, there can be no match.
48
+ # This is not a precise match because the name of the stream may slightly
49
+ # differ; for example, \x01CompObj instead of CompObj
50
+ filepath = None
51
+ for paths in ole.listdir():
52
+ p = '/'.join(paths)
53
+ if p == path or p[1:] == path:
54
+ filepath = p
55
+ break
56
+
57
+ # Path to match isn't in the container at all
58
+ if filepath is None:
59
+ continue
60
+
61
+ with ole.openstream(filepath) as stream:
62
+ contents = stream.read()
63
+ results.extend(self._process_puid_map(contents, puid_map))
64
+
65
+ return results
66
+
67
+
68
+ class ZipPackage(Package):
69
+ """ZipPackage supports Zip containers."""
70
+
71
+ def __init__(self, zip_, signatures):
72
+ """Instantiate ZipPackage object given the location of its file and signatures."""
73
+ self.zip = zip_
74
+ self.signatures = signatures
75
+
76
+ def detect_formats(self):
77
+ """Detect available formats inside the ZIP container."""
78
+ try:
79
+ zip_ = zipfile.ZipFile(self.zip)
80
+ except (zipfile.BadZipfile, RuntimeError, UnicodeDecodeError):
81
+ return []
82
+
83
+ results = []
84
+ for path, puid_map in iteritems(self.signatures):
85
+ # Each ZIP container signature lists the path of the file inside the ZIP
86
+ # on which it operates; if the file is missing, there can be no match.
87
+ if path not in zip_.namelist():
88
+ continue
89
+
90
+ # Extract the requested file from the ZIP only once, and pass the same
91
+ # data to each signature that requires it.
92
+ with zip_.open(path) as id_file:
93
+ contents = id_file.read()
94
+ results.extend(self._process_puid_map(contents, puid_map))
95
+
96
+ return results
@@ -1,88 +1,112 @@
1
- #!python
1
+ #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
- # Format Identification for Digital Objects
4
3
 
5
- # MdR: 'reload(sys)' and 'setdefaultencoding("utf-8")' needed to fix utf-8 encoding errors
6
- # when converting from PRONOM to FIDO format
7
- import sys
8
- reload(sys)
9
- sys.setdefaultencoding("utf-8")
10
- import cStringIO, zipfile, os
4
+ """Format Identification for Digital Objects."""
5
+
6
+ from __future__ import print_function
7
+
8
+ from argparse import ArgumentParser
11
9
  import hashlib
12
- import urllib
10
+ import sys
11
+ from xml.dom import minidom
13
12
  from xml.etree import ElementTree as ET
14
- from xml.etree import ElementTree as VET # versions.xml
15
- # needed for debug
16
- # print_r: https://github.com/marcbelmont/python-print_r
17
- # from print_r import print_r
13
+ import zipfile
14
+
15
+ from six.moves import cStringIO
16
+ from six.moves.urllib.request import urlopen
17
+ from six.moves.urllib.parse import urlparse
18
+
19
+ from .pronomutils import get_local_pronom_versions
20
+
21
+
22
+ # \a\b\n\r\t\v
23
+ # MdR: took out '<' and '>' out of _ordinary because they were converted to entities &lt;&gt;
24
+ # MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
25
+ _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
26
+ _special = '$()*+.?![]^\\{|}'
27
+ _hex = '0123456789abcdef'
28
+
18
29
 
19
30
  class NS:
20
- """Helper class for XML name spaces in ElementTree.
21
- Use like MYNS=NS("{http://some/uri}") and then
22
- MYNS(tag1/tag2).
23
31
  """
32
+ Helper class for XML name spaces in ElementTree.
33
+
34
+ Use like MYNS=NS("{http://some/uri}") and then MYNS(tag1/tag2).
35
+ """
36
+
24
37
  def __init__(self, uri):
38
+ """Instantiate class with `uri` argument."""
25
39
  self.uri = uri
40
+
26
41
  def __getattr__(self, tag):
42
+ """Append URI to the class attributes."""
27
43
  return self.uri + tag
44
+
28
45
  def __call__(self, path):
46
+ """Define behavior when the instant is used as a function."""
29
47
  return "/".join(getattr(self, tag) for tag in path.split("/"))
30
48
 
31
- # XHTML namespace
32
- XHTML = NS("{http://www.w3.org/1999/xhtml}")
33
- # TNA namespace
34
- TNA = NS("{http://pronom.nationalarchives.gov.uk}")
49
+
50
+ XHTML = NS("{http://www.w3.org/1999/xhtml}") # XHTML namespace
51
+ TNA = NS("{http://pronom.nationalarchives.gov.uk}") # TNA namespace
52
+
35
53
 
36
54
  def get_text_tna(element, tag, default=''):
37
- """Helper function to return the text for a tag or path using the TNA namespace.
38
- """
55
+ """Helper function to return the text for a tag or path using the TNA namespace."""
39
56
  part = element.find(TNA(tag))
40
- return part.text.strip() if part != None and part.text != None else default
57
+ if part is None or part.text is None:
58
+ return default
59
+ return part.text.strip()
60
+
41
61
 
42
62
  def prettify(elem):
43
- """Return a pretty-printed XML string for the Element.
44
- """
45
- from xml.dom import minidom
63
+ """Return a pretty-printed XML string for the Element."""
46
64
  rough_string = ET.tostring(elem, 'UTF-8')
47
65
  reparsed = minidom.parseString(rough_string)
48
66
  return reparsed.toprettyxml(indent=" ")
49
67
 
68
+
50
69
  class FormatInfo:
70
+ """Convert PRONOM formats into FIDO signatures."""
71
+
51
72
  def __init__(self, pronom_files, format_list=[]):
73
+ """Instantiate class, take a list of PRONOM files and an optional list of formats."""
52
74
  self.info = {}
53
75
  self.formats = []
54
76
  self.pronom_files = pronom_files
55
77
  for f in format_list:
56
- self.add_format(f)
57
-
58
- def save(self, dst):
59
- """Write the fido XML format definitions to @param dst
60
- """
61
- tree = ET.ElementTree(ET.Element('formats', {'version':'0.3',
62
- 'xmlns:xsi' : "http://www.w3.org/2001/XMLSchema-instance",
63
- 'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
64
- 'xmlns:dc': "http://purl.org/dc/elements/1.1/",
65
- 'xmlns:dcterms': "http://purl.org/dc/terms/"}))
78
+ self.add_format(f) # FIXME: add_format is undefined!
79
+
80
+ def save(self, dst=sys.stdout):
81
+ """Write the fido XML format definitions to @param dst."""
82
+ tree = ET.ElementTree(ET.Element('formats', {
83
+ 'version': '0.3',
84
+ 'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance",
85
+ 'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
86
+ 'xmlns:dc': "http://purl.org/dc/elements/1.1/",
87
+ 'xmlns:dcterms': "http://purl.org/dc/terms/"
88
+ }))
66
89
  root = tree.getroot()
67
90
  for f in self.formats:
68
91
  # MdR: this skipped puids without sig, but we want them ALL
69
92
  # because puid might be matched on extension
70
- #if f.find('signature'):
93
+ # if f.find('signature'):
71
94
  root.append(f)
72
95
  self.indent(root)
73
- with open(dst, 'wb') as out:
74
- #print >>out, ET.tostring(root,encoding='utf-8')
75
- print >>out, ET.tostring(root)
96
+ with open(dst, 'wb') as file_:
97
+ # print >>out, ET.tostring(root,encoding='utf-8')
98
+ print(ET.tostring(root), file=file_)
76
99
 
77
100
  def indent(self, elem, level=0):
78
- i = "\n" + level*" "
101
+ """Indent output."""
102
+ i = "\n" + level * " "
79
103
  if len(elem):
80
104
  if not elem.text or not elem.text.strip():
81
105
  elem.text = i + " "
82
106
  if not elem.tail or not elem.tail.strip():
83
107
  elem.tail = i
84
108
  for elem in elem:
85
- self.indent(elem, level+1)
109
+ self.indent(elem, level + 1)
86
110
  if not elem.tail or not elem.tail.strip():
87
111
  elem.tail = i
88
112
  else:
@@ -90,52 +114,57 @@ class FormatInfo:
90
114
  elem.tail = i
91
115
 
92
116
  def load_pronom_xml(self, puid_filter=None):
93
- """Load the pronom XML from self.pronom_files and convert it to fido XML.
94
- As a side-effect, set self.formats to a list of ElementTree.Element
95
- If a @param puid is specified, only that one will be loaded.
117
+ """
118
+ Load the pronom XML from self.pronom_files and convert it to fido XML.
119
+
120
+ As a side-effect, set self.formats to a list of ElementTree.Element.
121
+ If a @param puid is specified, only that one will be loaded.
96
122
  """
97
123
  formats = []
98
- #for p in self.pronom_files:
124
+ # for p in self.pronom_files:
99
125
  # print p
100
- #print self.pronom_files
101
- #exit()
126
+ # print self.pronom_files
127
+ # exit()
102
128
  try:
103
129
  zip = zipfile.ZipFile(self.pronom_files, 'r')
104
130
  for item in zip.infolist():
105
- #print item.filename
131
+ # print item.filename
106
132
  try:
107
133
  stream = zip.open(item)
108
134
  # Work is done here!
109
- #if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
110
- format = self.parse_pronom_xml(stream, puid_filter)
111
- if format != None:
112
- formats.append(format)
135
+ # if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
136
+ format_ = self.parse_pronom_xml(stream, puid_filter)
137
+ if format_ is not None:
138
+ formats.append(format_)
113
139
  finally:
114
140
  stream.close()
115
141
  finally:
116
142
  try:
117
143
  zip.close()
118
- except Exception, e:
119
- sys.stderr.write("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e))
144
+ except Exception as e:
145
+ print("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e), file=sys.stderr)
120
146
  sys.exit()
121
147
  # Replace the formatID with puids in has_priority_over
122
- id_map = {}
123
- for element in formats:
124
- puid = element.find('puid').text
125
- #print "working on puid:",puid
126
- pronom_id = element.find('pronom_id').text
127
- id_map[pronom_id] = puid
128
- for element in formats:
129
- for rel in element.findall('has_priority_over'):
130
- rel.text = id_map[rel.text]
148
+ if puid_filter is None:
149
+ id_map = {}
150
+ for element in formats:
151
+ puid = element.find('puid').text
152
+ # print "working on puid:",puid
153
+ pronom_id = element.find('pronom_id').text
154
+ id_map[pronom_id] = puid
155
+ for element in formats:
156
+ for rel in element.findall('has_priority_over'):
157
+ rel.text = id_map[rel.text]
131
158
 
132
159
  self._sort_formats(formats)
133
160
  self.formats = formats
134
-
161
+
135
162
  def parse_pronom_xml(self, source, puid_filter=None):
136
- """Read a pronom XML from @param source, convert it to fido XML and
137
- @return ET.ElementTree Element representing it.
138
- If a @param puid is specified, only that one will be loaded.
163
+ """
164
+ Parse PRONOM XML and convert into FIDO XML.
165
+
166
+ If a @param puid is specified, only that one will be loaded.
167
+ @return ET.ElementTree Element representing it.
139
168
  """
140
169
  pronom_xml = ET.parse(source)
141
170
  pronom_root = pronom_xml.getroot()
@@ -147,13 +176,13 @@ class FormatInfo:
147
176
  if type == 'PUID':
148
177
  puid = get_text_tna(id, 'Identifier')
149
178
  ET.SubElement(fido_format, 'puid').text = puid
150
- if puid_filter != None and puid != puid_filter:
179
+ if puid_filter and puid != puid_filter:
151
180
  return None
152
181
  # A bit clumsy. I want to have puid first, then mime, then container.
153
182
  for id in pronom_format.findall(TNA('FileFormatIdentifier')):
154
183
  type = get_text_tna(id, 'IdentifierType')
155
184
  if type == 'MIME':
156
- ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
185
+ ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
157
186
  elif type == 'PUID':
158
187
  puid = get_text_tna(id, 'Identifier')
159
188
  if puid == 'x-fmt/263':
@@ -170,7 +199,7 @@ class FormatInfo:
170
199
  for id in pronom_format.findall(TNA('FileFormatIdentifier')):
171
200
  type = get_text_tna(id, 'IdentifierType')
172
201
  if type == 'Apple Uniform Type Identifier':
173
- ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
202
+ ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
174
203
  # Handle the relationships
175
204
  for x in pronom_format.findall(TNA('RelatedFormat')):
176
205
  rel = get_text_tna(x, 'RelationshipType')
@@ -181,20 +210,20 @@ class FormatInfo:
181
210
  fido_sig = ET.SubElement(fido_format, 'signature')
182
211
  ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
183
212
  # There are some funny chars in the notes, which caused me trouble and it is a unicode string,
184
- ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote').encode('UTF-8')
213
+ ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
185
214
  for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
186
215
  fido_pat = ET.SubElement(fido_sig, 'pattern')
187
216
  pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
188
217
  bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
189
218
  offset = get_text_tna(pronom_pat, 'Offset')
190
219
  max_offset = get_text_tna(pronom_pat, 'MaxOffset')
191
- if max_offset == None:
220
+ if not max_offset:
192
221
  pass
193
- #print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
222
+ # print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
194
223
  regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
195
- #print "done puid", puid
224
+ # print "done puid", puid
196
225
  if regex == "__INCOMPATIBLE_SIG__":
197
- print >> sys.stderr, "Error: incompatible PRONOM signature found for puid", puid, ", skipping..."
226
+ print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
198
227
  # remove the empty 'signature' nodes
199
228
  # now that the signature is not compatible and thus "regex" is empty
200
229
  remove = fido_format.findall('signature')
@@ -205,8 +234,8 @@ class FormatInfo:
205
234
  ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
206
235
  ET.SubElement(fido_pat, 'regex').text = regex
207
236
  # Get the format details
208
- fido_details = ET.SubElement(fido_format,'details')
209
- ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription').encode('utf8')
237
+ fido_details = ET.SubElement(fido_format, 'details')
238
+ ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
210
239
  ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
211
240
  ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
212
241
  ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
@@ -221,7 +250,7 @@ class FormatInfo:
221
250
  ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
222
251
  # References
223
252
  for x in pronom_format.findall(TNA("Document")):
224
- r = ET.SubElement(fido_details,'reference')
253
+ r = ET.SubElement(fido_details, 'reference')
225
254
  ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
226
255
  ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
227
256
  ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
@@ -229,51 +258,53 @@ class FormatInfo:
229
258
  for id in x.findall(TNA('DocumentIdentifier')):
230
259
  type = get_text_tna(id, 'IdentifierType')
231
260
  if type == 'URL':
232
- ET.SubElement(r, 'dc:identifier').text = "http://"+get_text_tna(id, 'Identifier')
261
+ ET.SubElement(r, 'dc:identifier').text = "http://" + get_text_tna(id, 'Identifier')
233
262
  else:
234
- ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
263
+ ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
235
264
  ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
236
265
  ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
237
- ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription')+" "+get_text_tna(x, 'AvailabilityNote')
266
+ ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription') + " " + get_text_tna(x, 'AvailabilityNote')
238
267
  ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
239
- # Examples
268
+ # Examples
240
269
  for x in pronom_format.findall(TNA("ReferenceFile")):
241
- rf = ET.SubElement(fido_details,'example_file')
270
+ rf = ET.SubElement(fido_details, 'example_file')
242
271
  ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
243
272
  ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
244
273
  checksum = ""
245
274
  for id in x.findall(TNA('ReferenceFileIdentifier')):
246
275
  type = get_text_tna(id, 'IdentifierType')
247
276
  if type == 'URL':
248
- url = "http://"+get_text_tna(id, 'Identifier')
249
- ET.SubElement(rf, 'dc:identifier').text = url
277
+ # Starting with PRONOM 89, some URLs contain http://
278
+ # and others do not.
279
+ url = get_text_tna(id, 'Identifier')
280
+ if not urlparse(url).scheme:
281
+ url = "http://" + url
282
+ ET.SubElement(rf, 'dc:identifier').text = url
250
283
  # And calculate the checksum of this resource:
251
284
  m = hashlib.md5()
252
- sock = urllib.urlopen(url)
285
+ sock = urlopen(url)
253
286
  m.update(sock.read())
254
287
  sock.close()
255
- checksum=m.hexdigest()
288
+ checksum = m.hexdigest()
256
289
  else:
257
- ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
290
+ ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
258
291
  ET.SubElement(rf, 'dcterms:license').text = ""
259
292
  ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
260
293
  checksumElement = ET.SubElement(rf, 'checksum')
261
294
  checksumElement.text = checksum
262
295
  checksumElement.attrib['type'] = "md5"
263
296
  # Record Metadata
264
- md = ET.SubElement(fido_details,'record_metadata')
265
- ET.SubElement(md, 'status').text ='unknown'
297
+ md = ET.SubElement(fido_details, 'record_metadata')
298
+ ET.SubElement(md, 'status').text = 'unknown'
266
299
  ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
267
300
  ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
268
301
  ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
269
- ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription').encode('utf8')
270
- return fido_format
271
-
272
- #FIXME: I don't think that this quite works yet!
302
+ ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
303
+ return fido_format
304
+
305
+ # FIXME: I don't think that this quite works yet!
273
306
  def _sort_formats(self, formatlist):
274
- """Sort the format list based on their priority relationships so higher priority
275
- formats appear earlier in the list.
276
- """
307
+ """Sort the format list based on their priority relationships so higher priority formats appear earlier in the list."""
277
308
  def compare_formats(f1, f2):
278
309
  f1ID = f1.find('puid').text
279
310
  f2ID = f2.find('puid').text
@@ -291,9 +322,9 @@ class FormatInfo:
291
322
  return 1
292
323
  return sorted(formatlist, cmp=compare_formats)
293
324
 
325
+
294
326
  def fido_position(pronom_position):
295
- """@return BOF/EOF/VAR instead of the more verbose pronom position names.
296
- """
327
+ """Return BOF/EOF/VAR instead of the more verbose pronom position names."""
297
328
  if pronom_position == 'Absolute from BOF':
298
329
  return 'BOF'
299
330
  elif pronom_position == 'Absolute from EOF':
@@ -302,16 +333,20 @@ def fido_position(pronom_position):
302
333
  return 'VAR'
303
334
  elif pronom_position == 'Indirect From BOF':
304
335
  return 'IFB'
305
- else: # to make sure FIDO does not crash (IFB aftermath)
306
- sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
336
+ else: # to make sure FIDO does not crash (IFB aftermath)
337
+ sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
307
338
  return 'VAR'
308
339
 
340
+
309
341
  def _convert_err_msg(msg, c, i, chars):
310
342
  return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
311
343
 
344
+
312
345
  def doByte(chars, i, littleendian):
313
- """Convert two chars[i] and chars[i+1] into a byte.
314
- @return a tuple (byte, 2)
346
+ """
347
+ Convert two chars[i] and chars[i+1] into a byte.
348
+
349
+ @return a tuple (byte, 2)
315
350
  """
316
351
  c1 = '0123456789ABCDEF'.find(chars[i].upper())
317
352
  c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
@@ -323,12 +358,7 @@ def doByte(chars, i, littleendian):
323
358
  val = chr(c1 + 16 * c2)
324
359
  return (escape(val), 2)
325
360
 
326
- # \a\b\n\r\t\v
327
- # MdR: took out '<' and '>' out of _ordinary because they were converted to entities &lt;&gt;
328
- # MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
329
- _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
330
- _special = '$()*+.?![]^\\{|}'
331
- _hex = '0123456789abcdef'
361
+
332
362
  def _escape_char(c):
333
363
  if c in '\n':
334
364
  return '\\n'
@@ -340,68 +370,66 @@ def _escape_char(c):
340
370
  (high, low) = divmod(ord(c), 16)
341
371
  return '\\x' + _hex[high] + _hex[low]
342
372
 
373
+
343
374
  def escape(string):
344
- "Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
375
+ """Escape characters in pattern that are non-printable, non-ascii, or special for regexes."""
345
376
  return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
346
377
 
378
+
347
379
  def calculate_repetition(char, pos, offset, maxoffset):
348
- """
349
- Recursively calculates offset/maxoffset repetition,
350
- when one or both offsets is greater than 65535 bytes (64KB)
351
- see: bugs.python.org/issue13169
352
- Otherwise it returns the {offset,maxoffset}
353
- """
354
- calcbuf = cStringIO.StringIO()
355
-
380
+ """Recursively calculates offset/maxoffset repetition, when one or both offsets is greater than 65535 bytes (64KB). See: https://bugs.python.org/issue13169."""
381
+ calcbuf = cStringIO()
382
+
356
383
  calcremain = False
357
384
  offsetremain = 0
358
385
  maxoffsetremain = 0
359
-
360
- if offset != None and offset != '':
361
- if int(offset) > 65535:
362
- offsetremain = str(int(offset) - 65535)
363
- offset = '65535'
364
- calcremain = True
365
- if maxoffset != None and maxoffset != '':
366
- if int(maxoffset) > 65535:
367
- maxoffsetremain = str(int(maxoffset) - 65535)
368
- maxoffset = '65535'
369
- calcremain = True
370
-
386
+
387
+ if offset is not None and int(offset) > 65535:
388
+ offsetremain = str(int(offset) - 65535)
389
+ offset = '65535'
390
+ calcremain = True
391
+ if maxoffset is not None and int(maxoffset) > 65535:
392
+ maxoffsetremain = str(int(maxoffset) - 65535)
393
+ maxoffset = '65535'
394
+ calcremain = True
395
+
371
396
  if pos == "BOF" or pos == "EOF":
372
397
  if offset != '0':
373
398
  calcbuf.write(char + '{' + str(offset))
374
- if maxoffset != None:
399
+ if maxoffset is not None:
375
400
  calcbuf.write(',' + maxoffset)
376
401
  calcbuf.write('}')
377
- elif maxoffset != None:
402
+ elif maxoffset is not None:
378
403
  calcbuf.write(char + '{0,' + maxoffset + '}')
379
404
 
380
405
  if pos == "IFB":
381
406
  if offset != '0':
382
407
  calcbuf.write(char + '{' + str(offset))
383
- if maxoffset != None:
408
+ if maxoffset is not None:
384
409
  calcbuf.write(',' + maxoffset)
385
410
  calcbuf.write('}')
386
- if maxoffset == None:
411
+ if maxoffset is not None:
387
412
  calcbuf.write(',}')
388
- elif maxoffset != None:
413
+ elif maxoffset is not None:
389
414
  calcbuf.write(char + '{0,' + maxoffset + '}')
390
415
 
391
- if calcremain: # recursion happens here
416
+ if calcremain: # recursion happens here
392
417
  calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
393
-
418
+
394
419
  val = calcbuf.getvalue()
395
420
  calcbuf.close()
396
421
  return val
397
422
 
423
+
398
424
  def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
399
- """Convert
400
- @param chars, a pronom bytesequence, into a
401
- @return regular expression.
402
- Endianness is not used.
403
425
  """
426
+ Convert to regular expression.
427
+
428
+ Endianness is not used.
404
429
 
430
+ @param chars, a pronom bytesequence, into a
431
+ @return regular expression.
432
+ """
405
433
  if 'Big' in endianness:
406
434
  littleendian = False
407
435
  else:
@@ -410,24 +438,26 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
410
438
  offset = '0'
411
439
  if len(maxoffset) == 0:
412
440
  maxoffset = None
441
+ if maxoffset == '0':
442
+ maxoffset = None
413
443
  # make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
414
444
  global buf
415
- buf = cStringIO.StringIO()
416
- buf.write("(?s)") #If a regex starts with (?s), it is equivalent to DOTALL.
445
+ buf = cStringIO()
446
+ buf.write("(?s)") # If a regex starts with (?s), it is equivalent to DOTALL.
417
447
  i = 0
418
448
  state = 'start'
419
449
  if 'BOF' in pos:
420
- buf.write('\\A') # start of regex
450
+ buf.write('\\A') # start of regex
421
451
  buf.write(calculate_repetition('.', pos, offset, maxoffset))
422
-
452
+
423
453
  if 'IFB' in pos:
424
454
  buf.write('\\A')
425
455
  buf.write(calculate_repetition('.', pos, offset, maxoffset))
426
-
456
+
427
457
  while True:
428
458
  if i == len(chars):
429
459
  break
430
- #print _convert_err_msg(state,chars[i],i,chars)
460
+ # print _convert_err_msg(state,chars[i],i,chars)
431
461
  if state == 'start':
432
462
  if chars[i].isalnum():
433
463
  state = 'bytes'
@@ -471,7 +501,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
471
501
  (byt, inc) = doByte(chars, i, littleendian)
472
502
  buf.write(byt)
473
503
  i += inc
474
- #assert(chars[i] == ':')
504
+ # assert(chars[i] == ':')
475
505
  if chars[i] != ':':
476
506
  return "__INCOMPATIBLE_SIG__"
477
507
  buf.write('-')
@@ -479,13 +509,13 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
479
509
  (byt, inc) = doByte(chars, i, littleendian)
480
510
  buf.write(byt)
481
511
  i += inc
482
- #assert(chars[i] == ']')
512
+ # assert(chars[i] == ']')
483
513
  if chars[i] != ']':
484
514
  return "__INCOMPATIBLE_SIG__"
485
515
  buf.write(']')
486
516
  i += 1
487
517
  except Exception:
488
- print _convert_err_msg('Illegal character in bracket', chars[i], i, chars)
518
+ print(_convert_err_msg('Illegal character in bracket', chars[i], i, chars))
489
519
  raise
490
520
  if i < len(chars) and chars[i] == '{':
491
521
  state = 'curly-after-bracket'
@@ -511,7 +541,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
511
541
  (byt, inc) = doByte(chars, i, littleendian)
512
542
  buf.write(byt)
513
543
  i += inc
514
- #assert(chars[i] == ':')
544
+ # assert(chars[i] == ':')
515
545
  if chars[i] != ':':
516
546
  return "__INCOMPATIBLE_SIG__"
517
547
  buf.write('-')
@@ -519,8 +549,8 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
519
549
  (byt, inc) = doByte(chars, i, littleendian)
520
550
  buf.write(byt)
521
551
  i += inc
522
-
523
- #assert(chars[i] == ']')
552
+
553
+ # assert(chars[i] == ']')
524
554
  if chars[i] != ']':
525
555
  return "__INCOMPATIBLE_SIG__"
526
556
  buf.write(']')
@@ -537,7 +567,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
537
567
  # when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
538
568
  # The above, while sensible, appears to be incorrect. A '.' is always needed.
539
569
  # for droid equiv behavior
540
- #if state == 'curly':
570
+ # if state == 'curly':
541
571
  buf.write('.')
542
572
  buf.write('{')
543
573
  i += 1 # skip the (
@@ -548,7 +578,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
548
578
  elif chars[i] == '-':
549
579
  buf.write(',')
550
580
  i += 1
551
- elif chars[i] == '*': # skip the *
581
+ elif chars[i] == '*': # skip the *
552
582
  i += 1
553
583
  elif chars[i] == '}':
554
584
  break
@@ -581,36 +611,35 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
581
611
  buf.close()
582
612
  return val
583
613
 
584
- def main(arg=None):
585
- import sys
586
- from argparselocal import ArgumentParser
587
- if arg != None:
588
- arglist = arg
589
- else:
590
- arglist = sys.argv[1:]
591
- # print arglist
592
- # exit()
593
- mydir = os.path.abspath(os.path.dirname(__file__))
594
- # parse version file to fetch versions
595
- versionsFile = os.path.join(mydir, 'conf', 'versions.xml')
596
- try:
597
- versions = VET.parse(versionsFile)
598
- except Exception, e:
599
- sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
600
- sys.exit()
601
- xml_pronomSignature = os.path.join(mydir, 'conf', versions.find('pronomSignature').text)
602
- xml_pronomZipFile = os.path.join(mydir, 'conf', "pronom-xml-v{0}.zip".format(versions.find('pronomVersion').text))
603
- parser = ArgumentParser(description='Produce the fido format xml that is loaded at run-time')
604
- parser.add_argument('-input', default=xml_pronomZipFile, help='input file, a zip containing Pronom xml files')
605
- parser.add_argument('-output', default=xml_pronomSignature, help='output file')
606
- parser.add_argument('-puid', default=None, help='a particular PUID record to extract')
607
- # PROCESS ARGUMENTS
608
- args = parser.parse_args(arglist)
609
- # print os.path.abspath(args.input), os.path.abspath(args.output)
610
- info = FormatInfo(args.input)
611
- info.load_pronom_xml(args.puid)
612
- info.save(args.output)
613
- print >> sys.stderr, 'Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats))
614
-
614
+
615
+ def run(input=None, output=None, puid=None):
616
+ """Convert PRONOM formats into FIDO signatures."""
617
+ versions = get_local_pronom_versions()
618
+
619
+ if input is None:
620
+ input = versions.get_zip_file()
621
+ if output is None:
622
+ output = versions.get_signature_file()
623
+
624
+ info = FormatInfo(input)
625
+ info.load_pronom_xml(puid)
626
+ info.save(output)
627
+ print('Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats)), file=sys.stderr)
628
+
629
+
630
+ def main(args=None):
631
+ """Main CLI entrypoint."""
632
+ if args is None:
633
+ args = sys.argv[1:]
634
+
635
+ parser = ArgumentParser(description='Produce the FIDO format XML that is loaded at run-time')
636
+ parser.add_argument('-input', default=None, help='Input file, a Zip containing PRONOM XML files')
637
+ parser.add_argument('-output', default=None, help='Ouptut file')
638
+ parser.add_argument('-puid', default=None, help='A particular PUID record to extract')
639
+ args = parser.parse_args(args)
640
+
641
+ run(input=args.input, output=args.output, puid=args.puid)
642
+
643
+
615
644
  if __name__ == '__main__':
616
- main()
645
+ main()