libis-format 0.9.30 → 0.9.32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/lib/libis/format/converter/image_converter.rb +2 -2
  3. data/lib/libis/format/office_to_pdf.rb +1 -1
  4. data/lib/libis/format/version.rb +1 -1
  5. data/spec/converter_spec.rb +43 -27
  6. data/spec/data/test-options.png +0 -0
  7. data/spec/data/test.pdf.tif +0 -0
  8. data/tools/droid/{DROID_SignatureFile_V82.xml → DROID_SignatureFile_V90.xml} +8202 -701
  9. data/tools/droid/{container-signature-20150307.xml → container-signature-20170330.xml} +3584 -2235
  10. data/tools/droid/droid-command-line-6.3.jar +0 -0
  11. data/tools/droid/droid.bat +152 -154
  12. data/tools/droid/droid.sh +30 -16
  13. data/tools/droid/lib/aspectjrt-1.8.7.jar +0 -0
  14. data/tools/droid/lib/aspectjweaver-1.8.7.jar +0 -0
  15. data/tools/droid/lib/byteseek-2.0.3.jar +0 -0
  16. data/tools/droid/lib/commons-codec-1.10.jar +0 -0
  17. data/tools/droid/lib/commons-collections-3.2.2.jar +0 -0
  18. data/tools/droid/lib/droid-container-6.3.jar +0 -0
  19. data/tools/droid/lib/droid-core-6.3.jar +0 -0
  20. data/tools/droid/lib/droid-core-interfaces-6.3.jar +0 -0
  21. data/tools/droid/lib/droid-export-6.3.jar +0 -0
  22. data/tools/droid/lib/droid-export-interfaces-6.3.jar +0 -0
  23. data/tools/droid/lib/{droid-help-6.1.5.jar → droid-help-6.3.jar} +0 -0
  24. data/tools/droid/lib/droid-report-6.3.jar +0 -0
  25. data/tools/droid/lib/droid-report-interfaces-6.3.jar +0 -0
  26. data/tools/droid/lib/droid-results-6.3.jar +0 -0
  27. data/tools/droid/lib/jwat-arc-1.0.3.jar +0 -0
  28. data/tools/droid/lib/jwat-archive-common-1.0.3.jar +0 -0
  29. data/tools/droid/lib/jwat-common-1.0.3.jar +0 -0
  30. data/tools/droid/lib/jwat-gzip-1.0.3.jar +0 -0
  31. data/tools/droid/lib/jwat-warc-1.0.2.jar +0 -0
  32. data/tools/droid/lib/poi-3.13.jar +0 -0
  33. data/tools/droid/lib/saaj-api-1.3.jar +0 -0
  34. data/tools/droid/lib/trove4j-3.0.3.jar +0 -0
  35. data/tools/fido/__init__.py +50 -0
  36. data/tools/fido/conf/DROID_SignatureFile-v90.xml +2 -0
  37. data/tools/fido/conf/{container-signature-20150307.xml → container-signature-20170330.xml} +1487 -141
  38. data/tools/fido/conf/format_extensions.xml +0 -14
  39. data/tools/fido/conf/{formats-v81.xml → formats-v90.xml} +11409 -887
  40. data/tools/fido/conf/{pronom-xml-v81.zip → pronom-xml-v90.zip} +0 -0
  41. data/tools/fido/conf/versions.xml +6 -6
  42. data/tools/fido/fido.py +437 -407
  43. data/tools/fido/package.py +96 -0
  44. data/tools/fido/prepare.py +217 -188
  45. data/tools/fido/pronomutils.py +143 -58
  46. data/tools/fido/toxml.py +54 -46
  47. data/tools/fido/update_signatures.py +139 -127
  48. metadata +34 -40
  49. data/tools/droid/droid-command-line-6.1.5.jar +0 -0
  50. data/tools/droid/lib/antlr-2.7.7.jar +0 -0
  51. data/tools/droid/lib/antlr-3.2.jar +0 -0
  52. data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
  53. data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
  54. data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
  55. data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
  56. data/tools/droid/lib/commons-codec-1.4.jar +0 -0
  57. data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
  58. data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
  59. data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
  60. data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
  61. data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
  62. data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
  63. data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
  64. data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
  65. data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
  66. data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
  67. data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
  68. data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
  69. data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
  70. data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
  71. data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
  72. data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
  73. data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
  74. data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
  75. data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
  76. data/tools/droid/lib/poi-3.7.jar +0 -0
  77. data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
  78. data/tools/fido/argparselocal.py +0 -2355
  79. data/tools/fido/conf/DROID_SignatureFile-v81.xml +0 -2
@@ -0,0 +1,96 @@
1
+ """Support for containers."""
2
+
3
+ import re
4
+ import zipfile
5
+
6
+ import olefile
7
+ from six import iteritems
8
+
9
+
10
+ class Package(object):
11
+ """Base class for container support."""
12
+
13
+ def _process_puid_map(self, data, puid_map):
14
+ results = []
15
+ for puid, signatures in iteritems(puid_map):
16
+ results.extend(self._process_matches(data, puid, signatures))
17
+
18
+ return results
19
+
20
+ def _process_matches(self, data, puid, signatures):
21
+ results = []
22
+ for signature in signatures:
23
+ if re.search(signature["signature"], data):
24
+ results.append(puid)
25
+
26
+ return results
27
+
28
+
29
+ class OlePackage(Package):
30
+ """OlePackage supports OLE containers."""
31
+
32
+ def __init__(self, ole, signatures):
33
+ """Instantiate OlePackage object given the location of its file and signatures."""
34
+ self.ole = ole
35
+ self.signatures = signatures
36
+
37
+ def detect_formats(self):
38
+ """Detect available formats inside the OLE container."""
39
+ try:
40
+ ole = olefile.OleFileIO(self.ole)
41
+ except IOError:
42
+ return []
43
+
44
+ results = []
45
+ for path, puid_map in iteritems(self.signatures):
46
+ # Each OLE container signature lists the path of the file inside the OLE
47
+ # on which it operates; if the file is missing, there can be no match.
48
+ # This is not a precise match because the name of the stream may slightly
49
+ # differ; for example, \x01CompObj instead of CompObj
50
+ filepath = None
51
+ for paths in ole.listdir():
52
+ p = '/'.join(paths)
53
+ if p == path or p[1:] == path:
54
+ filepath = p
55
+ break
56
+
57
+ # Path to match isn't in the container at all
58
+ if filepath is None:
59
+ continue
60
+
61
+ with ole.openstream(filepath) as stream:
62
+ contents = stream.read()
63
+ results.extend(self._process_puid_map(contents, puid_map))
64
+
65
+ return results
66
+
67
+
68
+ class ZipPackage(Package):
69
+ """ZipPackage supports Zip containers."""
70
+
71
+ def __init__(self, zip_, signatures):
72
+ """Instantiate ZipPackage object given the location of its file and signatures."""
73
+ self.zip = zip_
74
+ self.signatures = signatures
75
+
76
+ def detect_formats(self):
77
+ """Detect available formats inside the ZIP container."""
78
+ try:
79
+ zip_ = zipfile.ZipFile(self.zip)
80
+ except (zipfile.BadZipfile, RuntimeError, UnicodeDecodeError):
81
+ return []
82
+
83
+ results = []
84
+ for path, puid_map in iteritems(self.signatures):
85
+ # Each ZIP container signature lists the path of the file inside the ZIP
86
+ # on which it operates; if the file is missing, there can be no match.
87
+ if path not in zip_.namelist():
88
+ continue
89
+
90
+ # Extract the requested file from the ZIP only once, and pass the same
91
+ # data to each signature that requires it.
92
+ with zip_.open(path) as id_file:
93
+ contents = id_file.read()
94
+ results.extend(self._process_puid_map(contents, puid_map))
95
+
96
+ return results
@@ -1,88 +1,112 @@
1
- #!python
1
+ #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
- # Format Identification for Digital Objects
4
3
 
5
- # MdR: 'reload(sys)' and 'setdefaultencoding("utf-8")' needed to fix utf-8 encoding errors
6
- # when converting from PRONOM to FIDO format
7
- import sys
8
- reload(sys)
9
- sys.setdefaultencoding("utf-8")
10
- import cStringIO, zipfile, os
4
+ """Format Identification for Digital Objects."""
5
+
6
+ from __future__ import print_function
7
+
8
+ from argparse import ArgumentParser
11
9
  import hashlib
12
- import urllib
10
+ import sys
11
+ from xml.dom import minidom
13
12
  from xml.etree import ElementTree as ET
14
- from xml.etree import ElementTree as VET # versions.xml
15
- # needed for debug
16
- # print_r: https://github.com/marcbelmont/python-print_r
17
- # from print_r import print_r
13
+ import zipfile
14
+
15
+ from six.moves import cStringIO
16
+ from six.moves.urllib.request import urlopen
17
+ from six.moves.urllib.parse import urlparse
18
+
19
+ from .pronomutils import get_local_pronom_versions
20
+
21
+
22
+ # \a\b\n\r\t\v
23
+ # MdR: took out '<' and '>' out of _ordinary because they were converted to entities &lt;&gt;
24
+ # MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
25
+ _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
26
+ _special = '$()*+.?![]^\\{|}'
27
+ _hex = '0123456789abcdef'
28
+
18
29
 
19
30
  class NS:
20
- """Helper class for XML name spaces in ElementTree.
21
- Use like MYNS=NS("{http://some/uri}") and then
22
- MYNS(tag1/tag2).
23
31
  """
32
+ Helper class for XML name spaces in ElementTree.
33
+
34
+ Use like MYNS=NS("{http://some/uri}") and then MYNS(tag1/tag2).
35
+ """
36
+
24
37
  def __init__(self, uri):
38
+ """Instantiate class with `uri` argument."""
25
39
  self.uri = uri
40
+
26
41
  def __getattr__(self, tag):
42
+ """Append URI to the class attributes."""
27
43
  return self.uri + tag
44
+
28
45
  def __call__(self, path):
46
+ """Define behavior when the instant is used as a function."""
29
47
  return "/".join(getattr(self, tag) for tag in path.split("/"))
30
48
 
31
- # XHTML namespace
32
- XHTML = NS("{http://www.w3.org/1999/xhtml}")
33
- # TNA namespace
34
- TNA = NS("{http://pronom.nationalarchives.gov.uk}")
49
+
50
+ XHTML = NS("{http://www.w3.org/1999/xhtml}") # XHTML namespace
51
+ TNA = NS("{http://pronom.nationalarchives.gov.uk}") # TNA namespace
52
+
35
53
 
36
54
  def get_text_tna(element, tag, default=''):
37
- """Helper function to return the text for a tag or path using the TNA namespace.
38
- """
55
+ """Helper function to return the text for a tag or path using the TNA namespace."""
39
56
  part = element.find(TNA(tag))
40
- return part.text.strip() if part != None and part.text != None else default
57
+ if part is None or part.text is None:
58
+ return default
59
+ return part.text.strip()
60
+
41
61
 
42
62
  def prettify(elem):
43
- """Return a pretty-printed XML string for the Element.
44
- """
45
- from xml.dom import minidom
63
+ """Return a pretty-printed XML string for the Element."""
46
64
  rough_string = ET.tostring(elem, 'UTF-8')
47
65
  reparsed = minidom.parseString(rough_string)
48
66
  return reparsed.toprettyxml(indent=" ")
49
67
 
68
+
50
69
  class FormatInfo:
70
+ """Convert PRONOM formats into FIDO signatures."""
71
+
51
72
  def __init__(self, pronom_files, format_list=[]):
73
+ """Instantiate class, take a list of PRONOM files and an optional list of formats."""
52
74
  self.info = {}
53
75
  self.formats = []
54
76
  self.pronom_files = pronom_files
55
77
  for f in format_list:
56
- self.add_format(f)
57
-
58
- def save(self, dst):
59
- """Write the fido XML format definitions to @param dst
60
- """
61
- tree = ET.ElementTree(ET.Element('formats', {'version':'0.3',
62
- 'xmlns:xsi' : "http://www.w3.org/2001/XMLSchema-instance",
63
- 'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
64
- 'xmlns:dc': "http://purl.org/dc/elements/1.1/",
65
- 'xmlns:dcterms': "http://purl.org/dc/terms/"}))
78
+ self.add_format(f) # FIXME: add_format is undefined!
79
+
80
+ def save(self, dst=sys.stdout):
81
+ """Write the fido XML format definitions to @param dst."""
82
+ tree = ET.ElementTree(ET.Element('formats', {
83
+ 'version': '0.3',
84
+ 'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance",
85
+ 'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
86
+ 'xmlns:dc': "http://purl.org/dc/elements/1.1/",
87
+ 'xmlns:dcterms': "http://purl.org/dc/terms/"
88
+ }))
66
89
  root = tree.getroot()
67
90
  for f in self.formats:
68
91
  # MdR: this skipped puids without sig, but we want them ALL
69
92
  # because puid might be matched on extension
70
- #if f.find('signature'):
93
+ # if f.find('signature'):
71
94
  root.append(f)
72
95
  self.indent(root)
73
- with open(dst, 'wb') as out:
74
- #print >>out, ET.tostring(root,encoding='utf-8')
75
- print >>out, ET.tostring(root)
96
+ with open(dst, 'wb') as file_:
97
+ # print >>out, ET.tostring(root,encoding='utf-8')
98
+ print(ET.tostring(root), file=file_)
76
99
 
77
100
  def indent(self, elem, level=0):
78
- i = "\n" + level*" "
101
+ """Indent output."""
102
+ i = "\n" + level * " "
79
103
  if len(elem):
80
104
  if not elem.text or not elem.text.strip():
81
105
  elem.text = i + " "
82
106
  if not elem.tail or not elem.tail.strip():
83
107
  elem.tail = i
84
108
  for elem in elem:
85
- self.indent(elem, level+1)
109
+ self.indent(elem, level + 1)
86
110
  if not elem.tail or not elem.tail.strip():
87
111
  elem.tail = i
88
112
  else:
@@ -90,52 +114,57 @@ class FormatInfo:
90
114
  elem.tail = i
91
115
 
92
116
  def load_pronom_xml(self, puid_filter=None):
93
- """Load the pronom XML from self.pronom_files and convert it to fido XML.
94
- As a side-effect, set self.formats to a list of ElementTree.Element
95
- If a @param puid is specified, only that one will be loaded.
117
+ """
118
+ Load the pronom XML from self.pronom_files and convert it to fido XML.
119
+
120
+ As a side-effect, set self.formats to a list of ElementTree.Element.
121
+ If a @param puid is specified, only that one will be loaded.
96
122
  """
97
123
  formats = []
98
- #for p in self.pronom_files:
124
+ # for p in self.pronom_files:
99
125
  # print p
100
- #print self.pronom_files
101
- #exit()
126
+ # print self.pronom_files
127
+ # exit()
102
128
  try:
103
129
  zip = zipfile.ZipFile(self.pronom_files, 'r')
104
130
  for item in zip.infolist():
105
- #print item.filename
131
+ # print item.filename
106
132
  try:
107
133
  stream = zip.open(item)
108
134
  # Work is done here!
109
- #if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
110
- format = self.parse_pronom_xml(stream, puid_filter)
111
- if format != None:
112
- formats.append(format)
135
+ # if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
136
+ format_ = self.parse_pronom_xml(stream, puid_filter)
137
+ if format_ is not None:
138
+ formats.append(format_)
113
139
  finally:
114
140
  stream.close()
115
141
  finally:
116
142
  try:
117
143
  zip.close()
118
- except Exception, e:
119
- sys.stderr.write("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e))
144
+ except Exception as e:
145
+ print("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e), file=sys.stderr)
120
146
  sys.exit()
121
147
  # Replace the formatID with puids in has_priority_over
122
- id_map = {}
123
- for element in formats:
124
- puid = element.find('puid').text
125
- #print "working on puid:",puid
126
- pronom_id = element.find('pronom_id').text
127
- id_map[pronom_id] = puid
128
- for element in formats:
129
- for rel in element.findall('has_priority_over'):
130
- rel.text = id_map[rel.text]
148
+ if puid_filter is None:
149
+ id_map = {}
150
+ for element in formats:
151
+ puid = element.find('puid').text
152
+ # print "working on puid:",puid
153
+ pronom_id = element.find('pronom_id').text
154
+ id_map[pronom_id] = puid
155
+ for element in formats:
156
+ for rel in element.findall('has_priority_over'):
157
+ rel.text = id_map[rel.text]
131
158
 
132
159
  self._sort_formats(formats)
133
160
  self.formats = formats
134
-
161
+
135
162
  def parse_pronom_xml(self, source, puid_filter=None):
136
- """Read a pronom XML from @param source, convert it to fido XML and
137
- @return ET.ElementTree Element representing it.
138
- If a @param puid is specified, only that one will be loaded.
163
+ """
164
+ Parse PRONOM XML and convert into FIDO XML.
165
+
166
+ If a @param puid is specified, only that one will be loaded.
167
+ @return ET.ElementTree Element representing it.
139
168
  """
140
169
  pronom_xml = ET.parse(source)
141
170
  pronom_root = pronom_xml.getroot()
@@ -147,13 +176,13 @@ class FormatInfo:
147
176
  if type == 'PUID':
148
177
  puid = get_text_tna(id, 'Identifier')
149
178
  ET.SubElement(fido_format, 'puid').text = puid
150
- if puid_filter != None and puid != puid_filter:
179
+ if puid_filter and puid != puid_filter:
151
180
  return None
152
181
  # A bit clumsy. I want to have puid first, then mime, then container.
153
182
  for id in pronom_format.findall(TNA('FileFormatIdentifier')):
154
183
  type = get_text_tna(id, 'IdentifierType')
155
184
  if type == 'MIME':
156
- ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
185
+ ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
157
186
  elif type == 'PUID':
158
187
  puid = get_text_tna(id, 'Identifier')
159
188
  if puid == 'x-fmt/263':
@@ -170,7 +199,7 @@ class FormatInfo:
170
199
  for id in pronom_format.findall(TNA('FileFormatIdentifier')):
171
200
  type = get_text_tna(id, 'IdentifierType')
172
201
  if type == 'Apple Uniform Type Identifier':
173
- ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
202
+ ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
174
203
  # Handle the relationships
175
204
  for x in pronom_format.findall(TNA('RelatedFormat')):
176
205
  rel = get_text_tna(x, 'RelationshipType')
@@ -181,20 +210,20 @@ class FormatInfo:
181
210
  fido_sig = ET.SubElement(fido_format, 'signature')
182
211
  ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
183
212
  # There are some funny chars in the notes, which caused me trouble and it is a unicode string,
184
- ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote').encode('UTF-8')
213
+ ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
185
214
  for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
186
215
  fido_pat = ET.SubElement(fido_sig, 'pattern')
187
216
  pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
188
217
  bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
189
218
  offset = get_text_tna(pronom_pat, 'Offset')
190
219
  max_offset = get_text_tna(pronom_pat, 'MaxOffset')
191
- if max_offset == None:
220
+ if not max_offset:
192
221
  pass
193
- #print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
222
+ # print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
194
223
  regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
195
- #print "done puid", puid
224
+ # print "done puid", puid
196
225
  if regex == "__INCOMPATIBLE_SIG__":
197
- print >> sys.stderr, "Error: incompatible PRONOM signature found for puid", puid, ", skipping..."
226
+ print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
198
227
  # remove the empty 'signature' nodes
199
228
  # now that the signature is not compatible and thus "regex" is empty
200
229
  remove = fido_format.findall('signature')
@@ -205,8 +234,8 @@ class FormatInfo:
205
234
  ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
206
235
  ET.SubElement(fido_pat, 'regex').text = regex
207
236
  # Get the format details
208
- fido_details = ET.SubElement(fido_format,'details')
209
- ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription').encode('utf8')
237
+ fido_details = ET.SubElement(fido_format, 'details')
238
+ ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription')
210
239
  ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
211
240
  ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
212
241
  ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
@@ -221,7 +250,7 @@ class FormatInfo:
221
250
  ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
222
251
  # References
223
252
  for x in pronom_format.findall(TNA("Document")):
224
- r = ET.SubElement(fido_details,'reference')
253
+ r = ET.SubElement(fido_details, 'reference')
225
254
  ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
226
255
  ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
227
256
  ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
@@ -229,51 +258,53 @@ class FormatInfo:
229
258
  for id in x.findall(TNA('DocumentIdentifier')):
230
259
  type = get_text_tna(id, 'IdentifierType')
231
260
  if type == 'URL':
232
- ET.SubElement(r, 'dc:identifier').text = "http://"+get_text_tna(id, 'Identifier')
261
+ ET.SubElement(r, 'dc:identifier').text = "http://" + get_text_tna(id, 'Identifier')
233
262
  else:
234
- ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
263
+ ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
235
264
  ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
236
265
  ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
237
- ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription')+" "+get_text_tna(x, 'AvailabilityNote')
266
+ ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription') + " " + get_text_tna(x, 'AvailabilityNote')
238
267
  ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
239
- # Examples
268
+ # Examples
240
269
  for x in pronom_format.findall(TNA("ReferenceFile")):
241
- rf = ET.SubElement(fido_details,'example_file')
270
+ rf = ET.SubElement(fido_details, 'example_file')
242
271
  ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
243
272
  ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
244
273
  checksum = ""
245
274
  for id in x.findall(TNA('ReferenceFileIdentifier')):
246
275
  type = get_text_tna(id, 'IdentifierType')
247
276
  if type == 'URL':
248
- url = "http://"+get_text_tna(id, 'Identifier')
249
- ET.SubElement(rf, 'dc:identifier').text = url
277
+ # Starting with PRONOM 89, some URLs contain http://
278
+ # and others do not.
279
+ url = get_text_tna(id, 'Identifier')
280
+ if not urlparse(url).scheme:
281
+ url = "http://" + url
282
+ ET.SubElement(rf, 'dc:identifier').text = url
250
283
  # And calculate the checksum of this resource:
251
284
  m = hashlib.md5()
252
- sock = urllib.urlopen(url)
285
+ sock = urlopen(url)
253
286
  m.update(sock.read())
254
287
  sock.close()
255
- checksum=m.hexdigest()
288
+ checksum = m.hexdigest()
256
289
  else:
257
- ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
290
+ ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
258
291
  ET.SubElement(rf, 'dcterms:license').text = ""
259
292
  ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
260
293
  checksumElement = ET.SubElement(rf, 'checksum')
261
294
  checksumElement.text = checksum
262
295
  checksumElement.attrib['type'] = "md5"
263
296
  # Record Metadata
264
- md = ET.SubElement(fido_details,'record_metadata')
265
- ET.SubElement(md, 'status').text ='unknown'
297
+ md = ET.SubElement(fido_details, 'record_metadata')
298
+ ET.SubElement(md, 'status').text = 'unknown'
266
299
  ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
267
300
  ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
268
301
  ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
269
- ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription').encode('utf8')
270
- return fido_format
271
-
272
- #FIXME: I don't think that this quite works yet!
302
+ ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription')
303
+ return fido_format
304
+
305
+ # FIXME: I don't think that this quite works yet!
273
306
  def _sort_formats(self, formatlist):
274
- """Sort the format list based on their priority relationships so higher priority
275
- formats appear earlier in the list.
276
- """
307
+ """Sort the format list based on their priority relationships so higher priority formats appear earlier in the list."""
277
308
  def compare_formats(f1, f2):
278
309
  f1ID = f1.find('puid').text
279
310
  f2ID = f2.find('puid').text
@@ -291,9 +322,9 @@ class FormatInfo:
291
322
  return 1
292
323
  return sorted(formatlist, cmp=compare_formats)
293
324
 
325
+
294
326
  def fido_position(pronom_position):
295
- """@return BOF/EOF/VAR instead of the more verbose pronom position names.
296
- """
327
+ """Return BOF/EOF/VAR instead of the more verbose pronom position names."""
297
328
  if pronom_position == 'Absolute from BOF':
298
329
  return 'BOF'
299
330
  elif pronom_position == 'Absolute from EOF':
@@ -302,16 +333,20 @@ def fido_position(pronom_position):
302
333
  return 'VAR'
303
334
  elif pronom_position == 'Indirect From BOF':
304
335
  return 'IFB'
305
- else: # to make sure FIDO does not crash (IFB aftermath)
306
- sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
336
+ else: # to make sure FIDO does not crash (IFB aftermath)
337
+ sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
307
338
  return 'VAR'
308
339
 
340
+
309
341
  def _convert_err_msg(msg, c, i, chars):
310
342
  return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
311
343
 
344
+
312
345
  def doByte(chars, i, littleendian):
313
- """Convert two chars[i] and chars[i+1] into a byte.
314
- @return a tuple (byte, 2)
346
+ """
347
+ Convert two chars[i] and chars[i+1] into a byte.
348
+
349
+ @return a tuple (byte, 2)
315
350
  """
316
351
  c1 = '0123456789ABCDEF'.find(chars[i].upper())
317
352
  c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
@@ -323,12 +358,7 @@ def doByte(chars, i, littleendian):
323
358
  val = chr(c1 + 16 * c2)
324
359
  return (escape(val), 2)
325
360
 
326
- # \a\b\n\r\t\v
327
- # MdR: took out '<' and '>' out of _ordinary because they were converted to entities &lt;&gt;
328
- # MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
329
- _ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
330
- _special = '$()*+.?![]^\\{|}'
331
- _hex = '0123456789abcdef'
361
+
332
362
  def _escape_char(c):
333
363
  if c in '\n':
334
364
  return '\\n'
@@ -340,68 +370,66 @@ def _escape_char(c):
340
370
  (high, low) = divmod(ord(c), 16)
341
371
  return '\\x' + _hex[high] + _hex[low]
342
372
 
373
+
343
374
  def escape(string):
344
- "Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
375
+ """Escape characters in pattern that are non-printable, non-ascii, or special for regexes."""
345
376
  return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
346
377
 
378
+
347
379
  def calculate_repetition(char, pos, offset, maxoffset):
348
- """
349
- Recursively calculates offset/maxoffset repetition,
350
- when one or both offsets is greater than 65535 bytes (64KB)
351
- see: bugs.python.org/issue13169
352
- Otherwise it returns the {offset,maxoffset}
353
- """
354
- calcbuf = cStringIO.StringIO()
355
-
380
+ """Recursively calculates offset/maxoffset repetition, when one or both offsets is greater than 65535 bytes (64KB). See: https://bugs.python.org/issue13169."""
381
+ calcbuf = cStringIO()
382
+
356
383
  calcremain = False
357
384
  offsetremain = 0
358
385
  maxoffsetremain = 0
359
-
360
- if offset != None and offset != '':
361
- if int(offset) > 65535:
362
- offsetremain = str(int(offset) - 65535)
363
- offset = '65535'
364
- calcremain = True
365
- if maxoffset != None and maxoffset != '':
366
- if int(maxoffset) > 65535:
367
- maxoffsetremain = str(int(maxoffset) - 65535)
368
- maxoffset = '65535'
369
- calcremain = True
370
-
386
+
387
+ if offset is not None and int(offset) > 65535:
388
+ offsetremain = str(int(offset) - 65535)
389
+ offset = '65535'
390
+ calcremain = True
391
+ if maxoffset is not None and int(maxoffset) > 65535:
392
+ maxoffsetremain = str(int(maxoffset) - 65535)
393
+ maxoffset = '65535'
394
+ calcremain = True
395
+
371
396
  if pos == "BOF" or pos == "EOF":
372
397
  if offset != '0':
373
398
  calcbuf.write(char + '{' + str(offset))
374
- if maxoffset != None:
399
+ if maxoffset is not None:
375
400
  calcbuf.write(',' + maxoffset)
376
401
  calcbuf.write('}')
377
- elif maxoffset != None:
402
+ elif maxoffset is not None:
378
403
  calcbuf.write(char + '{0,' + maxoffset + '}')
379
404
 
380
405
  if pos == "IFB":
381
406
  if offset != '0':
382
407
  calcbuf.write(char + '{' + str(offset))
383
- if maxoffset != None:
408
+ if maxoffset is not None:
384
409
  calcbuf.write(',' + maxoffset)
385
410
  calcbuf.write('}')
386
- if maxoffset == None:
411
+ if maxoffset is not None:
387
412
  calcbuf.write(',}')
388
- elif maxoffset != None:
413
+ elif maxoffset is not None:
389
414
  calcbuf.write(char + '{0,' + maxoffset + '}')
390
415
 
391
- if calcremain: # recursion happens here
416
+ if calcremain: # recursion happens here
392
417
  calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
393
-
418
+
394
419
  val = calcbuf.getvalue()
395
420
  calcbuf.close()
396
421
  return val
397
422
 
423
+
398
424
  def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
399
- """Convert
400
- @param chars, a pronom bytesequence, into a
401
- @return regular expression.
402
- Endianness is not used.
403
425
  """
426
+ Convert to regular expression.
427
+
428
+ Endianness is not used.
404
429
 
430
+ @param chars, a pronom bytesequence, into a
431
+ @return regular expression.
432
+ """
405
433
  if 'Big' in endianness:
406
434
  littleendian = False
407
435
  else:
@@ -410,24 +438,26 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
410
438
  offset = '0'
411
439
  if len(maxoffset) == 0:
412
440
  maxoffset = None
441
+ if maxoffset == '0':
442
+ maxoffset = None
413
443
  # make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
414
444
  global buf
415
- buf = cStringIO.StringIO()
416
- buf.write("(?s)") #If a regex starts with (?s), it is equivalent to DOTALL.
445
+ buf = cStringIO()
446
+ buf.write("(?s)") # If a regex starts with (?s), it is equivalent to DOTALL.
417
447
  i = 0
418
448
  state = 'start'
419
449
  if 'BOF' in pos:
420
- buf.write('\\A') # start of regex
450
+ buf.write('\\A') # start of regex
421
451
  buf.write(calculate_repetition('.', pos, offset, maxoffset))
422
-
452
+
423
453
  if 'IFB' in pos:
424
454
  buf.write('\\A')
425
455
  buf.write(calculate_repetition('.', pos, offset, maxoffset))
426
-
456
+
427
457
  while True:
428
458
  if i == len(chars):
429
459
  break
430
- #print _convert_err_msg(state,chars[i],i,chars)
460
+ # print _convert_err_msg(state,chars[i],i,chars)
431
461
  if state == 'start':
432
462
  if chars[i].isalnum():
433
463
  state = 'bytes'
@@ -471,7 +501,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
471
501
  (byt, inc) = doByte(chars, i, littleendian)
472
502
  buf.write(byt)
473
503
  i += inc
474
- #assert(chars[i] == ':')
504
+ # assert(chars[i] == ':')
475
505
  if chars[i] != ':':
476
506
  return "__INCOMPATIBLE_SIG__"
477
507
  buf.write('-')
@@ -479,13 +509,13 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
479
509
  (byt, inc) = doByte(chars, i, littleendian)
480
510
  buf.write(byt)
481
511
  i += inc
482
- #assert(chars[i] == ']')
512
+ # assert(chars[i] == ']')
483
513
  if chars[i] != ']':
484
514
  return "__INCOMPATIBLE_SIG__"
485
515
  buf.write(']')
486
516
  i += 1
487
517
  except Exception:
488
- print _convert_err_msg('Illegal character in bracket', chars[i], i, chars)
518
+ print(_convert_err_msg('Illegal character in bracket', chars[i], i, chars))
489
519
  raise
490
520
  if i < len(chars) and chars[i] == '{':
491
521
  state = 'curly-after-bracket'
@@ -511,7 +541,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
511
541
  (byt, inc) = doByte(chars, i, littleendian)
512
542
  buf.write(byt)
513
543
  i += inc
514
- #assert(chars[i] == ':')
544
+ # assert(chars[i] == ':')
515
545
  if chars[i] != ':':
516
546
  return "__INCOMPATIBLE_SIG__"
517
547
  buf.write('-')
@@ -519,8 +549,8 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
519
549
  (byt, inc) = doByte(chars, i, littleendian)
520
550
  buf.write(byt)
521
551
  i += inc
522
-
523
- #assert(chars[i] == ']')
552
+
553
+ # assert(chars[i] == ']')
524
554
  if chars[i] != ']':
525
555
  return "__INCOMPATIBLE_SIG__"
526
556
  buf.write(']')
@@ -537,7 +567,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
537
567
  # when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
538
568
  # The above, while sensible, appears to be incorrect. A '.' is always needed.
539
569
  # for droid equiv behavior
540
- #if state == 'curly':
570
+ # if state == 'curly':
541
571
  buf.write('.')
542
572
  buf.write('{')
543
573
  i += 1 # skip the (
@@ -548,7 +578,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
548
578
  elif chars[i] == '-':
549
579
  buf.write(',')
550
580
  i += 1
551
- elif chars[i] == '*': # skip the *
581
+ elif chars[i] == '*': # skip the *
552
582
  i += 1
553
583
  elif chars[i] == '}':
554
584
  break
@@ -581,36 +611,35 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
581
611
  buf.close()
582
612
  return val
583
613
 
584
- def main(arg=None):
585
- import sys
586
- from argparselocal import ArgumentParser
587
- if arg != None:
588
- arglist = arg
589
- else:
590
- arglist = sys.argv[1:]
591
- # print arglist
592
- # exit()
593
- mydir = os.path.abspath(os.path.dirname(__file__))
594
- # parse version file to fetch versions
595
- versionsFile = os.path.join(mydir, 'conf', 'versions.xml')
596
- try:
597
- versions = VET.parse(versionsFile)
598
- except Exception, e:
599
- sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
600
- sys.exit()
601
- xml_pronomSignature = os.path.join(mydir, 'conf', versions.find('pronomSignature').text)
602
- xml_pronomZipFile = os.path.join(mydir, 'conf', "pronom-xml-v{0}.zip".format(versions.find('pronomVersion').text))
603
- parser = ArgumentParser(description='Produce the fido format xml that is loaded at run-time')
604
- parser.add_argument('-input', default=xml_pronomZipFile, help='input file, a zip containing Pronom xml files')
605
- parser.add_argument('-output', default=xml_pronomSignature, help='output file')
606
- parser.add_argument('-puid', default=None, help='a particular PUID record to extract')
607
- # PROCESS ARGUMENTS
608
- args = parser.parse_args(arglist)
609
- # print os.path.abspath(args.input), os.path.abspath(args.output)
610
- info = FormatInfo(args.input)
611
- info.load_pronom_xml(args.puid)
612
- info.save(args.output)
613
- print >> sys.stderr, 'Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats))
614
-
614
+
615
+ def run(input=None, output=None, puid=None):
616
+ """Convert PRONOM formats into FIDO signatures."""
617
+ versions = get_local_pronom_versions()
618
+
619
+ if input is None:
620
+ input = versions.get_zip_file()
621
+ if output is None:
622
+ output = versions.get_signature_file()
623
+
624
+ info = FormatInfo(input)
625
+ info.load_pronom_xml(puid)
626
+ info.save(output)
627
+ print('Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats)), file=sys.stderr)
628
+
629
+
630
+ def main(args=None):
631
+ """Main CLI entrypoint."""
632
+ if args is None:
633
+ args = sys.argv[1:]
634
+
635
+ parser = ArgumentParser(description='Produce the FIDO format XML that is loaded at run-time')
636
+ parser.add_argument('-input', default=None, help='Input file, a Zip containing PRONOM XML files')
637
+ parser.add_argument('-output', default=None, help='Ouptut file')
638
+ parser.add_argument('-puid', default=None, help='A particular PUID record to extract')
639
+ args = parser.parse_args(args)
640
+
641
+ run(input=args.input, output=args.output, puid=args.puid)
642
+
643
+
615
644
  if __name__ == '__main__':
616
- main()
645
+ main()