libis-format 0.9.5-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.coveralls.yml +2 -0
- data/.gitignore +18 -0
- data/.travis.yml +41 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +39 -0
- data/Rakefile +8 -0
- data/bin/droid +15 -0
- data/bin/fido +12 -0
- data/bin/pdf_copy +13 -0
- data/data/ISOcoated_v2_eci.icc +0 -0
- data/data/PDFA_def.ps +40 -0
- data/data/ead.xsd +2728 -0
- data/data/eciRGB_v2.icc +0 -0
- data/data/lias_formats.xml +106 -0
- data/data/types.yml +217 -0
- data/lib/libis/format/config.rb +35 -0
- data/lib/libis/format/converter/base.rb +101 -0
- data/lib/libis/format/converter/chain.rb +167 -0
- data/lib/libis/format/converter/image_converter.rb +214 -0
- data/lib/libis/format/converter/office_converter.rb +50 -0
- data/lib/libis/format/converter/pdf_converter.rb +139 -0
- data/lib/libis/format/converter/repository.rb +98 -0
- data/lib/libis/format/converter.rb +11 -0
- data/lib/libis/format/droid.rb +45 -0
- data/lib/libis/format/fido.rb +102 -0
- data/lib/libis/format/identifier.rb +189 -0
- data/lib/libis/format/office_to_pdf.rb +52 -0
- data/lib/libis/format/pdf_copy.rb +40 -0
- data/lib/libis/format/pdf_merge.rb +41 -0
- data/lib/libis/format/pdf_split.rb +39 -0
- data/lib/libis/format/pdf_to_pdfa.rb +76 -0
- data/lib/libis/format/pdfa_validator.rb +61 -0
- data/lib/libis/format/type_database.rb +170 -0
- data/lib/libis/format/version.rb +5 -0
- data/lib/libis/format.rb +23 -0
- data/lib/libis-format.rb +1 -0
- data/libis-format.gemspec +34 -0
- data/spec/converter_spec.rb +212 -0
- data/spec/data/Cevennes2.bmp +0 -0
- data/spec/data/Cevennes2.jp2 +0 -0
- data/spec/data/Cevennes2.ppm +22492 -0
- data/spec/data/test-ead.xml +392 -0
- data/spec/data/test-jpg.tif +0 -0
- data/spec/data/test-lzw.tif +0 -0
- data/spec/data/test-options.jpg +0 -0
- data/spec/data/test.bmp +0 -0
- data/spec/data/test.doc +0 -0
- data/spec/data/test.docx +0 -0
- data/spec/data/test.gif +0 -0
- data/spec/data/test.jpg +0 -0
- data/spec/data/test.ods +0 -0
- data/spec/data/test.odt +0 -0
- data/spec/data/test.pdf +0 -0
- data/spec/data/test.pdf.tif +0 -0
- data/spec/data/test.png +0 -0
- data/spec/data/test.ps +8631 -0
- data/spec/data/test.psd +0 -0
- data/spec/data/test.rtf +1455 -0
- data/spec/data/test.tif +0 -0
- data/spec/data/test.txt +12 -0
- data/spec/data/test.xcf +0 -0
- data/spec/data/test.xls +0 -0
- data/spec/data/test.xlsx +0 -0
- data/spec/data/test.xml +4 -0
- data/spec/data/test_pdfa.pdf +0 -0
- data/spec/identifier_spec.rb +60 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/test_types.yml +12 -0
- data/spec/type_database_spec.rb +140 -0
- data/tools/PdfTool.jar +0 -0
- data/tools/bcpkix-jdk15on-1.49.jar +0 -0
- data/tools/bcprov-jdk15on-1.49.jar +0 -0
- data/tools/droid/DROID_SignatureFile_V82.xml +32681 -0
- data/tools/droid/container-signature-20150307.xml +2235 -0
- data/tools/droid/droid-command-line-6.1.5.jar +0 -0
- data/tools/droid/droid.bat +154 -0
- data/tools/droid/droid.sh +138 -0
- data/tools/droid/lib/XmlSchema-1.4.7.jar +0 -0
- data/tools/droid/lib/activation-1.1.jar +0 -0
- data/tools/droid/lib/antlr-2.7.7.jar +0 -0
- data/tools/droid/lib/antlr-3.2.jar +0 -0
- data/tools/droid/lib/antlr-runtime-3.2.jar +0 -0
- data/tools/droid/lib/aopalliance-1.0.jar +0 -0
- data/tools/droid/lib/asm-2.2.3.jar +0 -0
- data/tools/droid/lib/aspectjrt-1.7.2.jar +0 -0
- data/tools/droid/lib/aspectjweaver-1.7.2.jar +0 -0
- data/tools/droid/lib/bcmail-jdk14-138.jar +0 -0
- data/tools/droid/lib/bcprov-jdk14-138.jar +0 -0
- data/tools/droid/lib/beansbinding-1.2.1.jar +0 -0
- data/tools/droid/lib/byteseek-1.1.1.jar +0 -0
- data/tools/droid/lib/cglib-nodep-2.2.2.jar +0 -0
- data/tools/droid/lib/classmate-1.0.0.jar +0 -0
- data/tools/droid/lib/commons-cli-1.2.jar +0 -0
- data/tools/droid/lib/commons-codec-1.4.jar +0 -0
- data/tools/droid/lib/commons-collections-3.2.1.jar +0 -0
- data/tools/droid/lib/commons-compress-1.4.1.jar +0 -0
- data/tools/droid/lib/commons-configuration-1.8.jar +0 -0
- data/tools/droid/lib/commons-dbcp-1.4.jar +0 -0
- data/tools/droid/lib/commons-httpclient-3.1.jar +0 -0
- data/tools/droid/lib/commons-io-2.4.jar +0 -0
- data/tools/droid/lib/commons-lang-2.6.jar +0 -0
- data/tools/droid/lib/commons-logging-1.1.1.jar +0 -0
- data/tools/droid/lib/commons-pool-1.5.4.jar +0 -0
- data/tools/droid/lib/cxf-api-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-schemas-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-common-utilities-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-soap-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-bindings-xml-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-core-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-databinding-jaxb-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-jaxws-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-frontend-simple-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-transports-http-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-rt-ws-addr-2.2.12.jar +0 -0
- data/tools/droid/lib/cxf-tools-common-2.2.12.jar +0 -0
- data/tools/droid/lib/de.huxhorn.lilith.3rdparty.flyingsaucer.core-renderer-8RC1.jar +0 -0
- data/tools/droid/lib/derby-10.10.2.0.jar +0 -0
- data/tools/droid/lib/dom4j-1.6.1.jar +0 -0
- data/tools/droid/lib/droid-container-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-core-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-export-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-help-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-report-interfaces-6.1.5.jar +0 -0
- data/tools/droid/lib/droid-results-6.1.5.jar +0 -0
- data/tools/droid/lib/ejb3-persistence-1.0.2.GA.jar +0 -0
- data/tools/droid/lib/geronimo-activation_1.1_spec-1.0.2.jar +0 -0
- data/tools/droid/lib/geronimo-annotation_1.0_spec-1.1.1.jar +0 -0
- data/tools/droid/lib/geronimo-javamail_1.4_spec-1.6.jar +0 -0
- data/tools/droid/lib/geronimo-jaxws_2.1_spec-1.0.jar +0 -0
- data/tools/droid/lib/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/tools/droid/lib/geronimo-ws-metadata_2.0_spec-1.1.2.jar +0 -0
- data/tools/droid/lib/hibernate-commons-annotations-4.0.4.Final.jar +0 -0
- data/tools/droid/lib/hibernate-core-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-entitymanager-4.3.5.Final.jar +0 -0
- data/tools/droid/lib/hibernate-jpa-2.1-api-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/hibernate-validator-5.1.0.Final.jar +0 -0
- data/tools/droid/lib/itext-2.0.8.jar +0 -0
- data/tools/droid/lib/jandex-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/javahelp-2.0.05.jar +0 -0
- data/tools/droid/lib/javassist-3.18.1-GA.jar +0 -0
- data/tools/droid/lib/jaxb-api-2.1.jar +0 -0
- data/tools/droid/lib/jaxb-impl-2.1.13.jar +0 -0
- data/tools/droid/lib/jboss-logging-3.1.3.GA.jar +0 -0
- data/tools/droid/lib/jboss-logging-annotations-1.2.0.Beta1.jar +0 -0
- data/tools/droid/lib/jboss-transaction-api_1.2_spec-1.0.0.Final.jar +0 -0
- data/tools/droid/lib/joda-time-1.6.2.jar +0 -0
- data/tools/droid/lib/jra-1.0-alpha-4.jar +0 -0
- data/tools/droid/lib/jta-1.1.jar +0 -0
- data/tools/droid/lib/log4j-1.2.13.jar +0 -0
- data/tools/droid/lib/neethi-2.0.4.jar +0 -0
- data/tools/droid/lib/opencsv-2.3.jar +0 -0
- data/tools/droid/lib/org-netbeans-swing-outline-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-7.2.jar +0 -0
- data/tools/droid/lib/org-openide-util-lookup-7.2.jar +0 -0
- data/tools/droid/lib/poi-3.7.jar +0 -0
- data/tools/droid/lib/saaj-api-1.3.jar +0 -0
- data/tools/droid/lib/saaj-impl-1.3.2.jar +0 -0
- data/tools/droid/lib/slf4j-api-1.4.2.jar +0 -0
- data/tools/droid/lib/slf4j-log4j12-1.4.2.jar +0 -0
- data/tools/droid/lib/spring-aop-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-beans-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-context-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-core-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-expression-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-jdbc-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-orm-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-tx-4.0.3.RELEASE.jar +0 -0
- data/tools/droid/lib/spring-web-2.5.6.jar +0 -0
- data/tools/droid/lib/stax-api-1.0-2.jar +0 -0
- data/tools/droid/lib/stringtemplate-3.2.jar +0 -0
- data/tools/droid/lib/truezip-6.8.4.jar +0 -0
- data/tools/droid/lib/validation-api-1.1.0.Final.jar +0 -0
- data/tools/droid/lib/wsdl4j-1.6.2.jar +0 -0
- data/tools/droid/lib/wstx-asl-3.2.9.jar +0 -0
- data/tools/droid/lib/xercesImpl-2.9.1.jar +0 -0
- data/tools/droid/lib/xml-apis-1.3.04.jar +0 -0
- data/tools/droid/lib/xml-resolver-1.2.jar +0 -0
- data/tools/droid/lib/xz-1.0.jar +0 -0
- data/tools/fido/__init__.py +0 -0
- data/tools/fido/argparselocal.py +2355 -0
- data/tools/fido/conf/DROID_SignatureFile-v81.xml +2 -0
- data/tools/fido/conf/container-signature-20150307.xml +2238 -0
- data/tools/fido/conf/dc.xsd +119 -0
- data/tools/fido/conf/dcmitype.xsd +53 -0
- data/tools/fido/conf/dcterms.xsd +383 -0
- data/tools/fido/conf/fido-formats.xsd +173 -0
- data/tools/fido/conf/format_extension_template.xml +105 -0
- data/tools/fido/conf/format_extensions.xml +498 -0
- data/tools/fido/conf/formats-v81.xml +38355 -0
- data/tools/fido/conf/pronom-xml-v81.zip +0 -0
- data/tools/fido/conf/versions.xml +8 -0
- data/tools/fido/fido.bat +4 -0
- data/tools/fido/fido.py +854 -0
- data/tools/fido/fido.sh +5 -0
- data/tools/fido/prepare.py +616 -0
- data/tools/fido/pronomutils.py +115 -0
- data/tools/fido/toxml.py +52 -0
- data/tools/fido/update_signatures.py +171 -0
- data/tools/pdfbox/pdfbox-app-1.8.10.jar +0 -0
- data/tools/pdfbox/preflight-app-1.8.10.jar +0 -0
- metadata +396 -0
data/tools/fido/fido.sh
ADDED
|
@@ -0,0 +1,616 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Format Identification for Digital Objects
|
|
4
|
+
|
|
5
|
+
# MdR: 'reload(sys)' and 'setdefaultencoding("utf-8")' needed to fix utf-8 encoding errors
|
|
6
|
+
# when converting from PRONOM to FIDO format
|
|
7
|
+
import sys
|
|
8
|
+
reload(sys)
|
|
9
|
+
sys.setdefaultencoding("utf-8")
|
|
10
|
+
import cStringIO, zipfile, os
|
|
11
|
+
import hashlib
|
|
12
|
+
import urllib
|
|
13
|
+
from xml.etree import ElementTree as ET
|
|
14
|
+
from xml.etree import ElementTree as VET # versions.xml
|
|
15
|
+
# needed for debug
|
|
16
|
+
# print_r: https://github.com/marcbelmont/python-print_r
|
|
17
|
+
# from print_r import print_r
|
|
18
|
+
|
|
19
|
+
class NS:
|
|
20
|
+
"""Helper class for XML name spaces in ElementTree.
|
|
21
|
+
Use like MYNS=NS("{http://some/uri}") and then
|
|
22
|
+
MYNS(tag1/tag2).
|
|
23
|
+
"""
|
|
24
|
+
def __init__(self, uri):
|
|
25
|
+
self.uri = uri
|
|
26
|
+
def __getattr__(self, tag):
|
|
27
|
+
return self.uri + tag
|
|
28
|
+
def __call__(self, path):
|
|
29
|
+
return "/".join(getattr(self, tag) for tag in path.split("/"))
|
|
30
|
+
|
|
31
|
+
# XHTML namespace
|
|
32
|
+
XHTML = NS("{http://www.w3.org/1999/xhtml}")
|
|
33
|
+
# TNA namespace
|
|
34
|
+
TNA = NS("{http://pronom.nationalarchives.gov.uk}")
|
|
35
|
+
|
|
36
|
+
def get_text_tna(element, tag, default=''):
|
|
37
|
+
"""Helper function to return the text for a tag or path using the TNA namespace.
|
|
38
|
+
"""
|
|
39
|
+
part = element.find(TNA(tag))
|
|
40
|
+
return part.text.strip() if part != None and part.text != None else default
|
|
41
|
+
|
|
42
|
+
def prettify(elem):
|
|
43
|
+
"""Return a pretty-printed XML string for the Element.
|
|
44
|
+
"""
|
|
45
|
+
from xml.dom import minidom
|
|
46
|
+
rough_string = ET.tostring(elem, 'UTF-8')
|
|
47
|
+
reparsed = minidom.parseString(rough_string)
|
|
48
|
+
return reparsed.toprettyxml(indent=" ")
|
|
49
|
+
|
|
50
|
+
class FormatInfo:
|
|
51
|
+
def __init__(self, pronom_files, format_list=[]):
|
|
52
|
+
self.info = {}
|
|
53
|
+
self.formats = []
|
|
54
|
+
self.pronom_files = pronom_files
|
|
55
|
+
for f in format_list:
|
|
56
|
+
self.add_format(f)
|
|
57
|
+
|
|
58
|
+
def save(self, dst):
|
|
59
|
+
"""Write the fido XML format definitions to @param dst
|
|
60
|
+
"""
|
|
61
|
+
tree = ET.ElementTree(ET.Element('formats', {'version':'0.3',
|
|
62
|
+
'xmlns:xsi' : "http://www.w3.org/2001/XMLSchema-instance",
|
|
63
|
+
'xsi:noNamespaceSchemaLocation': "fido-formats.xsd",
|
|
64
|
+
'xmlns:dc': "http://purl.org/dc/elements/1.1/",
|
|
65
|
+
'xmlns:dcterms': "http://purl.org/dc/terms/"}))
|
|
66
|
+
root = tree.getroot()
|
|
67
|
+
for f in self.formats:
|
|
68
|
+
# MdR: this skipped puids without sig, but we want them ALL
|
|
69
|
+
# because puid might be matched on extension
|
|
70
|
+
#if f.find('signature'):
|
|
71
|
+
root.append(f)
|
|
72
|
+
self.indent(root)
|
|
73
|
+
with open(dst, 'wb') as out:
|
|
74
|
+
#print >>out, ET.tostring(root,encoding='utf-8')
|
|
75
|
+
print >>out, ET.tostring(root)
|
|
76
|
+
|
|
77
|
+
def indent(self, elem, level=0):
|
|
78
|
+
i = "\n" + level*" "
|
|
79
|
+
if len(elem):
|
|
80
|
+
if not elem.text or not elem.text.strip():
|
|
81
|
+
elem.text = i + " "
|
|
82
|
+
if not elem.tail or not elem.tail.strip():
|
|
83
|
+
elem.tail = i
|
|
84
|
+
for elem in elem:
|
|
85
|
+
self.indent(elem, level+1)
|
|
86
|
+
if not elem.tail or not elem.tail.strip():
|
|
87
|
+
elem.tail = i
|
|
88
|
+
else:
|
|
89
|
+
if level and (not elem.tail or not elem.tail.strip()):
|
|
90
|
+
elem.tail = i
|
|
91
|
+
|
|
92
|
+
def load_pronom_xml(self, puid_filter=None):
|
|
93
|
+
"""Load the pronom XML from self.pronom_files and convert it to fido XML.
|
|
94
|
+
As a side-effect, set self.formats to a list of ElementTree.Element
|
|
95
|
+
If a @param puid is specified, only that one will be loaded.
|
|
96
|
+
"""
|
|
97
|
+
formats = []
|
|
98
|
+
#for p in self.pronom_files:
|
|
99
|
+
# print p
|
|
100
|
+
#print self.pronom_files
|
|
101
|
+
#exit()
|
|
102
|
+
try:
|
|
103
|
+
zip = zipfile.ZipFile(self.pronom_files, 'r')
|
|
104
|
+
for item in zip.infolist():
|
|
105
|
+
#print item.filename
|
|
106
|
+
try:
|
|
107
|
+
stream = zip.open(item)
|
|
108
|
+
# Work is done here!
|
|
109
|
+
#if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
|
|
110
|
+
format = self.parse_pronom_xml(stream, puid_filter)
|
|
111
|
+
if format != None:
|
|
112
|
+
formats.append(format)
|
|
113
|
+
finally:
|
|
114
|
+
stream.close()
|
|
115
|
+
finally:
|
|
116
|
+
try:
|
|
117
|
+
zip.close()
|
|
118
|
+
except Exception, e:
|
|
119
|
+
sys.stderr.write("An error occured loading '{0}' (exception: {1})".format(self.pronom_files, e))
|
|
120
|
+
sys.exit()
|
|
121
|
+
# Replace the formatID with puids in has_priority_over
|
|
122
|
+
id_map = {}
|
|
123
|
+
for element in formats:
|
|
124
|
+
puid = element.find('puid').text
|
|
125
|
+
#print "working on puid:",puid
|
|
126
|
+
pronom_id = element.find('pronom_id').text
|
|
127
|
+
id_map[pronom_id] = puid
|
|
128
|
+
for element in formats:
|
|
129
|
+
for rel in element.findall('has_priority_over'):
|
|
130
|
+
rel.text = id_map[rel.text]
|
|
131
|
+
|
|
132
|
+
self._sort_formats(formats)
|
|
133
|
+
self.formats = formats
|
|
134
|
+
|
|
135
|
+
def parse_pronom_xml(self, source, puid_filter=None):
|
|
136
|
+
"""Read a pronom XML from @param source, convert it to fido XML and
|
|
137
|
+
@return ET.ElementTree Element representing it.
|
|
138
|
+
If a @param puid is specified, only that one will be loaded.
|
|
139
|
+
"""
|
|
140
|
+
pronom_xml = ET.parse(source)
|
|
141
|
+
pronom_root = pronom_xml.getroot()
|
|
142
|
+
pronom_format = pronom_root.find(TNA('report_format_detail/FileFormat'))
|
|
143
|
+
fido_format = ET.Element('format')
|
|
144
|
+
# Get the base Format information
|
|
145
|
+
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
|
146
|
+
type = get_text_tna(id, 'IdentifierType')
|
|
147
|
+
if type == 'PUID':
|
|
148
|
+
puid = get_text_tna(id, 'Identifier')
|
|
149
|
+
ET.SubElement(fido_format, 'puid').text = puid
|
|
150
|
+
if puid_filter != None and puid != puid_filter:
|
|
151
|
+
return None
|
|
152
|
+
# A bit clumsy. I want to have puid first, then mime, then container.
|
|
153
|
+
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
|
154
|
+
type = get_text_tna(id, 'IdentifierType')
|
|
155
|
+
if type == 'MIME':
|
|
156
|
+
ET.SubElement(fido_format, 'mime').text = get_text_tna(id, 'Identifier')
|
|
157
|
+
elif type == 'PUID':
|
|
158
|
+
puid = get_text_tna(id, 'Identifier')
|
|
159
|
+
if puid == 'x-fmt/263':
|
|
160
|
+
ET.SubElement(fido_format, 'container').text = 'zip'
|
|
161
|
+
elif puid == 'x-fmt/265':
|
|
162
|
+
ET.SubElement(fido_format, 'container').text = 'tar'
|
|
163
|
+
ET.SubElement(fido_format, 'name').text = get_text_tna(pronom_format, 'FormatName')
|
|
164
|
+
ET.SubElement(fido_format, 'version').text = get_text_tna(pronom_format, 'FormatVersion')
|
|
165
|
+
ET.SubElement(fido_format, 'alias').text = get_text_tna(pronom_format, 'FormatAliases')
|
|
166
|
+
ET.SubElement(fido_format, 'pronom_id').text = get_text_tna(pronom_format, 'FormatID')
|
|
167
|
+
# Get the extensions from the ExternalSignature
|
|
168
|
+
for x in pronom_format.findall(TNA('ExternalSignature')):
|
|
169
|
+
ET.SubElement(fido_format, 'extension').text = get_text_tna(x, 'Signature')
|
|
170
|
+
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
|
|
171
|
+
type = get_text_tna(id, 'IdentifierType')
|
|
172
|
+
if type == 'Apple Uniform Type Identifier':
|
|
173
|
+
ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
|
|
174
|
+
# Handle the relationships
|
|
175
|
+
for x in pronom_format.findall(TNA('RelatedFormat')):
|
|
176
|
+
rel = get_text_tna(x, 'RelationshipType')
|
|
177
|
+
if rel == 'Has priority over':
|
|
178
|
+
ET.SubElement(fido_format, 'has_priority_over').text = get_text_tna(x, 'RelatedFormatID')
|
|
179
|
+
# Get the InternalSignature information
|
|
180
|
+
for pronom_sig in pronom_format.findall(TNA('InternalSignature')):
|
|
181
|
+
fido_sig = ET.SubElement(fido_format, 'signature')
|
|
182
|
+
ET.SubElement(fido_sig, 'name').text = get_text_tna(pronom_sig, 'SignatureName')
|
|
183
|
+
# There are some funny chars in the notes, which caused me trouble and it is a unicode string,
|
|
184
|
+
ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote').encode('UTF-8')
|
|
185
|
+
for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
|
|
186
|
+
fido_pat = ET.SubElement(fido_sig, 'pattern')
|
|
187
|
+
pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
|
|
188
|
+
bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
|
|
189
|
+
offset = get_text_tna(pronom_pat, 'Offset')
|
|
190
|
+
max_offset = get_text_tna(pronom_pat, 'MaxOffset')
|
|
191
|
+
if max_offset == None:
|
|
192
|
+
pass
|
|
193
|
+
#print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
|
|
194
|
+
regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
|
|
195
|
+
#print "done puid", puid
|
|
196
|
+
if regex == "__INCOMPATIBLE_SIG__":
|
|
197
|
+
print >> sys.stderr, "Error: incompatible PRONOM signature found for puid", puid, ", skipping..."
|
|
198
|
+
# remove the empty 'signature' nodes
|
|
199
|
+
# now that the signature is not compatible and thus "regex" is empty
|
|
200
|
+
remove = fido_format.findall('signature')
|
|
201
|
+
for r in remove:
|
|
202
|
+
fido_format.remove(r)
|
|
203
|
+
continue
|
|
204
|
+
ET.SubElement(fido_pat, 'position').text = pos
|
|
205
|
+
ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
|
|
206
|
+
ET.SubElement(fido_pat, 'regex').text = regex
|
|
207
|
+
# Get the format details
|
|
208
|
+
fido_details = ET.SubElement(fido_format,'details')
|
|
209
|
+
ET.SubElement(fido_details, 'dc:description').text = get_text_tna(pronom_format, 'FormatDescription').encode('utf8')
|
|
210
|
+
ET.SubElement(fido_details, 'dcterms:available').text = get_text_tna(pronom_format, 'ReleaseDate')
|
|
211
|
+
ET.SubElement(fido_details, 'dc:creator').text = get_text_tna(pronom_format, 'Developers/DeveloperCompoundName')
|
|
212
|
+
ET.SubElement(fido_details, 'dcterms:publisher').text = get_text_tna(pronom_format, 'Developers/OrganisationName')
|
|
213
|
+
for x in pronom_format.findall(TNA('RelatedFormat')):
|
|
214
|
+
rel = get_text_tna(x, 'RelationshipType')
|
|
215
|
+
if rel == 'Is supertype of':
|
|
216
|
+
ET.SubElement(fido_details, 'is_supertype_of').text = get_text_tna(x, 'RelatedFormatID')
|
|
217
|
+
for x in pronom_format.findall(TNA('RelatedFormat')):
|
|
218
|
+
rel = get_text_tna(x, 'RelationshipType')
|
|
219
|
+
if rel == 'Is subtype of':
|
|
220
|
+
ET.SubElement(fido_details, 'is_subtype_of').text = get_text_tna(x, 'RelatedFormatID')
|
|
221
|
+
ET.SubElement(fido_details, 'content_type').text = get_text_tna(pronom_format, 'FormatTypes')
|
|
222
|
+
# References
|
|
223
|
+
for x in pronom_format.findall(TNA("Document")):
|
|
224
|
+
r = ET.SubElement(fido_details,'reference')
|
|
225
|
+
ET.SubElement(r, 'dc:title').text = get_text_tna(x, 'TitleText')
|
|
226
|
+
ET.SubElement(r, 'dc:creator').text = get_text_tna(x, 'Author/AuthorCompoundName')
|
|
227
|
+
ET.SubElement(r, 'dc:publisher').text = get_text_tna(x, 'Publisher/PublisherCompoundName')
|
|
228
|
+
ET.SubElement(r, 'dcterms:available').text = get_text_tna(x, 'PublicationDate')
|
|
229
|
+
for id in x.findall(TNA('DocumentIdentifier')):
|
|
230
|
+
type = get_text_tna(id, 'IdentifierType')
|
|
231
|
+
if type == 'URL':
|
|
232
|
+
ET.SubElement(r, 'dc:identifier').text = "http://"+get_text_tna(id, 'Identifier')
|
|
233
|
+
else:
|
|
234
|
+
ET.SubElement(r, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
|
|
235
|
+
ET.SubElement(r, 'dc:description').text = get_text_tna(x, 'DocumentNote')
|
|
236
|
+
ET.SubElement(r, 'dc:type').text = get_text_tna(x, 'DocumentType')
|
|
237
|
+
ET.SubElement(r, 'dcterms:license').text = get_text_tna(x, 'AvailabilityDescription')+" "+get_text_tna(x, 'AvailabilityNote')
|
|
238
|
+
ET.SubElement(r, 'dc:rights').text = get_text_tna(x, 'DocumentIPR')
|
|
239
|
+
# Examples
|
|
240
|
+
for x in pronom_format.findall(TNA("ReferenceFile")):
|
|
241
|
+
rf = ET.SubElement(fido_details,'example_file')
|
|
242
|
+
ET.SubElement(rf, 'dc:title').text = get_text_tna(x, 'ReferenceFileName')
|
|
243
|
+
ET.SubElement(rf, 'dc:description').text = get_text_tna(x, 'ReferenceFileDescription')
|
|
244
|
+
checksum = ""
|
|
245
|
+
for id in x.findall(TNA('ReferenceFileIdentifier')):
|
|
246
|
+
type = get_text_tna(id, 'IdentifierType')
|
|
247
|
+
if type == 'URL':
|
|
248
|
+
url = "http://"+get_text_tna(id, 'Identifier')
|
|
249
|
+
ET.SubElement(rf, 'dc:identifier').text = url
|
|
250
|
+
# And calculate the checksum of this resource:
|
|
251
|
+
m = hashlib.md5()
|
|
252
|
+
sock = urllib.urlopen(url)
|
|
253
|
+
m.update(sock.read())
|
|
254
|
+
sock.close()
|
|
255
|
+
checksum=m.hexdigest()
|
|
256
|
+
else:
|
|
257
|
+
ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType')+":"+get_text_tna(id, 'Identifier')
|
|
258
|
+
ET.SubElement(rf, 'dcterms:license').text = ""
|
|
259
|
+
ET.SubElement(rf, 'dc:rights').text = get_text_tna(x, 'ReferenceFileIPR')
|
|
260
|
+
checksumElement = ET.SubElement(rf, 'checksum')
|
|
261
|
+
checksumElement.text = checksum
|
|
262
|
+
checksumElement.attrib['type'] = "md5"
|
|
263
|
+
# Record Metadata
|
|
264
|
+
md = ET.SubElement(fido_details,'record_metadata')
|
|
265
|
+
ET.SubElement(md, 'status').text ='unknown'
|
|
266
|
+
ET.SubElement(md, 'dc:creator').text = get_text_tna(pronom_format, 'ProvenanceName')
|
|
267
|
+
ET.SubElement(md, 'dcterms:created').text = get_text_tna(pronom_format, 'ProvenanceSourceDate')
|
|
268
|
+
ET.SubElement(md, 'dcterms:modified').text = get_text_tna(pronom_format, 'LastUpdatedDate')
|
|
269
|
+
ET.SubElement(md, 'dc:description').text = get_text_tna(pronom_format, 'ProvenanceDescription').encode('utf8')
|
|
270
|
+
return fido_format
|
|
271
|
+
|
|
272
|
+
#FIXME: I don't think that this quite works yet!
|
|
273
|
+
def _sort_formats(self, formatlist):
|
|
274
|
+
"""Sort the format list based on their priority relationships so higher priority
|
|
275
|
+
formats appear earlier in the list.
|
|
276
|
+
"""
|
|
277
|
+
def compare_formats(f1, f2):
|
|
278
|
+
f1ID = f1.find('puid').text
|
|
279
|
+
f2ID = f2.find('puid').text
|
|
280
|
+
for worse in f1.findall('has_priority_over'):
|
|
281
|
+
if worse.text == f2ID:
|
|
282
|
+
return - 1
|
|
283
|
+
for worse in f2.findall('has_priority_over'):
|
|
284
|
+
if worse.text == f1ID:
|
|
285
|
+
return 1
|
|
286
|
+
if f1ID < f2ID:
|
|
287
|
+
return - 1
|
|
288
|
+
elif f1ID == f2ID:
|
|
289
|
+
return 0
|
|
290
|
+
else:
|
|
291
|
+
return 1
|
|
292
|
+
return sorted(formatlist, cmp=compare_formats)
|
|
293
|
+
|
|
294
|
+
def fido_position(pronom_position):
|
|
295
|
+
"""@return BOF/EOF/VAR instead of the more verbose pronom position names.
|
|
296
|
+
"""
|
|
297
|
+
if pronom_position == 'Absolute from BOF':
|
|
298
|
+
return 'BOF'
|
|
299
|
+
elif pronom_position == 'Absolute from EOF':
|
|
300
|
+
return 'EOF'
|
|
301
|
+
elif pronom_position == 'Variable':
|
|
302
|
+
return 'VAR'
|
|
303
|
+
elif pronom_position == 'Indirect From BOF':
|
|
304
|
+
return 'IFB'
|
|
305
|
+
else: # to make sure FIDO does not crash (IFB aftermath)
|
|
306
|
+
sys.stderr.write("Unknown pronom PositionType:" + pronom_position)
|
|
307
|
+
return 'VAR'
|
|
308
|
+
|
|
309
|
+
def _convert_err_msg(msg, c, i, chars):
|
|
310
|
+
return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
|
|
311
|
+
|
|
312
|
+
def doByte(chars, i, littleendian):
|
|
313
|
+
"""Convert two chars[i] and chars[i+1] into a byte.
|
|
314
|
+
@return a tuple (byte, 2)
|
|
315
|
+
"""
|
|
316
|
+
c1 = '0123456789ABCDEF'.find(chars[i].upper())
|
|
317
|
+
c2 = '0123456789ABCDEF'.find(chars[i + 1].upper())
|
|
318
|
+
if (c1 < 0 or c2 < 0):
|
|
319
|
+
raise Exception(_convert_err_msg('bad byte sequence', chars[i:i + 2], i, chars))
|
|
320
|
+
if littleendian:
|
|
321
|
+
val = chr(16 * c1 + c2)
|
|
322
|
+
else:
|
|
323
|
+
val = chr(c1 + 16 * c2)
|
|
324
|
+
return (escape(val), 2)
|
|
325
|
+
|
|
326
|
+
# \a\b\n\r\t\v
|
|
327
|
+
# MdR: took out '<' and '>' out of _ordinary because they were converted to entities <>
|
|
328
|
+
# MdR: moved '!' from _ordinary to _special because it means "NOT" in the regex world. At this time no regex in any sig has a negate set, did this to be on the safe side
|
|
329
|
+
_ordinary = frozenset(' "#%&\',-/0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~')
|
|
330
|
+
_special = '$()*+.?![]^\\{|}'
|
|
331
|
+
_hex = '0123456789abcdef'
|
|
332
|
+
def _escape_char(c):
|
|
333
|
+
if c in '\n':
|
|
334
|
+
return '\\n'
|
|
335
|
+
elif c == '\r':
|
|
336
|
+
return '\\r'
|
|
337
|
+
elif c in _special:
|
|
338
|
+
return '\\' + c
|
|
339
|
+
else:
|
|
340
|
+
(high, low) = divmod(ord(c), 16)
|
|
341
|
+
return '\\x' + _hex[high] + _hex[low]
|
|
342
|
+
|
|
343
|
+
def escape(string):
|
|
344
|
+
"Escape characters in pattern that are non-printable, non-ascii, or special for regexes."
|
|
345
|
+
return ''.join(c if c in _ordinary else _escape_char(c) for c in string)
|
|
346
|
+
|
|
347
|
+
def calculate_repetition(char, pos, offset, maxoffset):
|
|
348
|
+
"""
|
|
349
|
+
Recursively calculates offset/maxoffset repetition,
|
|
350
|
+
when one or both offsets is greater than 65535 bytes (64KB)
|
|
351
|
+
see: bugs.python.org/issue13169
|
|
352
|
+
Otherwise it returns the {offset,maxoffset}
|
|
353
|
+
"""
|
|
354
|
+
calcbuf = cStringIO.StringIO()
|
|
355
|
+
|
|
356
|
+
calcremain = False
|
|
357
|
+
offsetremain = 0
|
|
358
|
+
maxoffsetremain = 0
|
|
359
|
+
|
|
360
|
+
if offset != None and offset != '':
|
|
361
|
+
if int(offset) > 65535:
|
|
362
|
+
offsetremain = str(int(offset) - 65535)
|
|
363
|
+
offset = '65535'
|
|
364
|
+
calcremain = True
|
|
365
|
+
if maxoffset != None and maxoffset != '':
|
|
366
|
+
if int(maxoffset) > 65535:
|
|
367
|
+
maxoffsetremain = str(int(maxoffset) - 65535)
|
|
368
|
+
maxoffset = '65535'
|
|
369
|
+
calcremain = True
|
|
370
|
+
|
|
371
|
+
if pos == "BOF" or pos == "EOF":
|
|
372
|
+
if offset != '0':
|
|
373
|
+
calcbuf.write(char + '{' + str(offset))
|
|
374
|
+
if maxoffset != None:
|
|
375
|
+
calcbuf.write(',' + maxoffset)
|
|
376
|
+
calcbuf.write('}')
|
|
377
|
+
elif maxoffset != None:
|
|
378
|
+
calcbuf.write(char + '{0,' + maxoffset + '}')
|
|
379
|
+
|
|
380
|
+
if pos == "IFB":
|
|
381
|
+
if offset != '0':
|
|
382
|
+
calcbuf.write(char + '{' + str(offset))
|
|
383
|
+
if maxoffset != None:
|
|
384
|
+
calcbuf.write(',' + maxoffset)
|
|
385
|
+
calcbuf.write('}')
|
|
386
|
+
if maxoffset == None:
|
|
387
|
+
calcbuf.write(',}')
|
|
388
|
+
elif maxoffset != None:
|
|
389
|
+
calcbuf.write(char + '{0,' + maxoffset + '}')
|
|
390
|
+
|
|
391
|
+
if calcremain: # recursion happens here
|
|
392
|
+
calcbuf.write(calculate_repetition(char, pos, offsetremain, maxoffsetremain))
|
|
393
|
+
|
|
394
|
+
val = calcbuf.getvalue()
|
|
395
|
+
calcbuf.close()
|
|
396
|
+
return val
|
|
397
|
+
|
|
398
|
+
def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
|
|
399
|
+
"""Convert
|
|
400
|
+
@param chars, a pronom bytesequence, into a
|
|
401
|
+
@return regular expression.
|
|
402
|
+
Endianness is not used.
|
|
403
|
+
"""
|
|
404
|
+
|
|
405
|
+
if 'Big' in endianness:
|
|
406
|
+
littleendian = False
|
|
407
|
+
else:
|
|
408
|
+
littleendian = True
|
|
409
|
+
if len(offset) == 0:
|
|
410
|
+
offset = '0'
|
|
411
|
+
if len(maxoffset) == 0:
|
|
412
|
+
maxoffset = None
|
|
413
|
+
# make buf global so we can print it @'_convert_err_msg' while debugging (MdR)
|
|
414
|
+
global buf
|
|
415
|
+
buf = cStringIO.StringIO()
|
|
416
|
+
buf.write("(?s)") #If a regex starts with (?s), it is equivalent to DOTALL.
|
|
417
|
+
i = 0
|
|
418
|
+
state = 'start'
|
|
419
|
+
if 'BOF' in pos:
|
|
420
|
+
buf.write('\\A') # start of regex
|
|
421
|
+
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
|
422
|
+
|
|
423
|
+
if 'IFB' in pos:
|
|
424
|
+
buf.write('\\A')
|
|
425
|
+
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
|
426
|
+
|
|
427
|
+
while True:
|
|
428
|
+
if i == len(chars):
|
|
429
|
+
break
|
|
430
|
+
#print _convert_err_msg(state,chars[i],i,chars)
|
|
431
|
+
if state == 'start':
|
|
432
|
+
if chars[i].isalnum():
|
|
433
|
+
state = 'bytes'
|
|
434
|
+
elif chars[i] == '[' and chars[i + 1] == '!':
|
|
435
|
+
state = 'non-match'
|
|
436
|
+
elif chars[i] == '[':
|
|
437
|
+
state = 'bracket'
|
|
438
|
+
elif chars[i] == '{':
|
|
439
|
+
state = 'curly'
|
|
440
|
+
elif chars[i] == '(':
|
|
441
|
+
state = 'paren'
|
|
442
|
+
elif chars[i] in '*+?':
|
|
443
|
+
state = 'specials'
|
|
444
|
+
else:
|
|
445
|
+
raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars))
|
|
446
|
+
elif state == 'bytes':
|
|
447
|
+
(byt, inc) = doByte(chars, i, littleendian)
|
|
448
|
+
buf.write(byt)
|
|
449
|
+
i += inc
|
|
450
|
+
state = 'start'
|
|
451
|
+
elif state == 'non-match':
|
|
452
|
+
buf.write('(!')
|
|
453
|
+
i += 2
|
|
454
|
+
while True:
|
|
455
|
+
if chars[i].isalnum():
|
|
456
|
+
(byt, inc) = doByte(chars, i, littleendian)
|
|
457
|
+
buf.write(byt)
|
|
458
|
+
i += inc
|
|
459
|
+
elif chars[i] == ']':
|
|
460
|
+
break
|
|
461
|
+
else:
|
|
462
|
+
raise Exception(_convert_err_msg('Illegal character in non-match', chars[i], i, chars))
|
|
463
|
+
buf.write(')')
|
|
464
|
+
i += 1
|
|
465
|
+
state = 'start'
|
|
466
|
+
|
|
467
|
+
elif state == 'bracket':
|
|
468
|
+
try:
|
|
469
|
+
buf.write('[')
|
|
470
|
+
i += 1
|
|
471
|
+
(byt, inc) = doByte(chars, i, littleendian)
|
|
472
|
+
buf.write(byt)
|
|
473
|
+
i += inc
|
|
474
|
+
#assert(chars[i] == ':')
|
|
475
|
+
if chars[i] != ':':
|
|
476
|
+
return "__INCOMPATIBLE_SIG__"
|
|
477
|
+
buf.write('-')
|
|
478
|
+
i += 1
|
|
479
|
+
(byt, inc) = doByte(chars, i, littleendian)
|
|
480
|
+
buf.write(byt)
|
|
481
|
+
i += inc
|
|
482
|
+
#assert(chars[i] == ']')
|
|
483
|
+
if chars[i] != ']':
|
|
484
|
+
return "__INCOMPATIBLE_SIG__"
|
|
485
|
+
buf.write(']')
|
|
486
|
+
i += 1
|
|
487
|
+
except Exception:
|
|
488
|
+
print _convert_err_msg('Illegal character in bracket', chars[i], i, chars)
|
|
489
|
+
raise
|
|
490
|
+
if i < len(chars) and chars[i] == '{':
|
|
491
|
+
state = 'curly-after-bracket'
|
|
492
|
+
else:
|
|
493
|
+
state = 'start'
|
|
494
|
+
elif state == 'paren':
|
|
495
|
+
buf.write('(?:')
|
|
496
|
+
i += 1
|
|
497
|
+
while True:
|
|
498
|
+
if chars[i].isalnum():
|
|
499
|
+
(byt, inc) = doByte(chars, i, littleendian)
|
|
500
|
+
buf.write(byt)
|
|
501
|
+
i += inc
|
|
502
|
+
elif chars[i] == '|':
|
|
503
|
+
buf.write('|')
|
|
504
|
+
i += 1
|
|
505
|
+
elif chars[i] == ')':
|
|
506
|
+
break
|
|
507
|
+
# START fix FIDO-20
|
|
508
|
+
elif chars[i] == '[':
|
|
509
|
+
buf.write('[')
|
|
510
|
+
i += 1
|
|
511
|
+
(byt, inc) = doByte(chars, i, littleendian)
|
|
512
|
+
buf.write(byt)
|
|
513
|
+
i += inc
|
|
514
|
+
#assert(chars[i] == ':')
|
|
515
|
+
if chars[i] != ':':
|
|
516
|
+
return "__INCOMPATIBLE_SIG__"
|
|
517
|
+
buf.write('-')
|
|
518
|
+
i += 1
|
|
519
|
+
(byt, inc) = doByte(chars, i, littleendian)
|
|
520
|
+
buf.write(byt)
|
|
521
|
+
i += inc
|
|
522
|
+
|
|
523
|
+
#assert(chars[i] == ']')
|
|
524
|
+
if chars[i] != ']':
|
|
525
|
+
return "__INCOMPATIBLE_SIG__"
|
|
526
|
+
buf.write(']')
|
|
527
|
+
i += 1
|
|
528
|
+
else:
|
|
529
|
+
raise Exception(_convert_err_msg(('Current state = \'{0}\' : Illegal character in paren').format(state), chars[i], i, chars))
|
|
530
|
+
buf.write(')')
|
|
531
|
+
i += 1
|
|
532
|
+
state = 'start'
|
|
533
|
+
# END fix FIDO-20
|
|
534
|
+
elif state in ['curly', 'curly-after-bracket']:
|
|
535
|
+
# {nnnn} or {nnn-nnn} or {nnn-*}
|
|
536
|
+
# {nnn} or {nnn,nnn} or {nnn,}
|
|
537
|
+
# when there is a curly-after-bracket, then the {m,n} applies to the bracketed item
|
|
538
|
+
# The above, while sensible, appears to be incorrect. A '.' is always needed.
|
|
539
|
+
# for droid equiv behavior
|
|
540
|
+
#if state == 'curly':
|
|
541
|
+
buf.write('.')
|
|
542
|
+
buf.write('{')
|
|
543
|
+
i += 1 # skip the (
|
|
544
|
+
while True:
|
|
545
|
+
if chars[i].isalnum():
|
|
546
|
+
buf.write(chars[i])
|
|
547
|
+
i += 1
|
|
548
|
+
elif chars[i] == '-':
|
|
549
|
+
buf.write(',')
|
|
550
|
+
i += 1
|
|
551
|
+
elif chars[i] == '*': # skip the *
|
|
552
|
+
i += 1
|
|
553
|
+
elif chars[i] == '}':
|
|
554
|
+
break
|
|
555
|
+
else:
|
|
556
|
+
raise Exception(_convert_err_msg('Illegal character in curly', chars[i], i, chars))
|
|
557
|
+
buf.write('}')
|
|
558
|
+
i += 1 # skip the )
|
|
559
|
+
state = 'start'
|
|
560
|
+
elif state == 'specials':
|
|
561
|
+
if chars[i] == '*':
|
|
562
|
+
buf.write('.*')
|
|
563
|
+
i += 1
|
|
564
|
+
elif chars[i] == '+':
|
|
565
|
+
buf.write('.+')
|
|
566
|
+
i += 1
|
|
567
|
+
elif chars[i] == '?':
|
|
568
|
+
if chars[i + 1] != '?':
|
|
569
|
+
raise Exception(_convert_err_msg('Illegal character after ?', chars[i + 1], i + 1, chars))
|
|
570
|
+
buf.write('.?')
|
|
571
|
+
i += 2
|
|
572
|
+
state = 'start'
|
|
573
|
+
else:
|
|
574
|
+
raise Exception('Illegal state {0}'.format(state))
|
|
575
|
+
|
|
576
|
+
if 'EOF' in pos:
|
|
577
|
+
buf.write(calculate_repetition('.', pos, offset, maxoffset))
|
|
578
|
+
buf.write('\\Z')
|
|
579
|
+
|
|
580
|
+
val = buf.getvalue()
|
|
581
|
+
buf.close()
|
|
582
|
+
return val
|
|
583
|
+
|
|
584
|
+
def main(arg=None):
|
|
585
|
+
import sys
|
|
586
|
+
from argparselocal import ArgumentParser
|
|
587
|
+
if arg != None:
|
|
588
|
+
arglist = arg
|
|
589
|
+
else:
|
|
590
|
+
arglist = sys.argv[1:]
|
|
591
|
+
# print arglist
|
|
592
|
+
# exit()
|
|
593
|
+
mydir = os.path.abspath(os.path.dirname(__file__))
|
|
594
|
+
# parse version file to fetch versions
|
|
595
|
+
versionsFile = os.path.join(mydir, 'conf', 'versions.xml')
|
|
596
|
+
try:
|
|
597
|
+
versions = VET.parse(versionsFile)
|
|
598
|
+
except Exception, e:
|
|
599
|
+
sys.stderr.write("An error occured loading versions.xml:\n{0}".format(e))
|
|
600
|
+
sys.exit()
|
|
601
|
+
xml_pronomSignature = os.path.join(mydir, 'conf', versions.find('pronomSignature').text)
|
|
602
|
+
xml_pronomZipFile = os.path.join(mydir, 'conf', "pronom-xml-v{0}.zip".format(versions.find('pronomVersion').text))
|
|
603
|
+
parser = ArgumentParser(description='Produce the fido format xml that is loaded at run-time')
|
|
604
|
+
parser.add_argument('-input', default=xml_pronomZipFile, help='input file, a zip containing Pronom xml files')
|
|
605
|
+
parser.add_argument('-output', default=xml_pronomSignature, help='output file')
|
|
606
|
+
parser.add_argument('-puid', default=None, help='a particular PUID record to extract')
|
|
607
|
+
# PROCESS ARGUMENTS
|
|
608
|
+
args = parser.parse_args(arglist)
|
|
609
|
+
# print os.path.abspath(args.input), os.path.abspath(args.output)
|
|
610
|
+
info = FormatInfo(args.input)
|
|
611
|
+
info.load_pronom_xml(args.puid)
|
|
612
|
+
info.save(args.output)
|
|
613
|
+
print >> sys.stderr, 'Converted {0} PRONOM formats to FIDO signatures'.format(len(info.formats))
|
|
614
|
+
|
|
615
|
+
if __name__ == '__main__':
|
|
616
|
+
main()
|