fstdtools 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fstdtools/__init__.py +0 -0
- fstdtools/__main__.py +2 -0
- fstdtools/cli.py +130 -0
- fstdtools/convert.py +77 -0
- fstdtools/mdict/__init__.py +0 -0
- fstdtools/mdict/lzo.py +246 -0
- fstdtools/mdict/pureSalsa20.py +365 -0
- fstdtools/mdict/readmdict.py +802 -0
- fstdtools/mdict/ripemd128.py +130 -0
- fstdtools/mdict/writemdict.py +673 -0
- fstdtools-0.0.1.dist-info/METADATA +15 -0
- fstdtools-0.0.1.dist-info/RECORD +16 -0
- fstdtools-0.0.1.dist-info/WHEEL +5 -0
- fstdtools-0.0.1.dist-info/entry_points.txt +2 -0
- fstdtools-0.0.1.dist-info/licenses/LICENSE +21 -0
- fstdtools-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,802 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# readmdict.py
|
|
4
|
+
# Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
|
|
5
|
+
#
|
|
6
|
+
# Copyright (C) 2012, 2013, 2015, 2022, 2023 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
|
|
7
|
+
#
|
|
8
|
+
# This program is a free software; you can redistribute it and/or modify
|
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
|
10
|
+
# the Free Software Foundation, version 3 of the License.
|
|
11
|
+
#
|
|
12
|
+
# You can get a copy of GNU General Public License along this program
|
|
13
|
+
# But you can always get it from http://www.gnu.org/licenses/gpl.txt
|
|
14
|
+
#
|
|
15
|
+
# This program is distributed in the hope that it will be useful,
|
|
16
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
18
|
+
# GNU General Public License for more details.
|
|
19
|
+
|
|
20
|
+
from struct import pack, unpack
|
|
21
|
+
from io import BytesIO
|
|
22
|
+
import re
|
|
23
|
+
import sys
|
|
24
|
+
|
|
25
|
+
from .ripemd128 import ripemd128
|
|
26
|
+
from .pureSalsa20 import Salsa20
|
|
27
|
+
|
|
28
|
+
# zlib compression is used for engine version >=2.0
|
|
29
|
+
import zlib
|
|
30
|
+
# LZO compression is used for engine version < 2.0
|
|
31
|
+
try:
|
|
32
|
+
import lzo
|
|
33
|
+
except ImportError:
|
|
34
|
+
lzo = None
|
|
35
|
+
|
|
36
|
+
# xxhash is used for engine version >= 3.0
|
|
37
|
+
try:
|
|
38
|
+
import xxhash
|
|
39
|
+
except ImportError:
|
|
40
|
+
xxhash = None
|
|
41
|
+
|
|
42
|
+
# 2x3 compatible
|
|
43
|
+
if sys.hexversion >= 0x03000000:
|
|
44
|
+
unicode = str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _unescape_entities(text):
|
|
48
|
+
"""
|
|
49
|
+
unescape offending tags < > " &
|
|
50
|
+
"""
|
|
51
|
+
text = text.replace(b'<', b'<')
|
|
52
|
+
text = text.replace(b'>', b'>')
|
|
53
|
+
text = text.replace(b'"', b'"')
|
|
54
|
+
text = text.replace(b'&', b'&')
|
|
55
|
+
return text
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _fast_decrypt(data, key):
|
|
59
|
+
"""
|
|
60
|
+
XOR decryption
|
|
61
|
+
"""
|
|
62
|
+
b = bytearray(data)
|
|
63
|
+
key = bytearray(key)
|
|
64
|
+
previous = 0x36
|
|
65
|
+
for i in range(len(b)):
|
|
66
|
+
t = (b[i] >> 4 | b[i] << 4) & 0xff
|
|
67
|
+
t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)]
|
|
68
|
+
previous = b[i]
|
|
69
|
+
b[i] = t
|
|
70
|
+
return bytes(b)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _salsa_decrypt(ciphertext, encrypt_key):
|
|
74
|
+
"""
|
|
75
|
+
salsa20 (8 rounds) decryption
|
|
76
|
+
"""
|
|
77
|
+
s20 = Salsa20(key=encrypt_key, IV=b"\x00"*8, rounds=8)
|
|
78
|
+
return s20.encryptBytes(ciphertext)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _decrypt_regcode_by_userid(reg_code, userid):
|
|
82
|
+
userid_digest = ripemd128(userid)
|
|
83
|
+
s20 = Salsa20(key=userid_digest, IV=b"\x00"*8, rounds=8)
|
|
84
|
+
encrypt_key = s20.encryptBytes(reg_code)
|
|
85
|
+
return encrypt_key
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class MDict(object):
|
|
89
|
+
"""
|
|
90
|
+
Base class which reads in header and key block.
|
|
91
|
+
It has no public methods and serves only as code sharing base class.
|
|
92
|
+
"""
|
|
93
|
+
def __init__(self, fname, encoding='', passcode=None):
|
|
94
|
+
self._fname = fname
|
|
95
|
+
self._encoding = encoding.upper()
|
|
96
|
+
self._encrypted_key = None
|
|
97
|
+
|
|
98
|
+
self.header = self._read_header()
|
|
99
|
+
|
|
100
|
+
# decrypt regcode to get the encrypted key
|
|
101
|
+
if passcode is not None:
|
|
102
|
+
regcode, userid = passcode
|
|
103
|
+
if isinstance(userid, unicode):
|
|
104
|
+
userid = userid.encode('utf8')
|
|
105
|
+
self._encrypted_key = _decrypt_regcode_by_userid(regcode, userid)
|
|
106
|
+
# MDict 3.0 encryption key derives from UUID if present
|
|
107
|
+
elif self._version >= 3.0:
|
|
108
|
+
uuid = self.header.get(b'UUID')
|
|
109
|
+
if uuid:
|
|
110
|
+
if xxhash is None:
|
|
111
|
+
raise RuntimeError('xxhash module is needed to read MDict 3.0 format')
|
|
112
|
+
mid = (len(uuid) + 1) // 2
|
|
113
|
+
self._encrypted_key = xxhash.xxh64_digest(uuid[:mid]) + xxhash.xxh64_digest(uuid[mid:])
|
|
114
|
+
|
|
115
|
+
self._key_list = self._read_keys()
|
|
116
|
+
|
|
117
|
+
def __len__(self):
|
|
118
|
+
return self._num_entries
|
|
119
|
+
|
|
120
|
+
def __iter__(self):
|
|
121
|
+
return self.keys()
|
|
122
|
+
|
|
123
|
+
def keys(self):
|
|
124
|
+
"""
|
|
125
|
+
Return an iterator over dictionary keys.
|
|
126
|
+
"""
|
|
127
|
+
return (key_value for key_id, key_value in self._key_list)
|
|
128
|
+
|
|
129
|
+
def _read_number(self, f):
|
|
130
|
+
return unpack(self._number_format, f.read(self._number_width))[0]
|
|
131
|
+
|
|
132
|
+
def _read_int32(self, f):
|
|
133
|
+
return unpack('>I', f.read(4))[0]
|
|
134
|
+
|
|
135
|
+
def _parse_header(self, header):
|
|
136
|
+
"""
|
|
137
|
+
extract attributes from <Dict attr="value" ... >
|
|
138
|
+
"""
|
|
139
|
+
taglist = re.findall(rb'(\w+)="(.*?)"', header, re.DOTALL)
|
|
140
|
+
tagdict = {}
|
|
141
|
+
for key, value in taglist:
|
|
142
|
+
tagdict[key] = _unescape_entities(value)
|
|
143
|
+
return tagdict
|
|
144
|
+
|
|
145
|
+
def _decode_block(self, block, decompressed_size):
|
|
146
|
+
# block info: compression, encryption
|
|
147
|
+
info = unpack('<L', block[:4])[0]
|
|
148
|
+
compression_method = info & 0xf
|
|
149
|
+
encryption_method = (info >> 4) & 0xf
|
|
150
|
+
encryption_size = (info >> 8) & 0xff
|
|
151
|
+
|
|
152
|
+
# adler checksum of the block data used as the encryption key if none given
|
|
153
|
+
adler32 = unpack('>I', block[4:8])[0]
|
|
154
|
+
encrypted_key = self._encrypted_key
|
|
155
|
+
if encrypted_key is None:
|
|
156
|
+
encrypted_key = ripemd128(block[4:8])
|
|
157
|
+
|
|
158
|
+
# block data
|
|
159
|
+
data = block[8:]
|
|
160
|
+
|
|
161
|
+
# decrypt
|
|
162
|
+
if encryption_method == 0:
|
|
163
|
+
decrypted_block = data
|
|
164
|
+
elif encryption_method == 1:
|
|
165
|
+
decrypted_block = _fast_decrypt(data[:encryption_size], encrypted_key) + data[encryption_size:]
|
|
166
|
+
elif encryption_method == 2:
|
|
167
|
+
decrypted_block = _salsa_decrypt(data[:encryption_size], encrypted_key) + data[encryption_size:]
|
|
168
|
+
else:
|
|
169
|
+
raise Exception('encryption method %d not supported' % encryption_method)
|
|
170
|
+
|
|
171
|
+
# check adler checksum over decrypted data
|
|
172
|
+
if self._version >= 3:
|
|
173
|
+
assert(hex(adler32) == hex(zlib.adler32(decrypted_block) & 0xffffffff))
|
|
174
|
+
|
|
175
|
+
# decompress
|
|
176
|
+
if compression_method == 0:
|
|
177
|
+
decompressed_block = decrypted_block
|
|
178
|
+
elif compression_method == 1:
|
|
179
|
+
if lzo is None:
|
|
180
|
+
raise RuntimeError("LZO compression is not supported")
|
|
181
|
+
header = b'\xf0' + pack('>I', decompressed_size)
|
|
182
|
+
decompressed_block = lzo.decompress(header + decrypted_block)
|
|
183
|
+
elif compression_method == 2:
|
|
184
|
+
decompressed_block = zlib.decompress(decrypted_block)
|
|
185
|
+
else:
|
|
186
|
+
raise Exception('compression method %d not supported' % compression_method)
|
|
187
|
+
|
|
188
|
+
# check adler checksum over decompressed data
|
|
189
|
+
if self._version < 3:
|
|
190
|
+
assert(hex(adler32) == hex(zlib.adler32(decompressed_block) & 0xffffffff))
|
|
191
|
+
|
|
192
|
+
return decompressed_block
|
|
193
|
+
|
|
194
|
+
def _decode_key_block_info(self, key_block_info_compressed):
|
|
195
|
+
if self._version >= 2:
|
|
196
|
+
# zlib compression
|
|
197
|
+
assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00')
|
|
198
|
+
# decrypt if needed
|
|
199
|
+
if self._encrypt & 0x02:
|
|
200
|
+
key = ripemd128(key_block_info_compressed[4:8] + pack(b'<L', 0x3695))
|
|
201
|
+
key_block_info_compressed = key_block_info_compressed[:8] + _fast_decrypt(key_block_info_compressed[8:], key)
|
|
202
|
+
# decompress
|
|
203
|
+
key_block_info = zlib.decompress(key_block_info_compressed[8:])
|
|
204
|
+
# adler checksum
|
|
205
|
+
adler32 = unpack('>I', key_block_info_compressed[4:8])[0]
|
|
206
|
+
assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff)
|
|
207
|
+
else:
|
|
208
|
+
# no compression
|
|
209
|
+
key_block_info = key_block_info_compressed
|
|
210
|
+
# decode
|
|
211
|
+
key_block_info_list = []
|
|
212
|
+
num_entries = 0
|
|
213
|
+
i = 0
|
|
214
|
+
if self._version >= 2:
|
|
215
|
+
byte_format = '>H'
|
|
216
|
+
byte_width = 2
|
|
217
|
+
text_term = 1
|
|
218
|
+
else:
|
|
219
|
+
byte_format = '>B'
|
|
220
|
+
byte_width = 1
|
|
221
|
+
text_term = 0
|
|
222
|
+
|
|
223
|
+
while i < len(key_block_info):
|
|
224
|
+
# number of entries in current key block
|
|
225
|
+
num_entries += unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
|
|
226
|
+
i += self._number_width
|
|
227
|
+
# text head size
|
|
228
|
+
text_head_size = unpack(byte_format, key_block_info[i:i+byte_width])[0]
|
|
229
|
+
i += byte_width
|
|
230
|
+
# text head
|
|
231
|
+
if self._encoding != 'UTF-16':
|
|
232
|
+
i += text_head_size + text_term
|
|
233
|
+
else:
|
|
234
|
+
i += (text_head_size + text_term) * 2
|
|
235
|
+
# text tail size
|
|
236
|
+
text_tail_size = unpack(byte_format, key_block_info[i:i+byte_width])[0]
|
|
237
|
+
i += byte_width
|
|
238
|
+
# text tail
|
|
239
|
+
if self._encoding != 'UTF-16':
|
|
240
|
+
i += text_tail_size + text_term
|
|
241
|
+
else:
|
|
242
|
+
i += (text_tail_size + text_term) * 2
|
|
243
|
+
# key block compressed size
|
|
244
|
+
key_block_compressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
|
|
245
|
+
i += self._number_width
|
|
246
|
+
# key block decompressed size
|
|
247
|
+
key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
|
|
248
|
+
i += self._number_width
|
|
249
|
+
key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)]
|
|
250
|
+
|
|
251
|
+
#assert(num_entries == self._num_entries)
|
|
252
|
+
|
|
253
|
+
return key_block_info_list
|
|
254
|
+
|
|
255
|
+
def _decode_key_block(self, key_block_compressed, key_block_info_list):
|
|
256
|
+
key_list = []
|
|
257
|
+
i = 0
|
|
258
|
+
for compressed_size, decompressed_size in key_block_info_list:
|
|
259
|
+
key_block = self._decode_block(key_block_compressed[i:i+compressed_size], decompressed_size)
|
|
260
|
+
# extract one single key block into a key list
|
|
261
|
+
key_list += self._split_key_block(key_block)
|
|
262
|
+
i += compressed_size
|
|
263
|
+
return key_list
|
|
264
|
+
|
|
265
|
+
def _split_key_block(self, key_block):
|
|
266
|
+
key_list = []
|
|
267
|
+
key_start_index = 0
|
|
268
|
+
while key_start_index < len(key_block):
|
|
269
|
+
# the corresponding record's offset in record block
|
|
270
|
+
key_id = unpack(self._number_format, key_block[key_start_index:key_start_index+self._number_width])[0]
|
|
271
|
+
# key text ends with '\x00'
|
|
272
|
+
if self._encoding == 'UTF-16':
|
|
273
|
+
delimiter = b'\x00\x00'
|
|
274
|
+
width = 2
|
|
275
|
+
else:
|
|
276
|
+
delimiter = b'\x00'
|
|
277
|
+
width = 1
|
|
278
|
+
i = key_start_index + self._number_width
|
|
279
|
+
while i < len(key_block):
|
|
280
|
+
if key_block[i:i+width] == delimiter:
|
|
281
|
+
key_end_index = i
|
|
282
|
+
break
|
|
283
|
+
i += width
|
|
284
|
+
key_text = key_block[key_start_index+self._number_width:key_end_index]\
|
|
285
|
+
.decode(self._encoding, errors='ignore').encode('utf-8').strip()
|
|
286
|
+
key_start_index = key_end_index + width
|
|
287
|
+
key_list += [(key_id, key_text)]
|
|
288
|
+
return key_list
|
|
289
|
+
|
|
290
|
+
def _read_header(self):
|
|
291
|
+
f = open(self._fname, 'rb')
|
|
292
|
+
# number of bytes of header text
|
|
293
|
+
header_bytes_size = unpack('>I', f.read(4))[0]
|
|
294
|
+
header_bytes = f.read(header_bytes_size)
|
|
295
|
+
# 4 bytes: adler32 checksum of header, in little endian
|
|
296
|
+
adler32 = unpack('<I', f.read(4))[0]
|
|
297
|
+
assert(adler32 == zlib.adler32(header_bytes) & 0xffffffff)
|
|
298
|
+
# mark down key block offset
|
|
299
|
+
self._key_block_offset = f.tell()
|
|
300
|
+
f.close()
|
|
301
|
+
|
|
302
|
+
# header text in utf-16 encoding ending with '\x00\x00'
|
|
303
|
+
if header_bytes[-2:] == b'\x00\x00':
|
|
304
|
+
header_text = header_bytes[:-2].decode('utf-16').encode('utf-8')
|
|
305
|
+
else:
|
|
306
|
+
header_text = header_bytes[:-1]
|
|
307
|
+
header_tag = self._parse_header(header_text)
|
|
308
|
+
|
|
309
|
+
if not self._encoding:
|
|
310
|
+
encoding = header_tag.get(b'Encoding', b'utf-8')
|
|
311
|
+
if sys.hexversion >= 0x03000000:
|
|
312
|
+
encoding = encoding.decode('utf-8')
|
|
313
|
+
# GB18030 > GBK > GB2312
|
|
314
|
+
if encoding in ['GBK', 'GB2312']:
|
|
315
|
+
encoding = 'GB18030'
|
|
316
|
+
self._encoding = encoding
|
|
317
|
+
|
|
318
|
+
# encryption flag
|
|
319
|
+
# 0x00 - no encryption, "Allow export to text" is checked in MdxBuilder 3.
|
|
320
|
+
# 0x01 - encrypt record block, "Encryption Key" is given in MdxBuilder 3.
|
|
321
|
+
# 0x02 - encrypt key info block, "Allow export to text" is unchecked in MdxBuilder 3.
|
|
322
|
+
if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No':
|
|
323
|
+
self._encrypt = 0
|
|
324
|
+
elif header_tag[b'Encrypted'] == b'Yes':
|
|
325
|
+
self._encrypt = 1
|
|
326
|
+
else:
|
|
327
|
+
self._encrypt = int(header_tag[b'Encrypted'])
|
|
328
|
+
|
|
329
|
+
# stylesheet attribute if present takes form of:
|
|
330
|
+
# style_number # 1-255
|
|
331
|
+
# style_begin # or ''
|
|
332
|
+
# style_end # or ''
|
|
333
|
+
# store stylesheet in dict in the form of
|
|
334
|
+
# {'number' : ('style_begin', 'style_end')}
|
|
335
|
+
self._stylesheet = {}
|
|
336
|
+
if header_tag.get(b'StyleSheet'):
|
|
337
|
+
lines = header_tag[b'StyleSheet'].splitlines()
|
|
338
|
+
for i in range(0, len(lines), 3):
|
|
339
|
+
self._stylesheet[lines[i]] = (lines[i+1], lines[i+2])
|
|
340
|
+
|
|
341
|
+
# before version 2.0, number is 4 bytes integer
|
|
342
|
+
# version 2.0 and above uses 8 bytes
|
|
343
|
+
self._version = float(header_tag[b'GeneratedByEngineVersion'])
|
|
344
|
+
if self._version < 2.0:
|
|
345
|
+
self._number_width = 4
|
|
346
|
+
self._number_format = '>I'
|
|
347
|
+
else:
|
|
348
|
+
self._number_width = 8
|
|
349
|
+
self._number_format = '>Q'
|
|
350
|
+
# version 3.0 uses UTF-8 only
|
|
351
|
+
if self._version >= 3:
|
|
352
|
+
self._encoding = 'UTF-8'
|
|
353
|
+
|
|
354
|
+
return header_tag
|
|
355
|
+
|
|
356
|
+
def _read_keys(self):
|
|
357
|
+
if self._version >= 3:
|
|
358
|
+
return self._read_keys_v3()
|
|
359
|
+
else:
|
|
360
|
+
# if no regcode is given, try brutal force (only for engine <= 2)
|
|
361
|
+
if (self._encrypt & 0x01) and self._encrypted_key is None:
|
|
362
|
+
print("Try Brutal Force on Encrypted Key Blocks")
|
|
363
|
+
return self._read_keys_brutal()
|
|
364
|
+
else:
|
|
365
|
+
return self._read_keys_v1v2()
|
|
366
|
+
|
|
367
|
+
def _read_keys_v3(self):
|
|
368
|
+
f = open(self._fname, 'rb')
|
|
369
|
+
f.seek(self._key_block_offset)
|
|
370
|
+
|
|
371
|
+
# find all blocks offset
|
|
372
|
+
while True:
|
|
373
|
+
block_type = self._read_int32(f)
|
|
374
|
+
block_size = self._read_number(f)
|
|
375
|
+
block_offset = f.tell()
|
|
376
|
+
# record data
|
|
377
|
+
if block_type == 0x01000000:
|
|
378
|
+
self._record_block_offset = block_offset
|
|
379
|
+
# record index
|
|
380
|
+
elif block_type == 0x02000000:
|
|
381
|
+
self._record_index_offset = block_offset
|
|
382
|
+
# key data
|
|
383
|
+
elif block_type == 0x03000000:
|
|
384
|
+
self._key_data_offset = block_offset
|
|
385
|
+
# key index
|
|
386
|
+
elif block_type == 0x04000000:
|
|
387
|
+
self._key_index_offset = block_offset
|
|
388
|
+
else:
|
|
389
|
+
raise RuntimeError("Unknown block type %d" % block_type)
|
|
390
|
+
f.seek(block_size, 1)
|
|
391
|
+
# test the end of file
|
|
392
|
+
if f.read(4):
|
|
393
|
+
f.seek(-4, 1)
|
|
394
|
+
else:
|
|
395
|
+
break
|
|
396
|
+
|
|
397
|
+
# read key data
|
|
398
|
+
f.seek(self._key_data_offset)
|
|
399
|
+
number = self._read_int32(f)
|
|
400
|
+
total_size = self._read_number(f)
|
|
401
|
+
key_list = []
|
|
402
|
+
for i in range(number):
|
|
403
|
+
decompressed_size = self._read_int32(f)
|
|
404
|
+
compressed_size = self._read_int32(f)
|
|
405
|
+
block_data = f.read(compressed_size)
|
|
406
|
+
decompressed_block_data = self._decode_block(block_data, decompressed_size)
|
|
407
|
+
key_list.extend(self._split_key_block(decompressed_block_data))
|
|
408
|
+
|
|
409
|
+
f.close()
|
|
410
|
+
self._num_entries = len(key_list)
|
|
411
|
+
return key_list
|
|
412
|
+
|
|
413
|
+
def _read_keys_v1v2(self):
|
|
414
|
+
f = open(self._fname, 'rb')
|
|
415
|
+
f.seek(self._key_block_offset)
|
|
416
|
+
|
|
417
|
+
# the following numbers could be encrypted
|
|
418
|
+
if self._version >= 2.0:
|
|
419
|
+
num_bytes = 8 * 5
|
|
420
|
+
else:
|
|
421
|
+
num_bytes = 4 * 4
|
|
422
|
+
block = f.read(num_bytes)
|
|
423
|
+
|
|
424
|
+
if self._encrypt & 1:
|
|
425
|
+
block = _salsa_decrypt(block, self._encrypted_key)
|
|
426
|
+
|
|
427
|
+
# decode this block
|
|
428
|
+
sf = BytesIO(block)
|
|
429
|
+
# number of key blocks
|
|
430
|
+
num_key_blocks = self._read_number(sf)
|
|
431
|
+
# number of entries
|
|
432
|
+
self._num_entries = self._read_number(sf)
|
|
433
|
+
# number of bytes of key block info after decompression
|
|
434
|
+
if self._version >= 2.0:
|
|
435
|
+
key_block_info_decomp_size = self._read_number(sf)
|
|
436
|
+
# number of bytes of key block info
|
|
437
|
+
key_block_info_size = self._read_number(sf)
|
|
438
|
+
# number of bytes of key block
|
|
439
|
+
key_block_size = self._read_number(sf)
|
|
440
|
+
|
|
441
|
+
# 4 bytes: adler checksum of previous 5 numbers
|
|
442
|
+
if self._version >= 2.0:
|
|
443
|
+
adler32 = unpack('>I', f.read(4))[0]
|
|
444
|
+
assert adler32 == (zlib.adler32(block) & 0xffffffff)
|
|
445
|
+
|
|
446
|
+
# read key block info, which indicates key block's compressed and decompressed size
|
|
447
|
+
key_block_info = f.read(key_block_info_size)
|
|
448
|
+
key_block_info_list = self._decode_key_block_info(key_block_info)
|
|
449
|
+
assert(num_key_blocks == len(key_block_info_list))
|
|
450
|
+
|
|
451
|
+
# read key block
|
|
452
|
+
key_block_compressed = f.read(key_block_size)
|
|
453
|
+
# extract key block
|
|
454
|
+
key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
|
|
455
|
+
|
|
456
|
+
self._record_block_offset = f.tell()
|
|
457
|
+
f.close()
|
|
458
|
+
|
|
459
|
+
return key_list
|
|
460
|
+
|
|
461
|
+
def _read_keys_brutal(self):
|
|
462
|
+
f = open(self._fname, 'rb')
|
|
463
|
+
f.seek(self._key_block_offset)
|
|
464
|
+
|
|
465
|
+
# the following numbers could be encrypted, disregard them!
|
|
466
|
+
if self._version >= 2.0:
|
|
467
|
+
num_bytes = 8 * 5 + 4
|
|
468
|
+
key_block_type = b'\x02\x00\x00\x00'
|
|
469
|
+
else:
|
|
470
|
+
num_bytes = 4 * 4
|
|
471
|
+
key_block_type = b'\x01\x00\x00\x00'
|
|
472
|
+
block = f.read(num_bytes)
|
|
473
|
+
|
|
474
|
+
# key block info
|
|
475
|
+
# 4 bytes '\x02\x00\x00\x00'
|
|
476
|
+
# 4 bytes adler32 checksum
|
|
477
|
+
# unknown number of bytes follows until '\x02\x00\x00\x00' which marks the beginning of key block
|
|
478
|
+
key_block_info = f.read(8)
|
|
479
|
+
if self._version >= 2.0:
|
|
480
|
+
assert key_block_info[:4] == b'\x02\x00\x00\x00'
|
|
481
|
+
while True:
|
|
482
|
+
fpos = f.tell()
|
|
483
|
+
t = f.read(1024)
|
|
484
|
+
index = t.find(key_block_type)
|
|
485
|
+
if index != -1:
|
|
486
|
+
key_block_info += t[:index]
|
|
487
|
+
f.seek(fpos + index)
|
|
488
|
+
break
|
|
489
|
+
else:
|
|
490
|
+
key_block_info += t
|
|
491
|
+
|
|
492
|
+
key_block_info_list = self._decode_key_block_info(key_block_info)
|
|
493
|
+
key_block_size = sum(list(zip(*key_block_info_list))[0])
|
|
494
|
+
|
|
495
|
+
# read key block
|
|
496
|
+
key_block_compressed = f.read(key_block_size)
|
|
497
|
+
# extract key block
|
|
498
|
+
key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
|
|
499
|
+
|
|
500
|
+
self._record_block_offset = f.tell()
|
|
501
|
+
f.close()
|
|
502
|
+
|
|
503
|
+
self._num_entries = len(key_list)
|
|
504
|
+
return key_list
|
|
505
|
+
|
|
506
|
+
def items(self):
|
|
507
|
+
"""Return a generator which in turn produce tuples in the form of (filename, content)
|
|
508
|
+
"""
|
|
509
|
+
return self._read_records()
|
|
510
|
+
|
|
511
|
+
def _read_records(self):
|
|
512
|
+
if self._version >= 3:
|
|
513
|
+
yield from self._read_records_v3()
|
|
514
|
+
else:
|
|
515
|
+
yield from self._read_records_v1v2()
|
|
516
|
+
|
|
517
|
+
def _read_records_v3(self):
|
|
518
|
+
|
|
519
|
+
# record index has redudant information about block compressed/decompresed size
|
|
520
|
+
record_index = self._read_record_index()
|
|
521
|
+
|
|
522
|
+
f = open(self._fname, 'rb')
|
|
523
|
+
f.seek(self._record_block_offset)
|
|
524
|
+
|
|
525
|
+
offset = 0
|
|
526
|
+
i = 0
|
|
527
|
+
size_counter = 0
|
|
528
|
+
|
|
529
|
+
num_record_blocks = self._read_int32(f)
|
|
530
|
+
num_bytes = self._read_number(f)
|
|
531
|
+
for j in range(num_record_blocks):
|
|
532
|
+
decompressed_size = self._read_int32(f)
|
|
533
|
+
compressed_size = self._read_int32(f)
|
|
534
|
+
|
|
535
|
+
# check against the record index information
|
|
536
|
+
if (compressed_size + 8, decompressed_size) != record_index[j]:
|
|
537
|
+
compressed_size = record_index[j][0] - 8
|
|
538
|
+
decompressed_size = record_index[j][1]
|
|
539
|
+
# skip to the next block
|
|
540
|
+
print('Skip (potentially) damaged record block')
|
|
541
|
+
f.read(compressed_size)
|
|
542
|
+
continue
|
|
543
|
+
|
|
544
|
+
record_block = self._decode_block(f.read(compressed_size), decompressed_size)
|
|
545
|
+
|
|
546
|
+
# split record block according to the offset info from key block
|
|
547
|
+
while i < len(self._key_list):
|
|
548
|
+
record_start, key_text = self._key_list[i]
|
|
549
|
+
# reach the end of current record block
|
|
550
|
+
if record_start - offset >= len(record_block):
|
|
551
|
+
break
|
|
552
|
+
# record end index
|
|
553
|
+
if i < len(self._key_list)-1:
|
|
554
|
+
record_end = self._key_list[i+1][0]
|
|
555
|
+
else:
|
|
556
|
+
record_end = len(record_block) + offset
|
|
557
|
+
i += 1
|
|
558
|
+
data = record_block[record_start-offset:record_end-offset]
|
|
559
|
+
yield key_text, self._treat_record_data(data)
|
|
560
|
+
offset += len(record_block)
|
|
561
|
+
size_counter += compressed_size
|
|
562
|
+
|
|
563
|
+
def _read_records_v1v2(self):
|
|
564
|
+
f = open(self._fname, 'rb')
|
|
565
|
+
f.seek(self._record_block_offset)
|
|
566
|
+
|
|
567
|
+
num_record_blocks = self._read_number(f)
|
|
568
|
+
num_entries = self._read_number(f)
|
|
569
|
+
assert(num_entries == self._num_entries)
|
|
570
|
+
record_block_info_size = self._read_number(f)
|
|
571
|
+
record_block_size = self._read_number(f)
|
|
572
|
+
|
|
573
|
+
# record block info section
|
|
574
|
+
record_block_info_list = []
|
|
575
|
+
size_counter = 0
|
|
576
|
+
for i in range(num_record_blocks):
|
|
577
|
+
compressed_size = self._read_number(f)
|
|
578
|
+
decompressed_size = self._read_number(f)
|
|
579
|
+
record_block_info_list += [(compressed_size, decompressed_size)]
|
|
580
|
+
size_counter += self._number_width * 2
|
|
581
|
+
assert(size_counter == record_block_info_size)
|
|
582
|
+
|
|
583
|
+
# actual record block
|
|
584
|
+
offset = 0
|
|
585
|
+
i = 0
|
|
586
|
+
size_counter = 0
|
|
587
|
+
for compressed_size, decompressed_size in record_block_info_list:
|
|
588
|
+
record_block = self._decode_block(f.read(compressed_size), decompressed_size)
|
|
589
|
+
|
|
590
|
+
# split record block according to the offset info from key block
|
|
591
|
+
while i < len(self._key_list):
|
|
592
|
+
record_start, key_text = self._key_list[i]
|
|
593
|
+
# reach the end of current record block
|
|
594
|
+
if record_start - offset >= len(record_block):
|
|
595
|
+
break
|
|
596
|
+
# record end index
|
|
597
|
+
if i < len(self._key_list)-1:
|
|
598
|
+
record_end = self._key_list[i+1][0]
|
|
599
|
+
else:
|
|
600
|
+
record_end = len(record_block) + offset
|
|
601
|
+
i += 1
|
|
602
|
+
data = record_block[record_start-offset:record_end-offset]
|
|
603
|
+
yield key_text, self._treat_record_data(data)
|
|
604
|
+
offset += len(record_block)
|
|
605
|
+
size_counter += compressed_size
|
|
606
|
+
assert(size_counter == record_block_size)
|
|
607
|
+
|
|
608
|
+
f.close()
|
|
609
|
+
|
|
610
|
+
def _read_record_index(self):
|
|
611
|
+
f = open(self._fname, 'rb')
|
|
612
|
+
|
|
613
|
+
f.seek(self._record_index_offset)
|
|
614
|
+
num_record_blocks = self._read_int32(f)
|
|
615
|
+
num_bytes = self._read_number(f)
|
|
616
|
+
|
|
617
|
+
record_index = []
|
|
618
|
+
for i in range(num_record_blocks):
|
|
619
|
+
decompressed_size = self._read_int32(f)
|
|
620
|
+
compressed_size = self._read_int32(f)
|
|
621
|
+
record_block = self._decode_block(f.read(compressed_size), decompressed_size)
|
|
622
|
+
if len(record_block) % 16 != 0:
|
|
623
|
+
raise Exception('record index block has invalid size %d' % len(record_block))
|
|
624
|
+
|
|
625
|
+
j = 0
|
|
626
|
+
while j < len(record_block):
|
|
627
|
+
block_size, decompressed_size = unpack('>QQ', record_block[j:j+16])
|
|
628
|
+
record_index.append((block_size, decompressed_size))
|
|
629
|
+
j += 16
|
|
630
|
+
f.close()
|
|
631
|
+
return record_index
|
|
632
|
+
|
|
633
|
+
def _treat_record_data(self, data):
|
|
634
|
+
return data
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
class MDD(MDict):
|
|
638
|
+
"""
|
|
639
|
+
MDict resource file format (*.MDD) reader.
|
|
640
|
+
>>> mdd = MDD('example.mdd')
|
|
641
|
+
>>> len(mdd)
|
|
642
|
+
208
|
|
643
|
+
>>> for filename,content in mdd.items():
|
|
644
|
+
... print filename, content[:10]
|
|
645
|
+
"""
|
|
646
|
+
def __init__(self, fname, passcode=None):
|
|
647
|
+
MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode)
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
class MDX(MDict):
|
|
651
|
+
"""
|
|
652
|
+
MDict dictionary file format (*.MDD) reader.
|
|
653
|
+
>>> mdx = MDX('example.mdx')
|
|
654
|
+
>>> len(mdx)
|
|
655
|
+
42481
|
|
656
|
+
>>> for key,value in mdx.items():
|
|
657
|
+
... print key, value[:10]
|
|
658
|
+
"""
|
|
659
|
+
def __init__(self, fname, encoding='', substyle=False, passcode=None):
|
|
660
|
+
MDict.__init__(self, fname, encoding, passcode)
|
|
661
|
+
self._substyle = substyle
|
|
662
|
+
|
|
663
|
+
def _substitute_stylesheet(self, txt):
|
|
664
|
+
# substitute stylesheet definition
|
|
665
|
+
txt_list = re.split(rb'`\d+`', txt)
|
|
666
|
+
txt_tag = re.findall(rb'`\d+`', txt)
|
|
667
|
+
txt_styled = txt_list[0]
|
|
668
|
+
for j, p in enumerate(txt_list[1:]):
|
|
669
|
+
style = self._stylesheet[txt_tag[j][1:-1]]
|
|
670
|
+
if p and p[-1] == '\n':
|
|
671
|
+
txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n'
|
|
672
|
+
else:
|
|
673
|
+
txt_styled = txt_styled + style[0] + p + style[1]
|
|
674
|
+
return txt_styled
|
|
675
|
+
|
|
676
|
+
def _treat_record_data(self, data):
|
|
677
|
+
# convert to utf-8
|
|
678
|
+
data = data.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
|
|
679
|
+
# substitute styles
|
|
680
|
+
if self._substyle and self._stylesheet:
|
|
681
|
+
data = self._substitute_stylesheet(data)
|
|
682
|
+
return data
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
if __name__ == '__main__':
|
|
686
|
+
import sys
|
|
687
|
+
import os
|
|
688
|
+
import os.path
|
|
689
|
+
import argparse
|
|
690
|
+
import codecs
|
|
691
|
+
|
|
692
|
+
def passcode(s):
|
|
693
|
+
try:
|
|
694
|
+
regcode, userid = s.split(',')
|
|
695
|
+
except:
|
|
696
|
+
raise argparse.ArgumentTypeError("Passcode must be regcode,userid")
|
|
697
|
+
try:
|
|
698
|
+
regcode = codecs.decode(regcode, 'hex')
|
|
699
|
+
except:
|
|
700
|
+
raise argparse.ArgumentTypeError("regcode must be a 32 bytes hexadecimal string")
|
|
701
|
+
return regcode, userid
|
|
702
|
+
|
|
703
|
+
parser = argparse.ArgumentParser()
|
|
704
|
+
parser.add_argument('-x', '--extract', action="store_true",
|
|
705
|
+
help='extract mdx to source format and extract files from mdd')
|
|
706
|
+
parser.add_argument('-s', '--substyle', action="store_true",
|
|
707
|
+
help='substitute style definition if present')
|
|
708
|
+
parser.add_argument('-d', '--datafolder', default="data",
|
|
709
|
+
help='folder to extract data files from mdd')
|
|
710
|
+
parser.add_argument('-e', '--encoding', default="",
|
|
711
|
+
help='override the encoding specified in the mdx file')
|
|
712
|
+
parser.add_argument('-p', '--passcode', default=None, type=passcode,
|
|
713
|
+
help='register_code,email_or_deviceid')
|
|
714
|
+
parser.add_argument("filename", nargs='?', help="mdx file name")
|
|
715
|
+
args = parser.parse_args()
|
|
716
|
+
|
|
717
|
+
# use GUI to select file, default to extract
|
|
718
|
+
if not args.filename:
|
|
719
|
+
if sys.hexversion < 0x03000000:
|
|
720
|
+
import Tkinter
|
|
721
|
+
import tkFileDialog
|
|
722
|
+
else:
|
|
723
|
+
import tkinter as Tkinter
|
|
724
|
+
import tkinter.filedialog as tkFileDialog
|
|
725
|
+
root = Tkinter.Tk()
|
|
726
|
+
root.withdraw()
|
|
727
|
+
args.filename = tkFileDialog.askopenfilename(parent=root)
|
|
728
|
+
args.extract = True
|
|
729
|
+
|
|
730
|
+
if not os.path.exists(args.filename):
|
|
731
|
+
print("Please specify a valid MDX/MDD file")
|
|
732
|
+
|
|
733
|
+
base, ext = os.path.splitext(args.filename)
|
|
734
|
+
|
|
735
|
+
# read mdx file
|
|
736
|
+
if ext.lower() == os.path.extsep + 'mdx':
|
|
737
|
+
mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode)
|
|
738
|
+
if type(args.filename) is unicode:
|
|
739
|
+
bfname = args.filename.encode('utf-8')
|
|
740
|
+
else:
|
|
741
|
+
bfname = args.filename
|
|
742
|
+
print('======== %s ========' % bfname)
|
|
743
|
+
print(' Number of Entries : %d' % len(mdx))
|
|
744
|
+
for key, value in mdx.header.items():
|
|
745
|
+
print(' %s : %s' % (key, value))
|
|
746
|
+
else:
|
|
747
|
+
mdx = None
|
|
748
|
+
|
|
749
|
+
# find companion mdd file(s)
|
|
750
|
+
i = 0
|
|
751
|
+
mdds = []
|
|
752
|
+
while True:
|
|
753
|
+
extra = '' if i == 0 else '.%d' % i
|
|
754
|
+
mdd_filename = ''.join([base, extra, os.path.extsep, 'mdd'])
|
|
755
|
+
if os.path.exists(mdd_filename):
|
|
756
|
+
mdd = MDD(mdd_filename, args.passcode)
|
|
757
|
+
if type(mdd_filename) is unicode:
|
|
758
|
+
bfname = mdd_filename.encode('utf-8')
|
|
759
|
+
else:
|
|
760
|
+
bfname = mdd_filename
|
|
761
|
+
print('======== %s ========' % bfname)
|
|
762
|
+
print(' Number of Entries : %d' % len(mdd))
|
|
763
|
+
for key, value in mdd.header.items():
|
|
764
|
+
print(' %s : %s' % (key, value))
|
|
765
|
+
mdds.append(mdd)
|
|
766
|
+
else:
|
|
767
|
+
break
|
|
768
|
+
i += 1
|
|
769
|
+
|
|
770
|
+
if args.extract:
|
|
771
|
+
# write out glos
|
|
772
|
+
if mdx:
|
|
773
|
+
output_fname = ''.join([base, os.path.extsep, 'txt'])
|
|
774
|
+
tf = open(output_fname, 'wb')
|
|
775
|
+
for key, value in mdx.items():
|
|
776
|
+
tf.write(key)
|
|
777
|
+
tf.write(b'\r\n')
|
|
778
|
+
tf.write(value)
|
|
779
|
+
if not value.endswith(b'\n'):
|
|
780
|
+
tf.write(b'\r\n')
|
|
781
|
+
tf.write(b'</>\r\n')
|
|
782
|
+
tf.close()
|
|
783
|
+
# write out style
|
|
784
|
+
if mdx.header.get(b'StyleSheet'):
|
|
785
|
+
style_fname = ''.join([base, '_style', os.path.extsep, 'txt'])
|
|
786
|
+
sf = open(style_fname, 'wb')
|
|
787
|
+
sf.write(b'\r\n'.join(mdx.header[b'StyleSheet'].splitlines()))
|
|
788
|
+
sf.close()
|
|
789
|
+
# write out optional data files
|
|
790
|
+
if mdds:
|
|
791
|
+
datafolder = os.path.join(os.path.dirname(args.filename), args.datafolder)
|
|
792
|
+
if not os.path.exists(datafolder):
|
|
793
|
+
os.makedirs(datafolder)
|
|
794
|
+
for mdd in mdds:
|
|
795
|
+
for key, value in mdd.items():
|
|
796
|
+
fname = key.decode('utf-8').replace('\\', os.path.sep)
|
|
797
|
+
dfname = datafolder + fname
|
|
798
|
+
if not os.path.exists(os.path.dirname(dfname)):
|
|
799
|
+
os.makedirs(os.path.dirname(dfname))
|
|
800
|
+
df = open(dfname, 'wb')
|
|
801
|
+
df.write(value)
|
|
802
|
+
df.close()
|