fstdtools 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,802 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # readmdict.py
4
+ # Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
5
+ #
6
+ # Copyright (C) 2012, 2013, 2015, 2022, 2023 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
7
+ #
8
+ # This program is a free software; you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, version 3 of the License.
11
+ #
12
+ # You can get a copy of GNU General Public License along this program
13
+ # But you can always get it from http://www.gnu.org/licenses/gpl.txt
14
+ #
15
+ # This program is distributed in the hope that it will be useful,
16
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ # GNU General Public License for more details.
19
+
20
+ from struct import pack, unpack
21
+ from io import BytesIO
22
+ import re
23
+ import sys
24
+
25
+ from .ripemd128 import ripemd128
26
+ from .pureSalsa20 import Salsa20
27
+
28
+ # zlib compression is used for engine version >=2.0
29
+ import zlib
30
+ # LZO compression is used for engine version < 2.0
31
+ try:
32
+ import lzo
33
+ except ImportError:
34
+ lzo = None
35
+
36
+ # xxhash is used for engine version >= 3.0
37
+ try:
38
+ import xxhash
39
+ except ImportError:
40
+ xxhash = None
41
+
42
+ # 2x3 compatible
43
+ if sys.hexversion >= 0x03000000:
44
+ unicode = str
45
+
46
+
47
+ def _unescape_entities(text):
48
+ """
49
+ unescape offending tags < > " &
50
+ """
51
+ text = text.replace(b'&lt;', b'<')
52
+ text = text.replace(b'&gt;', b'>')
53
+ text = text.replace(b'&quot;', b'"')
54
+ text = text.replace(b'&amp;', b'&')
55
+ return text
56
+
57
+
58
+ def _fast_decrypt(data, key):
59
+ """
60
+ XOR decryption
61
+ """
62
+ b = bytearray(data)
63
+ key = bytearray(key)
64
+ previous = 0x36
65
+ for i in range(len(b)):
66
+ t = (b[i] >> 4 | b[i] << 4) & 0xff
67
+ t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)]
68
+ previous = b[i]
69
+ b[i] = t
70
+ return bytes(b)
71
+
72
+
73
+ def _salsa_decrypt(ciphertext, encrypt_key):
74
+ """
75
+ salsa20 (8 rounds) decryption
76
+ """
77
+ s20 = Salsa20(key=encrypt_key, IV=b"\x00"*8, rounds=8)
78
+ return s20.encryptBytes(ciphertext)
79
+
80
+
81
+ def _decrypt_regcode_by_userid(reg_code, userid):
82
+ userid_digest = ripemd128(userid)
83
+ s20 = Salsa20(key=userid_digest, IV=b"\x00"*8, rounds=8)
84
+ encrypt_key = s20.encryptBytes(reg_code)
85
+ return encrypt_key
86
+
87
+
88
+ class MDict(object):
89
+ """
90
+ Base class which reads in header and key block.
91
+ It has no public methods and serves only as code sharing base class.
92
+ """
93
+ def __init__(self, fname, encoding='', passcode=None):
94
+ self._fname = fname
95
+ self._encoding = encoding.upper()
96
+ self._encrypted_key = None
97
+
98
+ self.header = self._read_header()
99
+
100
+ # decrypt regcode to get the encrypted key
101
+ if passcode is not None:
102
+ regcode, userid = passcode
103
+ if isinstance(userid, unicode):
104
+ userid = userid.encode('utf8')
105
+ self._encrypted_key = _decrypt_regcode_by_userid(regcode, userid)
106
+ # MDict 3.0 encryption key derives from UUID if present
107
+ elif self._version >= 3.0:
108
+ uuid = self.header.get(b'UUID')
109
+ if uuid:
110
+ if xxhash is None:
111
+ raise RuntimeError('xxhash module is needed to read MDict 3.0 format')
112
+ mid = (len(uuid) + 1) // 2
113
+ self._encrypted_key = xxhash.xxh64_digest(uuid[:mid]) + xxhash.xxh64_digest(uuid[mid:])
114
+
115
+ self._key_list = self._read_keys()
116
+
117
+ def __len__(self):
118
+ return self._num_entries
119
+
120
+ def __iter__(self):
121
+ return self.keys()
122
+
123
+ def keys(self):
124
+ """
125
+ Return an iterator over dictionary keys.
126
+ """
127
+ return (key_value for key_id, key_value in self._key_list)
128
+
129
+ def _read_number(self, f):
130
+ return unpack(self._number_format, f.read(self._number_width))[0]
131
+
132
+ def _read_int32(self, f):
133
+ return unpack('>I', f.read(4))[0]
134
+
135
+ def _parse_header(self, header):
136
+ """
137
+ extract attributes from <Dict attr="value" ... >
138
+ """
139
+ taglist = re.findall(rb'(\w+)="(.*?)"', header, re.DOTALL)
140
+ tagdict = {}
141
+ for key, value in taglist:
142
+ tagdict[key] = _unescape_entities(value)
143
+ return tagdict
144
+
145
+ def _decode_block(self, block, decompressed_size):
146
+ # block info: compression, encryption
147
+ info = unpack('<L', block[:4])[0]
148
+ compression_method = info & 0xf
149
+ encryption_method = (info >> 4) & 0xf
150
+ encryption_size = (info >> 8) & 0xff
151
+
152
+ # adler checksum of the block data used as the encryption key if none given
153
+ adler32 = unpack('>I', block[4:8])[0]
154
+ encrypted_key = self._encrypted_key
155
+ if encrypted_key is None:
156
+ encrypted_key = ripemd128(block[4:8])
157
+
158
+ # block data
159
+ data = block[8:]
160
+
161
+ # decrypt
162
+ if encryption_method == 0:
163
+ decrypted_block = data
164
+ elif encryption_method == 1:
165
+ decrypted_block = _fast_decrypt(data[:encryption_size], encrypted_key) + data[encryption_size:]
166
+ elif encryption_method == 2:
167
+ decrypted_block = _salsa_decrypt(data[:encryption_size], encrypted_key) + data[encryption_size:]
168
+ else:
169
+ raise Exception('encryption method %d not supported' % encryption_method)
170
+
171
+ # check adler checksum over decrypted data
172
+ if self._version >= 3:
173
+ assert(hex(adler32) == hex(zlib.adler32(decrypted_block) & 0xffffffff))
174
+
175
+ # decompress
176
+ if compression_method == 0:
177
+ decompressed_block = decrypted_block
178
+ elif compression_method == 1:
179
+ if lzo is None:
180
+ raise RuntimeError("LZO compression is not supported")
181
+ header = b'\xf0' + pack('>I', decompressed_size)
182
+ decompressed_block = lzo.decompress(header + decrypted_block)
183
+ elif compression_method == 2:
184
+ decompressed_block = zlib.decompress(decrypted_block)
185
+ else:
186
+ raise Exception('compression method %d not supported' % compression_method)
187
+
188
+ # check adler checksum over decompressed data
189
+ if self._version < 3:
190
+ assert(hex(adler32) == hex(zlib.adler32(decompressed_block) & 0xffffffff))
191
+
192
+ return decompressed_block
193
+
194
+ def _decode_key_block_info(self, key_block_info_compressed):
195
+ if self._version >= 2:
196
+ # zlib compression
197
+ assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00')
198
+ # decrypt if needed
199
+ if self._encrypt & 0x02:
200
+ key = ripemd128(key_block_info_compressed[4:8] + pack(b'<L', 0x3695))
201
+ key_block_info_compressed = key_block_info_compressed[:8] + _fast_decrypt(key_block_info_compressed[8:], key)
202
+ # decompress
203
+ key_block_info = zlib.decompress(key_block_info_compressed[8:])
204
+ # adler checksum
205
+ adler32 = unpack('>I', key_block_info_compressed[4:8])[0]
206
+ assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff)
207
+ else:
208
+ # no compression
209
+ key_block_info = key_block_info_compressed
210
+ # decode
211
+ key_block_info_list = []
212
+ num_entries = 0
213
+ i = 0
214
+ if self._version >= 2:
215
+ byte_format = '>H'
216
+ byte_width = 2
217
+ text_term = 1
218
+ else:
219
+ byte_format = '>B'
220
+ byte_width = 1
221
+ text_term = 0
222
+
223
+ while i < len(key_block_info):
224
+ # number of entries in current key block
225
+ num_entries += unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
226
+ i += self._number_width
227
+ # text head size
228
+ text_head_size = unpack(byte_format, key_block_info[i:i+byte_width])[0]
229
+ i += byte_width
230
+ # text head
231
+ if self._encoding != 'UTF-16':
232
+ i += text_head_size + text_term
233
+ else:
234
+ i += (text_head_size + text_term) * 2
235
+ # text tail size
236
+ text_tail_size = unpack(byte_format, key_block_info[i:i+byte_width])[0]
237
+ i += byte_width
238
+ # text tail
239
+ if self._encoding != 'UTF-16':
240
+ i += text_tail_size + text_term
241
+ else:
242
+ i += (text_tail_size + text_term) * 2
243
+ # key block compressed size
244
+ key_block_compressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
245
+ i += self._number_width
246
+ # key block decompressed size
247
+ key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
248
+ i += self._number_width
249
+ key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)]
250
+
251
+ #assert(num_entries == self._num_entries)
252
+
253
+ return key_block_info_list
254
+
255
+ def _decode_key_block(self, key_block_compressed, key_block_info_list):
256
+ key_list = []
257
+ i = 0
258
+ for compressed_size, decompressed_size in key_block_info_list:
259
+ key_block = self._decode_block(key_block_compressed[i:i+compressed_size], decompressed_size)
260
+ # extract one single key block into a key list
261
+ key_list += self._split_key_block(key_block)
262
+ i += compressed_size
263
+ return key_list
264
+
265
+ def _split_key_block(self, key_block):
266
+ key_list = []
267
+ key_start_index = 0
268
+ while key_start_index < len(key_block):
269
+ # the corresponding record's offset in record block
270
+ key_id = unpack(self._number_format, key_block[key_start_index:key_start_index+self._number_width])[0]
271
+ # key text ends with '\x00'
272
+ if self._encoding == 'UTF-16':
273
+ delimiter = b'\x00\x00'
274
+ width = 2
275
+ else:
276
+ delimiter = b'\x00'
277
+ width = 1
278
+ i = key_start_index + self._number_width
279
+ while i < len(key_block):
280
+ if key_block[i:i+width] == delimiter:
281
+ key_end_index = i
282
+ break
283
+ i += width
284
+ key_text = key_block[key_start_index+self._number_width:key_end_index]\
285
+ .decode(self._encoding, errors='ignore').encode('utf-8').strip()
286
+ key_start_index = key_end_index + width
287
+ key_list += [(key_id, key_text)]
288
+ return key_list
289
+
290
+ def _read_header(self):
291
+ f = open(self._fname, 'rb')
292
+ # number of bytes of header text
293
+ header_bytes_size = unpack('>I', f.read(4))[0]
294
+ header_bytes = f.read(header_bytes_size)
295
+ # 4 bytes: adler32 checksum of header, in little endian
296
+ adler32 = unpack('<I', f.read(4))[0]
297
+ assert(adler32 == zlib.adler32(header_bytes) & 0xffffffff)
298
+ # mark down key block offset
299
+ self._key_block_offset = f.tell()
300
+ f.close()
301
+
302
+ # header text in utf-16 encoding ending with '\x00\x00'
303
+ if header_bytes[-2:] == b'\x00\x00':
304
+ header_text = header_bytes[:-2].decode('utf-16').encode('utf-8')
305
+ else:
306
+ header_text = header_bytes[:-1]
307
+ header_tag = self._parse_header(header_text)
308
+
309
+ if not self._encoding:
310
+ encoding = header_tag.get(b'Encoding', b'utf-8')
311
+ if sys.hexversion >= 0x03000000:
312
+ encoding = encoding.decode('utf-8')
313
+ # GB18030 > GBK > GB2312
314
+ if encoding in ['GBK', 'GB2312']:
315
+ encoding = 'GB18030'
316
+ self._encoding = encoding
317
+
318
+ # encryption flag
319
+ # 0x00 - no encryption, "Allow export to text" is checked in MdxBuilder 3.
320
+ # 0x01 - encrypt record block, "Encryption Key" is given in MdxBuilder 3.
321
+ # 0x02 - encrypt key info block, "Allow export to text" is unchecked in MdxBuilder 3.
322
+ if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No':
323
+ self._encrypt = 0
324
+ elif header_tag[b'Encrypted'] == b'Yes':
325
+ self._encrypt = 1
326
+ else:
327
+ self._encrypt = int(header_tag[b'Encrypted'])
328
+
329
+ # stylesheet attribute if present takes form of:
330
+ # style_number # 1-255
331
+ # style_begin # or ''
332
+ # style_end # or ''
333
+ # store stylesheet in dict in the form of
334
+ # {'number' : ('style_begin', 'style_end')}
335
+ self._stylesheet = {}
336
+ if header_tag.get(b'StyleSheet'):
337
+ lines = header_tag[b'StyleSheet'].splitlines()
338
+ for i in range(0, len(lines), 3):
339
+ self._stylesheet[lines[i]] = (lines[i+1], lines[i+2])
340
+
341
+ # before version 2.0, number is 4 bytes integer
342
+ # version 2.0 and above uses 8 bytes
343
+ self._version = float(header_tag[b'GeneratedByEngineVersion'])
344
+ if self._version < 2.0:
345
+ self._number_width = 4
346
+ self._number_format = '>I'
347
+ else:
348
+ self._number_width = 8
349
+ self._number_format = '>Q'
350
+ # version 3.0 uses UTF-8 only
351
+ if self._version >= 3:
352
+ self._encoding = 'UTF-8'
353
+
354
+ return header_tag
355
+
356
+ def _read_keys(self):
357
+ if self._version >= 3:
358
+ return self._read_keys_v3()
359
+ else:
360
+ # if no regcode is given, try brutal force (only for engine <= 2)
361
+ if (self._encrypt & 0x01) and self._encrypted_key is None:
362
+ print("Try Brutal Force on Encrypted Key Blocks")
363
+ return self._read_keys_brutal()
364
+ else:
365
+ return self._read_keys_v1v2()
366
+
367
+ def _read_keys_v3(self):
368
+ f = open(self._fname, 'rb')
369
+ f.seek(self._key_block_offset)
370
+
371
+ # find all blocks offset
372
+ while True:
373
+ block_type = self._read_int32(f)
374
+ block_size = self._read_number(f)
375
+ block_offset = f.tell()
376
+ # record data
377
+ if block_type == 0x01000000:
378
+ self._record_block_offset = block_offset
379
+ # record index
380
+ elif block_type == 0x02000000:
381
+ self._record_index_offset = block_offset
382
+ # key data
383
+ elif block_type == 0x03000000:
384
+ self._key_data_offset = block_offset
385
+ # key index
386
+ elif block_type == 0x04000000:
387
+ self._key_index_offset = block_offset
388
+ else:
389
+ raise RuntimeError("Unknown block type %d" % block_type)
390
+ f.seek(block_size, 1)
391
+ # test the end of file
392
+ if f.read(4):
393
+ f.seek(-4, 1)
394
+ else:
395
+ break
396
+
397
+ # read key data
398
+ f.seek(self._key_data_offset)
399
+ number = self._read_int32(f)
400
+ total_size = self._read_number(f)
401
+ key_list = []
402
+ for i in range(number):
403
+ decompressed_size = self._read_int32(f)
404
+ compressed_size = self._read_int32(f)
405
+ block_data = f.read(compressed_size)
406
+ decompressed_block_data = self._decode_block(block_data, decompressed_size)
407
+ key_list.extend(self._split_key_block(decompressed_block_data))
408
+
409
+ f.close()
410
+ self._num_entries = len(key_list)
411
+ return key_list
412
+
413
+ def _read_keys_v1v2(self):
414
+ f = open(self._fname, 'rb')
415
+ f.seek(self._key_block_offset)
416
+
417
+ # the following numbers could be encrypted
418
+ if self._version >= 2.0:
419
+ num_bytes = 8 * 5
420
+ else:
421
+ num_bytes = 4 * 4
422
+ block = f.read(num_bytes)
423
+
424
+ if self._encrypt & 1:
425
+ block = _salsa_decrypt(block, self._encrypted_key)
426
+
427
+ # decode this block
428
+ sf = BytesIO(block)
429
+ # number of key blocks
430
+ num_key_blocks = self._read_number(sf)
431
+ # number of entries
432
+ self._num_entries = self._read_number(sf)
433
+ # number of bytes of key block info after decompression
434
+ if self._version >= 2.0:
435
+ key_block_info_decomp_size = self._read_number(sf)
436
+ # number of bytes of key block info
437
+ key_block_info_size = self._read_number(sf)
438
+ # number of bytes of key block
439
+ key_block_size = self._read_number(sf)
440
+
441
+ # 4 bytes: adler checksum of previous 5 numbers
442
+ if self._version >= 2.0:
443
+ adler32 = unpack('>I', f.read(4))[0]
444
+ assert adler32 == (zlib.adler32(block) & 0xffffffff)
445
+
446
+ # read key block info, which indicates key block's compressed and decompressed size
447
+ key_block_info = f.read(key_block_info_size)
448
+ key_block_info_list = self._decode_key_block_info(key_block_info)
449
+ assert(num_key_blocks == len(key_block_info_list))
450
+
451
+ # read key block
452
+ key_block_compressed = f.read(key_block_size)
453
+ # extract key block
454
+ key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
455
+
456
+ self._record_block_offset = f.tell()
457
+ f.close()
458
+
459
+ return key_list
460
+
461
+ def _read_keys_brutal(self):
462
+ f = open(self._fname, 'rb')
463
+ f.seek(self._key_block_offset)
464
+
465
+ # the following numbers could be encrypted, disregard them!
466
+ if self._version >= 2.0:
467
+ num_bytes = 8 * 5 + 4
468
+ key_block_type = b'\x02\x00\x00\x00'
469
+ else:
470
+ num_bytes = 4 * 4
471
+ key_block_type = b'\x01\x00\x00\x00'
472
+ block = f.read(num_bytes)
473
+
474
+ # key block info
475
+ # 4 bytes '\x02\x00\x00\x00'
476
+ # 4 bytes adler32 checksum
477
+ # unknown number of bytes follows until '\x02\x00\x00\x00' which marks the beginning of key block
478
+ key_block_info = f.read(8)
479
+ if self._version >= 2.0:
480
+ assert key_block_info[:4] == b'\x02\x00\x00\x00'
481
+ while True:
482
+ fpos = f.tell()
483
+ t = f.read(1024)
484
+ index = t.find(key_block_type)
485
+ if index != -1:
486
+ key_block_info += t[:index]
487
+ f.seek(fpos + index)
488
+ break
489
+ else:
490
+ key_block_info += t
491
+
492
+ key_block_info_list = self._decode_key_block_info(key_block_info)
493
+ key_block_size = sum(list(zip(*key_block_info_list))[0])
494
+
495
+ # read key block
496
+ key_block_compressed = f.read(key_block_size)
497
+ # extract key block
498
+ key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
499
+
500
+ self._record_block_offset = f.tell()
501
+ f.close()
502
+
503
+ self._num_entries = len(key_list)
504
+ return key_list
505
+
506
+ def items(self):
507
+ """Return a generator which in turn produce tuples in the form of (filename, content)
508
+ """
509
+ return self._read_records()
510
+
511
+ def _read_records(self):
512
+ if self._version >= 3:
513
+ yield from self._read_records_v3()
514
+ else:
515
+ yield from self._read_records_v1v2()
516
+
517
+ def _read_records_v3(self):
518
+
519
+ # record index has redudant information about block compressed/decompresed size
520
+ record_index = self._read_record_index()
521
+
522
+ f = open(self._fname, 'rb')
523
+ f.seek(self._record_block_offset)
524
+
525
+ offset = 0
526
+ i = 0
527
+ size_counter = 0
528
+
529
+ num_record_blocks = self._read_int32(f)
530
+ num_bytes = self._read_number(f)
531
+ for j in range(num_record_blocks):
532
+ decompressed_size = self._read_int32(f)
533
+ compressed_size = self._read_int32(f)
534
+
535
+ # check against the record index information
536
+ if (compressed_size + 8, decompressed_size) != record_index[j]:
537
+ compressed_size = record_index[j][0] - 8
538
+ decompressed_size = record_index[j][1]
539
+ # skip to the next block
540
+ print('Skip (potentially) damaged record block')
541
+ f.read(compressed_size)
542
+ continue
543
+
544
+ record_block = self._decode_block(f.read(compressed_size), decompressed_size)
545
+
546
+ # split record block according to the offset info from key block
547
+ while i < len(self._key_list):
548
+ record_start, key_text = self._key_list[i]
549
+ # reach the end of current record block
550
+ if record_start - offset >= len(record_block):
551
+ break
552
+ # record end index
553
+ if i < len(self._key_list)-1:
554
+ record_end = self._key_list[i+1][0]
555
+ else:
556
+ record_end = len(record_block) + offset
557
+ i += 1
558
+ data = record_block[record_start-offset:record_end-offset]
559
+ yield key_text, self._treat_record_data(data)
560
+ offset += len(record_block)
561
+ size_counter += compressed_size
562
+
563
+ def _read_records_v1v2(self):
564
+ f = open(self._fname, 'rb')
565
+ f.seek(self._record_block_offset)
566
+
567
+ num_record_blocks = self._read_number(f)
568
+ num_entries = self._read_number(f)
569
+ assert(num_entries == self._num_entries)
570
+ record_block_info_size = self._read_number(f)
571
+ record_block_size = self._read_number(f)
572
+
573
+ # record block info section
574
+ record_block_info_list = []
575
+ size_counter = 0
576
+ for i in range(num_record_blocks):
577
+ compressed_size = self._read_number(f)
578
+ decompressed_size = self._read_number(f)
579
+ record_block_info_list += [(compressed_size, decompressed_size)]
580
+ size_counter += self._number_width * 2
581
+ assert(size_counter == record_block_info_size)
582
+
583
+ # actual record block
584
+ offset = 0
585
+ i = 0
586
+ size_counter = 0
587
+ for compressed_size, decompressed_size in record_block_info_list:
588
+ record_block = self._decode_block(f.read(compressed_size), decompressed_size)
589
+
590
+ # split record block according to the offset info from key block
591
+ while i < len(self._key_list):
592
+ record_start, key_text = self._key_list[i]
593
+ # reach the end of current record block
594
+ if record_start - offset >= len(record_block):
595
+ break
596
+ # record end index
597
+ if i < len(self._key_list)-1:
598
+ record_end = self._key_list[i+1][0]
599
+ else:
600
+ record_end = len(record_block) + offset
601
+ i += 1
602
+ data = record_block[record_start-offset:record_end-offset]
603
+ yield key_text, self._treat_record_data(data)
604
+ offset += len(record_block)
605
+ size_counter += compressed_size
606
+ assert(size_counter == record_block_size)
607
+
608
+ f.close()
609
+
610
+ def _read_record_index(self):
611
+ f = open(self._fname, 'rb')
612
+
613
+ f.seek(self._record_index_offset)
614
+ num_record_blocks = self._read_int32(f)
615
+ num_bytes = self._read_number(f)
616
+
617
+ record_index = []
618
+ for i in range(num_record_blocks):
619
+ decompressed_size = self._read_int32(f)
620
+ compressed_size = self._read_int32(f)
621
+ record_block = self._decode_block(f.read(compressed_size), decompressed_size)
622
+ if len(record_block) % 16 != 0:
623
+ raise Exception('record index block has invalid size %d' % len(record_block))
624
+
625
+ j = 0
626
+ while j < len(record_block):
627
+ block_size, decompressed_size = unpack('>QQ', record_block[j:j+16])
628
+ record_index.append((block_size, decompressed_size))
629
+ j += 16
630
+ f.close()
631
+ return record_index
632
+
633
+ def _treat_record_data(self, data):
634
+ return data
635
+
636
+
637
+ class MDD(MDict):
638
+ """
639
+ MDict resource file format (*.MDD) reader.
640
+ >>> mdd = MDD('example.mdd')
641
+ >>> len(mdd)
642
+ 208
643
+ >>> for filename,content in mdd.items():
644
+ ... print filename, content[:10]
645
+ """
646
+ def __init__(self, fname, passcode=None):
647
+ MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode)
648
+
649
+
650
+ class MDX(MDict):
651
+ """
652
+ MDict dictionary file format (*.MDD) reader.
653
+ >>> mdx = MDX('example.mdx')
654
+ >>> len(mdx)
655
+ 42481
656
+ >>> for key,value in mdx.items():
657
+ ... print key, value[:10]
658
+ """
659
+ def __init__(self, fname, encoding='', substyle=False, passcode=None):
660
+ MDict.__init__(self, fname, encoding, passcode)
661
+ self._substyle = substyle
662
+
663
+ def _substitute_stylesheet(self, txt):
664
+ # substitute stylesheet definition
665
+ txt_list = re.split(rb'`\d+`', txt)
666
+ txt_tag = re.findall(rb'`\d+`', txt)
667
+ txt_styled = txt_list[0]
668
+ for j, p in enumerate(txt_list[1:]):
669
+ style = self._stylesheet[txt_tag[j][1:-1]]
670
+ if p and p[-1] == '\n':
671
+ txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n'
672
+ else:
673
+ txt_styled = txt_styled + style[0] + p + style[1]
674
+ return txt_styled
675
+
676
+ def _treat_record_data(self, data):
677
+ # convert to utf-8
678
+ data = data.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
679
+ # substitute styles
680
+ if self._substyle and self._stylesheet:
681
+ data = self._substitute_stylesheet(data)
682
+ return data
683
+
684
+
685
+ if __name__ == '__main__':
686
+ import sys
687
+ import os
688
+ import os.path
689
+ import argparse
690
+ import codecs
691
+
692
+ def passcode(s):
693
+ try:
694
+ regcode, userid = s.split(',')
695
+ except:
696
+ raise argparse.ArgumentTypeError("Passcode must be regcode,userid")
697
+ try:
698
+ regcode = codecs.decode(regcode, 'hex')
699
+ except:
700
+ raise argparse.ArgumentTypeError("regcode must be a 32 bytes hexadecimal string")
701
+ return regcode, userid
702
+
703
+ parser = argparse.ArgumentParser()
704
+ parser.add_argument('-x', '--extract', action="store_true",
705
+ help='extract mdx to source format and extract files from mdd')
706
+ parser.add_argument('-s', '--substyle', action="store_true",
707
+ help='substitute style definition if present')
708
+ parser.add_argument('-d', '--datafolder', default="data",
709
+ help='folder to extract data files from mdd')
710
+ parser.add_argument('-e', '--encoding', default="",
711
+ help='override the encoding specified in the mdx file')
712
+ parser.add_argument('-p', '--passcode', default=None, type=passcode,
713
+ help='register_code,email_or_deviceid')
714
+ parser.add_argument("filename", nargs='?', help="mdx file name")
715
+ args = parser.parse_args()
716
+
717
+ # use GUI to select file, default to extract
718
+ if not args.filename:
719
+ if sys.hexversion < 0x03000000:
720
+ import Tkinter
721
+ import tkFileDialog
722
+ else:
723
+ import tkinter as Tkinter
724
+ import tkinter.filedialog as tkFileDialog
725
+ root = Tkinter.Tk()
726
+ root.withdraw()
727
+ args.filename = tkFileDialog.askopenfilename(parent=root)
728
+ args.extract = True
729
+
730
+ if not os.path.exists(args.filename):
731
+ print("Please specify a valid MDX/MDD file")
732
+
733
+ base, ext = os.path.splitext(args.filename)
734
+
735
+ # read mdx file
736
+ if ext.lower() == os.path.extsep + 'mdx':
737
+ mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode)
738
+ if type(args.filename) is unicode:
739
+ bfname = args.filename.encode('utf-8')
740
+ else:
741
+ bfname = args.filename
742
+ print('======== %s ========' % bfname)
743
+ print(' Number of Entries : %d' % len(mdx))
744
+ for key, value in mdx.header.items():
745
+ print(' %s : %s' % (key, value))
746
+ else:
747
+ mdx = None
748
+
749
+ # find companion mdd file(s)
750
+ i = 0
751
+ mdds = []
752
+ while True:
753
+ extra = '' if i == 0 else '.%d' % i
754
+ mdd_filename = ''.join([base, extra, os.path.extsep, 'mdd'])
755
+ if os.path.exists(mdd_filename):
756
+ mdd = MDD(mdd_filename, args.passcode)
757
+ if type(mdd_filename) is unicode:
758
+ bfname = mdd_filename.encode('utf-8')
759
+ else:
760
+ bfname = mdd_filename
761
+ print('======== %s ========' % bfname)
762
+ print(' Number of Entries : %d' % len(mdd))
763
+ for key, value in mdd.header.items():
764
+ print(' %s : %s' % (key, value))
765
+ mdds.append(mdd)
766
+ else:
767
+ break
768
+ i += 1
769
+
770
+ if args.extract:
771
+ # write out glos
772
+ if mdx:
773
+ output_fname = ''.join([base, os.path.extsep, 'txt'])
774
+ tf = open(output_fname, 'wb')
775
+ for key, value in mdx.items():
776
+ tf.write(key)
777
+ tf.write(b'\r\n')
778
+ tf.write(value)
779
+ if not value.endswith(b'\n'):
780
+ tf.write(b'\r\n')
781
+ tf.write(b'</>\r\n')
782
+ tf.close()
783
+ # write out style
784
+ if mdx.header.get(b'StyleSheet'):
785
+ style_fname = ''.join([base, '_style', os.path.extsep, 'txt'])
786
+ sf = open(style_fname, 'wb')
787
+ sf.write(b'\r\n'.join(mdx.header[b'StyleSheet'].splitlines()))
788
+ sf.close()
789
+ # write out optional data files
790
+ if mdds:
791
+ datafolder = os.path.join(os.path.dirname(args.filename), args.datafolder)
792
+ if not os.path.exists(datafolder):
793
+ os.makedirs(datafolder)
794
+ for mdd in mdds:
795
+ for key, value in mdd.items():
796
+ fname = key.decode('utf-8').replace('\\', os.path.sep)
797
+ dfname = datafolder + fname
798
+ if not os.path.exists(os.path.dirname(dfname)):
799
+ os.makedirs(os.path.dirname(dfname))
800
+ df = open(dfname, 'wb')
801
+ df.write(value)
802
+ df.close()