wenlin_db_scanner 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,27 @@
1
+ versionOneFunnyMask
2
+ [08 40 80 01 20 02 04 10 04 10 01 80 08 02 40 20 40 80 08 20 04 10 02 01 02 04 10 01 80 40 20 08 80 04 02 20 01 08 10 40 01 40 04 20 10 80 08 02 10 04 08 40 20 80 01 02 20 40 08 10 01 04 02 80]
3
+
4
+ versionTwoFunnyMask
5
+ [08 40 80 01 20 02 04 10 04 10 01 80 08 02 40 20 40 80 08 20 04 10 02 01 02 04 10 01 80 40 20 08 80 04 02 20 01 08 10 40 01 40 04 20 10 80 08 02 10 04 08 40 20 80 01 02 20 40 08 10 01 04 02 80]
6
+
7
+ versionTwoCrypMask
8
+ [E2 68 BB 3C 2E 16 89 BE 8C 95 CD E9 EF 49 75 78 84 A9 EF 92 56 72 2C 1E 15 16 8D B9 C6 64 EF B4 C9 E3 75 38 EC 17 13 52 2C A2 27 B1 13 F1 C9 C2 BD D4 58 F3 AB 52 2E 61 A6 A1 CB 8F 71 29 CE 84]
9
+
10
+ versionOneCrypMask
11
+ [C9 E3 72 38 EC 16 13 58 C2 2C A2 26 B1 13 F1 C9 BD D4 58 F2 AB 52 2E 61 A7 A1 CB 8F 71 29 CE 84 E2 78 68 BB 3C 2E 16 89 BE 8C 93 CD E9 EF 49 75 84 A9 EF 92 56 78 3C 1E 17 13 8D B9 C7 64 EF B4]
12
+
13
+ codeMatrix
14
+ [47 FC 6D 84 28 FD 4C B8 7F 7B AC 44 72 46 DC 0D 3C 5B FE 0C D9 25 97 E9 76 76 D5 5F 9B 44 A4 4F 16 24 6F A1 A7 86 B6 DE 6D B6 54 8E 13 8E 8E 53 BA FC DB C2 A5 37 75 04 A6 C0 A4 31 4C 1B C5 68 C9 4A 1D AE A5 0E 60 8C 25 DD FF 67 79 A2 35 9D A8]
15
+
16
+ inverseMatrix
17
+ [19 FA CB ED E4 B6 D9 AF 7A 8E A8 8F 20 2F A1 27 17 5A A5 24 F1 0B 44 B9 32 B7 AA FE 99 78 B9 3A A7 2F 56 5D 68 2D 00 DC 5E EB B2 73 5B 02 B9 EF E9 15 82 66 E2 05 E2 E6 8C B2 35 C7 8E CB 3B CA 16 A1 77 26 A7 D9 15 E0 F1 63 89 D3 59 A5 57 1E F1]
18
+
19
+ leftNode
20
+ [01 01 03 01 05 01 07 01 0B 01 0D 01 0F 01 11 01 13 01 15 01 17 01 19 01 1B 01 1D 01 1F 01 40 01 5E 01 60 01 80 01 82 01 84 01 86 01 88 01 8A 01 8C 01 8E 01 90 01 92 01 94 01 96 01 98 01 9A 01 9C 01 9E 01 A0 01 A2 01 A4 01 A6 01 A8 01 AA 01 AC 01 AE 01 B0 01 B2 01 B4 01 B6 01 B8 01 BA 01 BC 01 BE 01 C0 01 C6 01 C9 01 CB 01 CE 01 D0 01 D2 01 D4 01 D6 01 D8 01 DA 01 DC 01 DE 01 E0 01 EA 01 EC 01 F0 01 F2 01 F4 01 F6 01 F8 01 FA 01 FC 01 FE 01 7E 01 01 00 03 00 05 00 07 00 09 00 0B 00 0D 00 0F 00 11 00 13 00 15 00 17 00 19 00 1B 00 1D 00 1F 00 21 00 23 00 25 00 27 00 29 00 2B 00 2D 00 2F 00 31 00 33 00 35 00 37 00 39 00 3B 00 3D 00 3F 00 41 00 43 00 45 00 47 00 49 00 CC 01 4B 00 4D 00 4F 00 51 00 53 00 55 00 57 00 59 00 5B 00 5D 00 5F 00 61 00 63 00 65 00 67 00 69 00 6B 00 6D 00 6F 00 71 00 73 00 75 00 77 00 79 00 7B 00 7D 00 7F 00 81 00 83 00 C2 01 85 00 87 00 89 00 8B 00 8D 00 8F 00 91 00 93 00 3C 01 2A 01 95 00 58 01 97 00 2B 01 98 00 4A 01 52 01 9B 00 21 01 9D 00 7D 01 26 01 9F 00 A1 00 47 01 45 01 A2 00 4D 01 44 01 4E 01 48 01 3D 01 4C 01 A6 00 A8 00 AA 00 AC 00 41 01 AF 00 B1 00 43 01 B3 00 B5 00 39 01 36 01 B6 00 B7 00 B8 00 4B 01 71 01 33 01 C5 01 BA 00 09 01 30 01 BD 00 BE 00 BF 00 C0 00 6B 01 C3 00 C4 00 C5 00 78 01 C7 00 C7 01 C8 00 C9 00 62 01 2C 01 79 01 E9 01 66 01 5B 01 CE 00 CF 00 70 01 D1 00 29 01 D2 00 6D 01 67 01 D5 00 2E 01 D7 00 6C 01 C3 01 E5 01 68 01 DB 00 73 01 DE 00 74 01 E1 00 69 01 61 01 E3 00 E5 00 E6 00 65 01 6E 01 EA 00 EC 00 EE 00 F0 00 F2 00 F4 00 20 01 F7 00 F9 00 FB 00 FD 00 00 01]
21
+
22
+ rightNode
23

24
+
25
+ upNode
26

27
+
@@ -0,0 +1,235 @@
1
+ --- each file
2
+ 2 bytes header length
3
+ 2 bytes version (1)
4
+ n bytes header
5
+ array of records
6
+
7
+ --- each record
8
+ 2 bytes record size
9
+ - if negative, record is empty space
10
+ n bytes record
11
+
12
+ --- each n-byte record
13
+ 1 byte flag
14
+ - bit 1: set for alternate "encryption" offsets, used to protect CDL
15
+ - bit 2: set for text data, clear for binary data
16
+ L=N-1 bytes of real data
17
+ - the pseudocode below covers the algoritm used to decode a record
18
+
19
+ _DBRecordReadIntoTextBuf(buffer, fp, offset)
20
+ recordLength = fread(internal buffer, 1, 2, fp) interpreted as MSB
21
+ recordTag = fgetc(fp)
22
+ recordLength -= 1
23
+ if (recordTag >> 1) & 1 is not 0
24
+ return _DecompressDbRec(buffer, fp, recordLength, recordTag)
25
+ while recordLength > 0
26
+ rawRecordByte = fgetc(fp)
27
+ if rawRecordByte is -1
28
+ return failure
29
+ stat = _TBPutC(buffer, rawRecordByte)
30
+ if stat is not 0
31
+ return stat
32
+ recordLength -= 1
33
+
34
+ _DecompressDbRec(buffer, fp, recordLength, recordTag)
35
+ - pDcStruct is a stack-allocated DecompressStruct
36
+ - decompressedChar is a local holding up to 5 bytes (a UTF8 char)
37
+ memset(pDcStruct, 0, sizeof(pDcStruct));
38
+ pDcStruct->fp = fp
39
+ pDcStruct->totalBits = pDcStruct->totalBits2 = recordLength * 8
40
+ pDcStruct->funnyMaskPtr = _FunnyMask(recordLength, recordTagCopy)
41
+ - returns pointer into funnyMask array
42
+ - if is v1
43
+ - return versionOneFunnyMasks + 8 * (recordLength & 7) bytes
44
+ - if archiveDifferent is set
45
+ - return versionTwoFunnyMasks + 8 * ((recordLength + (recordTag & 1)) & 7) bytes
46
+ - else
47
+ - versionTwoFunnyMasks + 8 * (recordLength & 7) bytes
48
+ pDcStruct->cryptOffset = _CrypOfs(recordLength, recordTagCopy)
49
+ - returns number between 0..63, probably offset into crypmask
50
+ - normally returns recordLength & 63
51
+ - if archiveDifferent is set and it's not v1
52
+ - instead return (recordLength + ((recordTag & 1) * 8)) & 63
53
+ pDcStruct->cryptMask = _CrypMask(cryptOffset)
54
+ - if v1, versionOneCrypMask; else versionTwoCrypMask
55
+ if recordLength <= 8 or v1
56
+ - pDcStruct->lineOffset = 9 // otherwise, it's initialized at 0
57
+
58
+ loop
59
+ decompressedByte = _DecompressByte(pDcStruct)
60
+ if decompressedByte < 0
61
+ if decompressedByte is 0xFFFF
62
+ return 0 // great success?
63
+ if ((decompressedByte >> 7) & 1) == 1 // original code more complicated
64
+ decompressedChar[0] = decompressedByte
65
+ mbCharLen = _MBCLenFromFirstByte(decompressedByte)
66
+ charOffset = 1 // the byte offset in decompressedChar
67
+ while charOffset < mbCharLen
68
+ decompressedByte = _DecompressSixBits(pDcStruct)
69
+ if decompressedByte <= 0
70
+ return failure code 0xFFFFDFFF
71
+ decompressedChar[charOffset] = decompressedByte
72
+ decompressedChar[mbCharLen] = 0
73
+ unicodeChar = _ZiNumberLen(decompressedChar, ziBuffer)
74
+ if unicodeChar is 0xFFFE or *ziBuffer is not mbCharLen
75
+ return failure code 0xFFFFDFFF
76
+ charOffset = 0
77
+ while charOffset < mbCharLen
78
+ stat = _TBPutC(buffer, decompressedChar[charOffset])
79
+ if stat is not 0
80
+ return stat // out of memory?
81
+ charOffset += 1
82
+ else
83
+ easyStat = _TBPutC(buffer, decompressedByte)
84
+ if easyStat is not 0
85
+ return easyStat
86
+
87
+ DecompressStruct, size 40 bytes -- 0x28
88
+ fp - [0x00] - file pointer
89
+ bitsRead - [0x04] - counts number of bits read, starts at 0
90
+ totalBits2 - [0x08] - record length in bits (unused)
91
+ totalBits - [0x0C] - record length in bits
92
+ currentLine - [0x10] - 9-character buffer of chars; populated from file, after matrix multiplication
93
+ lineOffset - [0x19] - 1-byte count, increasing from 0 to 9, pointing into currentLine
94
+ currentChar - [0x1A] - one character that was "decrypted", before bit permutation; populated from currentLine, after xor-decryption
95
+ funnyMaskPtr - [0x1C] - result of _FunnyMask call, points into funnyMask
96
+ cryptOffset - [0x20] - 1-byte count, starts at cryptOffset (0…63), decreasing
97
+ cryptMask - [0x24] - points to a *CrypMask
98
+
99
+ _DecompressByte(pDcStruct)
100
+ - appears to do some huffman decoding
101
+ node = 0xFE
102
+ loop
103
+ if (pStruct->bitsRead & 7) == 0
104
+ bit = _GetBitX(pStruct)
105
+ else
106
+ bit = (pStruct->currentChar + pStruct->funnyMaskPtr[pStruct->bitsRead & 7]) ? 1 : 0
107
+ pStruct->bitsRead += 1
108
+ if bit is 0
109
+ node = leftNode[node] // array of shorts
110
+ else
111
+ if bit < 0 // most likely for the -1 and -2 error codes coming out of _GetBitX
112
+ return bit
113
+ node = rightNode[node]
114
+ if node >= 256
115
+ return node - 256
116
+
117
+ _DecompressSixBits(pDcStruct)
118
+ - no huffman encoding, just read the bits
119
+ decompressedByte = 0
120
+ if (pStruct->bitsRead & 7) == 0
121
+ firstBit = _GetBitX(pStruct)
122
+ else
123
+ firstBit = (pStruct->currentChar + pStruct->funnyMaskPtr[pStruct->bitsRead & 7]) ? 1 : 0
124
+ pStruct->bitsRead += 1
125
+ if firstBit is not 0
126
+ if firstBit < 0
127
+ return firstBit // failure code
128
+ decompressedByte |= 0x20
129
+ the structure above is repeated 5 more times, and or's decompressedByte with 0x10 0x08 0x04 0x02 and 0x01
130
+
131
+ _ZiNumberLen(decompressedChar, lenBuffer)
132
+ - UTF8 to unicode
133
+ if decompressedChar[0] < 0x80
134
+ *lenBuffer = 1
135
+ return decompressedChar[0]
136
+ if decompressedChar[0] <= 0xDF
137
+ if decompressedChar[0] <= 0xC1
138
+ *lenBuffer = 1
139
+ return 0xFFFE // fail
140
+ if (decompressedChar[1] & 0xC0) != 0x80
141
+ *lenBuffer = 1
142
+ return 0xFFFE // fail
143
+ *lenBuffer = 2
144
+ return (decompressedChar[0] & 0x1F) << 6 | (decompressedChar[1] & 0x3F)
145
+ if decompressedChar[0] <= 0xEF
146
+ if decompressedChar[0] == 0xE0
147
+ if decompressedChar[1] <= 0x9F
148
+ *lenBuffer = 1
149
+ return 0xFFFE // fail
150
+ if decompressedChar[0] == 0xED
151
+ if decompressedChar[1] > 0x9F
152
+ *lenBuffer = 1
153
+ return 0xFFFE // fail
154
+ if (decompressedChar[1] & 0xC0) != 0x80 or (decompressedChar[2] & 0xC0) != 0x80
155
+ *lenBuffer = 1
156
+ return 0xFFFE // fail
157
+ *lenBuffer = 3
158
+ return (decompressedChar[0] & 0x0F) << 12 | (decompressedChar[1] & 0x3F) << 6 | (decompressedChar[2] & 0x3F)
159
+ if decompressedChar[0] > 0xF4
160
+ *lenBuffer = 1
161
+ return 0xFFFE // fail
162
+ if decompressedChar[0] == 0xF0
163
+ if decompressedChar[1] <= 0x8F
164
+ *lenBuffer = 1
165
+ return 0xFFFE // fail
166
+ if decompressedChar[0] == 0xF4
167
+ if decompressedChar[1] > 0x8F
168
+ *lenBuffer = 1
169
+ return 0xFFFE // fail
170
+ if (decompressedChar[1] & 0xC0) != 0x80 or (decompressedChar[2] & 0xC0) != 0x80 or (decompressedChar[3] & 0xC0) != 0x80
171
+ *lenBuffer = 1
172
+ return 0xFFFE // fail
173
+ *lenBuffer = 4
174
+ return (decompressedChar[0] & 0x07) << 18 | (decompressedChar[1] & 0x3F) << 12 | (decompressedChar[2] & 0x3F) << 6 | (decompressedChar[3] & 0x3F)
175
+
176
+ _GetBitX(pDcStruct)
177
+ - reads one byte from the currentChar / currentLine buffer
178
+ if pDcStruct->bitsRead >= pDcStruct->totalBits
179
+ return -2 // read too much
180
+ fChar = _MatrixFGetC(pDcStruct)
181
+ if fChar is -1, return -1 // I/O error
182
+
183
+ pStruct->currentChar = fChar ^ pStruct->cryptMask[pStruct->cryptOffset & 63]
184
+ pStruct->cryptOffset -= 1
185
+ returnValue = (pStruct->funnyMaskPtr[pStruct->bitsRead & 7] & pStruct->currentChar) ? 1 : 0
186
+ pStruct->bitsRead += 1
187
+
188
+ _MatrixFGetC(pStruct)
189
+ - buffer is a local (stack-allocated) buffer
190
+ if pStruct->currentLine is not 9
191
+ if pStruct->currentLine is 0
192
+ fread(buffer, 1, 9, fp)
193
+ if fread fails, return -1
194
+ _MatrixMultiply(pStruct->currentLine, inverseMatrix, buffer)
195
+ returnValue = pStruct->currentLine[pStruct[0x19]]
196
+ pStruct->lineOffset += 1
197
+ else // pStruct->lineOffset is 9
198
+ returnValue = getc(*pStruct) // *pStruct is fp
199
+
200
+ _MatrixMultiply(result, matrix, vector)
201
+ for (i = 8; i != -1; i--)
202
+ *result = matrix[8] * vector[8] + matrix[7] * vector[7] + vector[6] * vector[6] + matrix[4] * vector[4] + matrix[5] * vector[5] + matrix[2] * vector[2] + matrix[3] * vector[3] + matrix[0] * vector[0] + matrix[1] * vector[1]
203
+ result += 1
204
+ matrix += 9
205
+
206
+ _OpenDatabaseFile(dbIndex, fopenMode) --> FILE*
207
+ - dbIndex points in an array of database names,
208
+ - fopenMode is the mode arg for open (e.g., "rb")
209
+ _OpenDictionaryFileSetLoc(dbIndex, fopenMode, dbStructure = NULL)
210
+ _OpenDictionaryFileSetLocMayWarn(dbIndex, fopenMode, dbStructure, 1)
211
+ dbLocation = UUUDBLoc(dbStructure)
212
+ - if dbStructure is 0, return NULL
213
+ - otherwise return dbStructure[0x218] -- seems like it's a large struct
214
+ if dbLocation is NULL
215
+ dbLocation = GetDictionaryFileLocationFromName(dbIndex)
216
+ if dbLocation is NULL
217
+ fp = _OpenWenlinFile(1, dbIndex, fopenMode)
218
+ else
219
+ fp = _OpenFileFromDBLoc(dbLocation, dbIndex, fopenMode)
220
+ if fp is NULL and mayWarn, complain
221
+ return fp
222
+
223
+ _MBCLenFromFirstByte(firstByte)
224
+ - returns the length of a (UTF8 probably?) multi-byte character, based on its first byte
225
+ if firstByte & 0x80 == 0
226
+ return 1
227
+ if firstByte > 0xC1 && firstByte < 0xDF
228
+ return 2
229
+ if firstByte <= 0xEF
230
+ return 3
231
+ if firstByte <= 0xF7
232
+ return 4
233
+ else
234
+ return 1
235
+
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wenlin_db_scanner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Victor Costan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-30 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: yard
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 0.8.2.1
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 0.8.2.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: rdoc
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '3.12'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '3.12'
46
+ - !ruby/object:Gem::Dependency
47
+ name: bundler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 1.2.0
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.2.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: jeweler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: 1.8.4
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: 1.8.4
78
+ description: ! 'The Wenlin dictionary contains two great databases, the ABC English<->Chinese
79
+
80
+ dictionary, and the Character Description Language (CDL). Unfortunately, this
81
+
82
+ data is wrapped by a less-than-great UI. This gem lets you extract the data so
83
+
84
+ you can build your own UI for it.
85
+
86
+ '
87
+ email: victor@costan.us
88
+ executables:
89
+ - wenlin_dbdump
90
+ - wenlin_dict
91
+ - wenlin_hanzi
92
+ - wenlin_parts
93
+ extensions: []
94
+ extra_rdoc_files:
95
+ - LICENSE.txt
96
+ - README.md
97
+ files:
98
+ - .document
99
+ - Gemfile
100
+ - Gemfile.lock
101
+ - LICENSE.txt
102
+ - README.md
103
+ - Rakefile
104
+ - VERSION
105
+ - bin/wenlin_dbdump
106
+ - bin/wenlin_dict
107
+ - bin/wenlin_hanzi
108
+ - bin/wenlin_parts
109
+ - lib/wenlin_db_scanner.rb
110
+ - lib/wenlin_db_scanner/chars.rb
111
+ - lib/wenlin_db_scanner/db.rb
112
+ - lib/wenlin_db_scanner/db_record.rb
113
+ - lib/wenlin_db_scanner/dict.rb
114
+ - lib/wenlin_db_scanner/speech_parts.rb
115
+ - reversed/README.md
116
+ - reversed/code.asm
117
+ - reversed/magic.txt
118
+ - reversed/notes.txt
119
+ homepage: http://github.com/pwnall/wenlin_db_scanner
120
+ licenses:
121
+ - CC0
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ none: false
128
+ requirements:
129
+ - - ! '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ segments:
133
+ - 0
134
+ hash: 2072562403419786808
135
+ required_rubygems_version: !ruby/object:Gem::Requirement
136
+ none: false
137
+ requirements:
138
+ - - ! '>='
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubyforge_project:
143
+ rubygems_version: 1.8.24
144
+ signing_key:
145
+ specification_version: 3
146
+ summary: Extracts the data from the Wenlin dictionary
147
+ test_files: []