hwpkit-dev 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/ .npmignore +4 -1
  2. package/README.md +39 -2
  3. package/dist/index.d.mts +74 -16
  4. package/dist/index.d.ts +70 -16
  5. package/dist/index.js +4985 -698
  6. package/dist/index.js.map +1 -1
  7. package/dist/index.mjs +4981 -698
  8. package/dist/index.mjs.map +1 -1
  9. package/package.json +4 -1
  10. package/playground/index.html +346 -0
  11. package/playground/main.ts +302 -0
  12. package/playground/vite.config.ts +16 -0
  13. package/src/contract/decoder.ts +1 -0
  14. package/src/contract/encoder.ts +6 -1
  15. package/src/core/BaseDecoder.ts +118 -0
  16. package/src/core/BaseEncoder.ts +146 -0
  17. package/src/decoders/docx/DocxDecoder.ts +867 -150
  18. package/src/decoders/html/HtmlDecoder.ts +366 -0
  19. package/src/decoders/hwp/HwpScanner.ts +477 -88
  20. package/src/decoders/hwpx/HwpxDecoder.ts +789 -293
  21. package/src/decoders/md/MdDecoder.ts +4 -4
  22. package/src/encoders/docx/DocxEncoder.ts +600 -295
  23. package/src/encoders/html/HtmlEncoder.ts +203 -0
  24. package/src/encoders/hwp/HwpEncoder.ts +1647 -398
  25. package/src/encoders/hwpx/HwpxEncoder.ts +1512 -444
  26. package/src/encoders/hwpx/constants.ts +148 -0
  27. package/src/encoders/hwpx/utils.ts +198 -0
  28. package/src/encoders/md/MdEncoder.ts +117 -30
  29. package/src/index.ts +1 -0
  30. package/src/model/builders.ts +8 -6
  31. package/src/model/doc-props.ts +19 -5
  32. package/src/model/doc-tree.ts +13 -5
  33. package/src/pipeline/Pipeline.ts +21 -4
  34. package/src/pipeline/registry.ts +13 -2
  35. package/src/safety/StyleBridge.ts +52 -7
  36. package/src/toolkit/ArchiveKit.ts +56 -0
  37. package/src/toolkit/StyleMapper.ts +221 -0
  38. package/src/toolkit/UnitConverter.ts +138 -0
  39. package/src/toolkit/XmlKit.ts +0 -5
  40. package/test-styling.ts +210 -0
@@ -0,0 +1,366 @@
1
+ /**
2
+ * HtmlDecoder — HTML → DocRoot
3
+ * 간단한 HTML 파서 (표, 이미지, 스타일 지원)
4
+ */
5
+
6
+ import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode, GridNode } from '../../model/doc-tree';
7
+ import type { Outcome } from '../../contract/result';
8
+ import type { TextProps, ParaProps } from '../../model/doc-props';
9
+ import { A4 } from '../../model/doc-props';
10
+ import { succeed, fail } from '../../contract/result';
11
+ import { buildRoot, buildSheet, buildPara, buildSpan, buildImg, buildGrid, buildRow, buildCell } from '../../model/builders';
12
+ import { ShieldedParser } from '../../safety/ShieldedParser';
13
+ import { TextKit } from '../../toolkit/TextKit';
14
+ import { registry } from '../../pipeline/registry';
15
+ import { BaseDecoder } from '../../core/BaseDecoder';
16
+
17
+ interface Token {
18
+ type: 'tag' | 'text' | 'comment';
19
+ name?: string;
20
+ attrs?: Record<string, string>;
21
+ selfClose?: boolean;
22
+ close?: boolean;
23
+ content?: string;
24
+ }
25
+
26
+ export class HtmlDecoder extends BaseDecoder {
27
+ protected getFormat(): string { return 'html'; }
28
+
29
+ async decode(data: Uint8Array): Promise<Outcome<DocRoot>> {
30
+ const shield = new ShieldedParser();
31
+ const warns: string[] = [];
32
+
33
+ try {
34
+ const html = this.bytesToString(data);
35
+ const tokens = shield.guard(() => tokenize(html), [], 'html:tokenize');
36
+ const kids = shield.guard(() => parseTokens(tokens), [], 'html:parse');
37
+
38
+ warns.push(...shield.flush());
39
+ const sheet = buildSheet(kids.length > 0 ? kids : [buildPara([buildSpan('')])], A4);
40
+ return succeed(buildRoot({}, [sheet]), warns);
41
+ } catch (e: any) {
42
+ warns.push(...shield.flush());
43
+ return fail(`HTML decode error: ${e?.message ?? String(e)}`, warns);
44
+ }
45
+ }
46
+ }
47
+
48
+ function tokenize(html: string): Token[] {
49
+ const tokens: Token[] = [];
50
+ let i = 0;
51
+
52
+ while (i < html.length) {
53
+ if (html[i] === '<') {
54
+ if (html[i + 1] === '!') {
55
+ const end = html.indexOf('>', i);
56
+ i = end + 1;
57
+ continue;
58
+ }
59
+
60
+ const isClose = html[i + 1] === '/';
61
+ const start = isClose ? i + 2 : i + 1;
62
+ const end = html.indexOf('>', i);
63
+ if (end === -1) break;
64
+
65
+ const tagContent = html.slice(start, end).trim();
66
+ const spaceIdx = tagContent.search(/\s/);
67
+ const name = spaceIdx > 0 ? tagContent.slice(0, spaceIdx) : tagContent;
68
+ const attrsStr = spaceIdx > 0 ? tagContent.slice(spaceIdx + 1).trim() : '';
69
+
70
+ const attrs: Record<string, string> = {};
71
+ if (attrsStr) {
72
+ const attrRegex = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+)))?/g;
73
+ let m;
74
+ while ((m = attrRegex.exec(attrsStr)) !== null) {
75
+ attrs[m[1].toLowerCase()] = m[2] ?? m[3] ?? m[4] ?? '';
76
+ }
77
+ }
78
+
79
+ tokens.push({
80
+ type: 'tag',
81
+ name: name.toLowerCase(),
82
+ attrs,
83
+ selfClose: html[end - 1] === '/',
84
+ close: isClose,
85
+ });
86
+
87
+ i = end + 1;
88
+ } else {
89
+ const end = html.indexOf('<', i);
90
+ const text = end === -1 ? html.slice(i) : html.slice(i, end);
91
+ if (text.trim()) {
92
+ tokens.push({ type: 'text', content: text });
93
+ }
94
+ i = end === -1 ? html.length : end;
95
+ }
96
+ }
97
+
98
+ return tokens;
99
+ }
100
+
101
+ function parseTokens(tokens: Token[]): ContentNode[] {
102
+ const kids: ContentNode[] = [];
103
+ let i = 0;
104
+
105
+ while (i < tokens.length) {
106
+ const t = tokens[i];
107
+
108
+ if (t.type === 'tag' && !t.close) {
109
+ switch (t.name) {
110
+ case 'html':
111
+ // Extract body content
112
+ i++;
113
+ let bodyStart = -1;
114
+ let depth = 1;
115
+ while (i < tokens.length && depth > 0) {
116
+ if (tokens[i].type === 'tag' && !tokens[i].close && tokens[i].name === 'html') depth++;
117
+ else if (tokens[i].type === 'tag' && tokens[i].close && tokens[i].name === 'html') depth--;
118
+ else if (tokens[i].type === 'tag' && !tokens[i].close && tokens[i].name === 'body') {
119
+ bodyStart = i + 1;
120
+ }
121
+ i++;
122
+ }
123
+ if (bodyStart > 0) {
124
+ // Find body end
125
+ let bodyEnd = bodyStart;
126
+ let bodyDepth = 1;
127
+ while (bodyEnd < tokens.length && bodyDepth > 0) {
128
+ if (tokens[bodyEnd].type === 'tag' && !tokens[bodyEnd].close && tokens[bodyEnd].name === 'body') bodyDepth++;
129
+ else if (tokens[bodyEnd].type === 'tag' && tokens[bodyEnd].close && tokens[bodyEnd].name === 'body') bodyDepth--;
130
+ bodyEnd++;
131
+ }
132
+ bodyEnd--;
133
+ const bodyTokens = tokens.slice(bodyStart, bodyEnd);
134
+ const bodyKids = parseTokens(bodyTokens);
135
+ kids.push(...bodyKids);
136
+ }
137
+ continue;
138
+
139
+ case 'head':
140
+ case 'style':
141
+ case 'script':
142
+ i = skipBlock(tokens, i, t.name);
143
+ continue;
144
+
145
+ case 'body':
146
+ case 'div':
147
+ case 'section':
148
+ case 'article':
149
+ case 'main':
150
+ // Find closing tag and process contents
151
+ const start = i + 1;
152
+ let end = start;
153
+ let divDepth = 1;
154
+ while (end < tokens.length && divDepth > 0) {
155
+ const t = tokens[end];
156
+ if (t.type === 'tag' && !t.close) {
157
+ if (['html', 'head', 'body', 'div', 'section', 'article', 'main'].includes(t.name ?? '')) divDepth++;
158
+ } else if (t.type === 'tag' && t.close) {
159
+ if (['html', 'head', 'body', 'div', 'section', 'article', 'main'].includes(t.name ?? '')) divDepth--;
160
+ }
161
+ end++;
162
+ }
163
+ end--;
164
+
165
+ // Process tokens between start and end
166
+ const subTokens = tokens.slice(start, end);
167
+ const subKids = parseTokens(subTokens);
168
+ kids.push(...subKids);
169
+
170
+ i = end + 1;
171
+ continue;
172
+
173
+ case 'p':
174
+ i++;
175
+ const paraKids = collectInline(tokens, i, ['p', 'div', 'br']);
176
+ i = paraKids.nextI;
177
+ const align = t.attrs?.style?.includes('text-align: center') ? 'center'
178
+ : t.attrs?.style?.includes('text-align: right') ? 'right'
179
+ : t.attrs?.style?.includes('text-align: left') ? 'left'
180
+ : undefined;
181
+ kids.push(buildPara(paraKids.nodes, { align }));
182
+ continue;
183
+
184
+ case 'br':
185
+ kids.push(buildPara([buildSpan('')], {}));
186
+ i++;
187
+ continue;
188
+
189
+ case 'img':
190
+ i++;
191
+ const src = t.attrs?.src;
192
+ const alt = t.attrs?.alt || '';
193
+ if (src?.startsWith('data:')) {
194
+ const match = src.match(/^data:([^;]+);base64,(.+)$/);
195
+ if (match) {
196
+ kids.push(buildPara([buildImg(match[2], match[1] as any, 100, 100, alt)], {}));
197
+ }
198
+ }
199
+ continue;
200
+
201
+ case 'table':
202
+ i++;
203
+ const rows: any[] = [];
204
+ while (i < tokens.length) {
205
+ if (tokens[i].type === 'tag' && tokens[i].close && tokens[i].name === 'table') {
206
+ i++;
207
+ break;
208
+ }
209
+ if (tokens[i].type === 'tag' && tokens[i].name === 'tr' && !tokens[i].close) {
210
+ i++;
211
+ const cells: any[] = [];
212
+ while (i < tokens.length) {
213
+ if (tokens[i].type === 'tag' && tokens[i].close && tokens[i].name === 'tr') {
214
+ i++;
215
+ break;
216
+ }
217
+ if (tokens[i].type === 'tag' && (tokens[i].name === 'td' || tokens[i].name === 'th') && !tokens[i].close) {
218
+ i++;
219
+ const cellKids = collectInline(tokens, i, ['td', 'th', 'tr']);
220
+ i = cellKids.nextI;
221
+ const isHeader = tokens[i - 2]?.name === 'th';
222
+ const paraKids = cellKids.nodes.map(n => n.tag === 'span' ? { ...n, props: { ...n.props, b: isHeader } } : n);
223
+ cells.push(buildCell([buildPara(paraKids, {})]));
224
+ } else if (tokens[i].type === 'text' && tokens[i].content?.trim()) {
225
+ cells.push(buildCell([buildPara([buildSpan(tokens[i].content!.trim())])]));
226
+ i++;
227
+ } else {
228
+ i++;
229
+ }
230
+ }
231
+ if (cells.length > 0) rows.push(buildRow(cells));
232
+ } else {
233
+ i++;
234
+ }
235
+ }
236
+ if (rows.length > 0) kids.push(buildGrid(rows));
237
+ continue;
238
+
239
+ case 'ul':
240
+ case 'ol':
241
+ i++;
242
+ const isOrdered = t.name === 'ol';
243
+ while (i < tokens.length) {
244
+ if (tokens[i].type === 'tag' && tokens[i].close && tokens[i].name === t.name) {
245
+ i++;
246
+ break;
247
+ }
248
+ if (tokens[i].type === 'tag' && tokens[i].name === 'li' && !tokens[i].close) {
249
+ i++;
250
+ const liKids = collectInline(tokens, i, ['li', 'ul', 'ol']);
251
+ i = liKids.nextI;
252
+ kids.push(buildPara(liKids.nodes, { listOrd: isOrdered }));
253
+ } else {
254
+ i++;
255
+ }
256
+ }
257
+ continue;
258
+
259
+ default:
260
+ i++;
261
+ }
262
+ } else if (t.type === 'text' && t.content?.trim()) {
263
+ kids.push(buildPara([buildSpan(t.content!.trim())], {}));
264
+ i++;
265
+ } else {
266
+ i++;
267
+ }
268
+ }
269
+
270
+ return kids;
271
+ }
272
+
273
+ function collectInline(tokens: Token[], start: number, stopTags: string[]): { nodes: (SpanNode | ImgNode)[]; nextI: number } {
274
+ const nodes: (SpanNode | ImgNode)[] = [];
275
+ let i = start;
276
+
277
+ while (i < tokens.length) {
278
+ const t = tokens[i];
279
+
280
+ if (t.type === 'tag' && !t.close) {
281
+ if (t.name && stopTags.includes(t.name)) {
282
+ break;
283
+ }
284
+
285
+ switch (t.name) {
286
+ case 'b':
287
+ case 'strong':
288
+ i++;
289
+ const boldKids = collectInline(tokens, i, ['b', 'strong', ...stopTags]);
290
+ i = boldKids.nextI;
291
+ nodes.push(...boldKids.nodes.map(n => n.tag === 'span' ? { ...n, props: { ...n.props, b: true } } : n));
292
+ continue;
293
+
294
+ case 'i':
295
+ case 'em':
296
+ i++;
297
+ const italicKids = collectInline(tokens, i, ['i', 'em', ...stopTags]);
298
+ i = italicKids.nextI;
299
+ nodes.push(...italicKids.nodes.map(n => n.tag === 'span' ? { ...n, props: { ...n.props, i: true } } : n));
300
+ continue;
301
+
302
+ case 'u':
303
+ i++;
304
+ const underlineKids = collectInline(tokens, i, ['u', ...stopTags]);
305
+ i = underlineKids.nextI;
306
+ nodes.push(...underlineKids.nodes.map(n => n.tag === 'span' ? { ...n, props: { ...n.props, u: true } } : n));
307
+ continue;
308
+
309
+ case 's':
310
+ case 'strike':
311
+ i++;
312
+ const strikeKids = collectInline(tokens, i, ['s', 'strike', ...stopTags]);
313
+ i = strikeKids.nextI;
314
+ nodes.push(...strikeKids.nodes.map(n => n.tag === 'span' ? { ...n, props: { ...n.props, s: true } } : n));
315
+ continue;
316
+
317
+ case 'span':
318
+ i++;
319
+ const spanKids = collectInline(tokens, i, ['span', ...stopTags]);
320
+ i = spanKids.nextI;
321
+ const color = t.attrs?.style?.match(/color:\s*([^;]+)/)?.[1];
322
+ nodes.push(...spanKids.nodes.map(n => n.tag === 'span' ? { ...n, props: { ...n.props, color: color || n.props.color } } : n));
323
+ continue;
324
+
325
+ case 'img':
326
+ const src = t.attrs?.src;
327
+ const alt = t.attrs?.alt || '';
328
+ if (src?.startsWith('data:')) {
329
+ const match = src.match(/^data:([^;]+);base64,(.+)$/);
330
+ if (match) {
331
+ nodes.push(buildImg(match[2], match[1] as any, 100, 100, alt));
332
+ }
333
+ }
334
+ i++;
335
+ continue;
336
+
337
+ default:
338
+ i++;
339
+ }
340
+ } else if (t.type === 'text') {
341
+ if (t.content?.trim()) {
342
+ nodes.push(buildSpan(t.content!.trim()));
343
+ }
344
+ i++;
345
+ } else {
346
+ i++;
347
+ }
348
+ }
349
+
350
+ return { nodes: nodes.length > 0 ? nodes : [buildSpan('')], nextI: i };
351
+ }
352
+
353
+ function skipBlock(tokens: Token[], start: number, name: string): number {
354
+ let i = start + 1;
355
+ let depth = 1;
356
+ while (i < tokens.length && depth > 0) {
357
+ if (tokens[i].type === 'tag') {
358
+ if (!tokens[i].close && tokens[i].name === name) depth++;
359
+ if (tokens[i].close && tokens[i].name === name) depth--;
360
+ }
361
+ i++;
362
+ }
363
+ return i;
364
+ }
365
+
366
+ registry.registerDecoder(new HtmlDecoder());