@adeu/core 1.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/mapper.ts ADDED
@@ -0,0 +1,835 @@
1
+ import { DocumentObject } from './docx/bridge.js';
2
+ import { Paragraph, Table, Run, DocxEvent } from './docx/primitives.js';
3
+ import { findAllDescendants, findChild } from './docx/dom.js';
4
+ import { extract_comments_data } from './comments.js';
5
+ import {
6
+ _get_style_cache,
7
+ get_paragraph_prefix,
8
+ get_run_style_markers,
9
+ get_run_text,
10
+ is_heading_paragraph,
11
+ is_native_heading,
12
+ iter_block_items,
13
+ iter_document_parts,
14
+ iter_paragraph_content,
15
+ } from './utils/docx.js';
16
+
17
+ export interface TextSpan {
18
+ start: number;
19
+ end: number;
20
+ text: string;
21
+ run: Run | null;
22
+ paragraph: Paragraph | null;
23
+ ins_id?: string | null;
24
+ del_id?: string | null;
25
+ hyperlink_id?: string | null;
26
+ }
27
+
28
+ export function renumber_snapshot_ids(doc: DocumentObject): [Record<string, string>, Record<string, string>] {
29
+ const chg_remap: Record<string, string> = {};
30
+ let next_chg = 1;
31
+ const body_root = doc.element;
32
+
33
+ const chg_elements: Element[] = [];
34
+ const all_elements = findAllDescendants(body_root, '*');
35
+ for (const el of all_elements) {
36
+ if (el.tagName === 'w:ins' || el.tagName === 'w:del') {
37
+ chg_elements.push(el);
38
+ }
39
+ }
40
+
41
+ for (const elem of chg_elements) {
42
+ const old_id = elem.getAttribute('w:id');
43
+ if (!old_id) continue;
44
+ if (chg_remap[old_id]) {
45
+ elem.setAttribute('w:id', chg_remap[old_id]);
46
+ continue;
47
+ }
48
+ const new_id = next_chg.toString();
49
+ chg_remap[old_id] = new_id;
50
+ elem.setAttribute('w:id', new_id);
51
+ next_chg++;
52
+ }
53
+
54
+ const com_remap: Record<string, string> = {};
55
+ let next_com = 1;
56
+ const comments_part = doc.pkg.parts.find(p => p.contentType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml');
57
+
58
+ if (comments_part) {
59
+ const comments_root = comments_part._element;
60
+ for (const c of findAllDescendants(comments_root, 'w:comment')) {
61
+ const old_id = c.getAttribute('w:id');
62
+ if (!old_id) continue;
63
+ if (com_remap[old_id]) {
64
+ c.setAttribute('w:id', com_remap[old_id]);
65
+ continue;
66
+ }
67
+ const new_id = next_com.toString();
68
+ com_remap[old_id] = new_id;
69
+ c.setAttribute('w:id', new_id);
70
+ next_com++;
71
+ }
72
+ }
73
+
74
+ for (const elem of all_elements) {
75
+ if (['w:commentReference', 'w:commentRangeStart', 'w:commentRangeEnd'].includes(elem.tagName)) {
76
+ const old_id = elem.getAttribute('w:id');
77
+ if (old_id && com_remap[old_id]) {
78
+ elem.setAttribute('w:id', com_remap[old_id]);
79
+ }
80
+ }
81
+ }
82
+
83
+ if (comments_part) {
84
+ for (const c of findAllDescendants(comments_part._element, 'w:comment')) {
85
+ const parent_id = c.getAttribute('w15:p');
86
+ if (parent_id && com_remap[parent_id]) {
87
+ c.setAttribute('w15:p', com_remap[parent_id]);
88
+ }
89
+ }
90
+ }
91
+
92
+ return [chg_remap, com_remap];
93
+ }
94
+
95
+ export class DocumentMapper {
96
+ public doc: DocumentObject;
97
+ public clean_view: boolean;
98
+ public comments_map: Record<string, any>;
99
+ public full_text: string = '';
100
+ public spans: TextSpan[] = [];
101
+ public appendix_start_index: number = -1;
102
+ private _text_chunks: string[] = [];
103
+
104
+ constructor(doc: DocumentObject, clean_view: boolean = false) {
105
+ this.doc = doc;
106
+ this.clean_view = clean_view;
107
+ this.comments_map = extract_comments_data(doc.pkg);
108
+ this._build_map();
109
+ }
110
+
111
+ private _build_map() {
112
+ let current_offset = 0;
113
+ this.spans = [];
114
+ this._text_chunks = [];
115
+ this.full_text = '';
116
+
117
+ for (const part of iter_document_parts(this.doc)) {
118
+ current_offset = this._map_blocks(part, current_offset);
119
+
120
+ if (this.spans.length > 0 && this.spans[this.spans.length - 1].text !== '\n\n') {
121
+ this._add_virtual_text('\n\n', current_offset, null);
122
+ current_offset += 2;
123
+ }
124
+ }
125
+
126
+ while (this.spans.length > 0 && this.spans[this.spans.length - 1].text === '\n\n') {
127
+ this.spans.pop();
128
+ this._text_chunks.pop();
129
+ }
130
+
131
+ this.full_text = this._text_chunks.join('');
132
+ this.appendix_start_index = -1;
133
+ }
134
+
135
+ private _map_blocks(container: any, offset: number): number {
136
+ let current = offset;
137
+ const c_type = container.constructor.name;
138
+ const part = container.part || container;
139
+ const [style_cache, default_pstyle] = _get_style_cache(part);
140
+
141
+ if (c_type === 'NotesPart') {
142
+ const header = container.note_type === 'fn' ? '## Footnotes' : '## Endnotes';
143
+ const sep = `---\n${header}`;
144
+ this._add_virtual_text(sep, current, null);
145
+ current += sep.length;
146
+ this._add_virtual_text('\n\n', current, null);
147
+ current += 2;
148
+ }
149
+
150
+ let is_first_para = true;
151
+ let previous_item: any = null;
152
+
153
+ for (const item of iter_block_items(container)) {
154
+ const i_type = item.constructor.name;
155
+
156
+ if (i_type === 'FootnoteItem') {
157
+ current = this._map_blocks(item, current);
158
+ } else if (item instanceof Paragraph) {
159
+ if (!is_first_para) {
160
+ const prev_para = previous_item instanceof Paragraph ? previous_item : null;
161
+ this._add_virtual_text('\n\n', current, prev_para);
162
+ current += 2;
163
+ }
164
+
165
+ let prefix = get_paragraph_prefix(item, style_cache, default_pstyle);
166
+ if (is_first_para && c_type === 'FootnoteItem') {
167
+ prefix = `[^${container.note_type}-${container.id}]: ` + prefix;
168
+ }
169
+ if (prefix) {
170
+ this._add_virtual_text(prefix, current, item);
171
+ current += prefix.length;
172
+ }
173
+
174
+ current = this._map_paragraph_content(item, current, style_cache, default_pstyle);
175
+ is_first_para = false;
176
+ previous_item = item;
177
+ } else if (item instanceof Table) {
178
+ if (!is_first_para) {
179
+ const prev_para = previous_item instanceof Paragraph ? previous_item : null;
180
+ this._add_virtual_text('\n\n', current, prev_para);
181
+ current += 2;
182
+ }
183
+ current = this._map_table(item, current);
184
+ is_first_para = false;
185
+ previous_item = item;
186
+ }
187
+ }
188
+
189
+ return current;
190
+ }
191
+
192
+ private _map_table(table: Table, offset: number): number {
193
+ let current = offset;
194
+ let rows_processed = 0;
195
+
196
+ for (const row of table.rows) {
197
+ const tr = row._element;
198
+ const trPr = findChild(tr, 'w:trPr');
199
+ const ins = trPr ? findChild(trPr, 'w:ins') : null;
200
+ const del_node = trPr ? findChild(trPr, 'w:del') : null;
201
+
202
+ if (this.clean_view && del_node) continue;
203
+
204
+ if (rows_processed > 0) {
205
+ this._add_virtual_text('\n', current, null);
206
+ current += 1;
207
+ }
208
+
209
+ if (ins && !this.clean_view) {
210
+ this._add_virtual_text('{++ ', current, null);
211
+ current += 4;
212
+ } else if (del_node && !this.clean_view) {
213
+ this._add_virtual_text('{-- ', current, null);
214
+ current += 4;
215
+ }
216
+
217
+ const seen_cells = new Set();
218
+ let cells_processed = 0;
219
+
220
+ for (const cell of row.cells) {
221
+ if (seen_cells.has(cell)) continue;
222
+ seen_cells.add(cell);
223
+
224
+ if (cells_processed > 0) {
225
+ this._add_virtual_text(' | ', current, null);
226
+ current += 3;
227
+ }
228
+
229
+ current = this._map_blocks(cell, current);
230
+ cells_processed += 1;
231
+ }
232
+
233
+ if (ins && !this.clean_view) {
234
+ const suffix = ` |Chg:${ins.getAttribute('w:id')}++}`;
235
+ this._add_virtual_text(suffix, current, null);
236
+ current += suffix.length;
237
+ } else if (del_node && !this.clean_view) {
238
+ const suffix = ` |Chg:${del_node.getAttribute('w:id')}--}`;
239
+ this._add_virtual_text(suffix, current, null);
240
+ current += suffix.length;
241
+ }
242
+
243
+ rows_processed += 1;
244
+ }
245
+
246
+ return current;
247
+ }
248
+
249
+ private _strip_markdown_formatting(text: string): string {
250
+ let result = text;
251
+ result = result.replace(/^#+\s*/gm, '');
252
+ result = result.replace(/\*\*(\w[\w\s]*\w|\w{2,})\*\*/g, '$1');
253
+ result = result.replace(/__(\w[\w\s]*\w|\w{2,})__/g, '$1');
254
+ result = result.replace(/(?<!\w)_(\w[\w\s]*\w|\w{2,})_(?!\w)/g, '$1');
255
+ result = result.replace(/(?<!\w)\*(\w[\w\s]*\w|\w{2,})\*(?!\w)/g, '$1');
256
+ return result;
257
+ }
258
+
259
+ private _map_paragraph_content(
260
+ paragraph: Paragraph,
261
+ start_offset: number,
262
+ style_cache?: any,
263
+ default_pstyle?: string | null
264
+ ): number {
265
+ let current = start_offset;
266
+
267
+ const span: TextSpan = { start: current, end: current, text: '', run: null, paragraph };
268
+ this.spans.push(span);
269
+
270
+ const active_ids = new Set<string>();
271
+ const active_ins: Record<string, DocxEvent> = {};
272
+ const active_del: Record<string, DocxEvent> = {};
273
+ const active_fmt: Record<string, DocxEvent> = {};
274
+
275
+ let deferred_meta_states: any[] = [];
276
+ let current_wrappers: [string, string] = ['', ''];
277
+ let current_style: [string, string] = ['', ''];
278
+ let active_hyperlink_id: string | null = null;
279
+ let pending_runs: [string, string, Run | null, string | null, string | null][] = [];
280
+
281
+ const flush_pending_runs = () => {
282
+ if (pending_runs.length === 0) return;
283
+ const [s_tok, e_tok] = current_wrappers;
284
+ if (s_tok) {
285
+ this._add_virtual_text(s_tok, current, paragraph);
286
+ current += s_tok.length;
287
+ }
288
+ for (const [kind, txt, r_obj, i_id, d_id] of pending_runs) {
289
+ if (kind === 'virtual') {
290
+ this._add_virtual_text(txt, current, paragraph, active_hyperlink_id);
291
+ } else {
292
+ const s: TextSpan = {
293
+ start: current, end: current + txt.length, text: txt,
294
+ run: r_obj, paragraph, ins_id: i_id || undefined, del_id: d_id || undefined,
295
+ hyperlink_id: active_hyperlink_id || undefined
296
+ };
297
+ this.spans.push(s);
298
+ this._text_chunks.push(txt);
299
+ }
300
+ current += txt.length;
301
+ }
302
+ if (e_tok) {
303
+ this._add_virtual_text(e_tok, current, paragraph);
304
+ current += e_tok.length;
305
+ }
306
+ pending_runs = [];
307
+ };
308
+
309
+ const items = Array.from(iter_paragraph_content(paragraph));
310
+ const is_heading = is_heading_paragraph(paragraph, style_cache, default_pstyle);
311
+ const native_heading = is_native_heading(paragraph, style_cache, default_pstyle);
312
+ let leading_strip_active = is_heading;
313
+
314
+ for (let i = 0; i < items.length; i++) {
315
+ const item = items[i];
316
+
317
+ if (item instanceof Run) {
318
+ const [prefix, suffix] = get_run_style_markers(item, native_heading);
319
+ const run_parts: [string, string, Run | null][] = [];
320
+ const text = get_run_text(item);
321
+
322
+ if (leading_strip_active) {
323
+ if (text === '' || /^\s*$/.test(text)) continue;
324
+ leading_strip_active = false;
325
+ }
326
+
327
+ if (text.includes('\n') && (prefix || suffix)) {
328
+ const parts = text.split('\n');
329
+ for (let idx = 0; idx < parts.length; idx++) {
330
+ if (idx > 0) run_parts.push(['real', '\n', item]);
331
+ if (parts[idx]) {
332
+ if (prefix) run_parts.push(['virtual', prefix, null]);
333
+ run_parts.push(['real', parts[idx], item]);
334
+ if (suffix) run_parts.push(['virtual', suffix, null]);
335
+ }
336
+ }
337
+ } else {
338
+ if (prefix) run_parts.push(['virtual', prefix, null]);
339
+ if (text) run_parts.push(['real', text, item]);
340
+ if (suffix) run_parts.push(['virtual', suffix, null]);
341
+ }
342
+
343
+ if (this.clean_view && Object.keys(active_del).length > 0) {
344
+ // pass
345
+ }
346
+
347
+ const full_seg_text = run_parts.map(x => x[1]).join('');
348
+ const curr_ins_id = Object.keys(active_ins).pop() || null;
349
+ const curr_del_id = Object.keys(active_del).pop() || null;
350
+
351
+ if (full_seg_text && !(this.clean_view && curr_del_id)) {
352
+ const new_wrappers = this.clean_view ? ['', ''] as [string, string] : this._get_wrappers(curr_ins_id, curr_del_id, active_ids, active_fmt);
353
+ const new_style: [string, string] = [prefix, suffix];
354
+
355
+ if (pending_runs.length > 0 && new_wrappers[0] === current_wrappers[0] && new_wrappers[1] === current_wrappers[1]) {
356
+ let skip_leading_prefix = false;
357
+ if (new_style[0] === current_style[0] && new_style[1] === current_style[1] && current_style[0] !== '' &&
358
+ pending_runs[pending_runs.length - 1][0] === 'virtual' &&
359
+ pending_runs[pending_runs.length - 1][1] === current_style[1]) {
360
+ pending_runs.pop();
361
+ skip_leading_prefix = true;
362
+ }
363
+
364
+ for (const [kind, txt, r_obj] of run_parts) {
365
+ if (skip_leading_prefix && kind === 'virtual' && txt === new_style[0]) {
366
+ skip_leading_prefix = false;
367
+ continue;
368
+ }
369
+ pending_runs.push([kind, txt, r_obj, curr_ins_id, curr_del_id]);
370
+ }
371
+ current_style = new_style;
372
+ } else {
373
+ flush_pending_runs();
374
+ current_wrappers = new_wrappers;
375
+ current_style = new_style;
376
+ for (const [kind, txt, r_obj] of run_parts) {
377
+ pending_runs.push([kind, txt, r_obj, curr_ins_id, curr_del_id]);
378
+ }
379
+ }
380
+ }
381
+
382
+ if (!this.clean_view) {
383
+ const has_meta = Object.keys(active_ins).length > 0 || Object.keys(active_del).length > 0 || active_ids.size > 0 || Object.keys(active_fmt).length > 0;
384
+ if (has_meta) {
385
+ deferred_meta_states.push([{...active_ins}, {...active_del}, new Set(active_ids), {...active_fmt}]);
386
+ }
387
+
388
+ let should_defer = false;
389
+ const is_redline = curr_ins_id !== null || curr_del_id !== null || Object.keys(active_fmt).length > 0;
390
+
391
+ if (is_redline) {
392
+ let j = i + 1;
393
+ let next_is_redline = false;
394
+ let temp_ins_count = Object.keys(active_ins).length;
395
+ let temp_del_count = Object.keys(active_del).length;
396
+ let temp_fmt_count = Object.keys(active_fmt).length;
397
+
398
+ while (j < items.length) {
399
+ const next_item = items[j];
400
+ if (next_item instanceof Run) {
401
+ if (!get_run_text(next_item)) {
402
+ j++; continue;
403
+ }
404
+ if (temp_ins_count > 0 || temp_del_count > 0 || temp_fmt_count > 0) {
405
+ next_is_redline = true;
406
+ }
407
+ break;
408
+ } else {
409
+ const ev = next_item as DocxEvent;
410
+ if (ev.type === 'ins_start') temp_ins_count++;
411
+ else if (ev.type === 'ins_end') temp_ins_count = Math.max(0, temp_ins_count - 1);
412
+ else if (ev.type === 'del_start') temp_del_count++;
413
+ else if (ev.type === 'del_end') temp_del_count = Math.max(0, temp_del_count - 1);
414
+ else if (ev.type === 'fmt_start') temp_fmt_count++;
415
+ else if (ev.type === 'fmt_end') temp_fmt_count = Math.max(0, temp_fmt_count - 1);
416
+ }
417
+ j++;
418
+ }
419
+
420
+ if (next_is_redline) should_defer = true;
421
+ }
422
+
423
+ if (!should_defer && deferred_meta_states.length > 0) {
424
+ const meta_block = this._build_merged_meta_block(deferred_meta_states);
425
+ if (meta_block) {
426
+ flush_pending_runs();
427
+ current_wrappers = ['', ''];
428
+ current_style = ['', ''];
429
+ const full_meta = `{>>${meta_block}<<}`;
430
+ this._add_virtual_text(full_meta, current, paragraph);
431
+ current += full_meta.length;
432
+ }
433
+ deferred_meta_states = [];
434
+ }
435
+ }
436
+ } else {
437
+ const ev = item as DocxEvent;
438
+ leading_strip_active = false;
439
+ flush_pending_runs();
440
+ current_wrappers = ['', ''];
441
+ current_style = ['', ''];
442
+
443
+ if (ev.type === 'start') active_ids.add(ev.id);
444
+ else if (ev.type === 'end') active_ids.delete(ev.id);
445
+ else if (ev.type === 'ins_start') active_ins[ev.id] = ev;
446
+ else if (ev.type === 'ins_end') delete active_ins[ev.id];
447
+ else if (ev.type === 'del_start') active_del[ev.id] = ev;
448
+ else if (ev.type === 'del_end') delete active_del[ev.id];
449
+ else if (ev.type === 'fmt_start') active_fmt[ev.id] = ev;
450
+ else if (ev.type === 'fmt_end') delete active_fmt[ev.id];
451
+ else if (ev.type === 'footnote' || ev.type === 'endnote') {
452
+ flush_pending_runs();
453
+ current_wrappers = ['', ''];
454
+ current_style = ['', ''];
455
+ const prefix_str = ev.type === 'footnote' ? 'fn' : 'en';
456
+ const txt = `[^${prefix_str}-${ev.id}]`;
457
+ this._add_virtual_text(txt, current, paragraph);
458
+ current += txt.length;
459
+ } else if (ev.type === 'hyperlink_start') {
460
+ flush_pending_runs();
461
+ current_wrappers = ['', ''];
462
+ current_style = ['', ''];
463
+ this._add_virtual_text('[', current, paragraph, ev.id);
464
+ current += 1;
465
+ active_hyperlink_id = ev.id;
466
+ } else if (ev.type === 'hyperlink_end') {
467
+ flush_pending_runs();
468
+ current_wrappers = ['', ''];
469
+ current_style = ['', ''];
470
+ const txt = `](${ev.date})`;
471
+ this._add_virtual_text(txt, current, paragraph, ev.id);
472
+ current += txt.length;
473
+ active_hyperlink_id = null;
474
+ } else if (ev.type === 'xref_start') {
475
+ flush_pending_runs();
476
+ current_wrappers = ['', ''];
477
+ current_style = ['', ''];
478
+ this._add_virtual_text('[~', current, paragraph);
479
+ current += 2;
480
+ } else if (ev.type === 'xref_end') {
481
+ flush_pending_runs();
482
+ current_wrappers = ['', ''];
483
+ current_style = ['', ''];
484
+ const txt = `~](#${ev.id})`;
485
+ this._add_virtual_text(txt, current, paragraph);
486
+ current += txt.length;
487
+ } else if (ev.type === 'bookmark') {
488
+ flush_pending_runs();
489
+ current_wrappers = ['', ''];
490
+ current_style = ['', ''];
491
+ const txt = `{#${ev.id}}`;
492
+ this._add_virtual_text(txt, current, paragraph);
493
+ current += txt.length;
494
+ }
495
+ }
496
+ }
497
+
498
+ flush_pending_runs();
499
+
500
+ if (deferred_meta_states.length > 0) {
501
+ const meta_block = this._build_merged_meta_block(deferred_meta_states);
502
+ if (meta_block) {
503
+ const full_meta = `{>>${meta_block}<<}`;
504
+ this._add_virtual_text(full_meta, current, paragraph);
505
+ current += full_meta.length;
506
+ }
507
+ }
508
+
509
+ return current;
510
+ }
511
+
512
+ private _get_wrappers(ins_id: string | null, del_id: string | null, active_ids: Set<string>, active_fmt: Record<string, DocxEvent>): [string, string] {
513
+ if (del_id) return ['{--', '--}'];
514
+ if (ins_id) return ['{++', '++}'];
515
+ if (active_ids.size > 0 || Object.keys(active_fmt).length > 0) return ['{==', '==}'];
516
+ return ['', ''];
517
+ }
518
+
519
+ private _build_merged_meta_block(states_list: any[]): string {
520
+ const change_lines: string[] = [];
521
+ const comment_lines: string[] = [];
522
+ const seen_sigs = new Set<string>();
523
+
524
+ for (const [ins_map, del_map, comments_set, fmt_map] of states_list) {
525
+ for (const [uid, meta] of Object.entries(ins_map as Record<string, DocxEvent>)) {
526
+ const sig = `Chg:${uid}`;
527
+ if (!seen_sigs.has(sig)) {
528
+ const auth = meta.author || 'Unknown';
529
+ change_lines.push(`[${sig} insert] ${auth}`);
530
+ seen_sigs.add(sig);
531
+ }
532
+ }
533
+ for (const [uid, meta] of Object.entries(del_map as Record<string, DocxEvent>)) {
534
+ const sig = `Chg:${uid}`;
535
+ if (!seen_sigs.has(sig)) {
536
+ const auth = meta.author || 'Unknown';
537
+ change_lines.push(`[${sig} delete] ${auth}`);
538
+ seen_sigs.add(sig);
539
+ }
540
+ }
541
+ for (const [uid, meta] of Object.entries(fmt_map as Record<string, DocxEvent>)) {
542
+ const sig = `Chg:${uid}`;
543
+ if (!seen_sigs.has(sig)) {
544
+ const auth = meta.author || 'Unknown';
545
+ change_lines.push(`[${sig} format] ${auth}`);
546
+ seen_sigs.add(sig);
547
+ }
548
+ }
549
+
550
+ const sorted_ids = Array.from(comments_set as Set<string>).sort();
551
+ for (const c_id of sorted_ids) {
552
+ if (!this.comments_map[c_id]) continue;
553
+ const sig = `Com:${c_id}`;
554
+ if (!seen_sigs.has(sig)) {
555
+ const data = this.comments_map[c_id];
556
+ let header = `[${sig}] ${data.author}`;
557
+ if (data.date) header += ` @ ${data.date}`;
558
+ if (data.resolved) header += `(RESOLVED)`;
559
+ comment_lines.push(`${header}: ${data.text}`);
560
+ seen_sigs.add(sig);
561
+ }
562
+ }
563
+ }
564
+
565
+ return [...change_lines, ...comment_lines].join('\n');
566
+ }
567
+
568
+ private _add_virtual_text(text: string, offset: number, context_paragraph: Paragraph | null, hyperlink_id: string | null = null) {
569
+ const span: TextSpan = {
570
+ start: offset,
571
+ end: offset + text.length,
572
+ text,
573
+ run: null,
574
+ paragraph: context_paragraph,
575
+ hyperlink_id: hyperlink_id || undefined
576
+ };
577
+ this.spans.push(span);
578
+ this._text_chunks.push(text);
579
+ }
580
+
581
+ private _replace_smart_quotes(text: string): string {
582
+ return text.replace(/“/g, '"').replace(/”/g, '"').replace(/‘/g, "'").replace(/’/g, "'");
583
+ }
584
+
585
+ private _make_fuzzy_regex(target_text: string): string {
586
+ target_text = this._strip_markdown_formatting(target_text);
587
+ target_text = this._replace_smart_quotes(target_text);
588
+
589
+ const parts: string[] = [];
590
+ const token_pattern = /(\[_+\])|(\s+)|(['"])|([.,;:\/])/g;
591
+
592
+ let last_idx = 0;
593
+ let match;
594
+ const escapeRegExp = (str: string) => str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
595
+
596
+ while ((match = token_pattern.exec(target_text)) !== null) {
597
+ const literal = target_text.substring(last_idx, match.index);
598
+ if (literal) parts.push(escapeRegExp(literal));
599
+
600
+ const g_placeholder = match[1];
601
+ const g_space = match[2];
602
+ const g_quote = match[3];
603
+ const g_punct = match[4];
604
+
605
+ if (g_placeholder) {
606
+ parts.push('\\[_+\\]');
607
+ } else if (g_space) {
608
+ parts.push('(?:\\*\\*|__|\\*|_)?');
609
+ parts.push('\\s+');
610
+ parts.push('(?:\\*\\*|__|\\*|_)?');
611
+ } else if (g_quote) {
612
+ if (g_quote === "'") parts.push('[\u2018\u2019\']');
613
+ else parts.push('["\u201c\u201d]');
614
+ } else if (g_punct) {
615
+ parts.push('(?:\\*\\*|__|\\*|_)?');
616
+ parts.push(escapeRegExp(g_punct));
617
+ parts.push('(?:\\*\\*|__|\\*|_)?');
618
+ }
619
+
620
+ last_idx = token_pattern.lastIndex;
621
+ }
622
+
623
+ const remaining = target_text.substring(last_idx);
624
+ if (remaining) parts.push(escapeRegExp(remaining));
625
+
626
+ return parts.join('');
627
+ }
628
+
629
+ public find_match_index(target_text: string): [number, number] {
630
+ let start_idx = this.full_text.indexOf(target_text);
631
+ if (start_idx !== -1) return [start_idx, target_text.length];
632
+
633
+ const norm_full = this._replace_smart_quotes(this.full_text);
634
+ const norm_target = this._replace_smart_quotes(target_text);
635
+ start_idx = norm_full.indexOf(norm_target);
636
+ if (start_idx !== -1) return [start_idx, target_text.length];
637
+
638
+ const stripped_target = this._strip_markdown_formatting(target_text);
639
+ if (this.full_text.includes(stripped_target)) {
640
+ start_idx = this.full_text.indexOf(stripped_target);
641
+ return [start_idx, stripped_target.length];
642
+ }
643
+
644
+ try {
645
+ const pattern = new RegExp(this._make_fuzzy_regex(target_text));
646
+ const match = pattern.exec(this.full_text);
647
+ if (match) return [match.index, match[0].length];
648
+ } catch (e) {}
649
+
650
+ return [-1, 0];
651
+ }
652
+
653
+ public find_all_match_indices(target_text: string): [number, number][] {
654
+ if (!target_text) return [];
655
+ const escapeRegExp = (str: string) => str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
656
+
657
+ let matches = [...this.full_text.matchAll(new RegExp(escapeRegExp(target_text), 'g'))];
658
+ if (matches.length > 0) return matches.map(m => [m.index!, m[0].length]);
659
+
660
+ const norm_full = this._replace_smart_quotes(this.full_text);
661
+ const norm_target = this._replace_smart_quotes(target_text);
662
+ matches = [...norm_full.matchAll(new RegExp(escapeRegExp(norm_target), 'g'))];
663
+ if (matches.length > 0) return matches.map(m => [m.index!, m[0].length]);
664
+
665
+ const stripped_target = this._strip_markdown_formatting(target_text);
666
+ matches = [...this.full_text.matchAll(new RegExp(escapeRegExp(stripped_target), 'g'))];
667
+ if (matches.length > 0) return matches.map(m => [m.index!, m[0].length]);
668
+
669
+ try {
670
+ const pattern = new RegExp(this._make_fuzzy_regex(target_text), 'g');
671
+ matches = [...this.full_text.matchAll(pattern)];
672
+ if (matches.length > 0) return matches.map(m => [m.index!, m[0].length]);
673
+ } catch (e) {}
674
+
675
+ return [];
676
+ }
677
+
678
+ public find_target_runs(target_text: string): Run[] {
679
+ const [start_idx, length] = this.find_match_index(target_text);
680
+ if (start_idx === -1) return [];
681
+ return this._resolve_runs_at_range(start_idx, start_idx + length);
682
+ }
683
+
684
+ public find_target_runs_by_index(start_index: number, length: number, rebuild_map = true): Run[] {
685
+ return this._resolve_runs_at_range(start_index, start_index + length, rebuild_map);
686
+ }
687
+
688
+ public get_virtual_spans_in_range(start_index: number, length: number): TextSpan[] {
689
+ const end_index = start_index + length;
690
+ return this.spans.filter(s => s.run === null && s.text === '\n\n' && s.start >= start_index && s.end <= end_index);
691
+ }
692
+
693
+ private _resolve_runs_at_range(start_idx: number, end_idx: number, rebuild_map = true): Run[] {
694
+ const affected_spans = this.spans.filter(s => s.end > start_idx && s.start < end_idx);
695
+ if (affected_spans.length === 0) return [];
696
+
697
+ const working_runs = affected_spans.filter(s => s.run !== null).map(s => s.run!);
698
+ if (working_runs.length === 0) return [];
699
+
700
+ let dom_modified = false;
701
+
702
+ const first_real_span = affected_spans.find(s => s.run !== null);
703
+ let start_split_adjustment = 0;
704
+
705
+ if (first_real_span) {
706
+ const local_start = start_idx - first_real_span.start;
707
+ if (local_start > 0) {
708
+ const idx_in_working = 0;
709
+ const [, right_run] = this._split_run_at_index(working_runs[idx_in_working], local_start);
710
+ working_runs[idx_in_working] = right_run;
711
+ dom_modified = true;
712
+ start_split_adjustment = local_start;
713
+ }
714
+ }
715
+
716
+ const last_real_span = [...affected_spans].reverse().find(s => s.run !== null);
717
+
718
+ if (last_real_span) {
719
+ const is_same_run = first_real_span === last_real_span;
720
+ const run_to_split = working_runs[working_runs.length - 1];
721
+ let overlap_end = Math.min(last_real_span.end, end_idx);
722
+ let local_end = overlap_end - last_real_span.start;
723
+
724
+ if (is_same_run && start_split_adjustment > 0) {
725
+ local_end -= start_split_adjustment;
726
+ }
727
+
728
+ const run_text = get_run_text(run_to_split);
729
+ if (local_end > 0 && local_end < run_text.length) {
730
+ const [left_run] = this._split_run_at_index(run_to_split, local_end);
731
+ working_runs[working_runs.length - 1] = left_run;
732
+ dom_modified = true;
733
+ }
734
+ }
735
+
736
+ if (dom_modified && rebuild_map) {
737
+ this._build_map();
738
+ }
739
+
740
+ return working_runs;
741
+ }
742
+
743
+ public get_insertion_anchor(index: number, rebuild_map = true): [Run | null, Paragraph | null] {
744
+ const preceding = this.spans.filter(s => s.end === index);
745
+ if (preceding.length > 0) {
746
+ for (let i = preceding.length - 1; i >= 0; i--) {
747
+ if (preceding[i].run) return [preceding[i].run, preceding[i].paragraph];
748
+ }
749
+ for (let i = preceding.length - 1; i >= 0; i--) {
750
+ if (preceding[i].paragraph) return [null, preceding[i].paragraph];
751
+ }
752
+ }
753
+
754
+ const containing = this.spans.filter(s => s.start < index && index < s.end);
755
+ if (containing.length > 0) {
756
+ const span = containing[0];
757
+ if (span.run === null) {
758
+ if (span.paragraph === null) {
759
+ return this.get_insertion_anchor(span.end, rebuild_map);
760
+ }
761
+ return [null, span.paragraph];
762
+ } else {
763
+ const offset = index - span.start;
764
+ const [left, ] = this._split_run_at_index(span.run, offset);
765
+ if (rebuild_map) this._build_map();
766
+ return [left, span.paragraph];
767
+ }
768
+ }
769
+
770
+ if (index === 0 && this.spans.length > 0) {
771
+ for (const s of this.spans) if (s.run) return [s.run, s.paragraph];
772
+ for (const s of this.spans) if (s.paragraph) return [null, s.paragraph];
773
+ return [null, null];
774
+ }
775
+
776
+ const preceding_gap = this.spans.filter(s => s.end < index);
777
+ if (preceding_gap.length > 0) {
778
+ for (let i = preceding_gap.length - 1; i >= 0; i--) {
779
+ if (preceding_gap[i].run) return [preceding_gap[i].run, preceding_gap[i].paragraph];
780
+ }
781
+ for (let i = preceding_gap.length - 1; i >= 0; i--) {
782
+ if (preceding_gap[i].paragraph) return [null, preceding_gap[i].paragraph];
783
+ }
784
+ }
785
+
786
+ return [null, null];
787
+ }
788
+
789
+ private _split_run_at_index(run: Run, split_index: number): [Run, Run] {
790
+ const text = get_run_text(run);
791
+ const left_text = text.substring(0, split_index);
792
+ const right_text = text.substring(split_index);
793
+
794
+ this._set_run_text_elements(run._element, left_text);
795
+
796
+ const new_r_element = run._element.cloneNode(true) as Element;
797
+ this._set_run_text_elements(new_r_element, right_text);
798
+
799
+ if (run._element.parentNode) {
800
+ run._element.parentNode.insertBefore(new_r_element, run._element.nextSibling);
801
+ }
802
+
803
+ const new_run = new Run(new_r_element, run._parent);
804
+ return [run, new_run];
805
+ }
806
+
807
+ private _set_run_text_elements(r_element: Element, new_text: string) {
808
+ const to_remove: Element[] = [];
809
+ for (let i = 0; i < r_element.childNodes.length; i++) {
810
+ const child = r_element.childNodes[i] as Element;
811
+ if (child.nodeType === 1 && ['w:t', 'w:delText', 'w:br', 'w:cr', 'w:tab'].includes(child.tagName)) {
812
+ to_remove.push(child);
813
+ }
814
+ }
815
+ for (const child of to_remove) {
816
+ r_element.removeChild(child);
817
+ }
818
+
819
+ const doc = r_element.ownerDocument;
820
+ if (doc) {
821
+ const new_t = doc.createElement('w:t');
822
+ new_t.textContent = new_text;
823
+ if (new_text.trim() !== new_text) {
824
+ new_t.setAttribute('xml:space', 'preserve');
825
+ }
826
+ r_element.appendChild(new_t);
827
+ }
828
+ }
829
+
830
+ public get_context_at_range(start_idx: number, end_idx: number): TextSpan | null {
831
+ const real_spans = this.spans.filter(s => s.run && s.end > start_idx && s.start < end_idx);
832
+ if (real_spans.length > 0) return real_spans[0];
833
+ return null;
834
+ }
835
+ }