rewritable 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/import.mjs CHANGED
@@ -263,147 +263,231 @@ async function convertPdf(bytes) {
263
263
  e.exitCode = 2;
264
264
  throw e;
265
265
  }
266
- const escape = s => s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
267
- const paragraphs = [];
266
+ const pages = [];
267
+ let totalText = 0;
268
268
  for (let p = 1; p <= doc.numPages; p++) {
269
269
  const page = await doc.getPage(p);
270
- const tc = await page.getTextContent();
271
- extractParagraphs(tc.items).forEach(line => paragraphs.push(line));
272
- paragraphs.push(null); // page break: forces flush of next paragraph
270
+ const rendered = await renderPdfPage(page, pdfjs.Util, pdfjs.OPS);
271
+ pages.push(rendered.html);
272
+ totalText += rendered.textCount;
273
273
  }
274
274
  await doc.destroy().catch(() => {});
275
275
 
276
- const blocks = [];
277
- let buf = [];
278
- const flush = () => {
279
- const joined = buf.join(' ').replace(/\s+/g, ' ').trim();
280
- if (joined) blocks.push(`<p>${escape(joined)}</p>`);
281
- buf = [];
282
- };
283
- for (const line of paragraphs) {
284
- if (line === null || line === '') { flush(); continue; }
285
- buf.push(line);
286
- }
287
- flush();
288
-
289
- if (blocks.length === 0) {
276
+ if (totalText === 0) {
290
277
  const e = new Error('pdf: no extractable text — this looks like a scanned/image PDF; OCR is not supported');
291
278
  e.exitCode = 2;
292
279
  throw e;
293
280
  }
294
281
  return {
295
- html: `<article>\n${blocks.join('\n')}\n</article>`,
296
- warnings: ['pdf: layout reconstructed by heuristicsreview headings/lists manually'],
282
+ html: `<article class="rwa-pdf">\n${PDF_PAGE_STYLE}\n<div class="rwa-pdf-doc">\n${pages.join('\n')}\n</div>\n</article>`,
283
+ warnings: ['pdf: imported as a geometry-faithful reconstruction (positioned text + rules) text stays editable but is absolutely positioned'],
297
284
  };
298
285
  }
299
286
 
300
- // Group pdf.js text items into paragraph-shaped lines.
287
+ // ─────────────────────────────────────────────────────────────────────────
288
+ // PDF geometry-faithful reconstruction
301
289
  //
302
- // Two non-obvious problems this handles:
303
- // (1) Adjacent items inside a word: pdf.js returns text per font run, so a
304
- // word like "Aufwände" comes out as ["Aufw", "ä", "nde"] when the umlaut
305
- // glyph lives in a different font table from the ASCII letters. Joining
306
- // with ' ' produces "Aufw ä nde" wrong. We concat directly and only
307
- // synthesize a space when there's a real positional x-gap.
308
- // (2) Stacked short lines: an address block (Name / Street / City) has small
309
- // y-gaps that fit inside the within-paragraph threshold, so naive logic
310
- // would join them into one paragraph "Name Street City". We additionally
311
- // break paragraphs when the *previous* line ended significantly short of
312
- // the page's typical right margin (a heuristic for "hard line break").
290
+ // Instead of flattening pdf.js text items into prose paragraphs (which throws
291
+ // away every column, table, and alignment), reproduce the page: each text run
292
+ // becomes an absolutely-positioned <span> at its real device coordinates, and
293
+ // the page's vector rules/boxes become positioned <div>s. The result looks
294
+ // like the source PDF while keeping the text as real, editable, selectable DOM
295
+ // so the rwa edit loop can still rewrite it (find/replace on the span text).
313
296
  //
314
- // Returns an array of strings; '' marks a paragraph break.
315
- function extractParagraphs(items) {
316
- if (!items || items.length === 0) return [];
317
- const rows = items.map(it => ({
318
- str: it.str,
319
- y: it.transform ? it.transform[5] : 0,
320
- x: it.transform ? it.transform[4] : 0,
321
- w: it.width || 0,
322
- h: it.height || (it.transform ? Math.abs(it.transform[3]) : 0) || 12,
323
- }));
324
- // Sort top-to-bottom (y desc in PDF coords), then left-to-right within a
325
- // row. pdfjs's content-stream order is reading order for well-tagged
326
- // single-column PDFs, but for multi-column or absolutely-positioned layouts
327
- // it interleaves visually-separate lines; sorting first makes the same-y
328
- // grouping below tolerant of that.
329
- rows.sort((a, b) => b.y - a.y || a.x - b.x);
330
- // Group into visual lines by y (within half a line height).
331
- const lines = [];
332
- let cur = null;
333
- for (const r of rows) {
334
- if (cur && Math.abs(r.y - cur.y) <= cur.h * 0.5) {
335
- cur.parts.push(r);
336
- cur.y = (cur.y + r.y) / 2;
337
- } else {
338
- if (cur) lines.push(cur);
339
- cur = { y: r.y, h: r.h, parts: [r] };
340
- }
341
- }
342
- if (cur) lines.push(cur);
343
-
344
- // For each line: concat parts directly, inserting a synthetic space only
345
- // when there's a real positional gap (previous part's right edge to next
346
- // part's x). pdf.js often emits explicit space items (str=" ") with tiny
347
- // width — those carry the space character themselves, so the position-gap
348
- // check below typically sees ~0 distance when they're present and we don't
349
- // double-space.
350
- const rendered = lines.map(line => {
351
- line.parts.sort((a, b) => a.x - b.x);
352
- let text = '';
353
- let prev = null;
354
- for (const p of line.parts) {
355
- if (prev) {
356
- const gap = p.x - (prev.x + prev.w);
357
- const lastChar = text.slice(-1);
358
- const firstChar = p.str.charAt(0);
359
- // Threshold of 2 user-space units catches inter-word gaps on body
360
- // text without false-positives inside words. Skip if the boundary
361
- // already has whitespace from either side.
362
- if (gap > 2 && !/\s/.test(lastChar) && !/\s/.test(firstChar)) {
363
- text += ' ';
364
- }
365
- }
366
- text += p.str;
367
- prev = p;
368
- }
369
- const left = line.parts.length ? Math.min(...line.parts.map(p => p.x)) : 0;
370
- const right = line.parts.length
371
- ? Math.max(...line.parts.map(p => p.x + p.w))
372
- : 0;
373
- return { text: text.replace(/\s+/g, ' ').trim(), y: line.y, h: line.h, left, right };
374
- });
297
+ // Coordinate math mirrors pdf.js's own text-layer builder: multiply the page
298
+ // viewport transform by each item's text matrix, read font height from the
299
+ // resulting matrix, and place the box top at baseline − ascent. Graphics are
300
+ // recovered by walking the operator list with a CTM stack (save/restore/
301
+ // transform) and emitting the device-space bounding box of every painted
302
+ // fill/stroke path. PDFs of this family draw rules as thin filled rectangles,
303
+ // so bbox-only rendering is exact; curves degrade to their bounding box.
304
+ // ─────────────────────────────────────────────────────────────────────────
305
+
306
+ const PDF_PAGE_STYLE = `<style>
307
+ .rwa-pdf{max-width:none;margin:0;padding:0;background:#e9ecef;}
308
+ .rwa-pdf-doc{display:flex;flex-direction:column;align-items:center;gap:20px;padding:20px;overflow-x:auto;}
309
+ .rwa-pdf-page{position:relative;flex:none;background:#fff;box-shadow:0 1px 5px rgba(0,0,0,.18);overflow:hidden;}
310
+ .rwa-pdf-t{position:absolute;white-space:pre;line-height:1;color:#000;transform-origin:0 0;}
311
+ .rwa-pdf-g{position:absolute;}
312
+ @media print{.rwa-pdf{background:none}.rwa-pdf-doc{gap:0;padding:0;overflow:visible}.rwa-pdf-page{box-shadow:none}}
313
+ </style>`;
314
+
315
+ function escapePdfText(s) {
316
+ return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
317
+ }
318
+
319
+ // 2-decimal round as a compact numeric string (no unit).
320
+ function pdfNum(n) {
321
+ return (Math.round(n * 100) / 100).toString();
322
+ }
375
323
 
376
- // The page's "typical right margin" use the 90th-percentile right edge
377
- // (more robust than max, which a stray header/page-number could inflate).
378
- // Lines ending well short of this are likely hard line-breaks, not soft
379
- // wraps to the right margin.
380
- const sortedRights = rendered.filter(l => l.text).map(l => l.right).sort((a, b) => a - b);
381
- const margin = sortedRights.length
382
- ? sortedRights[Math.floor(sortedRights.length * 0.9)]
383
- : 0;
324
+ // pdf.js 5.x passes path colors as a single CSS string in args[0] (e.g.
325
+ // ["#0000ff"]); older shapes pass [r,g,b] 0–255. Normalise to a validated CSS
326
+ // color these strings land in an inline style, so reject anything unexpected.
327
+ function pdfColorToCss(a) {
328
+ let c = null;
329
+ if (Array.isArray(a)) {
330
+ if (typeof a[0] === 'string') c = a[0];
331
+ else if (a.length >= 3) c = `rgb(${a[0] | 0},${a[1] | 0},${a[2] | 0})`;
332
+ } else if (typeof a === 'string') c = a;
333
+ if (c && /^#[0-9a-fA-F]{3,8}$/.test(c)) return c.toLowerCase();
334
+ if (c && /^rgb\(\d{1,3},\d{1,3},\d{1,3}\)$/.test(c)) return c;
335
+ return '#000000';
336
+ }
384
337
 
338
+ function pdfIsWhitish(css) {
339
+ const c = String(css).toLowerCase().replace(/\s+/g, '');
340
+ return c === '#fff' || c === '#ffffff' || c === 'white' || c === 'rgb(255,255,255)';
341
+ }
342
+
343
+ // Recover weight/style + family. The sanitized fontName ("g_d0_f2") carries no
344
+ // weight; the embedded font's real PostScript name (via commonObjs, populated
345
+ // by getOperatorList) does — e.g. "Cambria-Bold". Guard for the rare miss.
346
+ function pdfFontMeta(page, fontName, style) {
347
+ let name = '';
348
+ try { const f = page.commonObjs.get(fontName); name = (f && f.name) || ''; } catch { name = ''; }
349
+ const bold = /bold|black|heavy|semibold|demibold|extrabold/i.test(name);
350
+ const italic = /italic|oblique/i.test(name);
351
+ const fam = style && style.fontFamily;
352
+ let family = "Georgia, 'Times New Roman', serif";
353
+ if (fam === 'sans-serif') family = "Helvetica, Arial, sans-serif";
354
+ else if (fam === 'monospace') family = "'Courier New', monospace";
355
+ return { bold, italic, family };
356
+ }
357
+
358
+ // Walk the operator list and return device-space rectangles for every visible
359
+ // fill/stroke path. The CTM stack handles save/restore/transform; the path's
360
+ // local minMax (args[2]) is mapped through the CTM via its four corners.
361
+ function collectPdfGraphics(opList, baseTransform, Util, OPS) {
385
362
  const out = [];
386
- let prev = null;
387
- for (const line of rendered) {
388
- if (!line.text) continue; // pdfjs sometimes emits whitespace-only EOL stubs; ignore.
389
- if (prev != null) {
390
- const yGap = Math.abs(prev.y - line.y);
391
- const yJump = yGap > prev.h * 1.5;
392
- // Previous line ended significantly short of the page's right margin —
393
- // that's the signature of a hard line-break (address line, table cell,
394
- // bullet, sender block). Threshold of 1.5× line height (~1-2 chars)
395
- // ignores end-of-line whitespace + small justification slop while still
396
- // catching genuinely short lines. Soft wraps to the right margin are
397
- // within ~few units and don't trigger.
398
- const prevShortOfMargin = margin > 0 && (margin - prev.right) > prev.h * 1.5;
399
- // Right-aligned blocks have a fixed right edge but varying left edge
400
- // per line. A jump of more than a line-height in left position is a
401
- // structural change, not text-flow continuation.
402
- const leftJump = Math.abs(prev.left - line.left) > line.h;
403
- if (yJump || prevShortOfMargin || leftJump) out.push('');
363
+ let ctm = baseTransform.slice();
364
+ const stack = [];
365
+ let fill = '#000000', stroke = '#000000', lineWidth = 1;
366
+ const apply = (m, x, y) => [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
367
+ const FILLY = new Set([OPS.fill, OPS.eoFill]);
368
+ const STROKEY = new Set([OPS.stroke]);
369
+ const BOTH = new Set([OPS.fillStroke, OPS.eoFillStroke]);
370
+ for (let i = 0; i < opList.fnArray.length; i++) {
371
+ const fn = opList.fnArray[i], a = opList.argsArray[i];
372
+ if (fn === OPS.save) stack.push(ctm.slice());
373
+ else if (fn === OPS.restore) { if (stack.length) ctm = stack.pop(); }
374
+ else if (fn === OPS.transform) ctm = Util.transform(ctm, a);
375
+ else if (fn === OPS.setFillRGBColor) fill = pdfColorToCss(a);
376
+ else if (fn === OPS.setStrokeRGBColor) stroke = pdfColorToCss(a);
377
+ else if (fn === OPS.setLineWidth) lineWidth = (typeof a === 'number' ? a : Array.isArray(a) ? a[0] : 1) || 1;
378
+ else if (fn === OPS.constructPath) {
379
+ const paint = a[0];
380
+ const isFill = FILLY.has(paint) || BOTH.has(paint);
381
+ const isStroke = STROKEY.has(paint) || BOTH.has(paint);
382
+ if (!isFill && !isStroke) continue; // endPath / clip → not painted
383
+ const mm = a[2];
384
+ if (!mm || mm.length < 4) continue;
385
+ const px = [], py = [];
386
+ for (const X of [mm[0], mm[2]]) for (const Y of [mm[1], mm[3]]) {
387
+ const [dx, dy] = apply(ctm, X, Y); px.push(dx); py.push(dy);
388
+ }
389
+ const x0 = Math.min(...px), x1 = Math.max(...px);
390
+ const y0 = Math.min(...py), y1 = Math.max(...py);
391
+ const color = isFill ? fill : stroke;
392
+ if (pdfIsWhitish(color)) continue; // invisible on the white page
393
+ const w = x1 - x0, h = y1 - y0;
394
+ if (w < 0.01 && h < 0.01) continue;
395
+ // Keep hairlines visible: strokes get their device line width, fills 0.5px.
396
+ const sc = Math.hypot(ctm[0], ctm[1]) || 1;
397
+ const minThick = isStroke && !isFill ? Math.max(lineWidth * sc, 0.5) : 0.5;
398
+ out.push({ x: x0, y: y0, w: Math.max(w, minThick), h: Math.max(h, minThick), color });
404
399
  }
405
- out.push(line.text);
406
- prev = line;
407
400
  }
408
401
  return out;
409
402
  }
403
+
404
+ // Place one pdf.js text item in device space (angle-aware top/left).
405
+ function placePdfItem(it, page, viewportTransform, styles, Util) {
406
+ const tx = Util.transform(viewportTransform, it.transform);
407
+ const fh = Math.hypot(tx[2], tx[3]);
408
+ if (fh < 0.1) return null;
409
+ const angle = Math.atan2(tx[1], tx[0]);
410
+ const style = styles[it.fontName] || {};
411
+ let ascentFrac = style.ascent;
412
+ if (!ascentFrac && style.descent) ascentFrac = 1 + style.descent;
413
+ if (!ascentFrac) ascentFrac = 0.8;
414
+ const a = fh * ascentFrac;
415
+ let left, top;
416
+ if (Math.abs(angle) < 1e-3) { left = tx[4]; top = tx[5] - a; }
417
+ else { left = tx[4] + a * Math.sin(angle); top = tx[5] - a * Math.cos(angle); }
418
+ const meta = pdfFontMeta(page, it.fontName, style);
419
+ return { str: it.str, left, right: left + (it.width || 0), top, fh, angle, ...meta };
420
+ }
421
+
422
+ // Reconstruct one page as a positioned layer. Returns { html, textCount }.
423
+ //
424
+ // Text items are grouped into "runs" — adjacent, same-style glyphs on one
425
+ // baseline — and each run is emitted as a single positioned <span> that flows
426
+ // naturally. We split a run only at a real column gap, a style change, or a new
427
+ // line. This is what fixes word spacing: positioning each item independently
428
+ // lets a wider substitute font (the embedded face isn't shipped) overflow its
429
+ // slot and collide with the next item, eating the space; a flowing run spaces
430
+ // words with the substitute font's own metrics while staying pinned at the
431
+ // run's true start x, so columns and table cells stay put.
432
+ async function renderPdfPage(page, Util, OPS) {
433
+ const vp = page.getViewport({ scale: 1 });
434
+ const tc = await page.getTextContent();
435
+ const styles = tc.styles || {};
436
+ // getOperatorList yields the graphics and populates commonObjs (fonts).
437
+ const opList = await page.getOperatorList();
438
+ const graphics = collectPdfGraphics(opList, vp.transform, Util, OPS);
439
+
440
+ const parts = [];
441
+ for (const g of graphics) {
442
+ parts.push(`<div class="rwa-pdf-g" style="left:${pdfNum(g.x)}px;top:${pdfNum(g.y)}px;width:${pdfNum(g.w)}px;height:${pdfNum(g.h)}px;background:${g.color}"></div>`);
443
+ }
444
+
445
+ const placed = [];
446
+ for (const it of tc.items) {
447
+ if (!it.transform || !it.str) continue;
448
+ const p = placePdfItem(it, page, vp.transform, styles, Util);
449
+ if (p) placed.push(p);
450
+ }
451
+ // Reading order: top-to-bottom, then left-to-right.
452
+ placed.sort((a, b) => a.top - b.top || a.left - b.left);
453
+
454
+ const WORD_GAP = 2; // device px — below this, no inter-item space
455
+ const runs = [];
456
+ let cur = null;
457
+ const sameStyle = (r, p) => r.bold === p.bold && r.italic === p.italic
458
+ && r.family === p.family && Math.abs(r.fh - p.fh) < 0.5;
459
+ for (const p of placed) {
460
+ const colGap = Math.max(p.fh * 1.2, 12); // wider than a space, narrower than a column
461
+ const mergeable = cur
462
+ && Math.abs(p.angle) < 1e-3 && Math.abs(cur.angle) < 1e-3
463
+ && Math.abs(p.top - cur.top) <= Math.max(cur.fh, p.fh) * 0.5
464
+ && (p.left - cur.right) <= colGap
465
+ && sameStyle(cur, p);
466
+ if (mergeable) {
467
+ const gap = p.left - cur.right;
468
+ const lastChar = cur.text.slice(-1), firstChar = p.str.charAt(0);
469
+ if (gap > WORD_GAP && !/\s/.test(lastChar) && !/\s/.test(firstChar)) cur.text += ' ';
470
+ cur.text += p.str;
471
+ cur.right = p.right;
472
+ } else {
473
+ if (cur) runs.push(cur);
474
+ cur = { text: p.str, left: p.left, top: p.top, right: p.right, fh: p.fh, bold: p.bold, italic: p.italic, family: p.family, angle: p.angle };
475
+ }
476
+ }
477
+ if (cur) runs.push(cur);
478
+
479
+ let textCount = 0;
480
+ for (const run of runs) {
481
+ const text = run.text.replace(/\s+$/, '');
482
+ if (text.trim() === '') continue;
483
+ const css = [`left:${pdfNum(run.left)}px`, `top:${pdfNum(run.top)}px`, `font-size:${pdfNum(run.fh)}px`, `font-family:${run.family}`];
484
+ if (run.bold) css.push('font-weight:700');
485
+ if (run.italic) css.push('font-style:italic');
486
+ if (Math.abs(run.angle) >= 1e-3) css.push(`transform:rotate(${(run.angle * 180 / Math.PI).toFixed(2)}deg)`);
487
+ parts.push(`<span class="rwa-pdf-t" style="${css.join(';')}">${escapePdfText(text)}</span>`);
488
+ textCount++;
489
+ }
490
+
491
+ const html = `<div class="rwa-pdf-page" style="width:${pdfNum(vp.width)}px;height:${pdfNum(vp.height)}px">\n${parts.join('\n')}\n</div>`;
492
+ return { html, textCount };
493
+ }