rewritable 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/package.json +1 -1
- package/seeds/rewritable.html +607 -74
- package/src/import.mjs +206 -122
package/src/import.mjs
CHANGED
|
@@ -263,147 +263,231 @@ async function convertPdf(bytes) {
|
|
|
263
263
|
e.exitCode = 2;
|
|
264
264
|
throw e;
|
|
265
265
|
}
|
|
266
|
-
const
|
|
267
|
-
|
|
266
|
+
const pages = [];
|
|
267
|
+
let totalText = 0;
|
|
268
268
|
for (let p = 1; p <= doc.numPages; p++) {
|
|
269
269
|
const page = await doc.getPage(p);
|
|
270
|
-
const
|
|
271
|
-
|
|
272
|
-
|
|
270
|
+
const rendered = await renderPdfPage(page, pdfjs.Util, pdfjs.OPS);
|
|
271
|
+
pages.push(rendered.html);
|
|
272
|
+
totalText += rendered.textCount;
|
|
273
273
|
}
|
|
274
274
|
await doc.destroy().catch(() => {});
|
|
275
275
|
|
|
276
|
-
|
|
277
|
-
let buf = [];
|
|
278
|
-
const flush = () => {
|
|
279
|
-
const joined = buf.join(' ').replace(/\s+/g, ' ').trim();
|
|
280
|
-
if (joined) blocks.push(`<p>${escape(joined)}</p>`);
|
|
281
|
-
buf = [];
|
|
282
|
-
};
|
|
283
|
-
for (const line of paragraphs) {
|
|
284
|
-
if (line === null || line === '') { flush(); continue; }
|
|
285
|
-
buf.push(line);
|
|
286
|
-
}
|
|
287
|
-
flush();
|
|
288
|
-
|
|
289
|
-
if (blocks.length === 0) {
|
|
276
|
+
if (totalText === 0) {
|
|
290
277
|
const e = new Error('pdf: no extractable text — this looks like a scanned/image PDF; OCR is not supported');
|
|
291
278
|
e.exitCode = 2;
|
|
292
279
|
throw e;
|
|
293
280
|
}
|
|
294
281
|
return {
|
|
295
|
-
html: `<article>\n${
|
|
296
|
-
warnings: ['pdf:
|
|
282
|
+
html: `<article class="rwa-pdf">\n${PDF_PAGE_STYLE}\n<div class="rwa-pdf-doc">\n${pages.join('\n')}\n</div>\n</article>`,
|
|
283
|
+
warnings: ['pdf: imported as a geometry-faithful reconstruction (positioned text + rules) — text stays editable but is absolutely positioned'],
|
|
297
284
|
};
|
|
298
285
|
}
|
|
299
286
|
|
|
300
|
-
//
|
|
287
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
288
|
+
// PDF geometry-faithful reconstruction
|
|
301
289
|
//
|
|
302
|
-
//
|
|
303
|
-
//
|
|
304
|
-
//
|
|
305
|
-
//
|
|
306
|
-
//
|
|
307
|
-
//
|
|
308
|
-
// (2) Stacked short lines: an address block (Name / Street / City) has small
|
|
309
|
-
// y-gaps that fit inside the within-paragraph threshold, so naive logic
|
|
310
|
-
// would join them into one paragraph "Name Street City". We additionally
|
|
311
|
-
// break paragraphs when the *previous* line ended significantly short of
|
|
312
|
-
// the page's typical right margin (a heuristic for "hard line break").
|
|
290
|
+
// Instead of flattening pdf.js text items into prose paragraphs (which throws
|
|
291
|
+
// away every column, table, and alignment), reproduce the page: each text run
|
|
292
|
+
// becomes an absolutely-positioned <span> at its real device coordinates, and
|
|
293
|
+
// the page's vector rules/boxes become positioned <div>s. The result looks
|
|
294
|
+
// like the source PDF while keeping the text as real, editable, selectable DOM
|
|
295
|
+
// — so the rwa edit loop can still rewrite it (find/replace on the span text).
|
|
313
296
|
//
|
|
314
|
-
//
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
if (cur) lines.push(cur);
|
|
343
|
-
|
|
344
|
-
// For each line: concat parts directly, inserting a synthetic space only
|
|
345
|
-
// when there's a real positional gap (previous part's right edge to next
|
|
346
|
-
// part's x). pdf.js often emits explicit space items (str=" ") with tiny
|
|
347
|
-
// width — those carry the space character themselves, so the position-gap
|
|
348
|
-
// check below typically sees ~0 distance when they're present and we don't
|
|
349
|
-
// double-space.
|
|
350
|
-
const rendered = lines.map(line => {
|
|
351
|
-
line.parts.sort((a, b) => a.x - b.x);
|
|
352
|
-
let text = '';
|
|
353
|
-
let prev = null;
|
|
354
|
-
for (const p of line.parts) {
|
|
355
|
-
if (prev) {
|
|
356
|
-
const gap = p.x - (prev.x + prev.w);
|
|
357
|
-
const lastChar = text.slice(-1);
|
|
358
|
-
const firstChar = p.str.charAt(0);
|
|
359
|
-
// Threshold of 2 user-space units catches inter-word gaps on body
|
|
360
|
-
// text without false-positives inside words. Skip if the boundary
|
|
361
|
-
// already has whitespace from either side.
|
|
362
|
-
if (gap > 2 && !/\s/.test(lastChar) && !/\s/.test(firstChar)) {
|
|
363
|
-
text += ' ';
|
|
364
|
-
}
|
|
365
|
-
}
|
|
366
|
-
text += p.str;
|
|
367
|
-
prev = p;
|
|
368
|
-
}
|
|
369
|
-
const left = line.parts.length ? Math.min(...line.parts.map(p => p.x)) : 0;
|
|
370
|
-
const right = line.parts.length
|
|
371
|
-
? Math.max(...line.parts.map(p => p.x + p.w))
|
|
372
|
-
: 0;
|
|
373
|
-
return { text: text.replace(/\s+/g, ' ').trim(), y: line.y, h: line.h, left, right };
|
|
374
|
-
});
|
|
297
|
+
// Coordinate math mirrors pdf.js's own text-layer builder: multiply the page
|
|
298
|
+
// viewport transform by each item's text matrix, read font height from the
|
|
299
|
+
// resulting matrix, and place the box top at baseline − ascent. Graphics are
|
|
300
|
+
// recovered by walking the operator list with a CTM stack (save/restore/
|
|
301
|
+
// transform) and emitting the device-space bounding box of every painted
|
|
302
|
+
// fill/stroke path. PDFs of this family draw rules as thin filled rectangles,
|
|
303
|
+
// so bbox-only rendering is exact; curves degrade to their bounding box.
|
|
304
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
305
|
+
|
|
306
|
+
const PDF_PAGE_STYLE = `<style>
|
|
307
|
+
.rwa-pdf{max-width:none;margin:0;padding:0;background:#e9ecef;}
|
|
308
|
+
.rwa-pdf-doc{display:flex;flex-direction:column;align-items:center;gap:20px;padding:20px;overflow-x:auto;}
|
|
309
|
+
.rwa-pdf-page{position:relative;flex:none;background:#fff;box-shadow:0 1px 5px rgba(0,0,0,.18);overflow:hidden;}
|
|
310
|
+
.rwa-pdf-t{position:absolute;white-space:pre;line-height:1;color:#000;transform-origin:0 0;}
|
|
311
|
+
.rwa-pdf-g{position:absolute;}
|
|
312
|
+
@media print{.rwa-pdf{background:none}.rwa-pdf-doc{gap:0;padding:0;overflow:visible}.rwa-pdf-page{box-shadow:none}}
|
|
313
|
+
</style>`;
|
|
314
|
+
|
|
315
|
+
function escapePdfText(s) {
|
|
316
|
+
return s.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// 2-decimal round as a compact numeric string (no unit).
|
|
320
|
+
function pdfNum(n) {
|
|
321
|
+
return (Math.round(n * 100) / 100).toString();
|
|
322
|
+
}
|
|
375
323
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
324
|
+
// pdf.js 5.x passes path colors as a single CSS string in args[0] (e.g.
|
|
325
|
+
// ["#0000ff"]); older shapes pass [r,g,b] 0–255. Normalise to a validated CSS
|
|
326
|
+
// color — these strings land in an inline style, so reject anything unexpected.
|
|
327
|
+
function pdfColorToCss(a) {
|
|
328
|
+
let c = null;
|
|
329
|
+
if (Array.isArray(a)) {
|
|
330
|
+
if (typeof a[0] === 'string') c = a[0];
|
|
331
|
+
else if (a.length >= 3) c = `rgb(${a[0] | 0},${a[1] | 0},${a[2] | 0})`;
|
|
332
|
+
} else if (typeof a === 'string') c = a;
|
|
333
|
+
if (c && /^#[0-9a-fA-F]{3,8}$/.test(c)) return c.toLowerCase();
|
|
334
|
+
if (c && /^rgb\(\d{1,3},\d{1,3},\d{1,3}\)$/.test(c)) return c;
|
|
335
|
+
return '#000000';
|
|
336
|
+
}
|
|
384
337
|
|
|
338
|
+
function pdfIsWhitish(css) {
|
|
339
|
+
const c = String(css).toLowerCase().replace(/\s+/g, '');
|
|
340
|
+
return c === '#fff' || c === '#ffffff' || c === 'white' || c === 'rgb(255,255,255)';
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Recover weight/style + family. The sanitized fontName ("g_d0_f2") carries no
|
|
344
|
+
// weight; the embedded font's real PostScript name (via commonObjs, populated
|
|
345
|
+
// by getOperatorList) does — e.g. "Cambria-Bold". Guard for the rare miss.
|
|
346
|
+
function pdfFontMeta(page, fontName, style) {
|
|
347
|
+
let name = '';
|
|
348
|
+
try { const f = page.commonObjs.get(fontName); name = (f && f.name) || ''; } catch { name = ''; }
|
|
349
|
+
const bold = /bold|black|heavy|semibold|demibold|extrabold/i.test(name);
|
|
350
|
+
const italic = /italic|oblique/i.test(name);
|
|
351
|
+
const fam = style && style.fontFamily;
|
|
352
|
+
let family = "Georgia, 'Times New Roman', serif";
|
|
353
|
+
if (fam === 'sans-serif') family = "Helvetica, Arial, sans-serif";
|
|
354
|
+
else if (fam === 'monospace') family = "'Courier New', monospace";
|
|
355
|
+
return { bold, italic, family };
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
// Walk the operator list and return device-space rectangles for every visible
|
|
359
|
+
// fill/stroke path. The CTM stack handles save/restore/transform; the path's
|
|
360
|
+
// local minMax (args[2]) is mapped through the CTM via its four corners.
|
|
361
|
+
function collectPdfGraphics(opList, baseTransform, Util, OPS) {
|
|
385
362
|
const out = [];
|
|
386
|
-
let
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
const
|
|
403
|
-
|
|
363
|
+
let ctm = baseTransform.slice();
|
|
364
|
+
const stack = [];
|
|
365
|
+
let fill = '#000000', stroke = '#000000', lineWidth = 1;
|
|
366
|
+
const apply = (m, x, y) => [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
367
|
+
const FILLY = new Set([OPS.fill, OPS.eoFill]);
|
|
368
|
+
const STROKEY = new Set([OPS.stroke]);
|
|
369
|
+
const BOTH = new Set([OPS.fillStroke, OPS.eoFillStroke]);
|
|
370
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
371
|
+
const fn = opList.fnArray[i], a = opList.argsArray[i];
|
|
372
|
+
if (fn === OPS.save) stack.push(ctm.slice());
|
|
373
|
+
else if (fn === OPS.restore) { if (stack.length) ctm = stack.pop(); }
|
|
374
|
+
else if (fn === OPS.transform) ctm = Util.transform(ctm, a);
|
|
375
|
+
else if (fn === OPS.setFillRGBColor) fill = pdfColorToCss(a);
|
|
376
|
+
else if (fn === OPS.setStrokeRGBColor) stroke = pdfColorToCss(a);
|
|
377
|
+
else if (fn === OPS.setLineWidth) lineWidth = (typeof a === 'number' ? a : Array.isArray(a) ? a[0] : 1) || 1;
|
|
378
|
+
else if (fn === OPS.constructPath) {
|
|
379
|
+
const paint = a[0];
|
|
380
|
+
const isFill = FILLY.has(paint) || BOTH.has(paint);
|
|
381
|
+
const isStroke = STROKEY.has(paint) || BOTH.has(paint);
|
|
382
|
+
if (!isFill && !isStroke) continue; // endPath / clip → not painted
|
|
383
|
+
const mm = a[2];
|
|
384
|
+
if (!mm || mm.length < 4) continue;
|
|
385
|
+
const px = [], py = [];
|
|
386
|
+
for (const X of [mm[0], mm[2]]) for (const Y of [mm[1], mm[3]]) {
|
|
387
|
+
const [dx, dy] = apply(ctm, X, Y); px.push(dx); py.push(dy);
|
|
388
|
+
}
|
|
389
|
+
const x0 = Math.min(...px), x1 = Math.max(...px);
|
|
390
|
+
const y0 = Math.min(...py), y1 = Math.max(...py);
|
|
391
|
+
const color = isFill ? fill : stroke;
|
|
392
|
+
if (pdfIsWhitish(color)) continue; // invisible on the white page
|
|
393
|
+
const w = x1 - x0, h = y1 - y0;
|
|
394
|
+
if (w < 0.01 && h < 0.01) continue;
|
|
395
|
+
// Keep hairlines visible: strokes get their device line width, fills 0.5px.
|
|
396
|
+
const sc = Math.hypot(ctm[0], ctm[1]) || 1;
|
|
397
|
+
const minThick = isStroke && !isFill ? Math.max(lineWidth * sc, 0.5) : 0.5;
|
|
398
|
+
out.push({ x: x0, y: y0, w: Math.max(w, minThick), h: Math.max(h, minThick), color });
|
|
404
399
|
}
|
|
405
|
-
out.push(line.text);
|
|
406
|
-
prev = line;
|
|
407
400
|
}
|
|
408
401
|
return out;
|
|
409
402
|
}
|
|
403
|
+
|
|
404
|
+
// Place one pdf.js text item in device space (angle-aware top/left).
|
|
405
|
+
function placePdfItem(it, page, viewportTransform, styles, Util) {
|
|
406
|
+
const tx = Util.transform(viewportTransform, it.transform);
|
|
407
|
+
const fh = Math.hypot(tx[2], tx[3]);
|
|
408
|
+
if (fh < 0.1) return null;
|
|
409
|
+
const angle = Math.atan2(tx[1], tx[0]);
|
|
410
|
+
const style = styles[it.fontName] || {};
|
|
411
|
+
let ascentFrac = style.ascent;
|
|
412
|
+
if (!ascentFrac && style.descent) ascentFrac = 1 + style.descent;
|
|
413
|
+
if (!ascentFrac) ascentFrac = 0.8;
|
|
414
|
+
const a = fh * ascentFrac;
|
|
415
|
+
let left, top;
|
|
416
|
+
if (Math.abs(angle) < 1e-3) { left = tx[4]; top = tx[5] - a; }
|
|
417
|
+
else { left = tx[4] + a * Math.sin(angle); top = tx[5] - a * Math.cos(angle); }
|
|
418
|
+
const meta = pdfFontMeta(page, it.fontName, style);
|
|
419
|
+
return { str: it.str, left, right: left + (it.width || 0), top, fh, angle, ...meta };
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// Reconstruct one page as a positioned layer. Returns { html, textCount }.
|
|
423
|
+
//
|
|
424
|
+
// Text items are grouped into "runs" — adjacent, same-style glyphs on one
|
|
425
|
+
// baseline — and each run is emitted as a single positioned <span> that flows
|
|
426
|
+
// naturally. We split a run only at a real column gap, a style change, or a new
|
|
427
|
+
// line. This is what fixes word spacing: positioning each item independently
|
|
428
|
+
// lets a wider substitute font (the embedded face isn't shipped) overflow its
|
|
429
|
+
// slot and collide with the next item, eating the space; a flowing run spaces
|
|
430
|
+
// words with the substitute font's own metrics while staying pinned at the
|
|
431
|
+
// run's true start x, so columns and table cells stay put.
|
|
432
|
+
async function renderPdfPage(page, Util, OPS) {
|
|
433
|
+
const vp = page.getViewport({ scale: 1 });
|
|
434
|
+
const tc = await page.getTextContent();
|
|
435
|
+
const styles = tc.styles || {};
|
|
436
|
+
// getOperatorList yields the graphics and populates commonObjs (fonts).
|
|
437
|
+
const opList = await page.getOperatorList();
|
|
438
|
+
const graphics = collectPdfGraphics(opList, vp.transform, Util, OPS);
|
|
439
|
+
|
|
440
|
+
const parts = [];
|
|
441
|
+
for (const g of graphics) {
|
|
442
|
+
parts.push(`<div class="rwa-pdf-g" style="left:${pdfNum(g.x)}px;top:${pdfNum(g.y)}px;width:${pdfNum(g.w)}px;height:${pdfNum(g.h)}px;background:${g.color}"></div>`);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
const placed = [];
|
|
446
|
+
for (const it of tc.items) {
|
|
447
|
+
if (!it.transform || !it.str) continue;
|
|
448
|
+
const p = placePdfItem(it, page, vp.transform, styles, Util);
|
|
449
|
+
if (p) placed.push(p);
|
|
450
|
+
}
|
|
451
|
+
// Reading order: top-to-bottom, then left-to-right.
|
|
452
|
+
placed.sort((a, b) => a.top - b.top || a.left - b.left);
|
|
453
|
+
|
|
454
|
+
const WORD_GAP = 2; // device px — below this, no inter-item space
|
|
455
|
+
const runs = [];
|
|
456
|
+
let cur = null;
|
|
457
|
+
const sameStyle = (r, p) => r.bold === p.bold && r.italic === p.italic
|
|
458
|
+
&& r.family === p.family && Math.abs(r.fh - p.fh) < 0.5;
|
|
459
|
+
for (const p of placed) {
|
|
460
|
+
const colGap = Math.max(p.fh * 1.2, 12); // wider than a space, narrower than a column
|
|
461
|
+
const mergeable = cur
|
|
462
|
+
&& Math.abs(p.angle) < 1e-3 && Math.abs(cur.angle) < 1e-3
|
|
463
|
+
&& Math.abs(p.top - cur.top) <= Math.max(cur.fh, p.fh) * 0.5
|
|
464
|
+
&& (p.left - cur.right) <= colGap
|
|
465
|
+
&& sameStyle(cur, p);
|
|
466
|
+
if (mergeable) {
|
|
467
|
+
const gap = p.left - cur.right;
|
|
468
|
+
const lastChar = cur.text.slice(-1), firstChar = p.str.charAt(0);
|
|
469
|
+
if (gap > WORD_GAP && !/\s/.test(lastChar) && !/\s/.test(firstChar)) cur.text += ' ';
|
|
470
|
+
cur.text += p.str;
|
|
471
|
+
cur.right = p.right;
|
|
472
|
+
} else {
|
|
473
|
+
if (cur) runs.push(cur);
|
|
474
|
+
cur = { text: p.str, left: p.left, top: p.top, right: p.right, fh: p.fh, bold: p.bold, italic: p.italic, family: p.family, angle: p.angle };
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
if (cur) runs.push(cur);
|
|
478
|
+
|
|
479
|
+
let textCount = 0;
|
|
480
|
+
for (const run of runs) {
|
|
481
|
+
const text = run.text.replace(/\s+$/, '');
|
|
482
|
+
if (text.trim() === '') continue;
|
|
483
|
+
const css = [`left:${pdfNum(run.left)}px`, `top:${pdfNum(run.top)}px`, `font-size:${pdfNum(run.fh)}px`, `font-family:${run.family}`];
|
|
484
|
+
if (run.bold) css.push('font-weight:700');
|
|
485
|
+
if (run.italic) css.push('font-style:italic');
|
|
486
|
+
if (Math.abs(run.angle) >= 1e-3) css.push(`transform:rotate(${(run.angle * 180 / Math.PI).toFixed(2)}deg)`);
|
|
487
|
+
parts.push(`<span class="rwa-pdf-t" style="${css.join(';')}">${escapePdfText(text)}</span>`);
|
|
488
|
+
textCount++;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
const html = `<div class="rwa-pdf-page" style="width:${pdfNum(vp.width)}px;height:${pdfNum(vp.height)}px">\n${parts.join('\n')}\n</div>`;
|
|
492
|
+
return { html, textCount };
|
|
493
|
+
}
|