rewritable 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/import.mjs CHANGED
@@ -1,24 +1,42 @@
1
1
  import { marked } from 'marked';
2
+ import Papa from 'papaparse';
3
+ import mammoth from 'mammoth';
4
+ import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
2
5
 
3
- export async function convert(ext, content) {
6
+ // `convert` takes raw bytes (Buffer / Uint8Array). Text formats decode utf8
7
+ // internally; binary formats consume bytes directly. Switching to bytes was
8
+ // driven by docx/pdf — keeping a single signature avoids a fork.
9
+ export async function convert(ext, bytes) {
4
10
  switch (ext) {
5
11
  case 'md':
6
12
  case 'markdown':
7
- return convertMd(content);
13
+ return convertMd(toText(bytes));
8
14
  case 'html':
9
15
  case 'htm':
10
- return convertHtml(content);
16
+ return convertHtml(toText(bytes));
17
+ case 'csv':
18
+ return convertCsv(toText(bytes));
11
19
  case 'txt':
12
20
  case '':
13
- return convertTxt(content);
21
+ return convertTxt(toText(bytes));
22
+ case 'docx':
23
+ return convertDocx(bytes);
24
+ case 'pdf':
25
+ return convertPdf(bytes);
14
26
  default: {
15
- const e = new Error(`unsupported format: .${ext} (supported: .md, .markdown, .html, .htm, .txt)`);
27
+ const e = new Error(`unsupported format: .${ext} (supported: .md, .markdown, .html, .htm, .csv, .txt, .docx, .pdf)`);
16
28
  e.exitCode = 2;
17
29
  throw e;
18
30
  }
19
31
  }
20
32
  }
21
33
 
34
+ function toText(bytes) {
35
+ if (typeof bytes === 'string') return bytes;
36
+ if (Buffer.isBuffer(bytes)) return bytes.toString('utf8');
37
+ return Buffer.from(bytes).toString('utf8');
38
+ }
39
+
22
40
  function convertMd(md) {
23
41
  const html = marked.parse(md, { gfm: true, breaks: false });
24
42
  return { html: `<article>\n${html.trim()}\n</article>`, warnings: [] };
@@ -27,7 +45,12 @@ function convertMd(md) {
27
45
  function convertHtml(input) {
28
46
  const warnings = [];
29
47
 
30
- let body = input.replace(/<!DOCTYPE[^>]*>/gi, '').replace(/<\/?html[^>]*>/gi, '');
48
+ // Strip HTML comments first. Without this, a comment like <!-- </head> -->
49
+ // would terminate the non-greedy head match prematurely and let head content
50
+ // leak into the body. Comments are dropped — acceptable for an offline import
51
+ // CLI; full preservation would require a real parser.
52
+ let body = input.replace(/<!--[\s\S]*?-->/g, '');
53
+ body = body.replace(/<!DOCTYPE[^>]*>/gi, '').replace(/<\/?html[^>]*>/gi, '');
31
54
 
32
55
  const headMatch = body.match(/<head[^>]*>([\s\S]*?)<\/head>/i);
33
56
  let headStyles = '';
@@ -49,6 +72,42 @@ function convertHtml(input) {
49
72
  return { html: headStyles + body, warnings };
50
73
  }
51
74
 
75
+ function looksLikeCsv(text) {
76
+ const probe = Papa.parse(text, { preview: 2, skipEmptyLines: true, header: false });
77
+ if (probe.errors.length > 0) return false;
78
+ if (probe.data.length === 0) return false;
79
+ const cols = probe.data[0].length;
80
+ if (cols < 2) return false;
81
+ if (probe.data.length === 2 && probe.data[1].length !== cols) return false;
82
+ return true;
83
+ }
84
+
85
+ function convertCsv(text) {
86
+ if (!looksLikeCsv(text)) {
87
+ const e = new Error('csv probe failed: input does not look like CSV (need ≥2 columns with consistent column count)');
88
+ e.exitCode = 2;
89
+ throw e;
90
+ }
91
+ // skipEmptyLines drops trailing blank rows that csv exporters often emit.
92
+ // header:false because we own the first-row-as-thead split and want raw rows.
93
+ const result = Papa.parse(text, { skipEmptyLines: true, header: false });
94
+ const warnings = result.errors.map(e => {
95
+ const where = e.row != null ? ` (row ${e.row + 1})` : '';
96
+ return `csv parse: ${e.message}${where}`;
97
+ });
98
+ const rows = result.data;
99
+ if (rows.length === 0) {
100
+ return { html: '<article>\n</article>', warnings };
101
+ }
102
+ const escape = s => String(s == null ? '' : s).replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
103
+ const [header, ...body] = rows;
104
+ const thead = `<thead>\n<tr>${header.map(c => `<th>${escape(c)}</th>`).join('')}</tr>\n</thead>`;
105
+ const tbody = body.length === 0
106
+ ? ''
107
+ : `\n<tbody>\n${body.map(row => `<tr>${row.map(c => `<td>${escape(c)}</td>`).join('')}</tr>`).join('\n')}\n</tbody>`;
108
+ return { html: `<article>\n<table>\n${thead}${tbody}\n</table>\n</article>`, warnings };
109
+ }
110
+
52
111
  function convertTxt(text) {
53
112
  const escape = s => s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
54
113
  const blocks = text
@@ -58,3 +117,227 @@ function convertTxt(text) {
58
117
  .map(b => `<p>${escape(b)}</p>`);
59
118
  return { html: `<article>\n${blocks.join('\n')}\n</article>`, warnings: [] };
60
119
  }
120
+
121
+ async function convertDocx(bytes) {
122
+ const buffer = Buffer.isBuffer(bytes) ? bytes : Buffer.from(bytes);
123
+ let result;
124
+ try {
125
+ result = await mammoth.convertToHtml({ buffer });
126
+ } catch (err) {
127
+ const e = new Error(`docx: ${err && err.message ? err.message : String(err)}`);
128
+ e.exitCode = 2;
129
+ throw e;
130
+ }
131
+ const raw = (result.value || '').trim();
132
+ if (!raw) {
133
+ const e = new Error('docx: produced empty document — input may be corrupt or empty');
134
+ e.exitCode = 2;
135
+ throw e;
136
+ }
137
+ const { html, skipped } = sanitizeMammothUrls(raw);
138
+ const warnings = [
139
+ ...skipped.map(s => `docx: ${s}`),
140
+ ...(result.messages || []).map(m => `docx: ${m.message}`),
141
+ ];
142
+ return { html: `<article>\n${html}\n</article>`, warnings };
143
+ }
144
+
145
+ // Mammoth doesn't filter URL schemes — a docx with a `javascript:` hyperlink
146
+ // would land in the imported document and execute on click (stored XSS in the
147
+ // downloaded rwa container). Strip unsafe schemes from href/src and replace
148
+ // with `#`. Mammoth's HTML writer always uses double-quoted attributes and
149
+ // escapes &, ", <, > inside values, so a regex match against `attr="..."` is
150
+ // sufficient — no quote-escape ambiguity to worry about.
151
+ const _SAFE_HREF_SCHEMES = new Set(['http', 'https', 'mailto', 'tel']);
152
+ function _attrIsSafe(attr, val) {
153
+ const m = val.match(/^\s*([a-z][a-z0-9+.\-]*):/i);
154
+ if (!m) return true; // no scheme: relative URL, fragment, or path
155
+ const proto = m[1].toLowerCase();
156
+ if (_SAFE_HREF_SCHEMES.has(proto)) return true;
157
+ // Mammoth embeds images as data:image/...;base64,... — allow that one
158
+ // narrow shape on src only. A docx that lies about its image content-type
159
+ // could produce data:text/html in src; img doesn't execute scripts there,
160
+ // so it's a UX issue (broken image) not a security issue.
161
+ if (attr === 'src' && proto === 'data' && /^\s*data:image\//i.test(val)) return true;
162
+ return false;
163
+ }
164
+ function sanitizeMammothUrls(html) {
165
+ const skipped = [];
166
+ const stripAttr = (attr) => (full, val) => {
167
+ if (_attrIsSafe(attr, val)) return full;
168
+ const m = val.match(/^\s*([a-z][a-z0-9+.\-]*):/i);
169
+ const scheme = m ? m[1].toLowerCase() : 'unknown';
170
+ skipped.push(`stripped unsafe ${attr} (scheme: ${scheme}:)`);
171
+ return `${attr}="#"`;
172
+ };
173
+ return {
174
+ html: html
175
+ .replace(/href="([^"]*)"/g, stripAttr('href'))
176
+ .replace(/src="([^"]*)"/g, stripAttr('src')),
177
+ skipped,
178
+ };
179
+ }
180
+
181
+ async function convertPdf(bytes) {
182
+ // pdfjs explicitly rejects Node's Buffer (despite Buffer extending Uint8Array)
183
+ // and wants a plain Uint8Array view.
184
+ const src = bytes instanceof Uint8Array ? bytes : Buffer.from(bytes);
185
+ const data = new Uint8Array(src.buffer, src.byteOffset, src.byteLength);
186
+ let doc;
187
+ try {
188
+ doc = await pdfjs.getDocument({ data, isEvalSupported: false }).promise;
189
+ } catch (err) {
190
+ const name = err && err.name;
191
+ if (name === 'PasswordException') {
192
+ const e = new Error('pdf: file is password-protected');
193
+ e.exitCode = 2;
194
+ throw e;
195
+ }
196
+ const e = new Error(`pdf: ${err && err.message ? err.message : String(err)}`);
197
+ e.exitCode = 2;
198
+ throw e;
199
+ }
200
+ const escape = s => s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
201
+ const paragraphs = [];
202
+ for (let p = 1; p <= doc.numPages; p++) {
203
+ const page = await doc.getPage(p);
204
+ const tc = await page.getTextContent();
205
+ extractParagraphs(tc.items).forEach(line => paragraphs.push(line));
206
+ paragraphs.push(null); // page break: forces flush of next paragraph
207
+ }
208
+ await doc.destroy().catch(() => {});
209
+
210
+ const blocks = [];
211
+ let buf = [];
212
+ const flush = () => {
213
+ const joined = buf.join(' ').replace(/\s+/g, ' ').trim();
214
+ if (joined) blocks.push(`<p>${escape(joined)}</p>`);
215
+ buf = [];
216
+ };
217
+ for (const line of paragraphs) {
218
+ if (line === null || line === '') { flush(); continue; }
219
+ buf.push(line);
220
+ }
221
+ flush();
222
+
223
+ if (blocks.length === 0) {
224
+ const e = new Error('pdf: no extractable text — this looks like a scanned/image PDF; OCR is not supported');
225
+ e.exitCode = 2;
226
+ throw e;
227
+ }
228
+ return {
229
+ html: `<article>\n${blocks.join('\n')}\n</article>`,
230
+ warnings: ['pdf: layout reconstructed by heuristics — review headings/lists manually'],
231
+ };
232
+ }
233
+
234
+ // Group pdf.js text items into paragraph-shaped lines.
235
+ //
236
+ // Two non-obvious problems this handles:
237
+ // (1) Adjacent items inside a word: pdf.js returns text per font run, so a
238
+ // word like "Aufwände" comes out as ["Aufw", "ä", "nde"] when the umlaut
239
+ // glyph lives in a different font table from the ASCII letters. Joining
240
+ // with ' ' produces "Aufw ä nde" — wrong. We concat directly and only
241
+ // synthesize a space when there's a real positional x-gap.
242
+ // (2) Stacked short lines: an address block (Name / Street / City) has small
243
+ // y-gaps that fit inside the within-paragraph threshold, so naive logic
244
+ // would join them into one paragraph "Name Street City". We additionally
245
+ // break paragraphs when the *previous* line ended significantly short of
246
+ // the page's typical right margin (a heuristic for "hard line break").
247
+ //
248
+ // Returns an array of strings; '' marks a paragraph break.
249
+ function extractParagraphs(items) {
250
+ if (!items || items.length === 0) return [];
251
+ const rows = items.map(it => ({
252
+ str: it.str,
253
+ y: it.transform ? it.transform[5] : 0,
254
+ x: it.transform ? it.transform[4] : 0,
255
+ w: it.width || 0,
256
+ h: it.height || (it.transform ? Math.abs(it.transform[3]) : 0) || 12,
257
+ }));
258
+ // Sort top-to-bottom (y desc in PDF coords), then left-to-right within a
259
+ // row. pdfjs's content-stream order is reading order for well-tagged
260
+ // single-column PDFs, but for multi-column or absolutely-positioned layouts
261
+ // it interleaves visually-separate lines; sorting first makes the same-y
262
+ // grouping below tolerant of that.
263
+ rows.sort((a, b) => b.y - a.y || a.x - b.x);
264
+ // Group into visual lines by y (within half a line height).
265
+ const lines = [];
266
+ let cur = null;
267
+ for (const r of rows) {
268
+ if (cur && Math.abs(r.y - cur.y) <= cur.h * 0.5) {
269
+ cur.parts.push(r);
270
+ cur.y = (cur.y + r.y) / 2;
271
+ } else {
272
+ if (cur) lines.push(cur);
273
+ cur = { y: r.y, h: r.h, parts: [r] };
274
+ }
275
+ }
276
+ if (cur) lines.push(cur);
277
+
278
+ // For each line: concat parts directly, inserting a synthetic space only
279
+ // when there's a real positional gap (previous part's right edge to next
280
+ // part's x). pdf.js often emits explicit space items (str=" ") with tiny
281
+ // width — those carry the space character themselves, so the position-gap
282
+ // check below typically sees ~0 distance when they're present and we don't
283
+ // double-space.
284
+ const rendered = lines.map(line => {
285
+ line.parts.sort((a, b) => a.x - b.x);
286
+ let text = '';
287
+ let prev = null;
288
+ for (const p of line.parts) {
289
+ if (prev) {
290
+ const gap = p.x - (prev.x + prev.w);
291
+ const lastChar = text.slice(-1);
292
+ const firstChar = p.str.charAt(0);
293
+ // Threshold of 2 user-space units catches inter-word gaps on body
294
+ // text without false-positives inside words. Skip if the boundary
295
+ // already has whitespace from either side.
296
+ if (gap > 2 && !/\s/.test(lastChar) && !/\s/.test(firstChar)) {
297
+ text += ' ';
298
+ }
299
+ }
300
+ text += p.str;
301
+ prev = p;
302
+ }
303
+ const left = line.parts.length ? Math.min(...line.parts.map(p => p.x)) : 0;
304
+ const right = line.parts.length
305
+ ? Math.max(...line.parts.map(p => p.x + p.w))
306
+ : 0;
307
+ return { text: text.replace(/\s+/g, ' ').trim(), y: line.y, h: line.h, left, right };
308
+ });
309
+
310
+ // The page's "typical right margin" — use the 90th-percentile right edge
311
+ // (more robust than max, which a stray header/page-number could inflate).
312
+ // Lines ending well short of this are likely hard line-breaks, not soft
313
+ // wraps to the right margin.
314
+ const sortedRights = rendered.filter(l => l.text).map(l => l.right).sort((a, b) => a - b);
315
+ const margin = sortedRights.length
316
+ ? sortedRights[Math.floor(sortedRights.length * 0.9)]
317
+ : 0;
318
+
319
+ const out = [];
320
+ let prev = null;
321
+ for (const line of rendered) {
322
+ if (!line.text) continue; // pdfjs sometimes emits whitespace-only EOL stubs; ignore.
323
+ if (prev != null) {
324
+ const yGap = Math.abs(prev.y - line.y);
325
+ const yJump = yGap > prev.h * 1.5;
326
+ // Previous line ended significantly short of the page's right margin —
327
+ // that's the signature of a hard line-break (address line, table cell,
328
+ // bullet, sender block). Threshold of 1.5× line height (~1-2 chars)
329
+ // ignores end-of-line whitespace + small justification slop while still
330
+ // catching genuinely short lines. Soft wraps to the right margin are
331
+ // within ~few units and don't trigger.
332
+ const prevShortOfMargin = margin > 0 && (margin - prev.right) > prev.h * 1.5;
333
+ // Right-aligned blocks have a fixed right edge but varying left edge
334
+ // per line. A jump of more than a line-height in left position is a
335
+ // structural change, not text-flow continuation.
336
+ const leftJump = Math.abs(prev.left - line.left) > line.h;
337
+ if (yJump || prevShortOfMargin || leftJump) out.push('');
338
+ }
339
+ out.push(line.text);
340
+ prev = line;
341
+ }
342
+ return out;
343
+ }
package/src/seed.mjs CHANGED
@@ -16,9 +16,18 @@ const TITLE_RE = /<title>[^<]*<\/title>/;
16
16
  const FILE_RE = /(FILE\s*:\s*)'[^']*'/;
17
17
 
18
18
  export function applySeedSubs(seed, { uuid, title, fileMeta }) {
19
- const uuidMatches = seed.match(new RegExp(UUID_RE.source, 'g')) || [];
20
- if (uuidMatches.length !== 1) {
21
- throw new Error(`seed must contain exactly one DOC_UUID line, found ${uuidMatches.length}`);
19
+ // All three substitution sites must appear exactly once. A regression in the
20
+ // seed (title removed, FILE renamed, etc.) would otherwise silently no-op
21
+ // and ship a CLI emitting partially-substituted containers.
22
+ for (const { re, label } of [
23
+ { re: UUID_RE, label: 'DOC_UUID' },
24
+ { re: TITLE_RE, label: '<title>' },
25
+ { re: FILE_RE, label: 'FILE:' },
26
+ ]) {
27
+ const matches = seed.match(new RegExp(re.source, 'g')) || [];
28
+ if (matches.length !== 1) {
29
+ throw new Error(`seed must contain exactly one ${label} line, found ${matches.length}`);
30
+ }
22
31
  }
23
32
  let out = seed.replace(UUID_RE, `const DOC_UUID = '${uuid}';`);
24
33
  if (title != null) out = out.replace(TITLE_RE, `<title>${escapeHtml(title)}</title>`);
@@ -27,7 +36,9 @@ export function applySeedSubs(seed, { uuid, title, fileMeta }) {
27
36
  }
28
37
 
29
38
  // Mirrors the bootstrap's escapeTL — keep in sync with seeds/rewritable.html.
30
- const escapeTL = s => s
39
+ // LF-canonicalizes first; rwa-edit/1 invariant is that on-disk docs are LF-only.
40
+ const canonLF = s => s == null ? '' : String(s).replace(/\r\n/g, '\n').replace(/\r/g, '\n');
41
+ const escapeTL = s => canonLF(s)
31
42
  .replace(/\\/g, '\\\\')
32
43
  .replace(/`/g, '\\`')
33
44
  .replace(/\$\{/g, '\\${')