rewritable 0.1.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +263 -5
- package/bin/rwa.mjs +1033 -6
- package/package.json +7 -4
- package/seeds/rewritable.html +6989 -156
- package/src/agent-loop.mjs +155 -0
- package/src/apply-edits.mjs +664 -0
- package/src/atomic-write.mjs +38 -0
- package/src/backend.mjs +43 -0
- package/src/clone-extract.mjs +249 -0
- package/src/clone.mjs +161 -0
- package/src/commands.mjs +207 -11
- package/src/create.mjs +256 -0
- package/src/doc.mjs +69 -0
- package/src/dsl-compiler.mjs +357 -0
- package/src/edit.mjs +300 -0
- package/src/fetch-page.mjs +346 -0
- package/src/host.mjs +126 -0
- package/src/identity.mjs +257 -0
- package/src/import-claude.mjs +360 -0
- package/src/import-vision.mjs +156 -0
- package/src/import.mjs +357 -8
- package/src/ls.mjs +105 -0
- package/src/publish-site.mjs +85 -0
- package/src/publish.mjs +98 -0
- package/src/seed-extract.mjs +40 -0
- package/src/seed.mjs +1399 -6
- package/src/self-contained.mjs +115 -0
- package/src/skill-manifest.mjs +227 -0
- package/src/skin.mjs +350 -0
- package/src/skins.mjs +274 -0
- package/src/template.mjs +109 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
|
|
4
|
+
// PDF → HTML via OpenRouter chat completions.
|
|
5
|
+
//
|
|
6
|
+
// Why this exists: pdfjs's text extraction produces flat-paragraph output
|
|
7
|
+
// that loses tables, multi-column layouts, and any text whose font has a
|
|
8
|
+
// broken toUnicode CMap (e.g. "Ü" decoded as "UY"). Sending the raw PDF to
|
|
9
|
+
// a vision-capable model bypasses both — the model reads the rendered
|
|
10
|
+
// content and reconstructs semantic HTML.
|
|
11
|
+
//
|
|
12
|
+
// Trade-off: ~$0.01-$0.05 per page in API costs, network round-trip
|
|
13
|
+
// latency. Opt-in via `rwa import file.pdf --vision`.
|
|
14
|
+
//
|
|
15
|
+
// Wire format: OpenRouter's PDF input docs say content type is "file" with
|
|
16
|
+
// `file_data: "data:application/pdf;base64,..."`. For Anthropic models OR
|
|
17
|
+
// passes this through as a native PDF document block; for others (Gemini,
|
|
18
|
+
// GPT-4o), it's routed through OR's file-parser plugin (engine "native"
|
|
19
|
+
// uses the model's own multimodal capability).
|
|
20
|
+
|
|
21
|
+
const OPENROUTER_URL = 'https://openrouter.ai/api/v1/chat/completions';
|
|
22
|
+
|
|
23
|
+
const SYSTEM_PROMPT = `You are converting a PDF document into clean, semantic HTML for embedding in a single-file rewritable document container.
|
|
24
|
+
|
|
25
|
+
Output requirements:
|
|
26
|
+
- A single <article> element containing all document content.
|
|
27
|
+
- Use semantic HTML: <h1>-<h6> for headings, <p> for paragraphs, <ul>/<ol>/<li> for lists, <table><thead><tbody><tr><td>/<th> for tables, <strong>/<em> for emphasis, <a href="..."> for links.
|
|
28
|
+
- Do NOT output <html>, <head>, <body>, <!doctype>, any preamble, or any explanation before or after the HTML.
|
|
29
|
+
- Do NOT wrap output in markdown code fences (no \`\`\`html).
|
|
30
|
+
- Preserve text content exactly — do not summarize, paraphrase, translate, or reword.
|
|
31
|
+
- Reconstruct multi-column layouts and tables faithfully. Table headers go in <thead>, body rows in <tbody>.
|
|
32
|
+
- Omit <img> entirely; this container is text-focused. If an image carries information, describe it briefly in a <p>.
|
|
33
|
+
- No <script>, <style>, class, or id attributes. Plain semantic HTML only.
|
|
34
|
+
|
|
35
|
+
Output ONLY the <article>...</article> element.`;
|
|
36
|
+
|
|
37
|
+
const USER_PROMPT = 'Convert this PDF document to a single <article> element of clean semantic HTML, following the rules in the system prompt.';
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* @param {Buffer|Uint8Array} bytes PDF content
|
|
41
|
+
* @param {object} [opts]
|
|
42
|
+
* @param {string} [opts.apiKey] OpenRouter API key. If omitted, read from
|
|
43
|
+
* process.env.OPENROUTER_API_KEY, then ./.env
|
|
44
|
+
* @param {string} [opts.model] OpenRouter model id; default reuses
|
|
45
|
+
* the rwa container's default
|
|
46
|
+
* @param {AbortSignal} [opts.signal]
|
|
47
|
+
* @returns {Promise<{ html: string, warnings: string[] }>}
|
|
48
|
+
*/
|
|
49
|
+
export async function convertPdfViaVision(bytes, { apiKey, model, signal } = {}) {
|
|
50
|
+
apiKey = apiKey || process.env.OPENROUTER_API_KEY || await readDotEnvKey('OPENROUTER_API_KEY');
|
|
51
|
+
if (!apiKey) {
|
|
52
|
+
const e = new Error('vision: OPENROUTER_API_KEY is required (set in env or ./.env)');
|
|
53
|
+
e.exitCode = 2;
|
|
54
|
+
throw e;
|
|
55
|
+
}
|
|
56
|
+
const buf = Buffer.isBuffer(bytes) ? bytes : Buffer.from(bytes);
|
|
57
|
+
const dataUri = `data:application/pdf;base64,${buf.toString('base64')}`;
|
|
58
|
+
|
|
59
|
+
const body = {
|
|
60
|
+
model: model || 'google/gemini-3.5-flash',
|
|
61
|
+
messages: [
|
|
62
|
+
{ role: 'system', content: SYSTEM_PROMPT },
|
|
63
|
+
{
|
|
64
|
+
role: 'user',
|
|
65
|
+
content: [
|
|
66
|
+
{ type: 'text', text: USER_PROMPT },
|
|
67
|
+
{ type: 'file', file: { filename: 'document.pdf', file_data: dataUri } },
|
|
68
|
+
],
|
|
69
|
+
},
|
|
70
|
+
],
|
|
71
|
+
// Generous output budget — long PDFs can produce a lot of HTML.
|
|
72
|
+
// OpenRouter will clamp to model's actual max if smaller.
|
|
73
|
+
max_tokens: 16384,
|
|
74
|
+
// Deterministic output — we want the same HTML for the same input.
|
|
75
|
+
temperature: 0,
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
const res = await fetch(OPENROUTER_URL, {
|
|
79
|
+
method: 'POST',
|
|
80
|
+
headers: {
|
|
81
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
82
|
+
'Content-Type': 'application/json',
|
|
83
|
+
// Recommended by OpenRouter for tracking, helps with rate-limit accounting.
|
|
84
|
+
'HTTP-Referer': 'https://github.com/martintreiber/rewritable',
|
|
85
|
+
'X-Title': 'rwa CLI',
|
|
86
|
+
},
|
|
87
|
+
body: JSON.stringify(body),
|
|
88
|
+
signal,
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
if (!res.ok) {
|
|
92
|
+
const text = await res.text().catch(() => '');
|
|
93
|
+
const e = new Error(`vision: openrouter ${res.status}${text ? ': ' + text.slice(0, 500) : ''}`);
|
|
94
|
+
e.exitCode = 2;
|
|
95
|
+
throw e;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const json = await res.json();
|
|
99
|
+
const content = json?.choices?.[0]?.message?.content;
|
|
100
|
+
if (typeof content !== 'string' || !content.trim()) {
|
|
101
|
+
const e = new Error('vision: openrouter returned empty content');
|
|
102
|
+
e.exitCode = 2;
|
|
103
|
+
throw e;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const html = extractArticle(content);
|
|
107
|
+
if (!html) {
|
|
108
|
+
const e = new Error(
|
|
109
|
+
`vision: model output did not contain an <article> element. Output preview:\n${content.slice(0, 300)}`
|
|
110
|
+
);
|
|
111
|
+
e.exitCode = 2;
|
|
112
|
+
throw e;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const warnings = [];
|
|
116
|
+
// Surface usage so the user sees what each import cost.
|
|
117
|
+
const usage = json?.usage;
|
|
118
|
+
if (usage) {
|
|
119
|
+
const tokens = `${usage.prompt_tokens || 0} in / ${usage.completion_tokens || 0} out`;
|
|
120
|
+
warnings.push(`vision: ${body.model} (${tokens} tokens)`);
|
|
121
|
+
}
|
|
122
|
+
return { html, warnings };
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Minimal .env reader for the OPENROUTER_API_KEY fallback path. Handles
|
|
126
|
+
// KEY=value with optional surrounding whitespace, optional matched quotes,
|
|
127
|
+
// optional `export` prefix. No interpolation, no multi-line values. Returns
|
|
128
|
+
// null if the file or key is missing.
|
|
129
|
+
async function readDotEnvKey(name) {
|
|
130
|
+
let text;
|
|
131
|
+
try {
|
|
132
|
+
text = await fs.readFile(path.join(process.cwd(), '.env'), 'utf8');
|
|
133
|
+
} catch (_) { return null; }
|
|
134
|
+
for (const line of text.split('\n')) {
|
|
135
|
+
const m = line.match(/^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*?)\s*$/);
|
|
136
|
+
if (!m || m[1] !== name) continue;
|
|
137
|
+
let v = m[2];
|
|
138
|
+
if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) {
|
|
139
|
+
v = v.slice(1, -1);
|
|
140
|
+
}
|
|
141
|
+
return v || null;
|
|
142
|
+
}
|
|
143
|
+
return null;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Extract the outermost <article>...</article>. Models often wrap output in
|
|
147
|
+
// ```html fences or add a "Here is the HTML:" preamble despite the system
|
|
148
|
+
// prompt; pull out only the article element to be robust to that.
|
|
149
|
+
function extractArticle(text) {
|
|
150
|
+
// Find the first <article (allow attributes) and the LAST </article>.
|
|
151
|
+
const start = text.search(/<article(?:\s[^>]*)?>/i);
|
|
152
|
+
if (start < 0) return null;
|
|
153
|
+
const end = text.lastIndexOf('</article>');
|
|
154
|
+
if (end < 0 || end < start) return null;
|
|
155
|
+
return text.slice(start, end + '</article>'.length).trim();
|
|
156
|
+
}
|
package/src/import.mjs
CHANGED
|
@@ -1,33 +1,57 @@
|
|
|
1
1
|
import { marked } from 'marked';
|
|
2
|
+
import Papa from 'papaparse';
|
|
3
|
+
import mammoth from 'mammoth';
|
|
4
|
+
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
2
5
|
|
|
3
|
-
|
|
6
|
+
// `convert` takes raw bytes (Buffer / Uint8Array). Text formats decode utf8
|
|
7
|
+
// internally; binary formats consume bytes directly. Switching to bytes was
|
|
8
|
+
// driven by docx/pdf — keeping a single signature avoids a fork.
|
|
9
|
+
export async function convert(ext, bytes) {
|
|
4
10
|
switch (ext) {
|
|
5
11
|
case 'md':
|
|
6
12
|
case 'markdown':
|
|
7
|
-
return convertMd(
|
|
13
|
+
return convertMd(toText(bytes));
|
|
8
14
|
case 'html':
|
|
9
15
|
case 'htm':
|
|
10
|
-
return convertHtml(
|
|
16
|
+
return convertHtml(toText(bytes));
|
|
17
|
+
case 'csv':
|
|
18
|
+
return convertCsv(toText(bytes));
|
|
11
19
|
case 'txt':
|
|
12
20
|
case '':
|
|
13
|
-
return convertTxt(
|
|
21
|
+
return convertTxt(toText(bytes));
|
|
22
|
+
case 'docx':
|
|
23
|
+
return convertDocx(bytes);
|
|
24
|
+
case 'pdf':
|
|
25
|
+
return convertPdf(bytes);
|
|
14
26
|
default: {
|
|
15
|
-
const e = new Error(`unsupported format: .${ext} (supported: .md, .markdown, .html, .htm, .txt)`);
|
|
27
|
+
const e = new Error(`unsupported format: .${ext} (supported: .md, .markdown, .html, .htm, .csv, .txt, .docx, .pdf)`);
|
|
16
28
|
e.exitCode = 2;
|
|
17
29
|
throw e;
|
|
18
30
|
}
|
|
19
31
|
}
|
|
20
32
|
}
|
|
21
33
|
|
|
34
|
+
function toText(bytes) {
|
|
35
|
+
if (typeof bytes === 'string') return bytes;
|
|
36
|
+
if (Buffer.isBuffer(bytes)) return bytes.toString('utf8');
|
|
37
|
+
return Buffer.from(bytes).toString('utf8');
|
|
38
|
+
}
|
|
39
|
+
|
|
22
40
|
function convertMd(md) {
|
|
23
|
-
const
|
|
24
|
-
|
|
41
|
+
const raw = marked.parse(md, { gfm: true, breaks: false });
|
|
42
|
+
const { html, warnings } = sanitizeImportedHtml(raw);
|
|
43
|
+
return { html: `<article>\n${html.trim()}\n</article>`, warnings };
|
|
25
44
|
}
|
|
26
45
|
|
|
27
46
|
function convertHtml(input) {
|
|
28
47
|
const warnings = [];
|
|
29
48
|
|
|
30
|
-
|
|
49
|
+
// Strip HTML comments first. Without this, a comment like <!-- </head> -->
|
|
50
|
+
// would terminate the non-greedy head match prematurely and let head content
|
|
51
|
+
// leak into the body. Comments are dropped — acceptable for an offline import
|
|
52
|
+
// CLI; full preservation would require a real parser.
|
|
53
|
+
let body = input.replace(/<!--[\s\S]*?-->/g, '');
|
|
54
|
+
body = body.replace(/<!DOCTYPE[^>]*>/gi, '').replace(/<\/?html[^>]*>/gi, '');
|
|
31
55
|
|
|
32
56
|
const headMatch = body.match(/<head[^>]*>([\s\S]*?)<\/head>/i);
|
|
33
57
|
let headStyles = '';
|
|
@@ -49,6 +73,42 @@ function convertHtml(input) {
|
|
|
49
73
|
return { html: headStyles + body, warnings };
|
|
50
74
|
}
|
|
51
75
|
|
|
76
|
+
function looksLikeCsv(text) {
|
|
77
|
+
const probe = Papa.parse(text, { preview: 2, skipEmptyLines: true, header: false });
|
|
78
|
+
if (probe.errors.length > 0) return false;
|
|
79
|
+
if (probe.data.length === 0) return false;
|
|
80
|
+
const cols = probe.data[0].length;
|
|
81
|
+
if (cols < 2) return false;
|
|
82
|
+
if (probe.data.length === 2 && probe.data[1].length !== cols) return false;
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function convertCsv(text) {
|
|
87
|
+
if (!looksLikeCsv(text)) {
|
|
88
|
+
const e = new Error('csv probe failed: input does not look like CSV (need ≥2 columns with consistent column count)');
|
|
89
|
+
e.exitCode = 2;
|
|
90
|
+
throw e;
|
|
91
|
+
}
|
|
92
|
+
// skipEmptyLines drops trailing blank rows that csv exporters often emit.
|
|
93
|
+
// header:false because we own the first-row-as-thead split and want raw rows.
|
|
94
|
+
const result = Papa.parse(text, { skipEmptyLines: true, header: false });
|
|
95
|
+
const warnings = result.errors.map(e => {
|
|
96
|
+
const where = e.row != null ? ` (row ${e.row + 1})` : '';
|
|
97
|
+
return `csv parse: ${e.message}${where}`;
|
|
98
|
+
});
|
|
99
|
+
const rows = result.data;
|
|
100
|
+
if (rows.length === 0) {
|
|
101
|
+
return { html: '<article>\n</article>', warnings };
|
|
102
|
+
}
|
|
103
|
+
const escape = s => String(s == null ? '' : s).replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
|
|
104
|
+
const [header, ...body] = rows;
|
|
105
|
+
const thead = `<thead>\n<tr>${header.map(c => `<th>${escape(c)}</th>`).join('')}</tr>\n</thead>`;
|
|
106
|
+
const tbody = body.length === 0
|
|
107
|
+
? ''
|
|
108
|
+
: `\n<tbody>\n${body.map(row => `<tr>${row.map(c => `<td>${escape(c)}</td>`).join('')}</tr>`).join('\n')}\n</tbody>`;
|
|
109
|
+
return { html: `<article>\n<table>\n${thead}${tbody}\n</table>\n</article>`, warnings };
|
|
110
|
+
}
|
|
111
|
+
|
|
52
112
|
function convertTxt(text) {
|
|
53
113
|
const escape = s => s.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
|
|
54
114
|
const blocks = text
|
|
@@ -58,3 +118,292 @@ function convertTxt(text) {
|
|
|
58
118
|
.map(b => `<p>${escape(b)}</p>`);
|
|
59
119
|
return { html: `<article>\n${blocks.join('\n')}\n</article>`, warnings: [] };
|
|
60
120
|
}
|
|
121
|
+
|
|
122
|
+
async function convertDocx(bytes) {
|
|
123
|
+
const buffer = Buffer.isBuffer(bytes) ? bytes : Buffer.from(bytes);
|
|
124
|
+
let result;
|
|
125
|
+
try {
|
|
126
|
+
result = await mammoth.convertToHtml({ buffer });
|
|
127
|
+
} catch (err) {
|
|
128
|
+
const e = new Error(`docx: ${err && err.message ? err.message : String(err)}`);
|
|
129
|
+
e.exitCode = 2;
|
|
130
|
+
throw e;
|
|
131
|
+
}
|
|
132
|
+
const raw = (result.value || '').trim();
|
|
133
|
+
if (!raw) {
|
|
134
|
+
const e = new Error('docx: produced empty document — input may be corrupt or empty');
|
|
135
|
+
e.exitCode = 2;
|
|
136
|
+
throw e;
|
|
137
|
+
}
|
|
138
|
+
const { html, skipped } = sanitizeMammothUrls(raw);
|
|
139
|
+
const warnings = [
|
|
140
|
+
...skipped.map(s => `docx: ${s}`),
|
|
141
|
+
...(result.messages || []).map(m => `docx: ${m.message}`),
|
|
142
|
+
];
|
|
143
|
+
return { html: `<article>\n${html}\n</article>`, warnings };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Mammoth doesn't filter URL schemes — a docx with a `javascript:` hyperlink
|
|
147
|
+
// would land in the imported document and execute on click (stored XSS in the
|
|
148
|
+
// downloaded rwa container). Strip unsafe schemes from href/src and replace
|
|
149
|
+
// with `#`. Mammoth's HTML writer always uses double-quoted attributes and
|
|
150
|
+
// escapes &, ", <, > inside values, so a regex match against `attr="..."` is
|
|
151
|
+
// sufficient — no quote-escape ambiguity to worry about.
|
|
152
|
+
const _SAFE_HREF_SCHEMES = new Set(['http', 'https', 'mailto', 'tel']);
|
|
153
|
+
// Two layers, both required:
|
|
154
|
+
// 1) Strip invisibles before parsing — whitespace + C0/C1 controls (\x00-\x1f,
|
|
155
|
+
// \x7f-\xa0) + soft hyphen (\xad) + Cf-class format chars (ZWSP/ZWNJ/ZWJ,
|
|
156
|
+
// LRM/RLM, LRE/RLE/PDF/LRO/RLO, word joiner, BOM, etc.). The previous
|
|
157
|
+
// regex used JS \s which doesn't match these — they slipped through and
|
|
158
|
+
// let a docx with `javascript:…` href bypass the scheme check.
|
|
159
|
+
// 2) Parse via WHATWG URL — the same parser the browser uses to navigate.
|
|
160
|
+
// Resolve against a synthetic base so scheme-less inputs (relative URL,
|
|
161
|
+
// fragment, path) round-trip back to that base and pass.
|
|
162
|
+
const _ATTR_STRIP_RE = /[\s\x00-\x1f\x7f-\xa0\xad---]/g;
|
|
163
|
+
const _SANITIZER_BASE = 'http://_rwa_sanitizer_base_/';
|
|
164
|
+
function _attrIsSafe(attr, val) {
|
|
165
|
+
const normalized = String(val).replace(_ATTR_STRIP_RE, '');
|
|
166
|
+
let parsed;
|
|
167
|
+
try { parsed = new URL(normalized, _SANITIZER_BASE); }
|
|
168
|
+
catch { return true; } // unparseable → cannot be an active URL scheme
|
|
169
|
+
if (parsed.origin === 'http://_rwa_sanitizer_base_') return true; // resolved relative — no scheme
|
|
170
|
+
const proto = parsed.protocol.replace(/:$/, '').toLowerCase();
|
|
171
|
+
if (_SAFE_HREF_SCHEMES.has(proto)) return true;
|
|
172
|
+
// Mammoth embeds raster images as data:image/...;base64,... — allow on src.
|
|
173
|
+
// data:image/svg+xml passes here too, but <img src> renders SVG in image-
|
|
174
|
+
// loading mode with no script execution (HTML spec), so the narrow
|
|
175
|
+
// 'data:image/*' allowance is still safe for src. Keep scoped to src only.
|
|
176
|
+
if (attr === 'src' && proto === 'data' && /^data:image\//i.test(parsed.href)) return true;
|
|
177
|
+
return false;
|
|
178
|
+
}
|
|
179
|
+
function sanitizeMammothUrls(html) {
|
|
180
|
+
const skipped = [];
|
|
181
|
+
const stripAttr = (attr) => (full, val) => {
|
|
182
|
+
if (_attrIsSafe(attr, val)) return full;
|
|
183
|
+
const m = val.match(/^\s*([a-z][a-z0-9+.\-]*):/i);
|
|
184
|
+
const scheme = m ? m[1].toLowerCase() : 'unknown';
|
|
185
|
+
skipped.push(`stripped unsafe ${attr} (scheme: ${scheme}:)`);
|
|
186
|
+
return `${attr}="#"`;
|
|
187
|
+
};
|
|
188
|
+
return {
|
|
189
|
+
html: html
|
|
190
|
+
.replace(/href="([^"]*)"/g, stripAttr('href'))
|
|
191
|
+
.replace(/src="([^"]*)"/g, stripAttr('src')),
|
|
192
|
+
skipped,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// marked v14 explicitly does NOT sanitize HTML — its README points readers at
|
|
197
|
+
// DOMPurify. The seed bootstrap injects INLINE_DOC via m.innerHTML AND
|
|
198
|
+
// re-creates <script> tags so they execute (intended for documents that ship
|
|
199
|
+
// JS), so any active content in the imported HTML runs on container open. An
|
|
200
|
+
// imported .md must not be able to add active content.
|
|
201
|
+
//
|
|
202
|
+
// Regex-based strip (not a parser) for mirror-symmetry with the browser. The
|
|
203
|
+
// rules below are deliberately conservative: when in doubt, strip. Marked's
|
|
204
|
+
// output is well-formed and uses double-quoted attributes, so the regex shape
|
|
205
|
+
// matches reliably. Edge cases (CDATA, malformed nesting) are over-stripped
|
|
206
|
+
// rather than under-stripped — acceptable for an import path.
|
|
207
|
+
const _ACTIVE_TAGS = ['script', 'iframe', 'object', 'embed', 'svg', 'math', 'link', 'meta', 'base'];
|
|
208
|
+
export function sanitizeImportedHtml(html) {
|
|
209
|
+
const warnings = [];
|
|
210
|
+
let s = String(html);
|
|
211
|
+
// 1) Drop active-content tags (open+close blocks, then self-closing/unmatched).
|
|
212
|
+
for (const tag of _ACTIVE_TAGS) {
|
|
213
|
+
const block = new RegExp('<' + tag + '\\b[^>]*>[\\s\\S]*?<\\/' + tag + '\\s*>', 'gi');
|
|
214
|
+
const solo = new RegExp('<\\/?' + tag + '\\b[^>]*\\/?>', 'gi');
|
|
215
|
+
if (block.test(s) || solo.test(s)) warnings.push('imported md: stripped <' + tag + '> elements');
|
|
216
|
+
s = s.replace(block, '').replace(solo, '');
|
|
217
|
+
}
|
|
218
|
+
// 2) Drop on*= event-handler attributes from surviving elements.
|
|
219
|
+
// Match quoted (double/single) and unquoted-to-whitespace/> forms.
|
|
220
|
+
let onCount = 0;
|
|
221
|
+
s = s.replace(/\son[a-z]+\s*=\s*("[^"]*"|'[^']*'|[^\s>]+)/gi, () => { onCount++; return ''; });
|
|
222
|
+
if (onCount) warnings.push('imported md: stripped ' + onCount + ' event-handler attribute(s)');
|
|
223
|
+
// 3) Apply scheme allow-list to surviving URL-bearing attributes. Marked's
|
|
224
|
+
// output is double-quoted href/src only, but rwa clone feeds ARBITRARY web
|
|
225
|
+
// HTML here — single-quoted, unquoted, and other URL attributes (action/
|
|
226
|
+
// formaction/poster/xlink:href) are all common and must be checked too, or
|
|
227
|
+
// a `href='javascript:…'` survives into the file:// container as a live,
|
|
228
|
+
// clickable link. Match all three value forms (mirror of the on*= strip)
|
|
229
|
+
// and the full reachable URL-attr set. data:image/* stays allowed on src.
|
|
230
|
+
let urlSkipped = 0;
|
|
231
|
+
s = s.replace(
|
|
232
|
+
/(\s)(xlink:href|formaction|href|src|action|poster)(\s*=\s*)("[^"]*"|'[^']*'|[^\s>]+)/gi,
|
|
233
|
+
(full, ws, name, eq, rawVal) => {
|
|
234
|
+
const lname = name.toLowerCase();
|
|
235
|
+
const attr = (lname === 'src' || lname === 'poster') ? 'src' : 'href';
|
|
236
|
+
const quoted = rawVal[0] === '"' || rawVal[0] === "'";
|
|
237
|
+
const val = quoted ? rawVal.slice(1, -1) : rawVal;
|
|
238
|
+
if (_attrIsSafe(attr, val)) return full;
|
|
239
|
+
urlSkipped++;
|
|
240
|
+
return ws + name + eq + '"#"';
|
|
241
|
+
}
|
|
242
|
+
);
|
|
243
|
+
if (urlSkipped) warnings.push('imported md: neutralised ' + urlSkipped + ' unsafe URL attribute(s)');
|
|
244
|
+
return { html: s, warnings };
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
async function convertPdf(bytes) {
|
|
248
|
+
// pdfjs explicitly rejects Node's Buffer (despite Buffer extending Uint8Array)
|
|
249
|
+
// and wants a plain Uint8Array view.
|
|
250
|
+
const src = bytes instanceof Uint8Array ? bytes : Buffer.from(bytes);
|
|
251
|
+
const data = new Uint8Array(src.buffer, src.byteOffset, src.byteLength);
|
|
252
|
+
let doc;
|
|
253
|
+
try {
|
|
254
|
+
doc = await pdfjs.getDocument({ data, isEvalSupported: false }).promise;
|
|
255
|
+
} catch (err) {
|
|
256
|
+
const name = err && err.name;
|
|
257
|
+
if (name === 'PasswordException') {
|
|
258
|
+
const e = new Error('pdf: file is password-protected');
|
|
259
|
+
e.exitCode = 2;
|
|
260
|
+
throw e;
|
|
261
|
+
}
|
|
262
|
+
const e = new Error(`pdf: ${err && err.message ? err.message : String(err)}`);
|
|
263
|
+
e.exitCode = 2;
|
|
264
|
+
throw e;
|
|
265
|
+
}
|
|
266
|
+
const escape = s => s.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
|
|
267
|
+
const paragraphs = [];
|
|
268
|
+
for (let p = 1; p <= doc.numPages; p++) {
|
|
269
|
+
const page = await doc.getPage(p);
|
|
270
|
+
const tc = await page.getTextContent();
|
|
271
|
+
extractParagraphs(tc.items).forEach(line => paragraphs.push(line));
|
|
272
|
+
paragraphs.push(null); // page break: forces flush of next paragraph
|
|
273
|
+
}
|
|
274
|
+
await doc.destroy().catch(() => {});
|
|
275
|
+
|
|
276
|
+
const blocks = [];
|
|
277
|
+
let buf = [];
|
|
278
|
+
const flush = () => {
|
|
279
|
+
const joined = buf.join(' ').replace(/\s+/g, ' ').trim();
|
|
280
|
+
if (joined) blocks.push(`<p>${escape(joined)}</p>`);
|
|
281
|
+
buf = [];
|
|
282
|
+
};
|
|
283
|
+
for (const line of paragraphs) {
|
|
284
|
+
if (line === null || line === '') { flush(); continue; }
|
|
285
|
+
buf.push(line);
|
|
286
|
+
}
|
|
287
|
+
flush();
|
|
288
|
+
|
|
289
|
+
if (blocks.length === 0) {
|
|
290
|
+
const e = new Error('pdf: no extractable text — this looks like a scanned/image PDF; OCR is not supported');
|
|
291
|
+
e.exitCode = 2;
|
|
292
|
+
throw e;
|
|
293
|
+
}
|
|
294
|
+
return {
|
|
295
|
+
html: `<article>\n${blocks.join('\n')}\n</article>`,
|
|
296
|
+
warnings: ['pdf: layout reconstructed by heuristics — review headings/lists manually'],
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// Group pdf.js text items into paragraph-shaped lines.
|
|
301
|
+
//
|
|
302
|
+
// Two non-obvious problems this handles:
|
|
303
|
+
// (1) Adjacent items inside a word: pdf.js returns text per font run, so a
|
|
304
|
+
// word like "Aufwände" comes out as ["Aufw", "ä", "nde"] when the umlaut
|
|
305
|
+
// glyph lives in a different font table from the ASCII letters. Joining
|
|
306
|
+
// with ' ' produces "Aufw ä nde" — wrong. We concat directly and only
|
|
307
|
+
// synthesize a space when there's a real positional x-gap.
|
|
308
|
+
// (2) Stacked short lines: an address block (Name / Street / City) has small
|
|
309
|
+
// y-gaps that fit inside the within-paragraph threshold, so naive logic
|
|
310
|
+
// would join them into one paragraph "Name Street City". We additionally
|
|
311
|
+
// break paragraphs when the *previous* line ended significantly short of
|
|
312
|
+
// the page's typical right margin (a heuristic for "hard line break").
|
|
313
|
+
//
|
|
314
|
+
// Returns an array of strings; '' marks a paragraph break.
|
|
315
|
+
function extractParagraphs(items) {
|
|
316
|
+
if (!items || items.length === 0) return [];
|
|
317
|
+
const rows = items.map(it => ({
|
|
318
|
+
str: it.str,
|
|
319
|
+
y: it.transform ? it.transform[5] : 0,
|
|
320
|
+
x: it.transform ? it.transform[4] : 0,
|
|
321
|
+
w: it.width || 0,
|
|
322
|
+
h: it.height || (it.transform ? Math.abs(it.transform[3]) : 0) || 12,
|
|
323
|
+
}));
|
|
324
|
+
// Sort top-to-bottom (y desc in PDF coords), then left-to-right within a
|
|
325
|
+
// row. pdfjs's content-stream order is reading order for well-tagged
|
|
326
|
+
// single-column PDFs, but for multi-column or absolutely-positioned layouts
|
|
327
|
+
// it interleaves visually-separate lines; sorting first makes the same-y
|
|
328
|
+
// grouping below tolerant of that.
|
|
329
|
+
rows.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
330
|
+
// Group into visual lines by y (within half a line height).
|
|
331
|
+
const lines = [];
|
|
332
|
+
let cur = null;
|
|
333
|
+
for (const r of rows) {
|
|
334
|
+
if (cur && Math.abs(r.y - cur.y) <= cur.h * 0.5) {
|
|
335
|
+
cur.parts.push(r);
|
|
336
|
+
cur.y = (cur.y + r.y) / 2;
|
|
337
|
+
} else {
|
|
338
|
+
if (cur) lines.push(cur);
|
|
339
|
+
cur = { y: r.y, h: r.h, parts: [r] };
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
if (cur) lines.push(cur);
|
|
343
|
+
|
|
344
|
+
// For each line: concat parts directly, inserting a synthetic space only
|
|
345
|
+
// when there's a real positional gap (previous part's right edge to next
|
|
346
|
+
// part's x). pdf.js often emits explicit space items (str=" ") with tiny
|
|
347
|
+
// width — those carry the space character themselves, so the position-gap
|
|
348
|
+
// check below typically sees ~0 distance when they're present and we don't
|
|
349
|
+
// double-space.
|
|
350
|
+
const rendered = lines.map(line => {
|
|
351
|
+
line.parts.sort((a, b) => a.x - b.x);
|
|
352
|
+
let text = '';
|
|
353
|
+
let prev = null;
|
|
354
|
+
for (const p of line.parts) {
|
|
355
|
+
if (prev) {
|
|
356
|
+
const gap = p.x - (prev.x + prev.w);
|
|
357
|
+
const lastChar = text.slice(-1);
|
|
358
|
+
const firstChar = p.str.charAt(0);
|
|
359
|
+
// Threshold of 2 user-space units catches inter-word gaps on body
|
|
360
|
+
// text without false-positives inside words. Skip if the boundary
|
|
361
|
+
// already has whitespace from either side.
|
|
362
|
+
if (gap > 2 && !/\s/.test(lastChar) && !/\s/.test(firstChar)) {
|
|
363
|
+
text += ' ';
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
text += p.str;
|
|
367
|
+
prev = p;
|
|
368
|
+
}
|
|
369
|
+
const left = line.parts.length ? Math.min(...line.parts.map(p => p.x)) : 0;
|
|
370
|
+
const right = line.parts.length
|
|
371
|
+
? Math.max(...line.parts.map(p => p.x + p.w))
|
|
372
|
+
: 0;
|
|
373
|
+
return { text: text.replace(/\s+/g, ' ').trim(), y: line.y, h: line.h, left, right };
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
// The page's "typical right margin" — use the 90th-percentile right edge
|
|
377
|
+
// (more robust than max, which a stray header/page-number could inflate).
|
|
378
|
+
// Lines ending well short of this are likely hard line-breaks, not soft
|
|
379
|
+
// wraps to the right margin.
|
|
380
|
+
const sortedRights = rendered.filter(l => l.text).map(l => l.right).sort((a, b) => a - b);
|
|
381
|
+
const margin = sortedRights.length
|
|
382
|
+
? sortedRights[Math.floor(sortedRights.length * 0.9)]
|
|
383
|
+
: 0;
|
|
384
|
+
|
|
385
|
+
const out = [];
|
|
386
|
+
let prev = null;
|
|
387
|
+
for (const line of rendered) {
|
|
388
|
+
if (!line.text) continue; // pdfjs sometimes emits whitespace-only EOL stubs; ignore.
|
|
389
|
+
if (prev != null) {
|
|
390
|
+
const yGap = Math.abs(prev.y - line.y);
|
|
391
|
+
const yJump = yGap > prev.h * 1.5;
|
|
392
|
+
// Previous line ended significantly short of the page's right margin —
|
|
393
|
+
// that's the signature of a hard line-break (address line, table cell,
|
|
394
|
+
// bullet, sender block). Threshold of 1.5× line height (~1-2 chars)
|
|
395
|
+
// ignores end-of-line whitespace + small justification slop while still
|
|
396
|
+
// catching genuinely short lines. Soft wraps to the right margin are
|
|
397
|
+
// within ~few units and don't trigger.
|
|
398
|
+
const prevShortOfMargin = margin > 0 && (margin - prev.right) > prev.h * 1.5;
|
|
399
|
+
// Right-aligned blocks have a fixed right edge but varying left edge
|
|
400
|
+
// per line. A jump of more than a line-height in left position is a
|
|
401
|
+
// structural change, not text-flow continuation.
|
|
402
|
+
const leftJump = Math.abs(prev.left - line.left) > line.h;
|
|
403
|
+
if (yJump || prevShortOfMargin || leftJump) out.push('');
|
|
404
|
+
}
|
|
405
|
+
out.push(line.text);
|
|
406
|
+
prev = line;
|
|
407
|
+
}
|
|
408
|
+
return out;
|
|
409
|
+
}
|
package/src/ls.mjs
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
// `rwa ls` — collection-scale self-description. Where inspectDoc answers "what is
|
|
2
|
+
// THIS file?", listRewritables answers "what are all these?": it resolves a set
|
|
3
|
+
// of paths (files, directories, or — by default — the current directory) to
|
|
4
|
+
// candidate .html files and reports each one's self-description/1 projection,
|
|
5
|
+
// flagging non-rewritables. The scan is lenient: a missing path or a
|
|
6
|
+
// non-rewritable is a row in the result, never a thrown error — so one bad entry
|
|
7
|
+
// can't abort the inventory of a whole folder.
|
|
8
|
+
|
|
9
|
+
import { readdir, stat } from 'node:fs/promises';
|
|
10
|
+
import { join } from 'node:path';
|
|
11
|
+
import { inspectDoc } from './doc.mjs';
|
|
12
|
+
|
|
13
|
+
const HTML_RE = /\.html?$/i;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Expand input paths to a flat, ordered list of candidate files. A directory
|
|
17
|
+
* contributes its (non-recursive) .html children; a file contributes itself; a
|
|
18
|
+
* path that cannot be stat'd is kept as a `missing` candidate so the caller can
|
|
19
|
+
* report it rather than silently drop it. No inputs ⇒ scan the current directory.
|
|
20
|
+
*
|
|
21
|
+
* @param {string[]} paths
|
|
22
|
+
* @returns {Promise<Array<{file:string, missing?:boolean}>>}
|
|
23
|
+
*/
|
|
24
|
+
export async function resolveTargets(paths) {
|
|
25
|
+
const inputs = (paths && paths.length) ? paths : ['.'];
|
|
26
|
+
const targets = [];
|
|
27
|
+
for (const p of inputs) {
|
|
28
|
+
let st;
|
|
29
|
+
try { st = await stat(p); } catch { targets.push({ file: p, missing: true }); continue; }
|
|
30
|
+
if (st.isDirectory()) {
|
|
31
|
+
let names;
|
|
32
|
+
try { names = await readdir(p); } catch { targets.push({ file: p, missing: true }); continue; }
|
|
33
|
+
for (const name of names.filter(n => HTML_RE.test(n)).sort()) {
|
|
34
|
+
targets.push({ file: join(p, name) });
|
|
35
|
+
}
|
|
36
|
+
} else {
|
|
37
|
+
targets.push({ file: p });
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return targets;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Inspect each candidate and classify it. Each row is one of:
|
|
45
|
+
* { file, status:'rewritable', self } — self-description/1 object
|
|
46
|
+
* { file, status:'not_a_rewritable' } — a plain .html / other file
|
|
47
|
+
* { file, status:'error', reason } — not_found / read_error
|
|
48
|
+
*
|
|
49
|
+
* @param {string[]} paths
|
|
50
|
+
* @returns {Promise<Array<object>>}
|
|
51
|
+
*/
|
|
52
|
+
export async function listRewritables(paths) {
|
|
53
|
+
const targets = await resolveTargets(paths);
|
|
54
|
+
const rows = [];
|
|
55
|
+
for (const t of targets) {
|
|
56
|
+
if (t.missing) { rows.push({ file: t.file, status: 'error', reason: 'not_found' }); continue; }
|
|
57
|
+
try {
|
|
58
|
+
const info = await inspectDoc(t.file);
|
|
59
|
+
rows.push({ file: t.file, status: 'rewritable', self: info.self });
|
|
60
|
+
} catch (e) {
|
|
61
|
+
if (e && e.subcode === 'not_a_rewritable') {
|
|
62
|
+
rows.push({ file: t.file, status: 'not_a_rewritable' });
|
|
63
|
+
} else {
|
|
64
|
+
rows.push({ file: t.file, status: 'error', reason: (e && e.subcode) || 'read_error' });
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return rows;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Render the rows as a human-readable, aligned inventory. Rewritables become a
|
|
73
|
+
* KIND/TITLE/AFFORDANCES/FILE table; a footer counts rewritables vs. other files
|
|
74
|
+
* and names any errors — nothing is silently dropped (Rule 12).
|
|
75
|
+
*
|
|
76
|
+
* @param {Array<object>} rows
|
|
77
|
+
* @returns {string}
|
|
78
|
+
*/
|
|
79
|
+
export function formatRows(rows) {
|
|
80
|
+
const rwa = rows.filter(r => r.status === 'rewritable');
|
|
81
|
+
const other = rows.filter(r => r.status === 'not_a_rewritable');
|
|
82
|
+
const errors = rows.filter(r => r.status === 'error');
|
|
83
|
+
|
|
84
|
+
const lines = [];
|
|
85
|
+
if (rwa.length) {
|
|
86
|
+
const cells = rwa.map(r => ({
|
|
87
|
+
kind: r.self.kind || '',
|
|
88
|
+
title: r.self.title || '—',
|
|
89
|
+
affordances: r.self.affordances.length ? r.self.affordances.map(a => a.kind).join(',') : '—',
|
|
90
|
+
file: r.file,
|
|
91
|
+
}));
|
|
92
|
+
const head = { kind: 'KIND', title: 'TITLE', affordances: 'AFFORDANCES', file: 'FILE' };
|
|
93
|
+
const w = (k) => Math.max(head[k].length, ...cells.map(c => c[k].length));
|
|
94
|
+
const wk = w('kind'), wt = w('title'), wa = w('affordances');
|
|
95
|
+
const row = (c) => `${c.kind.padEnd(wk)} ${c.title.padEnd(wt)} ${c.affordances.padEnd(wa)} ${c.file}`;
|
|
96
|
+
lines.push(row(head));
|
|
97
|
+
for (const c of cells) lines.push(row(c));
|
|
98
|
+
lines.push('');
|
|
99
|
+
}
|
|
100
|
+
const parts = [`${rwa.length} rewritable${rwa.length === 1 ? '' : 's'}`];
|
|
101
|
+
if (other.length) parts.push(`${other.length} other (${other.map(r => r.file).join(', ')})`);
|
|
102
|
+
if (errors.length) parts.push(`${errors.length} error (${errors.map(r => `${r.file}: ${r.reason}`).join(', ')})`);
|
|
103
|
+
lines.push(parts.join(', '));
|
|
104
|
+
return lines.join('\n');
|
|
105
|
+
}
|