agent-reader 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,329 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import path from 'node:path';
3
+ import { createOutputDir } from '../utils/output.js';
4
+ import { normalizeOpenMode } from '../utils/preferences.js';
5
+ import { renderMarkdown } from './renderer.js';
6
+ import { exportDOCX, exportPDF } from './exporter.js';
7
+ import { createSlideshow } from './slideshow.js';
8
+
9
+ const MARKDOWN_EXTENSIONS = new Set(['.md', '.markdown', '.mdown', '.mdx']);
10
+ const IMAGE_EXTENSIONS = new Set(['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp']);
11
+ const OPENABLE_EXTENSIONS = new Set(['.html', '.htm', '.pdf', '.docx']);
12
+
13
+ function extOf(targetPath) {
14
+ return path.extname(targetPath).toLowerCase();
15
+ }
16
+
17
+ async function detectTargetKind(targetPath) {
18
+ const stat = await fs.stat(targetPath);
19
+
20
+ if (stat.isDirectory()) {
21
+ const entries = await fs.readdir(targetPath, { withFileTypes: true });
22
+ const hasImage = entries.some((entry) => entry.isFile() && IMAGE_EXTENSIONS.has(extOf(entry.name)));
23
+ return {
24
+ kind: hasImage ? 'image_directory' : 'directory',
25
+ stat,
26
+ extension: '',
27
+ };
28
+ }
29
+
30
+ if (!stat.isFile()) {
31
+ return {
32
+ kind: 'unknown',
33
+ stat,
34
+ extension: '',
35
+ };
36
+ }
37
+
38
+ const extension = extOf(targetPath);
39
+ if (MARKDOWN_EXTENSIONS.has(extension)) {
40
+ return { kind: 'markdown', stat, extension };
41
+ }
42
+ if (OPENABLE_EXTENSIONS.has(extension)) {
43
+ return { kind: 'openable_file', stat, extension };
44
+ }
45
+ if (IMAGE_EXTENSIONS.has(extension)) {
46
+ return { kind: 'image_file', stat, extension };
47
+ }
48
+
49
+ return { kind: 'file', stat, extension };
50
+ }
51
+
52
+ function toBase64WithLimit(buffer, maxContentBytes, warnings) {
53
+ const encoded = buffer.toString('base64');
54
+ if (Buffer.byteLength(encoded, 'utf8') > maxContentBytes) {
55
+ warnings.push('content_too_large');
56
+ return null;
57
+ }
58
+ return encoded;
59
+ }
60
+
61
+ function toTextWithLimit(text, maxContentBytes, warnings) {
62
+ if (Buffer.byteLength(text, 'utf8') > maxContentBytes) {
63
+ warnings.push('content_too_large');
64
+ return null;
65
+ }
66
+ return text;
67
+ }
68
+
69
+ function fileFormatFromExtension(extension) {
70
+ if (extension === '.pdf') return 'pdf';
71
+ if (extension === '.docx') return 'docx';
72
+ return 'html';
73
+ }
74
+
75
+ function withUniqueWarnings(payload) {
76
+ return {
77
+ ...payload,
78
+ warnings: [...new Set(payload.warnings || [])],
79
+ };
80
+ }
81
+
82
+ export async function openTarget(targetPath, options = {}) {
83
+ const resolvedPath = path.resolve(targetPath);
84
+ const target = await detectTargetKind(resolvedPath);
85
+
86
+ const warnings = [];
87
+ const {
88
+ mode = 'auto',
89
+ defaultMode = 'web',
90
+ theme = 'light',
91
+ outDir,
92
+ autoPlay,
93
+ pageSize = 'A4',
94
+ fetchRemote = true,
95
+ returnContent = false,
96
+ maxContentBytes = 50 * 1024 * 1024,
97
+ } = options;
98
+
99
+ const requestedMode = normalizeOpenMode(mode, 'auto');
100
+ const preferredMode = requestedMode === 'auto'
101
+ ? normalizeOpenMode(defaultMode, 'web')
102
+ : requestedMode;
103
+
104
+ if (target.kind === 'directory') {
105
+ throw new Error(`directory has no supported images: ${resolvedPath}`);
106
+ }
107
+
108
+ if (target.kind === 'openable_file' || target.kind === 'file' || target.kind === 'image_file') {
109
+ const format = target.kind === 'openable_file'
110
+ ? fileFormatFromExtension(target.extension)
111
+ : 'file';
112
+
113
+ if (!returnContent || format === 'file') {
114
+ return withUniqueWarnings({
115
+ path: resolvedPath,
116
+ format,
117
+ size: target.stat.size,
118
+ warnings,
119
+ resolved_mode: 'direct',
120
+ });
121
+ }
122
+
123
+ const fileBuffer = await fs.readFile(resolvedPath);
124
+ if (format === 'html') {
125
+ const text = toTextWithLimit(fileBuffer.toString('utf8'), maxContentBytes, warnings);
126
+ if (text !== null) {
127
+ return withUniqueWarnings({
128
+ content_data: text,
129
+ format,
130
+ size: Buffer.byteLength(text, 'utf8'),
131
+ warnings,
132
+ resolved_mode: 'direct',
133
+ });
134
+ }
135
+ } else {
136
+ const encoded = toBase64WithLimit(fileBuffer, maxContentBytes, warnings);
137
+ if (encoded !== null) {
138
+ return withUniqueWarnings({
139
+ content_data: encoded,
140
+ format,
141
+ size: fileBuffer.byteLength,
142
+ warnings,
143
+ resolved_mode: 'direct',
144
+ });
145
+ }
146
+ }
147
+
148
+ return withUniqueWarnings({
149
+ path: resolvedPath,
150
+ format,
151
+ size: target.stat.size,
152
+ warnings,
153
+ resolved_mode: 'direct',
154
+ });
155
+ }
156
+
157
+ if (target.kind === 'image_directory') {
158
+ const outputDir = await createOutputDir(path.basename(resolvedPath), outDir);
159
+
160
+ if (!['ppt', 'auto', 'web'].includes(preferredMode)) {
161
+ warnings.push(`mode_${preferredMode}_fallback_ppt`);
162
+ }
163
+
164
+ const slideshow = await createSlideshow(resolvedPath, {
165
+ autoPlay,
166
+ inlineAll: returnContent,
167
+ outDir: outputDir,
168
+ theme,
169
+ });
170
+ warnings.push(...slideshow.warnings);
171
+
172
+ const htmlPath = path.join(outputDir, 'slideshow.html');
173
+ await fs.writeFile(htmlPath, slideshow.html, 'utf8');
174
+ const size = (await fs.stat(htmlPath)).size;
175
+
176
+ if (returnContent) {
177
+ const text = toTextWithLimit(slideshow.html, maxContentBytes, warnings);
178
+ if (text !== null) {
179
+ return withUniqueWarnings({
180
+ content_data: text,
181
+ format: 'html',
182
+ size: Buffer.byteLength(text, 'utf8'),
183
+ warnings,
184
+ resolved_mode: 'ppt',
185
+ output_dir: outputDir,
186
+ });
187
+ }
188
+ }
189
+
190
+ return withUniqueWarnings({
191
+ path: htmlPath,
192
+ format: 'html',
193
+ size,
194
+ warnings,
195
+ resolved_mode: 'ppt',
196
+ output_dir: outputDir,
197
+ });
198
+ }
199
+
200
+ if (target.kind !== 'markdown') {
201
+ throw new Error(`unsupported target type: ${resolvedPath}`);
202
+ }
203
+
204
+ const markdown = await fs.readFile(resolvedPath, 'utf8');
205
+ const baseDir = path.dirname(resolvedPath);
206
+ const name = path.parse(resolvedPath).name;
207
+ const outputDir = await createOutputDir(name, outDir);
208
+
209
+ if (preferredMode === 'word') {
210
+ const docx = await exportDOCX(markdown, {
211
+ baseDir,
212
+ outDir: outputDir,
213
+ fileName: `${name}.docx`,
214
+ });
215
+ warnings.push(...docx.warnings);
216
+
217
+ if (returnContent) {
218
+ const buffer = await fs.readFile(docx.docxPath);
219
+ const encoded = toBase64WithLimit(buffer, maxContentBytes, warnings);
220
+ if (encoded !== null) {
221
+ return withUniqueWarnings({
222
+ content_data: encoded,
223
+ format: 'docx',
224
+ size: buffer.byteLength,
225
+ warnings,
226
+ resolved_mode: 'word',
227
+ output_dir: outputDir,
228
+ });
229
+ }
230
+ }
231
+
232
+ return withUniqueWarnings({
233
+ path: docx.docxPath,
234
+ format: 'docx',
235
+ size: docx.size,
236
+ warnings,
237
+ resolved_mode: 'word',
238
+ output_dir: outputDir,
239
+ });
240
+ }
241
+
242
+ if (preferredMode === 'pdf') {
243
+ const rendered = await renderMarkdown(markdown, {
244
+ theme,
245
+ baseDir,
246
+ inlineAll: true,
247
+ fetchRemote,
248
+ outDir: outputDir,
249
+ title: name,
250
+ });
251
+ warnings.push(...rendered.warnings);
252
+
253
+ const htmlPath = path.join(outputDir, `${name}.html`);
254
+ await fs.writeFile(htmlPath, rendered.html, 'utf8');
255
+
256
+ const pdf = await exportPDF(rendered.html, {
257
+ pageSize,
258
+ outDir: outputDir,
259
+ fileName: `${name}.pdf`,
260
+ htmlPath,
261
+ });
262
+ warnings.push(...pdf.warnings);
263
+
264
+ if (returnContent) {
265
+ const buffer = await fs.readFile(pdf.pdfPath);
266
+ const encoded = toBase64WithLimit(buffer, maxContentBytes, warnings);
267
+ if (encoded !== null) {
268
+ return withUniqueWarnings({
269
+ content_data: encoded,
270
+ format: 'pdf',
271
+ size: buffer.byteLength,
272
+ warnings,
273
+ resolved_mode: 'pdf',
274
+ output_dir: outputDir,
275
+ });
276
+ }
277
+ }
278
+
279
+ return withUniqueWarnings({
280
+ path: pdf.pdfPath,
281
+ format: 'pdf',
282
+ size: pdf.size,
283
+ warnings,
284
+ resolved_mode: 'pdf',
285
+ output_dir: outputDir,
286
+ });
287
+ }
288
+
289
+ if (preferredMode === 'ppt') {
290
+ warnings.push('mode_ppt_for_markdown_fallback_web');
291
+ }
292
+
293
+ const rendered = await renderMarkdown(markdown, {
294
+ theme,
295
+ baseDir,
296
+ inlineAll: returnContent,
297
+ fetchRemote,
298
+ outDir: outputDir,
299
+ title: name,
300
+ });
301
+ warnings.push(...rendered.warnings);
302
+
303
+ const htmlPath = path.join(outputDir, `${name}.html`);
304
+ await fs.writeFile(htmlPath, rendered.html, 'utf8');
305
+ const htmlSize = (await fs.stat(htmlPath)).size;
306
+
307
+ if (returnContent) {
308
+ const text = toTextWithLimit(rendered.html, maxContentBytes, warnings);
309
+ if (text !== null) {
310
+ return withUniqueWarnings({
311
+ content_data: text,
312
+ format: 'html',
313
+ size: Buffer.byteLength(text, 'utf8'),
314
+ warnings,
315
+ resolved_mode: 'web',
316
+ output_dir: outputDir,
317
+ });
318
+ }
319
+ }
320
+
321
+ return withUniqueWarnings({
322
+ path: htmlPath,
323
+ format: 'html',
324
+ size: htmlSize,
325
+ warnings,
326
+ resolved_mode: 'web',
327
+ output_dir: outputDir,
328
+ });
329
+ }
@@ -0,0 +1,235 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import path from 'node:path';
3
+ import { createRequire } from 'node:module';
4
+ import { fileURLToPath } from 'node:url';
5
+ import MarkdownIt from 'markdown-it';
6
+ import hljs from 'highlight.js';
7
+ import { processAssets } from './assets.js';
8
+ import { sanitize } from './sanitizer.js';
9
+
10
+ const require = createRequire(import.meta.url);
11
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
12
+ const THEME_DIR = path.join(__dirname, 'themes');
13
+ const TEMPLATE_PATH = path.resolve(__dirname, '../templates/document.html');
14
+
15
+ const GITHUB_MARKDOWN_CSS_PATH = require.resolve('github-markdown-css/github-markdown.css');
16
+ const HIGHLIGHT_CSS_PATH = require.resolve('highlight.js/styles/github.css');
17
+
18
+ const TOC_BASE_CSS = `
19
+ .markdown-body :is(h1, h2, h3, h4, h5, h6) {
20
+ scroll-margin-top: 24px;
21
+ }
22
+ `;
23
+
24
+ function escapeHtml(value) {
25
+ return String(value)
26
+ .replaceAll('&', '&')
27
+ .replaceAll('<', '&lt;')
28
+ .replaceAll('>', '&gt;')
29
+ .replaceAll('"', '&quot;')
30
+ .replaceAll("'", '&#39;');
31
+ }
32
+
33
+ function slugify(value) {
34
+ const normalized = String(value)
35
+ .toLowerCase()
36
+ .trim()
37
+ .replace(/[^\p{L}\p{N}\s-]/gu, '')
38
+ .replace(/\s+/g, '-')
39
+ .replace(/-+/g, '-');
40
+
41
+ return normalized || 'section';
42
+ }
43
+
44
+ function inlineTokenToText(inlineToken) {
45
+ if (!inlineToken?.children?.length) {
46
+ return inlineToken?.content || '';
47
+ }
48
+
49
+ return inlineToken.children
50
+ .map((child) => {
51
+ if (child.type === 'text' || child.type === 'code_inline') {
52
+ return child.content || '';
53
+ }
54
+ if (child.type === 'image') {
55
+ return child.content || child.attrGet('alt') || '';
56
+ }
57
+ return '';
58
+ })
59
+ .join('');
60
+ }
61
+
62
+ function createMarkdownRenderer() {
63
+ const instance = new MarkdownIt({
64
+ html: true,
65
+ linkify: true,
66
+ typographer: true,
67
+ highlight(code, language) {
68
+ if (language && hljs.getLanguage(language)) {
69
+ const highlighted = hljs.highlight(code, { language }).value;
70
+ return `<pre><code class="hljs language-${language}">${highlighted}</code></pre>`;
71
+ }
72
+
73
+ const escaped = instance.utils.escapeHtml(code);
74
+ return `<pre><code class="hljs">${escaped}</code></pre>`;
75
+ },
76
+ });
77
+
78
+ const defaultHeadingOpen = instance.renderer.rules.heading_open
79
+ || ((tokens, idx, options, env, self) => self.renderToken(tokens, idx, options));
80
+
81
+ instance.renderer.rules.heading_open = (tokens, idx, options, env, self) => {
82
+ const headingToken = tokens[idx];
83
+ const inlineToken = tokens[idx + 1];
84
+
85
+ const text = inlineTokenToText(inlineToken).trim() || `Section ${idx + 1}`;
86
+ const level = Number(headingToken.tag.slice(1)) || 1;
87
+
88
+ if (!env.__toc) {
89
+ env.__toc = [];
90
+ }
91
+ if (!env.__slugMap) {
92
+ env.__slugMap = new Map();
93
+ }
94
+
95
+ const slugBase = slugify(text);
96
+ const count = (env.__slugMap.get(slugBase) || 0) + 1;
97
+ env.__slugMap.set(slugBase, count);
98
+ const id = count === 1 ? slugBase : `${slugBase}-${count}`;
99
+
100
+ headingToken.attrSet('id', id);
101
+ env.__toc.push({
102
+ id,
103
+ text,
104
+ level,
105
+ });
106
+
107
+ return defaultHeadingOpen(tokens, idx, options, env, self);
108
+ };
109
+
110
+ return instance;
111
+ }
112
+
113
+ function buildTocTree(tocItems) {
114
+ const root = { level: 0, children: [] };
115
+ const stack = [root];
116
+
117
+ for (const item of tocItems) {
118
+ const node = {
119
+ id: item.id,
120
+ text: item.text,
121
+ level: Math.min(6, Math.max(1, Number(item.level) || 1)),
122
+ children: [],
123
+ };
124
+
125
+ while (stack.length > 1 && node.level <= stack[stack.length - 1].level) {
126
+ stack.pop();
127
+ }
128
+
129
+ const parent = stack[stack.length - 1];
130
+ parent.children.push(node);
131
+ stack.push(node);
132
+ }
133
+
134
+ return root.children;
135
+ }
136
+
137
+ function renderTocNodes(nodes) {
138
+ if (!nodes.length) {
139
+ return '';
140
+ }
141
+
142
+ const listHtml = nodes
143
+ .map((node) => {
144
+ const safeText = escapeHtml(node.text);
145
+ const safeId = escapeHtml(node.id);
146
+ const childHtml = renderTocNodes(node.children);
147
+ return `<li><a class="toc-link toc-level-${node.level}" href="#${safeId}" data-toc-id="${safeId}">${safeText}</a>${childHtml}</li>`;
148
+ })
149
+ .join('');
150
+
151
+ return `<ul>${listHtml}</ul>`;
152
+ }
153
+
154
+ function buildTocHtml(tocItems) {
155
+ if (!tocItems.length) {
156
+ return '<p class="toc-empty">当前文档没有标题结构</p>';
157
+ }
158
+
159
+ const tree = buildTocTree(tocItems);
160
+ return renderTocNodes(tree);
161
+ }
162
+
163
+ async function loadStyles(theme) {
164
+ const themeName = theme || 'light';
165
+ const builtInThemePath =
166
+ themeName === 'dark'
167
+ ? path.join(THEME_DIR, 'dark.css')
168
+ : themeName === 'print'
169
+ ? path.join(THEME_DIR, 'print.css')
170
+ : path.join(THEME_DIR, 'light.css');
171
+ const customThemePath =
172
+ themeName === 'light' || themeName === 'dark' || themeName === 'print'
173
+ ? null
174
+ : path.resolve(themeName);
175
+
176
+ const [builtInTheme, githubCss, highlightCss, customTheme = ''] = await Promise.all([
177
+ fs.readFile(builtInThemePath, 'utf8'),
178
+ fs.readFile(GITHUB_MARKDOWN_CSS_PATH, 'utf8'),
179
+ fs.readFile(HIGHLIGHT_CSS_PATH, 'utf8'),
180
+ customThemePath ? fs.readFile(customThemePath, 'utf8') : Promise.resolve(''),
181
+ ]);
182
+
183
+ const mergedCss = `${githubCss}\n${highlightCss}\n${builtInTheme}\n${TOC_BASE_CSS}\n${customTheme}`;
184
+ return `<style>${mergedCss}</style>`;
185
+ }
186
+
187
+ function fillTemplate(template, params) {
188
+ return template
189
+ .replace('{{CSS_INJECTION}}', params.cssInjection)
190
+ .replace('{{TOC_CONTENT}}', params.tocContent)
191
+ .replace('{{CONTENT}}', params.content);
192
+ }
193
+
194
+ export async function renderMarkdown(markdownString, options = {}) {
195
+ const {
196
+ theme = 'light',
197
+ baseDir,
198
+ inlineAll = false,
199
+ fetchRemote = true,
200
+ outDir,
201
+ } = options;
202
+
203
+ const renderer = createMarkdownRenderer();
204
+ const env = {};
205
+ const htmlUnsafe = renderer.render(markdownString, env);
206
+ const htmlSanitized = sanitize(htmlUnsafe);
207
+
208
+ const assetResult = await processAssets(htmlSanitized, {
209
+ baseDir,
210
+ outDir,
211
+ inlineAll,
212
+ fetchRemote,
213
+ });
214
+
215
+ const tocItems = Array.isArray(env.__toc) ? env.__toc : [];
216
+ const tocHtml = buildTocHtml(tocItems);
217
+
218
+ const [template, cssContent] = await Promise.all([
219
+ fs.readFile(TEMPLATE_PATH, 'utf8'),
220
+ loadStyles(theme),
221
+ ]);
222
+
223
+ const html = fillTemplate(template, {
224
+ cssInjection: cssContent,
225
+ tocContent: tocHtml,
226
+ content: assetResult.html,
227
+ });
228
+
229
+ return {
230
+ html,
231
+ assets: assetResult.assets,
232
+ warnings: assetResult.warnings,
233
+ toc: tocItems,
234
+ };
235
+ }
@@ -0,0 +1,79 @@
1
+ import sanitizeHtml from 'sanitize-html';
2
+
3
+ const ALLOWED_TAGS = [
4
+ 'h1',
5
+ 'h2',
6
+ 'h3',
7
+ 'h4',
8
+ 'h5',
9
+ 'h6',
10
+ 'p',
11
+ 'div',
12
+ 'span',
13
+ 'a',
14
+ 'img',
15
+ 'table',
16
+ 'thead',
17
+ 'tbody',
18
+ 'tfoot',
19
+ 'tr',
20
+ 'th',
21
+ 'td',
22
+ 'ul',
23
+ 'ol',
24
+ 'li',
25
+ 'blockquote',
26
+ 'pre',
27
+ 'code',
28
+ 'em',
29
+ 'strong',
30
+ 'del',
31
+ 'hr',
32
+ 'br',
33
+ 'sup',
34
+ 'sub',
35
+ 'details',
36
+ 'summary',
37
+ 'section',
38
+ 'article',
39
+ 'figure',
40
+ 'figcaption',
41
+ ];
42
+
43
+ const ALLOWED_ATTRIBUTES = {
44
+ '*': ['id', 'class', 'title', 'data-*'],
45
+ img: ['src', 'alt', 'width', 'height', 'loading', 'decoding'],
46
+ a: ['href', 'title', 'target', 'rel'],
47
+ code: ['class'],
48
+ pre: ['class'],
49
+ table: ['class'],
50
+ th: ['colspan', 'rowspan'],
51
+ td: ['colspan', 'rowspan'],
52
+ };
53
+
54
+ export function sanitize(htmlString) {
55
+ return sanitizeHtml(htmlString, {
56
+ allowedTags: ALLOWED_TAGS,
57
+ allowedAttributes: ALLOWED_ATTRIBUTES,
58
+ allowedSchemes: ['http', 'https', 'data', 'mailto'],
59
+ allowedSchemesByTag: {
60
+ img: ['http', 'https', 'data'],
61
+ a: ['http', 'https', 'mailto'],
62
+ },
63
+ allowProtocolRelative: false,
64
+ disallowedTagsMode: 'discard',
65
+ parser: {
66
+ lowerCaseTags: true,
67
+ lowerCaseAttributeNames: true,
68
+ },
69
+ transformTags: {
70
+ a: (tagName, attribs) => {
71
+ const out = { ...attribs };
72
+ if (out.target === '_blank') {
73
+ out.rel = 'noopener noreferrer';
74
+ }
75
+ return { tagName, attribs: out };
76
+ },
77
+ },
78
+ });
79
+ }