agentic-api 2.0.646 → 2.0.885
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/agents/prompts.d.ts +2 -3
- package/dist/src/agents/prompts.js +21 -118
- package/dist/src/agents/reducer.loaders.d.ts +103 -1
- package/dist/src/agents/reducer.loaders.js +164 -2
- package/dist/src/agents/reducer.types.d.ts +34 -3
- package/dist/src/agents/simulator.d.ts +32 -2
- package/dist/src/agents/simulator.executor.d.ts +15 -5
- package/dist/src/agents/simulator.executor.js +134 -67
- package/dist/src/agents/simulator.js +251 -8
- package/dist/src/agents/simulator.prompts.d.ts +55 -10
- package/dist/src/agents/simulator.prompts.js +305 -61
- package/dist/src/agents/simulator.types.d.ts +62 -1
- package/dist/src/agents/simulator.types.js +5 -0
- package/dist/src/agents/subagent.d.ts +128 -0
- package/dist/src/agents/subagent.js +231 -0
- package/dist/src/agents/worker.executor.d.ts +48 -0
- package/dist/src/agents/worker.executor.js +152 -0
- package/dist/src/execute/helpers.d.ts +3 -0
- package/dist/src/execute/helpers.js +222 -16
- package/dist/src/execute/responses.js +81 -55
- package/dist/src/execute/shared.d.ts +5 -0
- package/dist/src/execute/shared.js +27 -0
- package/dist/src/index.d.ts +2 -1
- package/dist/src/index.js +3 -1
- package/dist/src/llm/openai.js +8 -1
- package/dist/src/llm/pricing.js +2 -0
- package/dist/src/llm/xai.js +11 -6
- package/dist/src/prompts.d.ts +14 -0
- package/dist/src/prompts.js +41 -1
- package/dist/src/rag/rag.manager.d.ts +18 -3
- package/dist/src/rag/rag.manager.js +114 -12
- package/dist/src/rag/types.d.ts +3 -1
- package/dist/src/rules/git/git.e2e.helper.js +51 -4
- package/dist/src/rules/git/git.health.js +89 -56
- package/dist/src/rules/git/index.d.ts +2 -2
- package/dist/src/rules/git/index.js +22 -5
- package/dist/src/rules/git/repo.d.ts +64 -6
- package/dist/src/rules/git/repo.js +572 -141
- package/dist/src/rules/git/repo.pr.d.ts +11 -18
- package/dist/src/rules/git/repo.pr.js +82 -94
- package/dist/src/rules/git/repo.tools.d.ts +5 -0
- package/dist/src/rules/git/repo.tools.js +6 -1
- package/dist/src/rules/types.d.ts +0 -2
- package/dist/src/rules/utils.matter.js +1 -5
- package/dist/src/scrapper.d.ts +138 -25
- package/dist/src/scrapper.js +538 -160
- package/dist/src/stategraph/stategraph.d.ts +6 -2
- package/dist/src/stategraph/stategraph.js +21 -6
- package/dist/src/stategraph/types.d.ts +14 -6
- package/dist/src/types.d.ts +22 -0
- package/dist/src/utils.d.ts +24 -0
- package/dist/src/utils.js +84 -86
- package/package.json +3 -2
- package/dist/src/agents/semantic.d.ts +0 -4
- package/dist/src/agents/semantic.js +0 -19
- package/dist/src/execute/legacy.d.ts +0 -46
- package/dist/src/execute/legacy.js +0 -460
- package/dist/src/pricing.llm.d.ts +0 -5
- package/dist/src/pricing.llm.js +0 -14
package/dist/src/scrapper.js
CHANGED
|
@@ -3,9 +3,10 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
3
3
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
4
|
};
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.extractCaptcha = extractCaptcha;
|
|
7
6
|
exports.callLLMForParsingPDF = callLLMForParsingPDF;
|
|
8
7
|
exports.html2markdown = html2markdown;
|
|
8
|
+
exports.pdftotext_poppler = pdftotext_poppler;
|
|
9
|
+
exports.pdftotext_mupdf = pdftotext_mupdf;
|
|
9
10
|
exports.pdf2markdown = pdf2markdown;
|
|
10
11
|
const child_process_1 = require("child_process");
|
|
11
12
|
const util_1 = require("util");
|
|
@@ -13,133 +14,351 @@ const path_1 = __importDefault(require("path"));
|
|
|
13
14
|
const fs_1 = __importDefault(require("fs"));
|
|
14
15
|
const jsdom_1 = require("jsdom");
|
|
15
16
|
const readability_1 = require("@mozilla/readability");
|
|
16
|
-
const pricing_1 = require("./llm/pricing");
|
|
17
17
|
const prompts_1 = require("./prompts");
|
|
18
18
|
const utils_1 = require("./utils");
|
|
19
|
-
const execute_1 = require("./execute");
|
|
20
19
|
const utils_matter_1 = require("./rules/utils.matter");
|
|
21
|
-
|
|
20
|
+
const reducer_core_1 = require("./agents/reducer.core");
|
|
22
21
|
const execAsync = (0, util_1.promisify)(child_process_1.exec);
|
|
23
|
-
const execFileAsync = (0, util_1.promisify)(child_process_1.execFile);
|
|
24
22
|
const randomFile = (ext = '') => {
|
|
25
23
|
const random = () => Math.random() * 1000 | 0;
|
|
26
24
|
return `temp-${random()}-${random()}${ext}`;
|
|
27
25
|
};
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
const
|
|
35
|
-
const
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
const
|
|
43
|
-
|
|
26
|
+
/** Formats rows of cell strings as a GFM markdown table. */
|
|
27
|
+
function gfmTable(rows) {
|
|
28
|
+
if (rows.length === 0)
|
|
29
|
+
return '';
|
|
30
|
+
const maxCols = Math.max(...rows.map(r => r.length));
|
|
31
|
+
const pad = (r) => [...r, ...new Array(maxCols - r.length).fill('')];
|
|
32
|
+
const fmt = (cells) => `| ${cells.join(' | ')} |`;
|
|
33
|
+
const [header, ...body] = rows.map(pad);
|
|
34
|
+
return [fmt(header), fmt(header.map(() => '---')), ...body.map(fmt)].join('\n');
|
|
35
|
+
}
|
|
36
|
+
// ─── Geometric text reconstruction ────────────────────────────────────────────
|
|
37
|
+
/** Groups text items into rows by proximity on the Y axis (top-to-bottom). */
|
|
38
|
+
function groupByY(items, tolerance = 3) {
|
|
39
|
+
const buckets = new Map();
|
|
40
|
+
for (const item of items) {
|
|
41
|
+
const key = [...buckets.keys()].find(k => Math.abs(k - item.y) <= tolerance);
|
|
42
|
+
if (key !== undefined) {
|
|
43
|
+
buckets.get(key).push(item);
|
|
44
|
+
}
|
|
45
|
+
else {
|
|
46
|
+
buckets.set(item.y, [item]);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return [...buckets.entries()]
|
|
50
|
+
.sort(([ya], [yb]) => yb - ya)
|
|
51
|
+
.map(([y, rowItems]) => ({ y, items: rowItems.sort((a, b) => a.x - b.x) }));
|
|
52
|
+
}
|
|
53
|
+
function detectColumnBoundaries(rows, gap = 15) {
|
|
54
|
+
const allX = rows.flatMap(r => r.items.map(i => i.x)).sort((a, b) => a - b);
|
|
55
|
+
if (allX.length === 0)
|
|
56
|
+
return [];
|
|
57
|
+
const cols = [allX[0]];
|
|
58
|
+
for (let i = 1; i < allX.length; i++) {
|
|
59
|
+
if (allX[i] - allX[i - 1] > gap)
|
|
60
|
+
cols.push(allX[i]);
|
|
61
|
+
}
|
|
62
|
+
return cols;
|
|
63
|
+
}
|
|
64
|
+
function assignToColumns(row, cols) {
|
|
65
|
+
const cells = new Array(cols.length).fill('');
|
|
66
|
+
for (const item of row.items) {
|
|
67
|
+
let colIdx = 0, minDist = Math.abs(item.x - cols[0]);
|
|
68
|
+
for (let i = 1; i < cols.length; i++) {
|
|
69
|
+
const dist = Math.abs(item.x - cols[i]);
|
|
70
|
+
if (dist < minDist) {
|
|
71
|
+
minDist = dist;
|
|
72
|
+
colIdx = i;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
cells[colIdx] = cells[colIdx] ? `${cells[colIdx]} ${item.str}` : item.str;
|
|
76
|
+
}
|
|
77
|
+
return cells;
|
|
44
78
|
}
|
|
45
79
|
/**
|
|
46
|
-
*
|
|
80
|
+
* Reconstructs page text **block by block** and splits it into three zones.
|
|
47
81
|
*
|
|
48
|
-
*
|
|
49
|
-
*
|
|
50
|
-
*
|
|
51
|
-
*
|
|
52
|
-
*
|
|
82
|
+
* mupdf's `segment` option places each column / region in its own top-level
|
|
83
|
+
* block. Processing blocks independently prevents multi-column layouts from
|
|
84
|
+
* being interleaved into false GFM tables.
|
|
85
|
+
*
|
|
86
|
+
* Zone classification by median Y of the block's items vs `pageHeight`:
|
|
87
|
+
* - **header zone** : medianY / pageHeight < 0.12
|
|
88
|
+
* - **footer zone** : medianY / pageHeight > 0.88
|
|
89
|
+
* - **body** : everything else
|
|
90
|
+
*
|
|
91
|
+
* When `pageHeight` is 0 (unknown) every block is treated as body.
|
|
53
92
|
*/
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
const
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
const messages = [
|
|
65
|
-
{ role: "system",
|
|
66
|
-
content: prompts_1.htmlToMarkdownPrompt }
|
|
67
|
-
];
|
|
68
|
-
// console.log('🌶️ DEBUG: callLLMForParsingPDF -- SYSTEM:', messages[0].content);
|
|
69
|
-
// console.log('🌶️ DEBUG: callLLMForParsingPDF -- USER:', messages[1].content);
|
|
70
|
-
// WARNING: o3-mini is buggy with "Marche à suivre nouveau bail.pdf"
|
|
71
|
-
const response = await (0, execute_1.executeQuery)({
|
|
72
|
-
query: `Structure le contenu exhaustif en Markdown sans rien inventer, et avec les liens intégrés correctement.\n Nous sommes le ${today}.\n${linkPrefix}\nLe contenu du document est:\n${pdfDataAsString}`,
|
|
73
|
-
model,
|
|
74
|
-
messages,
|
|
75
|
-
stdout: execute_1.DummyWritable,
|
|
76
|
-
verbose: false
|
|
77
|
-
});
|
|
78
|
-
// Récupérer la réponse markdown
|
|
79
|
-
const markdown = response.content;
|
|
80
|
-
console.log(`Markdown 💰 cost: ${response.usage.cost}`);
|
|
93
|
+
function reconstructFromBlocks(parsed, pageHeight = 0) {
|
|
94
|
+
const empty = { header: '', body: '', footer: '' };
|
|
95
|
+
const root = parsed;
|
|
96
|
+
if (!root || !Array.isArray(root.blocks) || root.blocks.length === 0) {
|
|
97
|
+
const items = flattenMupdfTextItems(parsed);
|
|
98
|
+
return { ...empty, body: reconstructPageText(groupByY(items).reverse()) };
|
|
99
|
+
}
|
|
100
|
+
const headerParts = [];
|
|
101
|
+
const bodyParts = [];
|
|
102
|
+
const footerParts = [];
|
|
81
103
|
//
|
|
82
|
-
//
|
|
83
|
-
|
|
84
|
-
|
|
104
|
+
// Classify at item level (not block level) so that a single large block
|
|
105
|
+
// spanning the full page (common in magazine layouts) is still split
|
|
106
|
+
// into its header/body/footer zones correctly.
|
|
107
|
+
// mupdf Y-from-top: small Y = top of page.
|
|
108
|
+
// header zone : Y / pageHeight < 0.12
|
|
109
|
+
// footer zone : Y / pageHeight > 0.88
|
|
110
|
+
// body : everything else
|
|
111
|
+
const headerItems = [];
|
|
112
|
+
const footerItems = [];
|
|
113
|
+
for (const block of root.blocks) {
|
|
114
|
+
const items = flattenMupdfTextItems(block);
|
|
115
|
+
if (items.length === 0)
|
|
116
|
+
continue;
|
|
117
|
+
if (pageHeight > 0) {
|
|
118
|
+
const hItems = items.filter(i => i.y / pageHeight < 0.12);
|
|
119
|
+
const fItems = items.filter(i => i.y / pageHeight > 0.88);
|
|
120
|
+
const bItems = items.filter(i => { const r = i.y / pageHeight; return r >= 0.12 && r <= 0.88; });
|
|
121
|
+
headerItems.push(...hItems);
|
|
122
|
+
footerItems.push(...fItems);
|
|
123
|
+
if (bItems.length > 0) {
|
|
124
|
+
const text = reconstructPageText(groupByY(bItems).reverse());
|
|
125
|
+
if (text)
|
|
126
|
+
bodyParts.push(text);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
const text = reconstructPageText(groupByY(items).reverse());
|
|
131
|
+
if (text)
|
|
132
|
+
bodyParts.push(text);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
if (headerItems.length > 0) {
|
|
136
|
+
const text = reconstructPageText(groupByY(headerItems).reverse());
|
|
137
|
+
if (text)
|
|
138
|
+
headerParts.push(text);
|
|
139
|
+
}
|
|
140
|
+
if (footerItems.length > 0) {
|
|
141
|
+
const text = reconstructPageText(groupByY(footerItems).reverse());
|
|
142
|
+
if (text)
|
|
143
|
+
footerParts.push(text);
|
|
144
|
+
}
|
|
145
|
+
return {
|
|
146
|
+
header: headerParts.join('\n').trim(),
|
|
147
|
+
body: bodyParts.join('\n\n').trim(),
|
|
148
|
+
footer: footerParts.join('\n').trim(),
|
|
149
|
+
};
|
|
85
150
|
}
|
|
86
151
|
/**
|
|
87
|
-
*
|
|
152
|
+
* Promotes positional header/footer candidates to `page.header` / `page.footer`
|
|
153
|
+
* only when the same normalised pattern appears on **≥ 3 pages**.
|
|
88
154
|
*
|
|
89
|
-
*
|
|
90
|
-
*
|
|
91
|
-
*
|
|
92
|
-
*
|
|
155
|
+
* Numbers are normalised (`\d{2,4}` → `{N}`) so that incrementing folios
|
|
156
|
+
* (`057`, `058`, `059`) map to the same pattern.
|
|
157
|
+
*
|
|
158
|
+
* False positives (unique first-page titles that happen to sit in the top zone)
|
|
159
|
+
* are re-injected into `page.text` so nothing is silently lost.
|
|
93
160
|
*/
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
161
|
+
/**
|
|
162
|
+
* Parses GFM tables in a block of text and returns their dimensions.
|
|
163
|
+
*
|
|
164
|
+
* Used to populate `Page.tables` so that `callLLMForParsingPDF` can
|
|
165
|
+
* automatically select a stronger model for pages with complex tables
|
|
166
|
+
* (many columns or many rows).
|
|
167
|
+
*/
|
|
168
|
+
function detectTableStats(text) {
|
|
169
|
+
const tables = [];
|
|
170
|
+
const lines = text.split('\n');
|
|
171
|
+
let tableLines = [];
|
|
172
|
+
const flush = () => {
|
|
173
|
+
if (tableLines.length < 2) {
|
|
174
|
+
tableLines = [];
|
|
175
|
+
return;
|
|
176
|
+
}
|
|
177
|
+
//
|
|
178
|
+
// Separator lines (e.g. `| --- | --- |`) are structural — exclude from row count
|
|
179
|
+
const SEPARATOR_RE = /^\|\s*[-:]+[\s|:-]*\|$/;
|
|
180
|
+
const dataLines = tableLines.filter(l => !SEPARATOR_RE.test(l.trim()));
|
|
181
|
+
const cols = (tableLines[0].match(/\|/g) ?? []).length - 1;
|
|
182
|
+
if (cols >= 2 && dataLines.length >= 1) {
|
|
183
|
+
tables.push({ rows: dataLines.length, cols });
|
|
184
|
+
}
|
|
185
|
+
tableLines = [];
|
|
186
|
+
};
|
|
187
|
+
for (const line of lines) {
|
|
188
|
+
if (/^\|/.test(line) && line.trim().endsWith('|')) {
|
|
189
|
+
tableLines.push(line);
|
|
190
|
+
}
|
|
191
|
+
else {
|
|
192
|
+
flush();
|
|
193
|
+
}
|
|
109
194
|
}
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
195
|
+
flush();
|
|
196
|
+
return tables;
|
|
197
|
+
}
|
|
198
|
+
function detectRunningHeaders(pages) {
|
|
199
|
+
const normalize = (s) => s.replace(/\d{2,4}/g, '{N}').trim();
|
|
200
|
+
//
|
|
201
|
+
// Count how many pages share each normalised pattern
|
|
202
|
+
const headerCount = new Map();
|
|
203
|
+
const footerCount = new Map();
|
|
204
|
+
for (const p of pages) {
|
|
205
|
+
if (p._rawHeader)
|
|
206
|
+
headerCount.set(normalize(p._rawHeader), (headerCount.get(normalize(p._rawHeader)) ?? 0) + 1);
|
|
207
|
+
if (p._rawFooter)
|
|
208
|
+
footerCount.set(normalize(p._rawFooter), (footerCount.get(normalize(p._rawFooter)) ?? 0) + 1);
|
|
113
209
|
}
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
210
|
+
return pages.map(({ _rawHeader, _rawFooter, ...page }) => {
|
|
211
|
+
let text = page.text;
|
|
212
|
+
let header;
|
|
213
|
+
let footer;
|
|
214
|
+
if (_rawHeader) {
|
|
215
|
+
if ((headerCount.get(normalize(_rawHeader)) ?? 0) >= 3) {
|
|
216
|
+
header = _rawHeader;
|
|
217
|
+
}
|
|
218
|
+
else {
|
|
219
|
+
//
|
|
220
|
+
// Not a running header — keep in body text
|
|
221
|
+
text = `${_rawHeader}\n\n${text}`;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
if (_rawFooter) {
|
|
225
|
+
if ((footerCount.get(normalize(_rawFooter)) ?? 0) >= 3) {
|
|
226
|
+
footer = _rawFooter;
|
|
227
|
+
}
|
|
228
|
+
else {
|
|
229
|
+
text = `${text}\n\n${_rawFooter}`;
|
|
124
230
|
}
|
|
125
231
|
}
|
|
232
|
+
return { ...page, text: text.trim(), header, footer };
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
/** Reconstructs plain text with heuristic table detection for untagged PDFs. */
|
|
236
|
+
function reconstructPageText(rows) {
|
|
237
|
+
const LIST_MARKER_RE = /^([●•◦▪▸✅✓✗►]|\d{1,3}\.?|[a-zA-Z]\.)$/;
|
|
238
|
+
const isListItem = (r) => r.items.length === 2 && (r.items[0].str.trim().length <= 3 || LIST_MARKER_RE.test(r.items[0].str.trim()));
|
|
239
|
+
const fmtTableRows = (rows2) => {
|
|
240
|
+
const cols = detectColumnBoundaries(rows2);
|
|
241
|
+
if (cols.length < 2)
|
|
242
|
+
return rows2.map(r => r.items.map(i => i.str).join(' ')).join('\n');
|
|
243
|
+
return gfmTable(rows2.map(r => assignToColumns(r, cols)));
|
|
244
|
+
};
|
|
245
|
+
const chunks = [];
|
|
246
|
+
let tableCandidate = [];
|
|
247
|
+
const flush = () => {
|
|
248
|
+
if (tableCandidate.length === 0)
|
|
249
|
+
return;
|
|
250
|
+
const realTable = tableCandidate.filter(r => !isListItem(r) && r.items.length >= 2);
|
|
251
|
+
chunks.push(realTable.length >= 2 ? fmtTableRows(tableCandidate) : tableCandidate.map(r => r.items.map(i => i.str).join(' ')).join('\n'));
|
|
252
|
+
tableCandidate = [];
|
|
253
|
+
};
|
|
254
|
+
for (const row of rows) {
|
|
255
|
+
if (row.items.length >= 2 && !isListItem(row)) {
|
|
256
|
+
tableCandidate.push(row);
|
|
257
|
+
}
|
|
258
|
+
else {
|
|
259
|
+
flush();
|
|
260
|
+
chunks.push(row.items.map(i => i.str).join(' '));
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
flush();
|
|
264
|
+
return chunks.join('\n').replace(/\n{3,}/g, '\n\n').trim();
|
|
265
|
+
}
|
|
266
|
+
// ─── Existing helpers ─────────────────────────────────────────────────────────
|
|
267
|
+
/**
|
|
268
|
+
* Converts extracted PDF content to clean Markdown via LLM.
|
|
269
|
+
*
|
|
270
|
+
* Two paths depending on the `pdfData` type:
|
|
271
|
+
*
|
|
272
|
+
* **`Page[]` (mupdf path)** — `MapLLM.reduce`, one page per chunk.
|
|
273
|
+
* Each page is processed by `mupdfPagePrompt` (heading normalisation, broken-cell
|
|
274
|
+
* fusion, repeated-header removal). No frontmatter is added here; the caller
|
|
275
|
+
* (`pdf2markdown`) prepends the single YAML block.
|
|
276
|
+
*
|
|
277
|
+
* NOTE: `finalReduce` is intentionally disabled — it is reserved for a future
|
|
278
|
+
* "N-page light summary" feature where a second LLM pass synthesises the whole
|
|
279
|
+
* document into a shorter version.
|
|
280
|
+
*
|
|
281
|
+
* A raw `string` (e.g. from `html2markdown`) is automatically wrapped into a
|
|
282
|
+
* single `Page` so both callers share the exact same code path.
|
|
283
|
+
*
|
|
284
|
+
* @param inputfile - Original file path (used for logging only).
|
|
285
|
+
* @param pdfData - Either a `Page[]` array (mupdf) or a raw string.
|
|
286
|
+
* @param links - External links appended as `## Liens` footer (string path).
|
|
287
|
+
* @param model - LLM model alias (default: `'MEDIUM-fast'`).
|
|
288
|
+
*/
|
|
289
|
+
async function callLLMForParsingPDF(inputfile, pdfData, links = [], model = 'LOW-fast') {
|
|
290
|
+
//
|
|
291
|
+
// Normalise input: a raw string becomes a single-page array.
|
|
292
|
+
// Links (html2markdown path) are appended as a Liens footer so the
|
|
293
|
+
// MapLLM digest can embed them naturally in context.
|
|
294
|
+
const pages = Array.isArray(pdfData)
|
|
295
|
+
? pdfData
|
|
296
|
+
: [{
|
|
297
|
+
pageNumber: 1,
|
|
298
|
+
text: links.length > 0
|
|
299
|
+
? `${pdfData}\n\n## Liens\n\n${links.map(l => `- [${l.text}](${l.href})`).join('\n')}`
|
|
300
|
+
: pdfData,
|
|
301
|
+
tables: [],
|
|
302
|
+
images: [],
|
|
303
|
+
}];
|
|
304
|
+
//
|
|
305
|
+
// Auto model upgrade: count total `|` across all pages.
|
|
306
|
+
// A 4-col × 6-row table (4+1) × (6+2) = ≈ 40 `|` — threshold 40 catches any non-trivial table.
|
|
307
|
+
// Only upgrades from LOW-fast; explicit caller models are respected.
|
|
308
|
+
const PIPE_THRESHOLD = 40;
|
|
309
|
+
const totalPipes = pages.reduce((sum, p) => sum + (p.text.match(/\|/g) ?? []).length, 0);
|
|
310
|
+
const effectiveModel = totalPipes > PIPE_THRESHOLD ? 'HIGH-fast' : model;
|
|
311
|
+
if (totalPipes > PIPE_THRESHOLD) {
|
|
312
|
+
console.log(`pdf: ${totalPipes} pipes detected → upgrading model LOW-fast → HIGH-fast`);
|
|
126
313
|
}
|
|
314
|
+
const pageLoader = {
|
|
315
|
+
loadNativeChunk: async (pos) => ({
|
|
316
|
+
content: pages[pos].text,
|
|
317
|
+
eof: pos + 1 >= pages.length,
|
|
318
|
+
position: pos + 1,
|
|
319
|
+
}),
|
|
320
|
+
};
|
|
321
|
+
//
|
|
322
|
+
// finalReduce: false — reserved for future "N-page light summary" feature
|
|
323
|
+
const mapper = new reducer_core_1.MapLLM(pageLoader, { finalReduce: false });
|
|
324
|
+
const result = await mapper.reduce((res, current) => {
|
|
325
|
+
const section = typeof current === 'string' ? current : JSON.stringify(current);
|
|
326
|
+
res.acc = res.acc ? `${res.acc}\n\n---\n\n${section}` : section;
|
|
327
|
+
return res;
|
|
328
|
+
}, {
|
|
329
|
+
acc: '',
|
|
330
|
+
config: {
|
|
331
|
+
digestPrompt: prompts_1.mupdfPagePrompt,
|
|
332
|
+
reducePrompt: '',
|
|
333
|
+
},
|
|
334
|
+
model: effectiveModel,
|
|
335
|
+
verbose: true,
|
|
336
|
+
});
|
|
337
|
+
const raw = typeof result.acc === 'string' ? result.acc : JSON.stringify(result.acc);
|
|
338
|
+
const clean = raw.replace(/<thinking>[\s\S]*?<\/thinking>/g, '').trim();
|
|
339
|
+
return { markdown: clean, cost: 0 };
|
|
127
340
|
}
|
|
341
|
+
/**
|
|
342
|
+
* Extracts hyperlinks from a PDF file by converting it to HTML and parsing the links.
|
|
343
|
+
*
|
|
344
|
+
* @param {string} pdfPath - The file path to the PDF document to extract links from
|
|
345
|
+
* @param {string} output - The directory output where temporary files will be created
|
|
346
|
+
* @returns {Promise<Array<{text: string, href: string}>>} - A promise that resolves to an array of link objects
|
|
347
|
+
*/
|
|
128
348
|
function cleanHTML(html) {
|
|
129
349
|
const dom = new jsdom_1.JSDOM(html);
|
|
130
|
-
// Instancie Readability avec le document
|
|
131
350
|
const reader = new readability_1.Readability(dom.window.document);
|
|
132
351
|
const article = reader.parse();
|
|
133
352
|
return article?.content || '';
|
|
134
353
|
}
|
|
135
354
|
/**
|
|
136
|
-
* Parses an HTML file and converts it to markdown using
|
|
355
|
+
* Parses an HTML file and converts it to markdown using LLM.
|
|
137
356
|
*
|
|
138
357
|
* @param {string} output - The directory path where the output markdown file will be saved.
|
|
139
358
|
* @param {string} file - The path to the HTML file to be parsed.
|
|
140
359
|
* @param {string} service - The service name used as part of the output filename output.
|
|
141
360
|
* @param {string} model - The model to use for parsing (default: "MEDIUM-fast")
|
|
142
|
-
* @returns {Promise<{markdown: string, cost: number}>} - The generated markdown content and the cost of the
|
|
361
|
+
* @returns {Promise<{markdown: string, cost: number}>} - The generated markdown content and the cost of the API call.
|
|
143
362
|
*/
|
|
144
363
|
async function html2markdown(output, file, service, model = "MEDIUM-fast") {
|
|
145
364
|
const filename = (0, utils_1.toSlug)(path_1.default.basename(file, path_1.default.extname(file)));
|
|
@@ -150,89 +369,248 @@ async function html2markdown(output, file, service, model = "MEDIUM-fast") {
|
|
|
150
369
|
fs_1.default.writeFileSync(path_1.default.join(output, `${outputfile + filename}.md`), markdown, { encoding: 'utf8', flag: 'w' });
|
|
151
370
|
return { markdown, cost };
|
|
152
371
|
}
|
|
372
|
+
// ─── PDF text extraction ──────────────────────────────────────────────────────
|
|
153
373
|
/**
|
|
154
|
-
*
|
|
155
|
-
*
|
|
156
|
-
*
|
|
157
|
-
*
|
|
158
|
-
*
|
|
159
|
-
*
|
|
160
|
-
*
|
|
161
|
-
*
|
|
162
|
-
*
|
|
163
|
-
*
|
|
374
|
+
* Extracts plain text from a PDF using the system `pdftotext` binary (poppler-utils).
|
|
375
|
+
*
|
|
376
|
+
* - Pages are delimited by form-feed (\f) characters in the binary's output.
|
|
377
|
+
* - Excessive blank lines are normalised (3+ → 2).
|
|
378
|
+
* - Images are NOT extracted (always []).
|
|
379
|
+
*
|
|
380
|
+
* NOTE: Better alternative is `pdftotext_pdfjs` which uses Mozilla's PDF engine
|
|
381
|
+
* to extract text + images + links in a single Node.js-native pass, with better
|
|
382
|
+
* table reconstruction for complex layouts. See `pdftotext_pdfjs` for details.
|
|
383
|
+
*
|
|
384
|
+
* @param {string} pdfPath - Absolute path to the PDF file.
|
|
385
|
+
* @param {string} outputDir - Directory used for temporary files.
|
|
386
|
+
* @returns {Promise<Page[]>} One `Page` per PDF page, text-only.
|
|
164
387
|
*/
|
|
165
|
-
async function
|
|
388
|
+
async function pdftotext_poppler(pdfPath, outputDir) {
|
|
389
|
+
const tempOut = path_1.default.join(outputDir, `${randomFile()}.txt`);
|
|
390
|
+
try {
|
|
391
|
+
//
|
|
392
|
+
// Omit -nopgbrk so pdftotext emits \f between pages
|
|
393
|
+
await execAsync(`pdftotext -nodiag "${pdfPath}" "${tempOut}"`);
|
|
394
|
+
const rawText = fs_1.default.readFileSync(tempOut, 'utf8');
|
|
395
|
+
//
|
|
396
|
+
// \f (form feed = \x0C) is the page delimiter; filter empty trailing entries
|
|
397
|
+
return rawText
|
|
398
|
+
.split('\f')
|
|
399
|
+
.filter(p => p.trim() !== '')
|
|
400
|
+
.map((raw, idx) => {
|
|
401
|
+
const text = raw.replace(/\n{3,}/g, '\n\n').trim();
|
|
402
|
+
return { pageNumber: idx + 1, text, tables: detectTableStats(text), images: [] };
|
|
403
|
+
});
|
|
404
|
+
}
|
|
405
|
+
finally {
|
|
406
|
+
if (fs_1.default.existsSync(tempOut))
|
|
407
|
+
fs_1.default.unlinkSync(tempOut);
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
/**
|
|
411
|
+
* Walks the mupdf `asJSON()` tree and returns all non-empty text lines as
|
|
412
|
+
* `RawTextItem[]` (same shape used by the pdfjs geometric reconstruction).
|
|
413
|
+
*
|
|
414
|
+
* Coordinate space: mupdf uses screen coords (Y from top, increases downward).
|
|
415
|
+
* `line.y` is the **baseline** y — the same semantic as pdfjs `transform[5]`.
|
|
416
|
+
* Callers must call `groupByY(items).reverse()` to get top-to-bottom order.
|
|
417
|
+
*/
|
|
418
|
+
function flattenMupdfTextItems(node) {
|
|
419
|
+
const items = [];
|
|
420
|
+
function walk(n) {
|
|
421
|
+
if (!n || typeof n !== 'object')
|
|
422
|
+
return;
|
|
423
|
+
const obj = n;
|
|
424
|
+
if (obj.type === 'text' && Array.isArray(obj.lines)) {
|
|
425
|
+
for (const line of obj.lines) {
|
|
426
|
+
const text = line.text;
|
|
427
|
+
if (!text?.trim())
|
|
428
|
+
continue;
|
|
429
|
+
const bbox = line.bbox;
|
|
430
|
+
if (!bbox)
|
|
431
|
+
continue;
|
|
432
|
+
//
|
|
433
|
+
// PDFs with fonts missing ToUnicode CMaps produce U+FFFD replacement
|
|
434
|
+
// characters for every undecodable glyph. Drop lines where more than
|
|
435
|
+
// 40% of characters are replacement chars (decorative/unreadable text)
|
|
436
|
+
// and strip residual runs from otherwise readable lines.
|
|
437
|
+
const replacements = (text.match(/\uFFFD/g) ?? []).length;
|
|
438
|
+
if (replacements / text.length > 0.4)
|
|
439
|
+
continue;
|
|
440
|
+
const cleaned = text.replace(/\uFFFD+/g, '').trim();
|
|
441
|
+
if (!cleaned)
|
|
442
|
+
continue;
|
|
443
|
+
items.push({
|
|
444
|
+
str: cleaned,
|
|
445
|
+
//
|
|
446
|
+
// Use line.x/y (baseline) when available, fall back to bbox top-left
|
|
447
|
+
x: typeof line.x === 'number' ? line.x : bbox.x,
|
|
448
|
+
y: typeof line.y === 'number' ? line.y : bbox.y + bbox.h,
|
|
449
|
+
width: bbox.w,
|
|
450
|
+
height: bbox.h,
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
//
|
|
455
|
+
// Recurse: structure nodes expose `contents`, top-level page exposes `blocks`
|
|
456
|
+
if (Array.isArray(obj.contents))
|
|
457
|
+
for (const c of obj.contents)
|
|
458
|
+
walk(c);
|
|
459
|
+
if (Array.isArray(obj.blocks))
|
|
460
|
+
for (const b of obj.blocks)
|
|
461
|
+
walk(b);
|
|
462
|
+
}
|
|
463
|
+
walk(node);
|
|
464
|
+
return items;
|
|
465
|
+
}
|
|
466
|
+
/**
|
|
467
|
+
* Resolves the absolute path to the `mupdf-extract.mjs` ESM worker script.
|
|
468
|
+
*
|
|
469
|
+
* Works in both ts-jest context (`__dirname` = `src/`) and compiled context
|
|
470
|
+
* (`__dirname` = `dist/src/`) since both paths resolve to `<package>/`.
|
|
471
|
+
*/
|
|
472
|
+
function resolveMupdfScript() {
|
|
473
|
+
//
|
|
474
|
+
// ts-jest: __dirname = …/agentic-api/src → 1 level up → package root
|
|
475
|
+
const fromSrc = path_1.default.resolve(__dirname, '..', 'mupdf-extract.mjs');
|
|
166
476
|
//
|
|
167
|
-
//
|
|
477
|
+
// post-build copy (cp mupdf-extract.mjs dist/):
|
|
478
|
+
// __dirname = …/agentic-api/dist/src → 1 level up → dist/
|
|
479
|
+
const fromDistFlat = path_1.default.resolve(__dirname, '..', 'mupdf-extract.mjs');
|
|
480
|
+
//
|
|
481
|
+
// fallback — repo root deployed without cp step:
|
|
482
|
+
// __dirname = …/agentic-api/dist/src → 2 levels up → package root
|
|
483
|
+
const fromDistRoot = path_1.default.resolve(__dirname, '..', '..', 'mupdf-extract.mjs');
|
|
484
|
+
for (const candidate of [fromSrc, fromDistFlat, fromDistRoot]) {
|
|
485
|
+
if (fs_1.default.existsSync(candidate))
|
|
486
|
+
return candidate;
|
|
487
|
+
}
|
|
488
|
+
throw new Error(`mupdf-extract.mjs not found. Searched:\n` +
|
|
489
|
+
` ${fromSrc}\n ${fromDistFlat}\n ${fromDistRoot}`);
|
|
490
|
+
}
|
|
491
|
+
/**
|
|
492
|
+
* Extracts text, reconstructed tables, links, and optionally page-raster images
|
|
493
|
+
* from a PDF using the **mupdf** npm package (WASM build of the MuPDF C library).
|
|
494
|
+
*
|
|
495
|
+
* Key advantages over the poppler engine:
|
|
496
|
+
* - `table-hunt` detects tables geometrically even in **untagged** PDFs.
|
|
497
|
+
* - `segment` splits the page into logical reading-order blocks.
|
|
498
|
+
* - Significantly faster than pdfjs for large documents.
|
|
499
|
+
* - No shell binary dependency (pure WASM, runs anywhere Node.js does).
|
|
500
|
+
*
|
|
501
|
+
* Images (opt-in via `withImages: true`): each page is rasterised at 1.5× scale
|
|
502
|
+
* (≈ 113 DPI). The `imageFormat` option controls encoding:
|
|
503
|
+
*
|
|
504
|
+
* | format | size/page (base64) | notes |
|
|
505
|
+
* |-------------|-------------------|--------------------------------|
|
|
506
|
+
* | `'rgb'` | ≈ 4.4 MB | raw RGB, lossless, large |
|
|
507
|
+
* | `'gray'` | ≈ 1.5 MB | raw grayscale, 3× smaller |
|
|
508
|
+
* | `'jpeg'` | ≈ 100–200 KB | JPEG quality 75, 31× smaller |
|
|
509
|
+
*
|
|
510
|
+
* Disabled by default because image data quickly exhausts stdout buffers for
|
|
511
|
+
* large documents. Use `jpeg` for production with vision models.
|
|
512
|
+
*
|
|
513
|
+
* NOTE: `mupdf` is ESM-only. Extraction is delegated to a standalone
|
|
514
|
+
* `mupdf-extract.mjs` worker spawned via `execAsync`, which avoids any
|
|
515
|
+
* ESM/CJS interoperability issues in the main process and under ts-jest.
|
|
516
|
+
*
|
|
517
|
+
* @param {string} pdfPath - Absolute path to the PDF file.
|
|
518
|
+
* @param {object} [options]
|
|
519
|
+
* @param {boolean} [options.withImages=false] - Rasterise each page.
|
|
520
|
+
* @param {'rgb'|'gray'|'jpeg'} [options.imageFormat='rgb'] - Pixel encoding.
|
|
521
|
+
* @returns {Promise<Page[]>} One `Page` per PDF page with text, GFM tables, and optional images.
|
|
522
|
+
*/
|
|
523
|
+
async function pdftotext_mupdf(pdfPath, options = {}) {
|
|
524
|
+
const scriptPath = resolveMupdfScript();
|
|
525
|
+
const fmt = options.imageFormat ?? 'rgb';
|
|
526
|
+
const imageFlags = options.withImages ? ` --with-images --image-format=${fmt}` : '';
|
|
527
|
+
//
|
|
528
|
+
// maxBuffer scales with expected image size:
|
|
529
|
+
// jpeg ≈ 150KB/page, gray ≈ 1.5MB/page, rgb ≈ 4.4MB/page (base64).
|
|
530
|
+
// 32 MB is plenty for text-only or jpeg; rgb on large docs needs more.
|
|
531
|
+
const maxBuffer = options.withImages && fmt === 'rgb' ? 256 * 1024 * 1024 : 32 * 1024 * 1024;
|
|
532
|
+
const { stdout } = await execAsync(`node "${scriptPath}" "${pdfPath}"${imageFlags}`, { maxBuffer });
|
|
533
|
+
const result = JSON.parse(stdout);
|
|
534
|
+
//
|
|
535
|
+
// 1. Build raw pages — header/footer candidates kept separately
|
|
536
|
+
const rawPages = result.pages.map(p => {
|
|
537
|
+
let parsed;
|
|
538
|
+
try {
|
|
539
|
+
parsed = JSON.parse(p.json);
|
|
540
|
+
}
|
|
541
|
+
catch {
|
|
542
|
+
parsed = {};
|
|
543
|
+
}
|
|
544
|
+
//
|
|
545
|
+
// Block-by-block: each segment processed independently (multi-column safe).
|
|
546
|
+
// pageHeight classifies top/bottom 12% as header/footer candidate zones.
|
|
547
|
+
const { header: _rawHeader, body, footer: _rawFooter } = reconstructFromBlocks(parsed, p.pageHeight);
|
|
548
|
+
let text = body;
|
|
549
|
+
if (p.links.length > 0) {
|
|
550
|
+
text += '\n\n## Liens\n\n' + p.links.map(u => `- ${u}`).join('\n');
|
|
551
|
+
}
|
|
552
|
+
const images = p.image
|
|
553
|
+
? [{
|
|
554
|
+
type: p.image.type,
|
|
555
|
+
width: p.image.width,
|
|
556
|
+
height: p.image.height,
|
|
557
|
+
data: Buffer.from(p.image.data, 'base64'),
|
|
558
|
+
}]
|
|
559
|
+
: [];
|
|
560
|
+
return { pageNumber: p.pageNumber, text, _rawHeader, _rawFooter, tables: detectTableStats(text), images };
|
|
561
|
+
});
|
|
562
|
+
//
|
|
563
|
+
// 2. Promote header/footer candidates that repeat on ≥ 3 pages.
|
|
564
|
+
// Unique occurrences (e.g. first-page title) are re-injected into body.
|
|
565
|
+
return detectRunningHeaders(rawPages);
|
|
566
|
+
}
|
|
567
|
+
// ─── PDF → Markdown ───────────────────────────────────────────────────────────
|
|
568
|
+
/**
|
|
569
|
+
* Converts a PDF to a structured Markdown file.
|
|
570
|
+
*
|
|
571
|
+
* Pipeline:
|
|
572
|
+
* 1. `pdftotext_mupdf` (or poppler) → `Page[]`
|
|
573
|
+
* 2. `callLLMForParsingPDF` — MapLLM.reduce, one page per chunk
|
|
574
|
+
* 3. Prepend a **single** YAML frontmatter block and write to `outputDir`.
|
|
575
|
+
*
|
|
576
|
+
* Model choice: `LOW-fast` is sufficient — mupdf output is already clean GFM;
|
|
577
|
+
* the LLM only normalises headings and removes repeated headers/footers.
|
|
578
|
+
* Use `MEDIUM-fast` for complex layouts that need heavier restructuring.
|
|
579
|
+
*
|
|
580
|
+
* @param outputDir - Directory for the output `.md` file.
|
|
581
|
+
* @param pdf - Absolute path to the PDF file.
|
|
582
|
+
* @param matter - Document metadata; defaults derived from filename.
|
|
583
|
+
* @param model - LLM model alias (default: `'LOW-fast'`).
|
|
584
|
+
* @param engine - Extraction backend (default: `'mupdf'`).
|
|
585
|
+
* @returns `{ markdown, outputPath }` — frontmatter-prefixed markdown and output path.
|
|
586
|
+
*/
|
|
587
|
+
async function pdf2markdown(outputDir, pdf, matter, model = 'LOW-fast', engine = 'mupdf') {
|
|
168
588
|
const service = matter?.service || 'unknown';
|
|
169
589
|
const title = matter?.title || path_1.default.basename(pdf, path_1.default.extname(pdf));
|
|
170
|
-
//
|
|
171
|
-
// Build complete FrontMatter with defaults
|
|
172
590
|
const frontMatter = {
|
|
173
591
|
title,
|
|
174
592
|
service,
|
|
175
593
|
author: matter?.author || '',
|
|
176
594
|
role: matter?.role || 'rule',
|
|
177
595
|
};
|
|
178
|
-
|
|
179
|
-
// Use title for filename
|
|
180
|
-
const filename = (0, utils_1.toSlug)(title);
|
|
181
|
-
// Créez des noms de fichiers temporaires pour le PDF nettoyé et le XML généré.
|
|
182
|
-
const tempPdf = path_1.default.join(outputDir, `cleaned-${randomFile()}.pdf`);
|
|
183
|
-
const tempOut = path_1.default.join(outputDir, `${filename}.txt`);
|
|
184
|
-
//
|
|
185
|
-
// generated folder path
|
|
186
|
-
const outputPath = path_1.default.join(outputDir, `${(0, utils_1.toSlug)(service.toLocaleLowerCase())}-${filename}.md`);
|
|
596
|
+
const outputPath = path_1.default.join(outputDir, `${(0, utils_1.toSlug)(service.toLowerCase())}-${(0, utils_1.toSlug)(title)}.md`);
|
|
187
597
|
try {
|
|
188
598
|
//
|
|
189
|
-
//
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
// `pdftotext -f 1 -l 2 -layout -eol unix -nodiag "${pdf}" "${outputPath}"`;
|
|
194
|
-
await execAsync(`pdftotext -nodiag -nopgbrk "${pdf}" "${outputPath}"`);
|
|
195
|
-
const links = await extractLinksFromPDF(pdf, outputDir);
|
|
196
|
-
const text = fs_1.default.readFileSync(outputPath, "utf8");
|
|
197
|
-
const { markdown, cost } = await callLLMForParsingPDF(pdf, text, links, model);
|
|
599
|
+
// 1. Extract pages (GFM text, tables, per-page link footer)
|
|
600
|
+
const pages = engine === 'mupdf'
|
|
601
|
+
? await pdftotext_mupdf(pdf)
|
|
602
|
+
: await pdftotext_poppler(pdf, outputDir);
|
|
198
603
|
//
|
|
199
|
-
//
|
|
200
|
-
const
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
// La commande utilise -o pour spécifier le fichier de sortie et applique des options de mise en forme.
|
|
207
|
-
const gsCommand = `gs -o "${tempPdf}" -sDEVICE=pdfwrite -dFIXEDMEDIA -dDEVICEWIDTHPOINTS=595 -dDEVICEHEIGHTPOINTS=752 -dORIGINY=100 -dFILTERVECTOR "${file}"`;
|
|
208
|
-
console.log("Executing Ghostscript command:", gsCommand);
|
|
209
|
-
await execAsync(gsCommand);
|
|
210
|
-
|
|
211
|
-
// STEP 2: Convert the cleaned PDF to XML using pdftohtml.
|
|
212
|
-
// La commande génère un fichier XML à partir du PDF nettoyé.
|
|
213
|
-
const pdftohtmlCommand = `pdftohtml -xml -nodrm -s -c "${tempPdf}" "${tempOut}"`;
|
|
214
|
-
console.log("Executing pdftohtml command:", pdftohtmlCommand);
|
|
215
|
-
await execAsync(pdftohtmlCommand);
|
|
216
|
-
|
|
217
|
-
// Lecture du contenu XML généré
|
|
218
|
-
const xmlContent = fs.readFileSync(tempOut, "utf8");
|
|
219
|
-
|
|
220
|
-
// (OPTIONNEL) STEP 3: Utilisez GPT pour analyser la structure du contenu XML.
|
|
221
|
-
// Vous pouvez adapter le traitement en fonction du contenu généré par pdftohtml.
|
|
222
|
-
const {markdown,cost} = await callLLMForParsingPDF(file, xmlContent, [], model);
|
|
223
|
-
fs.writeFileSync(outputPath, markdown);
|
|
224
|
-
return {markdown,cost}; */
|
|
604
|
+
// 2. LLM: format each page as clean Markdown (no frontmatter inside)
|
|
605
|
+
const { markdown: body } = await callLLMForParsingPDF(pdf, pages, [], model);
|
|
606
|
+
//
|
|
607
|
+
// 3. Prepend single YAML frontmatter and write
|
|
608
|
+
const markdown = (0, utils_matter_1.matterSerialize)(body, frontMatter);
|
|
609
|
+
fs_1.default.writeFileSync(outputPath, markdown);
|
|
610
|
+
return { markdown, outputPath };
|
|
225
611
|
}
|
|
226
612
|
catch (error) {
|
|
227
|
-
console.error(
|
|
613
|
+
console.error('Error during PDF parsing:', error);
|
|
228
614
|
throw error;
|
|
229
615
|
}
|
|
230
|
-
finally {
|
|
231
|
-
if (fs_1.default.existsSync(tempPdf)) {
|
|
232
|
-
fs_1.default.unlinkSync(tempPdf);
|
|
233
|
-
}
|
|
234
|
-
if (fs_1.default.existsSync(tempOut)) {
|
|
235
|
-
fs_1.default.unlinkSync(tempOut);
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
616
|
}
|