agentic-api 2.0.684 → 2.0.885
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/agents/prompts.d.ts +2 -3
- package/dist/src/agents/prompts.js +13 -109
- package/dist/src/agents/reducer.loaders.d.ts +46 -15
- package/dist/src/agents/reducer.loaders.js +76 -21
- package/dist/src/agents/reducer.types.d.ts +30 -3
- package/dist/src/agents/simulator.d.ts +3 -2
- package/dist/src/agents/simulator.executor.d.ts +8 -2
- package/dist/src/agents/simulator.executor.js +62 -26
- package/dist/src/agents/simulator.js +100 -11
- package/dist/src/agents/simulator.prompts.d.ts +48 -21
- package/dist/src/agents/simulator.prompts.js +289 -122
- package/dist/src/agents/simulator.types.d.ts +33 -1
- package/dist/src/agents/subagent.d.ts +128 -0
- package/dist/src/agents/subagent.js +231 -0
- package/dist/src/agents/worker.executor.d.ts +48 -0
- package/dist/src/agents/worker.executor.js +152 -0
- package/dist/src/execute/helpers.d.ts +3 -0
- package/dist/src/execute/helpers.js +221 -15
- package/dist/src/execute/responses.js +78 -51
- package/dist/src/execute/shared.d.ts +5 -0
- package/dist/src/execute/shared.js +27 -0
- package/dist/src/index.d.ts +2 -1
- package/dist/src/index.js +3 -1
- package/dist/src/llm/openai.js +8 -1
- package/dist/src/llm/pricing.js +2 -0
- package/dist/src/llm/xai.js +11 -6
- package/dist/src/prompts.d.ts +14 -0
- package/dist/src/prompts.js +41 -1
- package/dist/src/rag/rag.manager.d.ts +18 -3
- package/dist/src/rag/rag.manager.js +91 -5
- package/dist/src/rules/git/git.e2e.helper.js +3 -0
- package/dist/src/rules/git/git.health.js +88 -57
- package/dist/src/rules/git/index.d.ts +1 -1
- package/dist/src/rules/git/index.js +13 -5
- package/dist/src/rules/git/repo.d.ts +25 -6
- package/dist/src/rules/git/repo.js +430 -146
- package/dist/src/rules/git/repo.pr.js +45 -13
- package/dist/src/rules/git/repo.tools.d.ts +5 -0
- package/dist/src/rules/git/repo.tools.js +6 -1
- package/dist/src/rules/types.d.ts +0 -2
- package/dist/src/rules/utils.matter.js +1 -5
- package/dist/src/scrapper.d.ts +138 -25
- package/dist/src/scrapper.js +538 -160
- package/dist/src/stategraph/stategraph.d.ts +4 -0
- package/dist/src/stategraph/stategraph.js +16 -0
- package/dist/src/stategraph/types.d.ts +13 -1
- package/dist/src/types.d.ts +21 -0
- package/dist/src/utils.d.ts +24 -0
- package/dist/src/utils.js +84 -86
- package/package.json +3 -2
- package/dist/src/agents/semantic.d.ts +0 -4
- package/dist/src/agents/semantic.js +0 -19
- package/dist/src/execute/legacy.d.ts +0 -46
- package/dist/src/execute/legacy.js +0 -460
- package/dist/src/pricing.llm.d.ts +0 -5
- package/dist/src/pricing.llm.js +0 -14
|
@@ -50,6 +50,7 @@ const path_1 = require("path");
|
|
|
50
50
|
const fs = __importStar(require("fs/promises"));
|
|
51
51
|
const repo_tools_1 = require("./repo.tools");
|
|
52
52
|
const repo_1 = require("./repo");
|
|
53
|
+
const utils_matter_1 = require("../utils.matter");
|
|
53
54
|
/**
|
|
54
55
|
* Synchronise une branche PR avec son mergeBase pour corriger les références orphelines
|
|
55
56
|
*
|
|
@@ -154,9 +155,12 @@ async function gitSyncPR(git, branch, user) {
|
|
|
154
155
|
throw new errors_1.GitOperationError(`Failed to sync PR ${branch} with merge base ${mergeBase}: ${mergeError}`, 'pr_sync', { branch, mergeBase, mergeError, fallbackError });
|
|
155
156
|
}
|
|
156
157
|
}
|
|
157
|
-
// Préserver les fichiers originaux de la PR (ne pas recalculer après merge)
|
|
158
|
-
//
|
|
159
|
-
//
|
|
158
|
+
// Préserver les fichiers originaux de la PR (ne pas recalculer après merge).
|
|
159
|
+
// NOTE IMPORTANTE:
|
|
160
|
+
// - gitSyncPR fait un merge technique pour maintenir la branche de validation à jour.
|
|
161
|
+
// - Ce merge NE DOIT PAS redéfinir le scope métier de la PR.
|
|
162
|
+
// - metadata.files reste la source de vérité (pilotée par add/edit/rename/delete).
|
|
163
|
+
// - Un diff Git post-merge peut être ambigu (base mouvante, merge commit, refs).
|
|
160
164
|
const updatedMetadata = {
|
|
161
165
|
...pr.metadata,
|
|
162
166
|
files: pr.metadata.files, // Préserver les fichiers originaux
|
|
@@ -236,13 +240,33 @@ async function gitIsPRClosedRobust(git, branch, config) {
|
|
|
236
240
|
async function gitGetPRMetadata(git, branch, config) {
|
|
237
241
|
const gitConf = (0, repo_tools_1.gitLoad)(config);
|
|
238
242
|
try {
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
243
|
+
const expectedPRId = Number.parseInt(branch.replace(gitConf.validationPrefix, ''), 10);
|
|
244
|
+
// D'abord essayer de chercher rapidement sur HEAD / historique récent.
|
|
245
|
+
const metadata = await (0, repo_tools_1.gitReadNote)(git, branch, gitConf.gitNotes.namespace, 20);
|
|
246
|
+
if (metadata) {
|
|
247
|
+
// Si ce n'est pas une branche de validation standard, on retourne tel quel.
|
|
248
|
+
if (Number.isNaN(expectedPRId)) {
|
|
249
|
+
return metadata;
|
|
250
|
+
}
|
|
251
|
+
// Branche de validation: rejeter une note dont l'ID ne correspond pas.
|
|
252
|
+
if (metadata.id === expectedPRId) {
|
|
253
|
+
return metadata;
|
|
254
|
+
}
|
|
255
|
+
if (gitConf.verbose) {
|
|
256
|
+
console.warn(`⚠️ gitGetPRMetadata(${branch}): note récente id=${metadata.id}, attendu=${expectedPRId}. Recherche approfondie...`);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
// Fallback robuste: scanner plus large et ne retenir que la note du bon PR.
|
|
260
|
+
if (!Number.isNaN(expectedPRId)) {
|
|
261
|
+
const log = await git.log(['-n', '200', branch]);
|
|
262
|
+
for (const commit of log.all) {
|
|
263
|
+
const note = await (0, repo_tools_1.gitReadNote)(git, commit.hash, gitConf.gitNotes.namespace, 1);
|
|
264
|
+
if (note && note.id === expectedPRId) {
|
|
265
|
+
return note;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
return null;
|
|
246
270
|
}
|
|
247
271
|
catch (error) {
|
|
248
272
|
// This typically happens if the branch doesn't exist, which is a valid case.
|
|
@@ -323,12 +347,13 @@ async function gitGetClosedPRs(git, gitConfig) {
|
|
|
323
347
|
}
|
|
324
348
|
async function gitLoadPR(git, branch) {
|
|
325
349
|
try {
|
|
350
|
+
const gitConf = (0, repo_tools_1.gitLoad)();
|
|
326
351
|
// Load metadata from
|
|
327
352
|
const metadata = await gitGetPRMetadata(git, branch);
|
|
328
353
|
if (!metadata) {
|
|
329
354
|
throw new errors_1.GitOperationError(`PR not found for branch ${branch}`, 'pr_load', { branch });
|
|
330
355
|
}
|
|
331
|
-
const files = metadata.files || (await (0, repo_tools_1.gitGetDiffFiles)(git, branch, metadata?.mergeBase, '.md'));
|
|
356
|
+
const files = metadata.files || (await (0, repo_tools_1.gitGetDiffFiles)(git, branch, metadata?.mergeBase || gitConf.draftBranch, '.md'));
|
|
332
357
|
// Récupérer les infos du dernier commit de la branche
|
|
333
358
|
const log = await git.log({ from: branch, to: branch, maxCount: 1 });
|
|
334
359
|
const lastCommit = log.latest;
|
|
@@ -619,9 +644,16 @@ async function gitNewValidationRequest(git, files, description, author, options
|
|
|
619
644
|
if (!content[idx]) {
|
|
620
645
|
continue;
|
|
621
646
|
}
|
|
622
|
-
|
|
647
|
+
let fileContent = content[idx];
|
|
648
|
+
// Assurer l'ID documentaire lors de la création d'une PR avec contenus fournis.
|
|
649
|
+
if (gitConfig.withID !== false) {
|
|
650
|
+
const parsed = (0, utils_matter_1.matterParse)(fileContent);
|
|
651
|
+
parsed.matter = (0, repo_1.gitEnsureMatterID)(parsed.matter, gitConfig, newBranchName, files[idx]);
|
|
652
|
+
fileContent = (0, utils_matter_1.matterSerialize)(parsed.content, parsed.matter);
|
|
653
|
+
}
|
|
654
|
+
// console.log('writeFile',files[idx],fileContent);
|
|
623
655
|
// This writeFile is in-memory and stages the file.
|
|
624
|
-
await _writeFileAndCommit(git, files[idx],
|
|
656
|
+
await _writeFileAndCommit(git, files[idx], fileContent, author, gitConfig, initialCommitMessage);
|
|
625
657
|
}
|
|
626
658
|
const metadata = {
|
|
627
659
|
id: nextID,
|
|
@@ -123,6 +123,11 @@ export declare function gitGetAllBranches(git: SimpleGit, options?: {
|
|
|
123
123
|
* @param baseBranch Branche de base (par défaut: main)
|
|
124
124
|
* @param filter Filtre optionnel (ex: '.md')
|
|
125
125
|
* @returns Liste des fichiers modifiés
|
|
126
|
+
*
|
|
127
|
+
* NOTE IMPORTANTE:
|
|
128
|
+
* - Ce résultat est une vue TECHNIQUE du diff Git (commits/références), pas une vérité métier PR.
|
|
129
|
+
* - Ne pas utiliser seul pour reconstruire metadata.files d'une PR.
|
|
130
|
+
* - La source de vérité métier reste metadata.files maintenu par add/edit/rename/delete.
|
|
126
131
|
*/
|
|
127
132
|
export declare function gitGetDiffFiles(git: SimpleGit, targetBranch: string, baseBranch?: string, filter?: string): Promise<string[]>;
|
|
128
133
|
/**
|
|
@@ -129,7 +129,7 @@ function gitLoad(defaultConfig) {
|
|
|
129
129
|
}
|
|
130
130
|
// console.log(`🌶️ gitLoad: First Loading git config`,defaultConfig);
|
|
131
131
|
const verbose = defaultConfig?.verbose || process.env.GIT_VERBOSE === 'true';
|
|
132
|
-
const remoteUrl = defaultConfig?.remoteUrl
|
|
132
|
+
const remoteUrl = defaultConfig?.remoteUrl ?? process.env.GIT_REMOTE_URL;
|
|
133
133
|
const repoPath = defaultConfig?.repoPath || process.env.GIT_REPO_PATH;
|
|
134
134
|
const uploadPath = defaultConfig?.uploadPath || process.env.GIT_UPLOAD_PATH;
|
|
135
135
|
const draftBranch = defaultConfig?.draftBranch || process.env.DEFAULT_BRANCH_DRAFT;
|
|
@@ -625,6 +625,11 @@ async function gitGetAllBranches(git, options = {}) {
|
|
|
625
625
|
* @param baseBranch Branche de base (par défaut: main)
|
|
626
626
|
* @param filter Filtre optionnel (ex: '.md')
|
|
627
627
|
* @returns Liste des fichiers modifiés
|
|
628
|
+
*
|
|
629
|
+
* NOTE IMPORTANTE:
|
|
630
|
+
* - Ce résultat est une vue TECHNIQUE du diff Git (commits/références), pas une vérité métier PR.
|
|
631
|
+
* - Ne pas utiliser seul pour reconstruire metadata.files d'une PR.
|
|
632
|
+
* - La source de vérité métier reste metadata.files maintenu par add/edit/rename/delete.
|
|
628
633
|
*/
|
|
629
634
|
async function gitGetDiffFiles(git, targetBranch, baseBranch, filter) {
|
|
630
635
|
const gitConfig = gitLoad();
|
|
@@ -122,8 +122,6 @@ export interface FrontMatter {
|
|
|
122
122
|
id?: number;
|
|
123
123
|
/** Titre descriptif de la règle */
|
|
124
124
|
title: string;
|
|
125
|
-
/** FIXME(oldfile): champ legacy transitoire pour notification UI rename */
|
|
126
|
-
oldfile?: string;
|
|
127
125
|
/** Auteur original de la règle (format git: "Name <email>") */
|
|
128
126
|
author?: string;
|
|
129
127
|
/** Email du validateur assigné à cette règle */
|
|
@@ -121,13 +121,9 @@ function matterParse(markdown) {
|
|
|
121
121
|
* ✅ NOUVELLE FONCTION pour reconstruire le contenu complet
|
|
122
122
|
*/
|
|
123
123
|
function matterSerializeFromRule(rule) {
|
|
124
|
-
|
|
125
|
-
const matter = { ...rule.matter };
|
|
126
|
-
delete matter.oldfile;
|
|
127
|
-
return matterSerialize(rule.content, matter);
|
|
124
|
+
return matterSerialize(rule.content, rule.matter);
|
|
128
125
|
}
|
|
129
126
|
function matterSerialize(content, matter) {
|
|
130
|
-
// Créer un objet propre pour le front-matter (exclure oldfile)
|
|
131
127
|
const cleanMatter = { ...matter };
|
|
132
128
|
const result = Object.keys(cleanMatter).reduce((acc, key) => {
|
|
133
129
|
const value = cleanMatter[key];
|
package/dist/src/scrapper.d.ts
CHANGED
|
@@ -1,48 +1,161 @@
|
|
|
1
1
|
import { FrontMatter } from "./rules/types";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
2
|
+
/** Raw image data extracted from a PDF page. */
|
|
3
|
+
export interface PageImage {
|
|
4
|
+
/** Raw pixel buffer: RGBA, RGB, grayscale bytes, or JPEG-encoded bytes. */
|
|
5
|
+
data: Buffer;
|
|
6
|
+
/** Pixel format / encoding of the buffer content. */
|
|
7
|
+
type: 'jpeg' | 'rgb' | 'rgba' | 'grayscale';
|
|
8
|
+
width: number;
|
|
9
|
+
height: number;
|
|
10
|
+
}
|
|
11
|
+
/** Dimensions of a single GFM table detected on a page. */
|
|
12
|
+
export interface PageTable {
|
|
13
|
+
/** Number of data rows (header and separator lines excluded). */
|
|
14
|
+
rows: number;
|
|
15
|
+
/** Number of columns inferred from the header line. */
|
|
16
|
+
cols: number;
|
|
17
|
+
}
|
|
18
|
+
/** Structured representation of a single PDF page. */
|
|
19
|
+
export interface Page {
|
|
20
|
+
pageNumber: number;
|
|
21
|
+
/** Cleaned body text, with reconstructed GFM tables. Running headers/footers removed. */
|
|
22
|
+
text: string;
|
|
23
|
+
/**
|
|
24
|
+
* Running page header detected across ≥ 3 consecutive pages (e.g. chapter title,
|
|
25
|
+
* magazine section name). Undefined for the poppler engine or single-page PDFs.
|
|
26
|
+
*/
|
|
27
|
+
header?: string;
|
|
28
|
+
/**
|
|
29
|
+
* Running page footer detected across ≥ 3 consecutive pages (e.g. folio number,
|
|
30
|
+
* document title). Undefined for the poppler engine or single-page PDFs.
|
|
31
|
+
*/
|
|
32
|
+
footer?: string;
|
|
33
|
+
/**
|
|
34
|
+
* Dimensions of each GFM table found in `text`.
|
|
35
|
+
* Used by `callLLMForParsingPDF` to select an appropriate model:
|
|
36
|
+
* pages with wide (cols > 3) or long (rows > 10) tables are upgraded
|
|
37
|
+
* from `LOW-fast` to `MEDIUM-fast` automatically.
|
|
38
|
+
*/
|
|
39
|
+
tables: PageTable[];
|
|
40
|
+
/** Images extracted from the page. Always empty for the poppler engine. */
|
|
41
|
+
images: PageImage[];
|
|
42
|
+
}
|
|
43
|
+
/** Extraction backend selection. */
|
|
44
|
+
export type PdftotextEngine = 'poppler' | 'mupdf';
|
|
6
45
|
/**
|
|
7
|
-
*
|
|
46
|
+
* Converts extracted PDF content to clean Markdown via LLM.
|
|
8
47
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
48
|
+
* Two paths depending on the `pdfData` type:
|
|
49
|
+
*
|
|
50
|
+
* **`Page[]` (mupdf path)** — `MapLLM.reduce`, one page per chunk.
|
|
51
|
+
* Each page is processed by `mupdfPagePrompt` (heading normalisation, broken-cell
|
|
52
|
+
* fusion, repeated-header removal). No frontmatter is added here; the caller
|
|
53
|
+
* (`pdf2markdown`) prepends the single YAML block.
|
|
54
|
+
*
|
|
55
|
+
* NOTE: `finalReduce` is intentionally disabled — it is reserved for a future
|
|
56
|
+
* "N-page light summary" feature where a second LLM pass synthesises the whole
|
|
57
|
+
* document into a shorter version.
|
|
58
|
+
*
|
|
59
|
+
* A raw `string` (e.g. from `html2markdown`) is automatically wrapped into a
|
|
60
|
+
* single `Page` so both callers share the exact same code path.
|
|
61
|
+
*
|
|
62
|
+
* @param inputfile - Original file path (used for logging only).
|
|
63
|
+
* @param pdfData - Either a `Page[]` array (mupdf) or a raw string.
|
|
64
|
+
* @param links - External links appended as `## Liens` footer (string path).
|
|
65
|
+
* @param model - LLM model alias (default: `'MEDIUM-fast'`).
|
|
14
66
|
*/
|
|
15
|
-
export declare function callLLMForParsingPDF(inputfile: string, pdfData:
|
|
67
|
+
export declare function callLLMForParsingPDF(inputfile: string, pdfData: Page[] | string, links?: {
|
|
68
|
+
text: string;
|
|
69
|
+
href: string;
|
|
70
|
+
}[], model?: string): Promise<{
|
|
16
71
|
markdown: string;
|
|
17
72
|
cost: number;
|
|
18
73
|
}>;
|
|
19
74
|
/**
|
|
20
|
-
* Parses an HTML file and converts it to markdown using
|
|
75
|
+
* Parses an HTML file and converts it to markdown using LLM.
|
|
21
76
|
*
|
|
22
77
|
* @param {string} output - The directory path where the output markdown file will be saved.
|
|
23
78
|
* @param {string} file - The path to the HTML file to be parsed.
|
|
24
79
|
* @param {string} service - The service name used as part of the output filename output.
|
|
25
80
|
* @param {string} model - The model to use for parsing (default: "MEDIUM-fast")
|
|
26
|
-
* @returns {Promise<{markdown: string, cost: number}>} - The generated markdown content and the cost of the
|
|
81
|
+
* @returns {Promise<{markdown: string, cost: number}>} - The generated markdown content and the cost of the API call.
|
|
27
82
|
*/
|
|
28
83
|
export declare function html2markdown(output: string, file: string, service: string, model?: string): Promise<{
|
|
29
84
|
markdown: string;
|
|
30
85
|
cost: number;
|
|
31
86
|
}>;
|
|
32
87
|
/**
|
|
33
|
-
*
|
|
34
|
-
*
|
|
35
|
-
*
|
|
36
|
-
*
|
|
37
|
-
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
40
|
-
*
|
|
41
|
-
*
|
|
42
|
-
*
|
|
88
|
+
* Extracts plain text from a PDF using the system `pdftotext` binary (poppler-utils).
|
|
89
|
+
*
|
|
90
|
+
* - Pages are delimited by form-feed (\f) characters in the binary's output.
|
|
91
|
+
* - Excessive blank lines are normalised (3+ → 2).
|
|
92
|
+
* - Images are NOT extracted (always []).
|
|
93
|
+
*
|
|
94
|
+
* NOTE: Better alternative is `pdftotext_pdfjs` which uses Mozilla's PDF engine
|
|
95
|
+
* to extract text + images + links in a single Node.js-native pass, with better
|
|
96
|
+
* table reconstruction for complex layouts. See `pdftotext_pdfjs` for details.
|
|
97
|
+
*
|
|
98
|
+
* @param {string} pdfPath - Absolute path to the PDF file.
|
|
99
|
+
* @param {string} outputDir - Directory used for temporary files.
|
|
100
|
+
* @returns {Promise<Page[]>} One `Page` per PDF page, text-only.
|
|
101
|
+
*/
|
|
102
|
+
export declare function pdftotext_poppler(pdfPath: string, outputDir: string): Promise<Page[]>;
|
|
103
|
+
/**
|
|
104
|
+
* Extracts text, reconstructed tables, links, and optionally page-raster images
|
|
105
|
+
* from a PDF using the **mupdf** npm package (WASM build of the MuPDF C library).
|
|
106
|
+
*
|
|
107
|
+
* Key advantages over the poppler engine:
|
|
108
|
+
* - `table-hunt` detects tables geometrically even in **untagged** PDFs.
|
|
109
|
+
* - `segment` splits the page into logical reading-order blocks.
|
|
110
|
+
* - Significantly faster than pdfjs for large documents.
|
|
111
|
+
* - No shell binary dependency (pure WASM, runs anywhere Node.js does).
|
|
112
|
+
*
|
|
113
|
+
* Images (opt-in via `withImages: true`): each page is rasterised at 1.5× scale
|
|
114
|
+
* (≈ 113 DPI). The `imageFormat` option controls encoding:
|
|
115
|
+
*
|
|
116
|
+
* | format | size/page (base64) | notes |
|
|
117
|
+
* |-------------|-------------------|--------------------------------|
|
|
118
|
+
* | `'rgb'` | ≈ 4.4 MB | raw RGB, lossless, large |
|
|
119
|
+
* | `'gray'` | ≈ 1.5 MB | raw grayscale, 3× smaller |
|
|
120
|
+
* | `'jpeg'` | ≈ 100–200 KB | JPEG quality 75, 31× smaller |
|
|
121
|
+
*
|
|
122
|
+
* Disabled by default because image data quickly exhausts stdout buffers for
|
|
123
|
+
* large documents. Use `jpeg` for production with vision models.
|
|
124
|
+
*
|
|
125
|
+
* NOTE: `mupdf` is ESM-only. Extraction is delegated to a standalone
|
|
126
|
+
* `mupdf-extract.mjs` worker spawned via `execAsync`, which avoids any
|
|
127
|
+
* ESM/CJS interoperability issues in the main process and under ts-jest.
|
|
128
|
+
*
|
|
129
|
+
* @param {string} pdfPath - Absolute path to the PDF file.
|
|
130
|
+
* @param {object} [options]
|
|
131
|
+
* @param {boolean} [options.withImages=false] - Rasterise each page.
|
|
132
|
+
* @param {'rgb'|'gray'|'jpeg'} [options.imageFormat='rgb'] - Pixel encoding.
|
|
133
|
+
* @returns {Promise<Page[]>} One `Page` per PDF page with text, GFM tables, and optional images.
|
|
43
134
|
*/
|
|
44
|
-
export declare function
|
|
135
|
+
export declare function pdftotext_mupdf(pdfPath: string, options?: {
|
|
136
|
+
withImages?: boolean;
|
|
137
|
+
imageFormat?: 'rgb' | 'gray' | 'jpeg';
|
|
138
|
+
}): Promise<Page[]>;
|
|
139
|
+
/**
|
|
140
|
+
* Converts a PDF to a structured Markdown file.
|
|
141
|
+
*
|
|
142
|
+
* Pipeline:
|
|
143
|
+
* 1. `pdftotext_mupdf` (or poppler) → `Page[]`
|
|
144
|
+
* 2. `callLLMForParsingPDF` — MapLLM.reduce, one page per chunk
|
|
145
|
+
* 3. Prepend a **single** YAML frontmatter block and write to `outputDir`.
|
|
146
|
+
*
|
|
147
|
+
* Model choice: `LOW-fast` is sufficient — mupdf output is already clean GFM;
|
|
148
|
+
* the LLM only normalises headings and removes repeated headers/footers.
|
|
149
|
+
* Use `MEDIUM-fast` for complex layouts that need heavier restructuring.
|
|
150
|
+
*
|
|
151
|
+
* @param outputDir - Directory for the output `.md` file.
|
|
152
|
+
* @param pdf - Absolute path to the PDF file.
|
|
153
|
+
* @param matter - Document metadata; defaults derived from filename.
|
|
154
|
+
* @param model - LLM model alias (default: `'LOW-fast'`).
|
|
155
|
+
* @param engine - Extraction backend (default: `'mupdf'`).
|
|
156
|
+
* @returns `{ markdown, outputPath }` — frontmatter-prefixed markdown and output path.
|
|
157
|
+
*/
|
|
158
|
+
export declare function pdf2markdown(outputDir: string, pdf: string, matter: FrontMatter | null, model?: string, engine?: PdftotextEngine): Promise<{
|
|
45
159
|
markdown: string;
|
|
46
|
-
cost: number;
|
|
47
160
|
outputPath: string;
|
|
48
161
|
}>;
|