npm - agentic-api - Versions diffs - 2.0.646 → 2.0.885 - Mend

agentic-api 2.0.646 → 2.0.885

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/dist/src/agents/prompts.d.ts +2 -3
package/dist/src/agents/prompts.js +21 -118
package/dist/src/agents/reducer.loaders.d.ts +103 -1
package/dist/src/agents/reducer.loaders.js +164 -2
package/dist/src/agents/reducer.types.d.ts +34 -3
package/dist/src/agents/simulator.d.ts +32 -2
package/dist/src/agents/simulator.executor.d.ts +15 -5
package/dist/src/agents/simulator.executor.js +134 -67
package/dist/src/agents/simulator.js +251 -8
package/dist/src/agents/simulator.prompts.d.ts +55 -10
package/dist/src/agents/simulator.prompts.js +305 -61
package/dist/src/agents/simulator.types.d.ts +62 -1
package/dist/src/agents/simulator.types.js +5 -0
package/dist/src/agents/subagent.d.ts +128 -0
package/dist/src/agents/subagent.js +231 -0
package/dist/src/agents/worker.executor.d.ts +48 -0
package/dist/src/agents/worker.executor.js +152 -0
package/dist/src/execute/helpers.d.ts +3 -0
package/dist/src/execute/helpers.js +222 -16
package/dist/src/execute/responses.js +81 -55
package/dist/src/execute/shared.d.ts +5 -0
package/dist/src/execute/shared.js +27 -0
package/dist/src/index.d.ts +2 -1
package/dist/src/index.js +3 -1
package/dist/src/llm/openai.js +8 -1
package/dist/src/llm/pricing.js +2 -0
package/dist/src/llm/xai.js +11 -6
package/dist/src/prompts.d.ts +14 -0
package/dist/src/prompts.js +41 -1
package/dist/src/rag/rag.manager.d.ts +18 -3
package/dist/src/rag/rag.manager.js +114 -12
package/dist/src/rag/types.d.ts +3 -1
package/dist/src/rules/git/git.e2e.helper.js +51 -4
package/dist/src/rules/git/git.health.js +89 -56
package/dist/src/rules/git/index.d.ts +2 -2
package/dist/src/rules/git/index.js +22 -5
package/dist/src/rules/git/repo.d.ts +64 -6
package/dist/src/rules/git/repo.js +572 -141
package/dist/src/rules/git/repo.pr.d.ts +11 -18
package/dist/src/rules/git/repo.pr.js +82 -94
package/dist/src/rules/git/repo.tools.d.ts +5 -0
package/dist/src/rules/git/repo.tools.js +6 -1
package/dist/src/rules/types.d.ts +0 -2
package/dist/src/rules/utils.matter.js +1 -5
package/dist/src/scrapper.d.ts +138 -25
package/dist/src/scrapper.js +538 -160
package/dist/src/stategraph/stategraph.d.ts +6 -2
package/dist/src/stategraph/stategraph.js +21 -6
package/dist/src/stategraph/types.d.ts +14 -6
package/dist/src/types.d.ts +22 -0
package/dist/src/utils.d.ts +24 -0
package/dist/src/utils.js +84 -86
package/package.json +3 -2
package/dist/src/agents/semantic.d.ts +0 -4
package/dist/src/agents/semantic.js +0 -19
package/dist/src/execute/legacy.d.ts +0 -46
package/dist/src/execute/legacy.js +0 -460
package/dist/src/pricing.llm.d.ts +0 -5
package/dist/src/pricing.llm.js +0 -14

package/dist/src/scrapper.js CHANGED Viewed

@@ -3,9 +3,10 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
     return (mod && mod.__esModule) ? mod : { "default": mod };
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.extractCaptcha = extractCaptcha;
 exports.callLLMForParsingPDF = callLLMForParsingPDF;
 exports.html2markdown = html2markdown;
+exports.pdftotext_poppler = pdftotext_poppler;
+exports.pdftotext_mupdf = pdftotext_mupdf;
 exports.pdf2markdown = pdf2markdown;
 const child_process_1 = require("child_process");
 const util_1 = require("util");
@@ -13,133 +14,351 @@ const path_1 = __importDefault(require("path"));
 const fs_1 = __importDefault(require("fs"));
 const jsdom_1 = require("jsdom");
 const readability_1 = require("@mozilla/readability");
-const pricing_1 = require("./llm/pricing");
 const prompts_1 = require("./prompts");
 const utils_1 = require("./utils");
-const execute_1 = require("./execute");
 const utils_matter_1 = require("./rules/utils.matter");
-// Promisify exec for easier async/await usage
+const reducer_core_1 = require("./agents/reducer.core");
 const execAsync = (0, util_1.promisify)(child_process_1.exec);
-const execFileAsync = (0, util_1.promisify)(child_process_1.execFile);
 const randomFile = (ext = '') => {
     const random = () => Math.random() * 1000 | 0;
     return `temp-${random()}-${random()}${ext}`;
 };
-async function extractCaptcha(base64Image, openai) {
-    const content = [
-        { type: 'text', text: "Extrais uniquement le nombre" },
-        { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${base64Image}` } },
-    ];
-    // Cost per captcha $0.0000696
-    const model = "gpt-4.1";
-    const response = await openai.chat.completions.create({
-        model,
-        messages: [{ role: "user", content }],
-        max_completion_tokens: 50,
-    });
-    const cost = (0, pricing_1.calculateCost)(model, response.usage);
-    // Récupérer la réponse markdown
-    const number = response.choices[0].message.content;
-    return { number, cost };
+/** Formats rows of cell strings as a GFM markdown table. */
+function gfmTable(rows) {
+    if (rows.length === 0)
+        return '';
+    const maxCols = Math.max(...rows.map(r => r.length));
+    const pad = (r) => [...r, ...new Array(maxCols - r.length).fill('')];
+    const fmt = (cells) => `| ${cells.join(' | ')} |`;
+    const [header, ...body] = rows.map(pad);
+    return [fmt(header), fmt(header.map(() => '---')), ...body.map(fmt)].join('\n');
+}
+// ─── Geometric text reconstruction ────────────────────────────────────────────
+/** Groups text items into rows by proximity on the Y axis (top-to-bottom). */
+function groupByY(items, tolerance = 3) {
+    const buckets = new Map();
+    for (const item of items) {
+        const key = [...buckets.keys()].find(k => Math.abs(k - item.y) <= tolerance);
+        if (key !== undefined) {
+            buckets.get(key).push(item);
+        }
+        else {
+            buckets.set(item.y, [item]);
+        }
+    }
+    return [...buckets.entries()]
+        .sort(([ya], [yb]) => yb - ya)
+        .map(([y, rowItems]) => ({ y, items: rowItems.sort((a, b) => a.x - b.x) }));
+}
+function detectColumnBoundaries(rows, gap = 15) {
+    const allX = rows.flatMap(r => r.items.map(i => i.x)).sort((a, b) => a - b);
+    if (allX.length === 0)
+        return [];
+    const cols = [allX[0]];
+    for (let i = 1; i < allX.length; i++) {
+        if (allX[i] - allX[i - 1] > gap)
+            cols.push(allX[i]);
+    }
+    return cols;
+}
+function assignToColumns(row, cols) {
+    const cells = new Array(cols.length).fill('');
+    for (const item of row.items) {
+        let colIdx = 0, minDist = Math.abs(item.x - cols[0]);
+        for (let i = 1; i < cols.length; i++) {
+            const dist = Math.abs(item.x - cols[i]);
+            if (dist < minDist) {
+                minDist = dist;
+                colIdx = i;
+            }
+        }
+        cells[colIdx] = cells[colIdx] ? `${cells[colIdx]} ${item.str}` : item.str;
+    }
+    return cells;
 }
 /**
- * Calls GPT to parse a PDF file and convert it to markdown format.
+ * Reconstructs page text **block by block** and splits it into three zones.
  *
- * @param {string} inputfile - The name of the PDF file being processed
- * @param {any} pdfData - The extracted content from the PDF file
- * @param {any[]} links - Optional array of links extracted from the PDF to be integrated into the markdown
- * @param {string} model - The model to use for parsing (default: "MEDIUM-fast")
- * @returns {Promise<{markdown: string, cost: number}>} - The parsed markdown content and the cost of the API call
+ * mupdf's `segment` option places each column / region in its own top-level
+ * block. Processing blocks independently prevents multi-column layouts from
+ * being interleaved into false GFM tables.
+ *
+ * Zone classification by median Y of the block's items vs `pageHeight`:
+ * - **header zone** : medianY / pageHeight < 0.12
+ * - **footer zone** : medianY / pageHeight > 0.88
+ * - **body**        : everything else
+ *
+ * When `pageHeight` is 0 (unknown) every block is treated as body.
  */
-async function callLLMForParsingPDF(inputfile, pdfData, links = [], model = "MEDIUM-fast") {
-    // Convertir le contenu en chaîne de caractères (attention à la taille potentielle !)
-    const pdfDataAsString = JSON.stringify(pdfData, null, 2);
-    // Format: YYYY-MM-DD
-    const today = new Date().toISOString().substring(0, 10);
-    const linkLabel = 'Voici une liste de liens que tu dois intègrer directement dans le texte si, et seulement si, celui-ci correspond précisément à un texte existant dans le document.';
-    const linkPrefix = linkLabel + links.reduce((acc, link) => {
-        return acc + `- [${link.text}](${link.href})\n`;
-    }, '');
-    // Créer le prompt pour décrire la tâche au LLM
-    const messages = [
-        { role: "system",
-            content: prompts_1.htmlToMarkdownPrompt }
-    ];
-    // console.log('🌶️ DEBUG: callLLMForParsingPDF -- SYSTEM:', messages[0].content);
-    // console.log('🌶️ DEBUG: callLLMForParsingPDF -- USER:', messages[1].content);
-    // WARNING: o3-mini is buggy with "Marche à suivre nouveau bail.pdf"
-    const response = await (0, execute_1.executeQuery)({
-        query: `Structure le contenu  exhaustif en Markdown sans rien inventer, et avec les liens intégrés correctement.\n Nous sommes le ${today}.\n${linkPrefix}\nLe contenu du document est:\n${pdfDataAsString}`,
-        model,
-        messages,
-        stdout: execute_1.DummyWritable,
-        verbose: false
-    });
-    // Récupérer la réponse markdown
-    const markdown = response.content;
-    console.log(`Markdown 💰 cost: ${response.usage.cost}`);
+function reconstructFromBlocks(parsed, pageHeight = 0) {
+    const empty = { header: '', body: '', footer: '' };
+    const root = parsed;
+    if (!root || !Array.isArray(root.blocks) || root.blocks.length === 0) {
+        const items = flattenMupdfTextItems(parsed);
+        return { ...empty, body: reconstructPageText(groupByY(items).reverse()) };
+    }
+    const headerParts = [];
+    const bodyParts = [];
+    const footerParts = [];
     //
-    // add a regex to extract the markdown content between <thinking></thinking> tags
-    const markdownWithoutThinking = markdown.replace(/<thinking>[\s\S]*?<\/thinking>/g, '');
-    return { markdown: markdownWithoutThinking, cost: response.usage.cost };
+    // Classify at item level (not block level) so that a single large block
+    // spanning the full page (common in magazine layouts) is still split
+    // into its header/body/footer zones correctly.
+    // mupdf Y-from-top: small Y = top of page.
+    //   header zone : Y / pageHeight < 0.12
+    //   footer zone : Y / pageHeight > 0.88
+    //   body        : everything else
+    const headerItems = [];
+    const footerItems = [];
+    for (const block of root.blocks) {
+        const items = flattenMupdfTextItems(block);
+        if (items.length === 0)
+            continue;
+        if (pageHeight > 0) {
+            const hItems = items.filter(i => i.y / pageHeight < 0.12);
+            const fItems = items.filter(i => i.y / pageHeight > 0.88);
+            const bItems = items.filter(i => { const r = i.y / pageHeight; return r >= 0.12 && r <= 0.88; });
+            headerItems.push(...hItems);
+            footerItems.push(...fItems);
+            if (bItems.length > 0) {
+                const text = reconstructPageText(groupByY(bItems).reverse());
+                if (text)
+                    bodyParts.push(text);
+            }
+        }
+        else {
+            const text = reconstructPageText(groupByY(items).reverse());
+            if (text)
+                bodyParts.push(text);
+        }
+    }
+    if (headerItems.length > 0) {
+        const text = reconstructPageText(groupByY(headerItems).reverse());
+        if (text)
+            headerParts.push(text);
+    }
+    if (footerItems.length > 0) {
+        const text = reconstructPageText(groupByY(footerItems).reverse());
+        if (text)
+            footerParts.push(text);
+    }
+    return {
+        header: headerParts.join('\n').trim(),
+        body: bodyParts.join('\n\n').trim(),
+        footer: footerParts.join('\n').trim(),
+    };
 }
 /**
- * Extracts hyperlinks from a PDF file by converting it to HTML and parsing the links.
+ * Promotes positional header/footer candidates to `page.header` / `page.footer`
+ * only when the same normalised pattern appears on **≥ 3 pages**.
  *
- * @param {string} pdfPath - The file path to the PDF document to extract links from
- * @param {string} output - The directory output where temporary files will be created
- * @returns {Promise<Array<{text: string, href: string}>>} - A promise that resolves to an array of link objects,
- *                                                          each containing the link text and href attributes
+ * Numbers are normalised (`\d{2,4}` → `{N}`) so that incrementing folios
+ * (`057`, `058`, `059`) map to the same pattern.
+ *
+ * False positives (unique first-page titles that happen to sit in the top zone)
+ * are re-injected into `page.text` so nothing is silently lost.
  */
-async function extractLinksFromPDF(pdfPath, output) {
-    const tempOut = path_1.default.join(output, `${randomFile()}`);
-    try {
-        // STEP 2: Convert the cleaned PDF to XML using pdftohtml.
-        // La commande génère un fichier XML à partir du PDF nettoyé.
-        const pdftohtmlCommand = `pdftohtml -s -nodrm -c "${pdfPath}" "${tempOut}"`;
-        await execAsync(pdftohtmlCommand);
-        const html = fs_1.default.readFileSync(tempOut + '-html.html', "utf8");
-        const dom = new jsdom_1.JSDOM(html);
-        const links = Array.from(dom.window.document.querySelectorAll('a')).map(link => ({
-            text: link.textContent?.trim() || link.href,
-            href: link.href
-        }));
-        process.stdout.write("Extracting links: " + links.length + " ");
-        return links;
+/**
+ * Parses GFM tables in a block of text and returns their dimensions.
+ *
+ * Used to populate `Page.tables` so that `callLLMForParsingPDF` can
+ * automatically select a stronger model for pages with complex tables
+ * (many columns or many rows).
+ */
+function detectTableStats(text) {
+    const tables = [];
+    const lines = text.split('\n');
+    let tableLines = [];
+    const flush = () => {
+        if (tableLines.length < 2) {
+            tableLines = [];
+            return;
+        }
+        //
+        // Separator lines (e.g. `| --- | --- |`) are structural — exclude from row count
+        const SEPARATOR_RE = /^\|\s*[-:]+[\s|:-]*\|$/;
+        const dataLines = tableLines.filter(l => !SEPARATOR_RE.test(l.trim()));
+        const cols = (tableLines[0].match(/\|/g) ?? []).length - 1;
+        if (cols >= 2 && dataLines.length >= 1) {
+            tables.push({ rows: dataLines.length, cols });
+        }
+        tableLines = [];
+    };
+    for (const line of lines) {
+        if (/^\|/.test(line) && line.trim().endsWith('|')) {
+            tableLines.push(line);
+        }
+        else {
+            flush();
+        }
     }
-    catch (error) {
-        console.error('❌ Error extracting links from PDF:', error);
-        return [];
+    flush();
+    return tables;
+}
+function detectRunningHeaders(pages) {
+    const normalize = (s) => s.replace(/\d{2,4}/g, '{N}').trim();
+    //
+    // Count how many pages share each normalised pattern
+    const headerCount = new Map();
+    const footerCount = new Map();
+    for (const p of pages) {
+        if (p._rawHeader)
+            headerCount.set(normalize(p._rawHeader), (headerCount.get(normalize(p._rawHeader)) ?? 0) + 1);
+        if (p._rawFooter)
+            footerCount.set(normalize(p._rawFooter), (footerCount.get(normalize(p._rawFooter)) ?? 0) + 1);
     }
-    finally {
-        if (fs_1.default.existsSync(tempOut + '-html.html')) {
-            fs_1.default.unlinkSync(tempOut + '-html.html');
-            // Clean up any PNG files that might have been generated
-            const pngFiles = fs_1.default.readdirSync(path_1.default.dirname(tempOut))
-                .filter(file => file.startsWith(path_1.default.basename(tempOut)) && file.endsWith('.png'));
-            for (const pngFile of pngFiles) {
-                const pngPath = path_1.default.join(path_1.default.dirname(tempOut), pngFile);
-                if (fs_1.default.existsSync(pngPath))
-                    fs_1.default.unlinkSync(pngPath);
+    return pages.map(({ _rawHeader, _rawFooter, ...page }) => {
+        let text = page.text;
+        let header;
+        let footer;
+        if (_rawHeader) {
+            if ((headerCount.get(normalize(_rawHeader)) ?? 0) >= 3) {
+                header = _rawHeader;
+            }
+            else {
+                //
+                // Not a running header — keep in body text
+                text = `${_rawHeader}\n\n${text}`;
+            }
+        }
+        if (_rawFooter) {
+            if ((footerCount.get(normalize(_rawFooter)) ?? 0) >= 3) {
+                footer = _rawFooter;
+            }
+            else {
+                text = `${text}\n\n${_rawFooter}`;
             }
         }
+        return { ...page, text: text.trim(), header, footer };
+    });
+}
+/** Reconstructs plain text with heuristic table detection for untagged PDFs. */
+function reconstructPageText(rows) {
+    const LIST_MARKER_RE = /^([●•◦▪▸✅✓✗►]|\d{1,3}\.?|[a-zA-Z]\.)$/;
+    const isListItem = (r) => r.items.length === 2 && (r.items[0].str.trim().length <= 3 || LIST_MARKER_RE.test(r.items[0].str.trim()));
+    const fmtTableRows = (rows2) => {
+        const cols = detectColumnBoundaries(rows2);
+        if (cols.length < 2)
+            return rows2.map(r => r.items.map(i => i.str).join(' ')).join('\n');
+        return gfmTable(rows2.map(r => assignToColumns(r, cols)));
+    };
+    const chunks = [];
+    let tableCandidate = [];
+    const flush = () => {
+        if (tableCandidate.length === 0)
+            return;
+        const realTable = tableCandidate.filter(r => !isListItem(r) && r.items.length >= 2);
+        chunks.push(realTable.length >= 2 ? fmtTableRows(tableCandidate) : tableCandidate.map(r => r.items.map(i => i.str).join(' ')).join('\n'));
+        tableCandidate = [];
+    };
+    for (const row of rows) {
+        if (row.items.length >= 2 && !isListItem(row)) {
+            tableCandidate.push(row);
+        }
+        else {
+            flush();
+            chunks.push(row.items.map(i => i.str).join(' '));
+        }
+    }
+    flush();
+    return chunks.join('\n').replace(/\n{3,}/g, '\n\n').trim();
+}
+// ─── Existing helpers ─────────────────────────────────────────────────────────
+/**
+ * Converts extracted PDF content to clean Markdown via LLM.
+ *
+ * Two paths depending on the `pdfData` type:
+ *
+ * **`Page[]` (mupdf path)** — `MapLLM.reduce`, one page per chunk.
+ * Each page is processed by `mupdfPagePrompt` (heading normalisation, broken-cell
+ * fusion, repeated-header removal). No frontmatter is added here; the caller
+ * (`pdf2markdown`) prepends the single YAML block.
+ *
+ * NOTE: `finalReduce` is intentionally disabled — it is reserved for a future
+ * "N-page light summary" feature where a second LLM pass synthesises the whole
+ * document into a shorter version.
+ *
+ * A raw `string` (e.g. from `html2markdown`) is automatically wrapped into a
+ * single `Page` so both callers share the exact same code path.
+ *
+ * @param inputfile - Original file path (used for logging only).
+ * @param pdfData   - Either a `Page[]` array (mupdf) or a raw string.
+ * @param links     - External links appended as `## Liens` footer (string path).
+ * @param model     - LLM model alias (default: `'MEDIUM-fast'`).
+ */
+async function callLLMForParsingPDF(inputfile, pdfData, links = [], model = 'LOW-fast') {
+    //
+    // Normalise input: a raw string becomes a single-page array.
+    // Links (html2markdown path) are appended as a Liens footer so the
+    // MapLLM digest can embed them naturally in context.
+    const pages = Array.isArray(pdfData)
+        ? pdfData
+        : [{
+                pageNumber: 1,
+                text: links.length > 0
+                    ? `${pdfData}\n\n## Liens\n\n${links.map(l => `- [${l.text}](${l.href})`).join('\n')}`
+                    : pdfData,
+                tables: [],
+                images: [],
+            }];
+    //
+    // Auto model upgrade: count total `|` across all pages.
+    // A 4-col × 6-row table (4+1) × (6+2) = ≈ 40 `|` — threshold 40 catches any non-trivial table.
+    // Only upgrades from LOW-fast; explicit caller models are respected.
+    const PIPE_THRESHOLD = 40;
+    const totalPipes = pages.reduce((sum, p) => sum + (p.text.match(/\|/g) ?? []).length, 0);
+    const effectiveModel = totalPipes > PIPE_THRESHOLD ? 'HIGH-fast' : model;
+    if (totalPipes > PIPE_THRESHOLD) {
+        console.log(`pdf: ${totalPipes} pipes detected → upgrading model LOW-fast → HIGH-fast`);
     }
+    const pageLoader = {
+        loadNativeChunk: async (pos) => ({
+            content: pages[pos].text,
+            eof: pos + 1 >= pages.length,
+            position: pos + 1,
+        }),
+    };
+    //
+    // finalReduce: false — reserved for future "N-page light summary" feature
+    const mapper = new reducer_core_1.MapLLM(pageLoader, { finalReduce: false });
+    const result = await mapper.reduce((res, current) => {
+        const section = typeof current === 'string' ? current : JSON.stringify(current);
+        res.acc = res.acc ? `${res.acc}\n\n---\n\n${section}` : section;
+        return res;
+    }, {
+        acc: '',
+        config: {
+            digestPrompt: prompts_1.mupdfPagePrompt,
+            reducePrompt: '',
+        },
+        model: effectiveModel,
+        verbose: true,
+    });
+    const raw = typeof result.acc === 'string' ? result.acc : JSON.stringify(result.acc);
+    const clean = raw.replace(/<thinking>[\s\S]*?<\/thinking>/g, '').trim();
+    return { markdown: clean, cost: 0 };
 }
+/**
+ * Extracts hyperlinks from a PDF file by converting it to HTML and parsing the links.
+ *
+ * @param {string} pdfPath - The file path to the PDF document to extract links from
+ * @param {string} output - The directory output where temporary files will be created
+ * @returns {Promise<Array<{text: string, href: string}>>} - A promise that resolves to an array of link objects
+ */
 function cleanHTML(html) {
     const dom = new jsdom_1.JSDOM(html);
-    // Instancie Readability avec le document
     const reader = new readability_1.Readability(dom.window.document);
     const article = reader.parse();
     return article?.content || '';
 }
 /**
- * Parses an HTML file and converts it to markdown using GPT.
+ * Parses an HTML file and converts it to markdown using LLM.
  *
  * @param {string} output - The directory path where the output markdown file will be saved.
  * @param {string} file - The path to the HTML file to be parsed.
  * @param {string} service - The service name used as part of the output filename output.
  * @param {string} model - The model to use for parsing (default: "MEDIUM-fast")
- * @returns {Promise<{markdown: string, cost: number}>} - The generated markdown content and the cost of the GPT API call.
+ * @returns {Promise<{markdown: string, cost: number}>} - The generated markdown content and the cost of the API call.
  */
 async function html2markdown(output, file, service, model = "MEDIUM-fast") {
     const filename = (0, utils_1.toSlug)(path_1.default.basename(file, path_1.default.extname(file)));
@@ -150,89 +369,248 @@ async function html2markdown(output, file, service, model = "MEDIUM-fast") {
     fs_1.default.writeFileSync(path_1.default.join(output, `${outputfile + filename}.md`), markdown, { encoding: 'utf8', flag: 'w' });
     return { markdown, cost };
 }
+// ─── PDF text extraction ──────────────────────────────────────────────────────
 /**
- * Parse un PDF en effectuant :
- * 1. Le nettoyage du PDF avec Ghostscript.
- * 2. Sa conversion en XML via pdftohtml.
- * 3. (Optionnellement) Le passage du contenu converti au modèle LLM pour analyser la structure.
- *
- * @param {string} outputDir - Dossier de sortie pour le fichier markdown.
- * @param {string} pdf - Chemin vers le fichier PDF à analyser.
- * @param {FrontMatter|null} matter - Métadonnées du document (title, service, author, role). Si null, utilise le nom du PDF pour le titre.
- * @param {string} model - Le modèle à utiliser (défaut: "MEDIUM-fast").
- * @returns {Promise<{markdown: string, cost: number, outputPath: string}>} - Le markdown structuré, le coût et le chemin du fichier de sortie.
+ * Extracts plain text from a PDF using the system `pdftotext` binary (poppler-utils).
+ *
+ * - Pages are delimited by form-feed (\f) characters in the binary's output.
+ * - Excessive blank lines are normalised (3+ → 2).
+ * - Images are NOT extracted (always []).
+ *
+ * NOTE: Better alternative is `pdftotext_pdfjs` which uses Mozilla's PDF engine
+ *   to extract text + images + links in a single Node.js-native pass, with better
+ *   table reconstruction for complex layouts. See `pdftotext_pdfjs` for details.
+ *
+ * @param {string} pdfPath   - Absolute path to the PDF file.
+ * @param {string} outputDir - Directory used for temporary files.
+ * @returns {Promise<Page[]>} One `Page` per PDF page, text-only.
  */
-async function pdf2markdown(outputDir, pdf, matter, model = "MEDIUM-fast") {
+async function pdftotext_poppler(pdfPath, outputDir) {
+    const tempOut = path_1.default.join(outputDir, `${randomFile()}.txt`);
+    try {
+        //
+        // Omit -nopgbrk so pdftotext emits \f between pages
+        await execAsync(`pdftotext -nodiag "${pdfPath}" "${tempOut}"`);
+        const rawText = fs_1.default.readFileSync(tempOut, 'utf8');
+        //
+        // \f (form feed = \x0C) is the page delimiter; filter empty trailing entries
+        return rawText
+            .split('\f')
+            .filter(p => p.trim() !== '')
+            .map((raw, idx) => {
+            const text = raw.replace(/\n{3,}/g, '\n\n').trim();
+            return { pageNumber: idx + 1, text, tables: detectTableStats(text), images: [] };
+        });
+    }
+    finally {
+        if (fs_1.default.existsSync(tempOut))
+            fs_1.default.unlinkSync(tempOut);
+    }
+}
+/**
+ * Walks the mupdf `asJSON()` tree and returns all non-empty text lines as
+ * `RawTextItem[]` (same shape used by the pdfjs geometric reconstruction).
+ *
+ * Coordinate space: mupdf uses screen coords (Y from top, increases downward).
+ * `line.y` is the **baseline** y — the same semantic as pdfjs `transform[5]`.
+ * Callers must call `groupByY(items).reverse()` to get top-to-bottom order.
+ */
+function flattenMupdfTextItems(node) {
+    const items = [];
+    function walk(n) {
+        if (!n || typeof n !== 'object')
+            return;
+        const obj = n;
+        if (obj.type === 'text' && Array.isArray(obj.lines)) {
+            for (const line of obj.lines) {
+                const text = line.text;
+                if (!text?.trim())
+                    continue;
+                const bbox = line.bbox;
+                if (!bbox)
+                    continue;
+                //
+                // PDFs with fonts missing ToUnicode CMaps produce U+FFFD replacement
+                // characters for every undecodable glyph. Drop lines where more than
+                // 40% of characters are replacement chars (decorative/unreadable text)
+                // and strip residual runs from otherwise readable lines.
+                const replacements = (text.match(/\uFFFD/g) ?? []).length;
+                if (replacements / text.length > 0.4)
+                    continue;
+                const cleaned = text.replace(/\uFFFD+/g, '').trim();
+                if (!cleaned)
+                    continue;
+                items.push({
+                    str: cleaned,
+                    //
+                    // Use line.x/y (baseline) when available, fall back to bbox top-left
+                    x: typeof line.x === 'number' ? line.x : bbox.x,
+                    y: typeof line.y === 'number' ? line.y : bbox.y + bbox.h,
+                    width: bbox.w,
+                    height: bbox.h,
+                });
+            }
+        }
+        //
+        // Recurse: structure nodes expose `contents`, top-level page exposes `blocks`
+        if (Array.isArray(obj.contents))
+            for (const c of obj.contents)
+                walk(c);
+        if (Array.isArray(obj.blocks))
+            for (const b of obj.blocks)
+                walk(b);
+    }
+    walk(node);
+    return items;
+}
+/**
+ * Resolves the absolute path to the `mupdf-extract.mjs` ESM worker script.
+ *
+ * Works in both ts-jest context (`__dirname` = `src/`) and compiled context
+ * (`__dirname` = `dist/src/`) since both paths resolve to `<package>/`.
+ */
+function resolveMupdfScript() {
+    //
+    // ts-jest:   __dirname = …/agentic-api/src      → 1 level up  → package root
+    const fromSrc = path_1.default.resolve(__dirname, '..', 'mupdf-extract.mjs');
     //
-    // Extract matter values with defaults
+    // post-build copy (cp mupdf-extract.mjs dist/):
+    //   __dirname = …/agentic-api/dist/src  → 1 level up  → dist/
+    const fromDistFlat = path_1.default.resolve(__dirname, '..', 'mupdf-extract.mjs');
+    //
+    // fallback — repo root deployed without cp step:
+    //   __dirname = …/agentic-api/dist/src  → 2 levels up → package root
+    const fromDistRoot = path_1.default.resolve(__dirname, '..', '..', 'mupdf-extract.mjs');
+    for (const candidate of [fromSrc, fromDistFlat, fromDistRoot]) {
+        if (fs_1.default.existsSync(candidate))
+            return candidate;
+    }
+    throw new Error(`mupdf-extract.mjs not found. Searched:\n` +
+        `  ${fromSrc}\n  ${fromDistFlat}\n  ${fromDistRoot}`);
+}
+/**
+ * Extracts text, reconstructed tables, links, and optionally page-raster images
+ * from a PDF using the **mupdf** npm package (WASM build of the MuPDF C library).
+ *
+ * Key advantages over the poppler engine:
+ * - `table-hunt` detects tables geometrically even in **untagged** PDFs.
+ * - `segment` splits the page into logical reading-order blocks.
+ * - Significantly faster than pdfjs for large documents.
+ * - No shell binary dependency (pure WASM, runs anywhere Node.js does).
+ *
+ * Images (opt-in via `withImages: true`): each page is rasterised at 1.5× scale
+ * (≈ 113 DPI). The `imageFormat` option controls encoding:
+ *
+ * | format      | size/page (base64) | notes                          |
+ * |-------------|-------------------|--------------------------------|
+ * | `'rgb'`     | ≈ 4.4 MB          | raw RGB, lossless, large       |
+ * | `'gray'`    | ≈ 1.5 MB          | raw grayscale, 3× smaller      |
+ * | `'jpeg'`    | ≈ 100–200 KB      | JPEG quality 75, 31× smaller   |
+ *
+ * Disabled by default because image data quickly exhausts stdout buffers for
+ * large documents. Use `jpeg` for production with vision models.
+ *
+ * NOTE: `mupdf` is ESM-only. Extraction is delegated to a standalone
+ * `mupdf-extract.mjs` worker spawned via `execAsync`, which avoids any
+ * ESM/CJS interoperability issues in the main process and under ts-jest.
+ *
+ * @param {string}  pdfPath    - Absolute path to the PDF file.
+ * @param {object}  [options]
+ * @param {boolean} [options.withImages=false]     - Rasterise each page.
+ * @param {'rgb'|'gray'|'jpeg'} [options.imageFormat='rgb'] - Pixel encoding.
+ * @returns {Promise<Page[]>} One `Page` per PDF page with text, GFM tables, and optional images.
+ */
+async function pdftotext_mupdf(pdfPath, options = {}) {
+    const scriptPath = resolveMupdfScript();
+    const fmt = options.imageFormat ?? 'rgb';
+    const imageFlags = options.withImages ? ` --with-images --image-format=${fmt}` : '';
+    //
+    // maxBuffer scales with expected image size:
+    //   jpeg ≈ 150KB/page, gray ≈ 1.5MB/page, rgb ≈ 4.4MB/page (base64).
+    // 32 MB is plenty for text-only or jpeg; rgb on large docs needs more.
+    const maxBuffer = options.withImages && fmt === 'rgb' ? 256 * 1024 * 1024 : 32 * 1024 * 1024;
+    const { stdout } = await execAsync(`node "${scriptPath}" "${pdfPath}"${imageFlags}`, { maxBuffer });
+    const result = JSON.parse(stdout);
+    //
+    // 1. Build raw pages — header/footer candidates kept separately
+    const rawPages = result.pages.map(p => {
+        let parsed;
+        try {
+            parsed = JSON.parse(p.json);
+        }
+        catch {
+            parsed = {};
+        }
+        //
+        // Block-by-block: each segment processed independently (multi-column safe).
+        // pageHeight classifies top/bottom 12% as header/footer candidate zones.
+        const { header: _rawHeader, body, footer: _rawFooter } = reconstructFromBlocks(parsed, p.pageHeight);
+        let text = body;
+        if (p.links.length > 0) {
+            text += '\n\n## Liens\n\n' + p.links.map(u => `- ${u}`).join('\n');
+        }
+        const images = p.image
+            ? [{
+                    type: p.image.type,
+                    width: p.image.width,
+                    height: p.image.height,
+                    data: Buffer.from(p.image.data, 'base64'),
+                }]
+            : [];
+        return { pageNumber: p.pageNumber, text, _rawHeader, _rawFooter, tables: detectTableStats(text), images };
+    });
+    //
+    // 2. Promote header/footer candidates that repeat on ≥ 3 pages.
+    //    Unique occurrences (e.g. first-page title) are re-injected into body.
+    return detectRunningHeaders(rawPages);
+}
+// ─── PDF → Markdown ───────────────────────────────────────────────────────────
+/**
+ * Converts a PDF to a structured Markdown file.
+ *
+ * Pipeline:
+ * 1. `pdftotext_mupdf` (or poppler) → `Page[]`
+ * 2. `callLLMForParsingPDF` — MapLLM.reduce, one page per chunk
+ * 3. Prepend a **single** YAML frontmatter block and write to `outputDir`.
+ *
+ * Model choice: `LOW-fast` is sufficient — mupdf output is already clean GFM;
+ * the LLM only normalises headings and removes repeated headers/footers.
+ * Use `MEDIUM-fast` for complex layouts that need heavier restructuring.
+ *
+ * @param outputDir - Directory for the output `.md` file.
+ * @param pdf       - Absolute path to the PDF file.
+ * @param matter    - Document metadata; defaults derived from filename.
+ * @param model     - LLM model alias (default: `'LOW-fast'`).
+ * @param engine    - Extraction backend (default: `'mupdf'`).
+ * @returns `{ markdown, outputPath }` — frontmatter-prefixed markdown and output path.
+ */
+async function pdf2markdown(outputDir, pdf, matter, model = 'LOW-fast', engine = 'mupdf') {
     const service = matter?.service || 'unknown';
     const title = matter?.title || path_1.default.basename(pdf, path_1.default.extname(pdf));
-    //
-    // Build complete FrontMatter with defaults
     const frontMatter = {
         title,
         service,
         author: matter?.author || '',
         role: matter?.role || 'rule',
     };
-    //
-    // Use title for filename
-    const filename = (0, utils_1.toSlug)(title);
-    // Créez des noms de fichiers temporaires pour le PDF nettoyé et le XML généré.
-    const tempPdf = path_1.default.join(outputDir, `cleaned-${randomFile()}.pdf`);
-    const tempOut = path_1.default.join(outputDir, `${filename}.txt`);
-    //
-    // generated folder path
-    const outputPath = path_1.default.join(outputDir, `${(0, utils_1.toSlug)(service.toLocaleLowerCase())}-${filename}.md`);
+    const outputPath = path_1.default.join(outputDir, `${(0, utils_1.toSlug)(service.toLowerCase())}-${(0, utils_1.toSlug)(title)}.md`);
     try {
         //
-        // replace pdftotext with python script PyMuPDF
-        // Ca ne marche pas mieux que pdftotext
-        // const { stdout } = await execFileAsync("python3", ["./bin/extract_text_with_links.py", pdf]);
-        // const { text, links } = JSON.parse(stdout);
-        // `pdftotext -f 1 -l 2  -layout -eol unix -nodiag "${pdf}" "${outputPath}"`;
-        await execAsync(`pdftotext -nodiag -nopgbrk "${pdf}" "${outputPath}"`);
-        const links = await extractLinksFromPDF(pdf, outputDir);
-        const text = fs_1.default.readFileSync(outputPath, "utf8");
-        const { markdown, cost } = await callLLMForParsingPDF(pdf, text, links, model);
+        // 1. Extract pages (GFM text, tables, per-page link footer)
+        const pages = engine === 'mupdf'
+            ? await pdftotext_mupdf(pdf)
+            : await pdftotext_poppler(pdf, outputDir);
         //
-        // Add frontmatter to the markdown before saving
-        const markdownWithMatter = (0, utils_matter_1.matterSerialize)(markdown, frontMatter);
-        fs_1.default.writeFileSync(outputPath, markdownWithMatter);
-        return { markdown: markdownWithMatter, cost, outputPath };
-        /**
-        // STEP 1: Clean the PDF using Ghostscript.
-        // La commande utilise -o pour spécifier le fichier de sortie et applique des options de mise en forme.
-        const gsCommand = `gs -o "${tempPdf}" -sDEVICE=pdfwrite -dFIXEDMEDIA -dDEVICEWIDTHPOINTS=595 -dDEVICEHEIGHTPOINTS=752 -dORIGINY=100 -dFILTERVECTOR "${file}"`;
-        console.log("Executing Ghostscript command:", gsCommand);
-        await execAsync(gsCommand);
-        // STEP 2: Convert the cleaned PDF to XML using pdftohtml.
-        // La commande génère un fichier XML à partir du PDF nettoyé.
-        const pdftohtmlCommand = `pdftohtml -xml -nodrm -s -c "${tempPdf}" "${tempOut}"`;
-        console.log("Executing pdftohtml command:", pdftohtmlCommand);
-        await execAsync(pdftohtmlCommand);
-        // Lecture du contenu XML généré
-        const xmlContent = fs.readFileSync(tempOut, "utf8");
-        // (OPTIONNEL) STEP 3: Utilisez GPT pour analyser la structure du contenu XML.
-        // Vous pouvez adapter le traitement en fonction du contenu généré par pdftohtml.
-        const {markdown,cost} = await callLLMForParsingPDF(file, xmlContent, [], model);
-        fs.writeFileSync(outputPath, markdown);
-        return {markdown,cost}; */
+        // 2. LLM: format each page as clean Markdown (no frontmatter inside)
+        const { markdown: body } = await callLLMForParsingPDF(pdf, pages, [], model);
+        //
+        // 3. Prepend single YAML frontmatter and write
+        const markdown = (0, utils_matter_1.matterSerialize)(body, frontMatter);
+        fs_1.default.writeFileSync(outputPath, markdown);
+        return { markdown, outputPath };
     }
     catch (error) {
-        console.error("Error during PDF parsing: ", error);
+        console.error('Error during PDF parsing:', error);
         throw error;
     }
-    finally {
-        if (fs_1.default.existsSync(tempPdf)) {
-            fs_1.default.unlinkSync(tempPdf);
-        }
-        if (fs_1.default.existsSync(tempOut)) {
-            fs_1.default.unlinkSync(tempOut);
-        }
-    }
 }