agentic-api 2.0.646 → 2.0.885

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/dist/src/agents/prompts.d.ts +2 -3
  2. package/dist/src/agents/prompts.js +21 -118
  3. package/dist/src/agents/reducer.loaders.d.ts +103 -1
  4. package/dist/src/agents/reducer.loaders.js +164 -2
  5. package/dist/src/agents/reducer.types.d.ts +34 -3
  6. package/dist/src/agents/simulator.d.ts +32 -2
  7. package/dist/src/agents/simulator.executor.d.ts +15 -5
  8. package/dist/src/agents/simulator.executor.js +134 -67
  9. package/dist/src/agents/simulator.js +251 -8
  10. package/dist/src/agents/simulator.prompts.d.ts +55 -10
  11. package/dist/src/agents/simulator.prompts.js +305 -61
  12. package/dist/src/agents/simulator.types.d.ts +62 -1
  13. package/dist/src/agents/simulator.types.js +5 -0
  14. package/dist/src/agents/subagent.d.ts +128 -0
  15. package/dist/src/agents/subagent.js +231 -0
  16. package/dist/src/agents/worker.executor.d.ts +48 -0
  17. package/dist/src/agents/worker.executor.js +152 -0
  18. package/dist/src/execute/helpers.d.ts +3 -0
  19. package/dist/src/execute/helpers.js +222 -16
  20. package/dist/src/execute/responses.js +81 -55
  21. package/dist/src/execute/shared.d.ts +5 -0
  22. package/dist/src/execute/shared.js +27 -0
  23. package/dist/src/index.d.ts +2 -1
  24. package/dist/src/index.js +3 -1
  25. package/dist/src/llm/openai.js +8 -1
  26. package/dist/src/llm/pricing.js +2 -0
  27. package/dist/src/llm/xai.js +11 -6
  28. package/dist/src/prompts.d.ts +14 -0
  29. package/dist/src/prompts.js +41 -1
  30. package/dist/src/rag/rag.manager.d.ts +18 -3
  31. package/dist/src/rag/rag.manager.js +114 -12
  32. package/dist/src/rag/types.d.ts +3 -1
  33. package/dist/src/rules/git/git.e2e.helper.js +51 -4
  34. package/dist/src/rules/git/git.health.js +89 -56
  35. package/dist/src/rules/git/index.d.ts +2 -2
  36. package/dist/src/rules/git/index.js +22 -5
  37. package/dist/src/rules/git/repo.d.ts +64 -6
  38. package/dist/src/rules/git/repo.js +572 -141
  39. package/dist/src/rules/git/repo.pr.d.ts +11 -18
  40. package/dist/src/rules/git/repo.pr.js +82 -94
  41. package/dist/src/rules/git/repo.tools.d.ts +5 -0
  42. package/dist/src/rules/git/repo.tools.js +6 -1
  43. package/dist/src/rules/types.d.ts +0 -2
  44. package/dist/src/rules/utils.matter.js +1 -5
  45. package/dist/src/scrapper.d.ts +138 -25
  46. package/dist/src/scrapper.js +538 -160
  47. package/dist/src/stategraph/stategraph.d.ts +6 -2
  48. package/dist/src/stategraph/stategraph.js +21 -6
  49. package/dist/src/stategraph/types.d.ts +14 -6
  50. package/dist/src/types.d.ts +22 -0
  51. package/dist/src/utils.d.ts +24 -0
  52. package/dist/src/utils.js +84 -86
  53. package/package.json +3 -2
  54. package/dist/src/agents/semantic.d.ts +0 -4
  55. package/dist/src/agents/semantic.js +0 -19
  56. package/dist/src/execute/legacy.d.ts +0 -46
  57. package/dist/src/execute/legacy.js +0 -460
  58. package/dist/src/pricing.llm.d.ts +0 -5
  59. package/dist/src/pricing.llm.js +0 -14
@@ -3,9 +3,10 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
3
3
  return (mod && mod.__esModule) ? mod : { "default": mod };
4
4
  };
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
- exports.extractCaptcha = extractCaptcha;
7
6
  exports.callLLMForParsingPDF = callLLMForParsingPDF;
8
7
  exports.html2markdown = html2markdown;
8
+ exports.pdftotext_poppler = pdftotext_poppler;
9
+ exports.pdftotext_mupdf = pdftotext_mupdf;
9
10
  exports.pdf2markdown = pdf2markdown;
10
11
  const child_process_1 = require("child_process");
11
12
  const util_1 = require("util");
@@ -13,133 +14,351 @@ const path_1 = __importDefault(require("path"));
13
14
  const fs_1 = __importDefault(require("fs"));
14
15
  const jsdom_1 = require("jsdom");
15
16
  const readability_1 = require("@mozilla/readability");
16
- const pricing_1 = require("./llm/pricing");
17
17
  const prompts_1 = require("./prompts");
18
18
  const utils_1 = require("./utils");
19
- const execute_1 = require("./execute");
20
19
  const utils_matter_1 = require("./rules/utils.matter");
21
- // Promisify exec for easier async/await usage
20
+ const reducer_core_1 = require("./agents/reducer.core");
22
21
  const execAsync = (0, util_1.promisify)(child_process_1.exec);
23
- const execFileAsync = (0, util_1.promisify)(child_process_1.execFile);
24
22
  const randomFile = (ext = '') => {
25
23
  const random = () => Math.random() * 1000 | 0;
26
24
  return `temp-${random()}-${random()}${ext}`;
27
25
  };
28
- async function extractCaptcha(base64Image, openai) {
29
- const content = [
30
- { type: 'text', text: "Extrais uniquement le nombre" },
31
- { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${base64Image}` } },
32
- ];
33
- // Cost per captcha $0.0000696
34
- const model = "gpt-4.1";
35
- const response = await openai.chat.completions.create({
36
- model,
37
- messages: [{ role: "user", content }],
38
- max_completion_tokens: 50,
39
- });
40
- const cost = (0, pricing_1.calculateCost)(model, response.usage);
41
- // Récupérer la réponse markdown
42
- const number = response.choices[0].message.content;
43
- return { number, cost };
26
+ /** Formats rows of cell strings as a GFM markdown table. */
27
+ function gfmTable(rows) {
28
+ if (rows.length === 0)
29
+ return '';
30
+ const maxCols = Math.max(...rows.map(r => r.length));
31
+ const pad = (r) => [...r, ...new Array(maxCols - r.length).fill('')];
32
+ const fmt = (cells) => `| ${cells.join(' | ')} |`;
33
+ const [header, ...body] = rows.map(pad);
34
+ return [fmt(header), fmt(header.map(() => '---')), ...body.map(fmt)].join('\n');
35
+ }
36
+ // ─── Geometric text reconstruction ────────────────────────────────────────────
37
+ /** Groups text items into rows by proximity on the Y axis (top-to-bottom). */
38
+ function groupByY(items, tolerance = 3) {
39
+ const buckets = new Map();
40
+ for (const item of items) {
41
+ const key = [...buckets.keys()].find(k => Math.abs(k - item.y) <= tolerance);
42
+ if (key !== undefined) {
43
+ buckets.get(key).push(item);
44
+ }
45
+ else {
46
+ buckets.set(item.y, [item]);
47
+ }
48
+ }
49
+ return [...buckets.entries()]
50
+ .sort(([ya], [yb]) => yb - ya)
51
+ .map(([y, rowItems]) => ({ y, items: rowItems.sort((a, b) => a.x - b.x) }));
52
+ }
53
+ function detectColumnBoundaries(rows, gap = 15) {
54
+ const allX = rows.flatMap(r => r.items.map(i => i.x)).sort((a, b) => a - b);
55
+ if (allX.length === 0)
56
+ return [];
57
+ const cols = [allX[0]];
58
+ for (let i = 1; i < allX.length; i++) {
59
+ if (allX[i] - allX[i - 1] > gap)
60
+ cols.push(allX[i]);
61
+ }
62
+ return cols;
63
+ }
64
+ function assignToColumns(row, cols) {
65
+ const cells = new Array(cols.length).fill('');
66
+ for (const item of row.items) {
67
+ let colIdx = 0, minDist = Math.abs(item.x - cols[0]);
68
+ for (let i = 1; i < cols.length; i++) {
69
+ const dist = Math.abs(item.x - cols[i]);
70
+ if (dist < minDist) {
71
+ minDist = dist;
72
+ colIdx = i;
73
+ }
74
+ }
75
+ cells[colIdx] = cells[colIdx] ? `${cells[colIdx]} ${item.str}` : item.str;
76
+ }
77
+ return cells;
44
78
  }
45
79
  /**
46
- * Calls GPT to parse a PDF file and convert it to markdown format.
80
+ * Reconstructs page text **block by block** and splits it into three zones.
47
81
  *
48
- * @param {string} inputfile - The name of the PDF file being processed
49
- * @param {any} pdfData - The extracted content from the PDF file
50
- * @param {any[]} links - Optional array of links extracted from the PDF to be integrated into the markdown
51
- * @param {string} model - The model to use for parsing (default: "MEDIUM-fast")
52
- * @returns {Promise<{markdown: string, cost: number}>} - The parsed markdown content and the cost of the API call
82
+ * mupdf's `segment` option places each column / region in its own top-level
83
+ * block. Processing blocks independently prevents multi-column layouts from
84
+ * being interleaved into false GFM tables.
85
+ *
86
+ * Zone classification by median Y of the block's items vs `pageHeight`:
87
+ * - **header zone** : medianY / pageHeight < 0.12
88
+ * - **footer zone** : medianY / pageHeight > 0.88
89
+ * - **body** : everything else
90
+ *
91
+ * When `pageHeight` is 0 (unknown) every block is treated as body.
53
92
  */
54
- async function callLLMForParsingPDF(inputfile, pdfData, links = [], model = "MEDIUM-fast") {
55
- // Convertir le contenu en chaîne de caractères (attention à la taille potentielle !)
56
- const pdfDataAsString = JSON.stringify(pdfData, null, 2);
57
- // Format: YYYY-MM-DD
58
- const today = new Date().toISOString().substring(0, 10);
59
- const linkLabel = 'Voici une liste de liens que tu dois intègrer directement dans le texte si, et seulement si, celui-ci correspond précisément à un texte existant dans le document.';
60
- const linkPrefix = linkLabel + links.reduce((acc, link) => {
61
- return acc + `- [${link.text}](${link.href})\n`;
62
- }, '');
63
- // Créer le prompt pour décrire la tâche au LLM
64
- const messages = [
65
- { role: "system",
66
- content: prompts_1.htmlToMarkdownPrompt }
67
- ];
68
- // console.log('🌶️ DEBUG: callLLMForParsingPDF -- SYSTEM:', messages[0].content);
69
- // console.log('🌶️ DEBUG: callLLMForParsingPDF -- USER:', messages[1].content);
70
- // WARNING: o3-mini is buggy with "Marche à suivre nouveau bail.pdf"
71
- const response = await (0, execute_1.executeQuery)({
72
- query: `Structure le contenu exhaustif en Markdown sans rien inventer, et avec les liens intégrés correctement.\n Nous sommes le ${today}.\n${linkPrefix}\nLe contenu du document est:\n${pdfDataAsString}`,
73
- model,
74
- messages,
75
- stdout: execute_1.DummyWritable,
76
- verbose: false
77
- });
78
- // Récupérer la réponse markdown
79
- const markdown = response.content;
80
- console.log(`Markdown 💰 cost: ${response.usage.cost}`);
93
+ function reconstructFromBlocks(parsed, pageHeight = 0) {
94
+ const empty = { header: '', body: '', footer: '' };
95
+ const root = parsed;
96
+ if (!root || !Array.isArray(root.blocks) || root.blocks.length === 0) {
97
+ const items = flattenMupdfTextItems(parsed);
98
+ return { ...empty, body: reconstructPageText(groupByY(items).reverse()) };
99
+ }
100
+ const headerParts = [];
101
+ const bodyParts = [];
102
+ const footerParts = [];
81
103
  //
82
- // add a regex to extract the markdown content between <thinking></thinking> tags
83
- const markdownWithoutThinking = markdown.replace(/<thinking>[\s\S]*?<\/thinking>/g, '');
84
- return { markdown: markdownWithoutThinking, cost: response.usage.cost };
104
+ // Classify at item level (not block level) so that a single large block
105
+ // spanning the full page (common in magazine layouts) is still split
106
+ // into its header/body/footer zones correctly.
107
+ // mupdf Y-from-top: small Y = top of page.
108
+ // header zone : Y / pageHeight < 0.12
109
+ // footer zone : Y / pageHeight > 0.88
110
+ // body : everything else
111
+ const headerItems = [];
112
+ const footerItems = [];
113
+ for (const block of root.blocks) {
114
+ const items = flattenMupdfTextItems(block);
115
+ if (items.length === 0)
116
+ continue;
117
+ if (pageHeight > 0) {
118
+ const hItems = items.filter(i => i.y / pageHeight < 0.12);
119
+ const fItems = items.filter(i => i.y / pageHeight > 0.88);
120
+ const bItems = items.filter(i => { const r = i.y / pageHeight; return r >= 0.12 && r <= 0.88; });
121
+ headerItems.push(...hItems);
122
+ footerItems.push(...fItems);
123
+ if (bItems.length > 0) {
124
+ const text = reconstructPageText(groupByY(bItems).reverse());
125
+ if (text)
126
+ bodyParts.push(text);
127
+ }
128
+ }
129
+ else {
130
+ const text = reconstructPageText(groupByY(items).reverse());
131
+ if (text)
132
+ bodyParts.push(text);
133
+ }
134
+ }
135
+ if (headerItems.length > 0) {
136
+ const text = reconstructPageText(groupByY(headerItems).reverse());
137
+ if (text)
138
+ headerParts.push(text);
139
+ }
140
+ if (footerItems.length > 0) {
141
+ const text = reconstructPageText(groupByY(footerItems).reverse());
142
+ if (text)
143
+ footerParts.push(text);
144
+ }
145
+ return {
146
+ header: headerParts.join('\n').trim(),
147
+ body: bodyParts.join('\n\n').trim(),
148
+ footer: footerParts.join('\n').trim(),
149
+ };
85
150
  }
86
151
  /**
87
- * Extracts hyperlinks from a PDF file by converting it to HTML and parsing the links.
152
+ * Promotes positional header/footer candidates to `page.header` / `page.footer`
153
+ * only when the same normalised pattern appears on **≥ 3 pages**.
88
154
  *
89
- * @param {string} pdfPath - The file path to the PDF document to extract links from
90
- * @param {string} output - The directory output where temporary files will be created
91
- * @returns {Promise<Array<{text: string, href: string}>>} - A promise that resolves to an array of link objects,
92
- * each containing the link text and href attributes
155
+ * Numbers are normalised (`\d{2,4}` `{N}`) so that incrementing folios
156
+ * (`057`, `058`, `059`) map to the same pattern.
157
+ *
158
+ * False positives (unique first-page titles that happen to sit in the top zone)
159
+ * are re-injected into `page.text` so nothing is silently lost.
93
160
  */
94
- async function extractLinksFromPDF(pdfPath, output) {
95
- const tempOut = path_1.default.join(output, `${randomFile()}`);
96
- try {
97
- // STEP 2: Convert the cleaned PDF to XML using pdftohtml.
98
- // La commande génère un fichier XML à partir du PDF nettoyé.
99
- const pdftohtmlCommand = `pdftohtml -s -nodrm -c "${pdfPath}" "${tempOut}"`;
100
- await execAsync(pdftohtmlCommand);
101
- const html = fs_1.default.readFileSync(tempOut + '-html.html', "utf8");
102
- const dom = new jsdom_1.JSDOM(html);
103
- const links = Array.from(dom.window.document.querySelectorAll('a')).map(link => ({
104
- text: link.textContent?.trim() || link.href,
105
- href: link.href
106
- }));
107
- process.stdout.write("Extracting links: " + links.length + " ");
108
- return links;
161
+ /**
162
+ * Parses GFM tables in a block of text and returns their dimensions.
163
+ *
164
+ * Used to populate `Page.tables` so that `callLLMForParsingPDF` can
165
+ * automatically select a stronger model for pages with complex tables
166
+ * (many columns or many rows).
167
+ */
168
+ function detectTableStats(text) {
169
+ const tables = [];
170
+ const lines = text.split('\n');
171
+ let tableLines = [];
172
+ const flush = () => {
173
+ if (tableLines.length < 2) {
174
+ tableLines = [];
175
+ return;
176
+ }
177
+ //
178
+ // Separator lines (e.g. `| --- | --- |`) are structural — exclude from row count
179
+ const SEPARATOR_RE = /^\|\s*[-:]+[\s|:-]*\|$/;
180
+ const dataLines = tableLines.filter(l => !SEPARATOR_RE.test(l.trim()));
181
+ const cols = (tableLines[0].match(/\|/g) ?? []).length - 1;
182
+ if (cols >= 2 && dataLines.length >= 1) {
183
+ tables.push({ rows: dataLines.length, cols });
184
+ }
185
+ tableLines = [];
186
+ };
187
+ for (const line of lines) {
188
+ if (/^\|/.test(line) && line.trim().endsWith('|')) {
189
+ tableLines.push(line);
190
+ }
191
+ else {
192
+ flush();
193
+ }
109
194
  }
110
- catch (error) {
111
- console.error('❌ Error extracting links from PDF:', error);
112
- return [];
195
+ flush();
196
+ return tables;
197
+ }
198
+ function detectRunningHeaders(pages) {
199
+ const normalize = (s) => s.replace(/\d{2,4}/g, '{N}').trim();
200
+ //
201
+ // Count how many pages share each normalised pattern
202
+ const headerCount = new Map();
203
+ const footerCount = new Map();
204
+ for (const p of pages) {
205
+ if (p._rawHeader)
206
+ headerCount.set(normalize(p._rawHeader), (headerCount.get(normalize(p._rawHeader)) ?? 0) + 1);
207
+ if (p._rawFooter)
208
+ footerCount.set(normalize(p._rawFooter), (footerCount.get(normalize(p._rawFooter)) ?? 0) + 1);
113
209
  }
114
- finally {
115
- if (fs_1.default.existsSync(tempOut + '-html.html')) {
116
- fs_1.default.unlinkSync(tempOut + '-html.html');
117
- // Clean up any PNG files that might have been generated
118
- const pngFiles = fs_1.default.readdirSync(path_1.default.dirname(tempOut))
119
- .filter(file => file.startsWith(path_1.default.basename(tempOut)) && file.endsWith('.png'));
120
- for (const pngFile of pngFiles) {
121
- const pngPath = path_1.default.join(path_1.default.dirname(tempOut), pngFile);
122
- if (fs_1.default.existsSync(pngPath))
123
- fs_1.default.unlinkSync(pngPath);
210
+ return pages.map(({ _rawHeader, _rawFooter, ...page }) => {
211
+ let text = page.text;
212
+ let header;
213
+ let footer;
214
+ if (_rawHeader) {
215
+ if ((headerCount.get(normalize(_rawHeader)) ?? 0) >= 3) {
216
+ header = _rawHeader;
217
+ }
218
+ else {
219
+ //
220
+ // Not a running header — keep in body text
221
+ text = `${_rawHeader}\n\n${text}`;
222
+ }
223
+ }
224
+ if (_rawFooter) {
225
+ if ((footerCount.get(normalize(_rawFooter)) ?? 0) >= 3) {
226
+ footer = _rawFooter;
227
+ }
228
+ else {
229
+ text = `${text}\n\n${_rawFooter}`;
124
230
  }
125
231
  }
232
+ return { ...page, text: text.trim(), header, footer };
233
+ });
234
+ }
235
+ /** Reconstructs plain text with heuristic table detection for untagged PDFs. */
236
+ function reconstructPageText(rows) {
237
+ const LIST_MARKER_RE = /^([●•◦▪▸✅✓✗►]|\d{1,3}\.?|[a-zA-Z]\.)$/;
238
+ const isListItem = (r) => r.items.length === 2 && (r.items[0].str.trim().length <= 3 || LIST_MARKER_RE.test(r.items[0].str.trim()));
239
+ const fmtTableRows = (rows2) => {
240
+ const cols = detectColumnBoundaries(rows2);
241
+ if (cols.length < 2)
242
+ return rows2.map(r => r.items.map(i => i.str).join(' ')).join('\n');
243
+ return gfmTable(rows2.map(r => assignToColumns(r, cols)));
244
+ };
245
+ const chunks = [];
246
+ let tableCandidate = [];
247
+ const flush = () => {
248
+ if (tableCandidate.length === 0)
249
+ return;
250
+ const realTable = tableCandidate.filter(r => !isListItem(r) && r.items.length >= 2);
251
+ chunks.push(realTable.length >= 2 ? fmtTableRows(tableCandidate) : tableCandidate.map(r => r.items.map(i => i.str).join(' ')).join('\n'));
252
+ tableCandidate = [];
253
+ };
254
+ for (const row of rows) {
255
+ if (row.items.length >= 2 && !isListItem(row)) {
256
+ tableCandidate.push(row);
257
+ }
258
+ else {
259
+ flush();
260
+ chunks.push(row.items.map(i => i.str).join(' '));
261
+ }
262
+ }
263
+ flush();
264
+ return chunks.join('\n').replace(/\n{3,}/g, '\n\n').trim();
265
+ }
266
+ // ─── Existing helpers ─────────────────────────────────────────────────────────
267
+ /**
268
+ * Converts extracted PDF content to clean Markdown via LLM.
269
+ *
270
+ * Two paths depending on the `pdfData` type:
271
+ *
272
+ * **`Page[]` (mupdf path)** — `MapLLM.reduce`, one page per chunk.
273
+ * Each page is processed by `mupdfPagePrompt` (heading normalisation, broken-cell
274
+ * fusion, repeated-header removal). No frontmatter is added here; the caller
275
+ * (`pdf2markdown`) prepends the single YAML block.
276
+ *
277
+ * NOTE: `finalReduce` is intentionally disabled — it is reserved for a future
278
+ * "N-page light summary" feature where a second LLM pass synthesises the whole
279
+ * document into a shorter version.
280
+ *
281
+ * A raw `string` (e.g. from `html2markdown`) is automatically wrapped into a
282
+ * single `Page` so both callers share the exact same code path.
283
+ *
284
+ * @param inputfile - Original file path (used for logging only).
285
+ * @param pdfData - Either a `Page[]` array (mupdf) or a raw string.
286
+ * @param links - External links appended as `## Liens` footer (string path).
287
+ * @param model - LLM model alias (default: `'MEDIUM-fast'`).
288
+ */
289
+ async function callLLMForParsingPDF(inputfile, pdfData, links = [], model = 'LOW-fast') {
290
+ //
291
+ // Normalise input: a raw string becomes a single-page array.
292
+ // Links (html2markdown path) are appended as a Liens footer so the
293
+ // MapLLM digest can embed them naturally in context.
294
+ const pages = Array.isArray(pdfData)
295
+ ? pdfData
296
+ : [{
297
+ pageNumber: 1,
298
+ text: links.length > 0
299
+ ? `${pdfData}\n\n## Liens\n\n${links.map(l => `- [${l.text}](${l.href})`).join('\n')}`
300
+ : pdfData,
301
+ tables: [],
302
+ images: [],
303
+ }];
304
+ //
305
+ // Auto model upgrade: count total `|` across all pages.
306
+ // A 4-col × 6-row table (4+1) × (6+2) = ≈ 40 `|` — threshold 40 catches any non-trivial table.
307
+ // Only upgrades from LOW-fast; explicit caller models are respected.
308
+ const PIPE_THRESHOLD = 40;
309
+ const totalPipes = pages.reduce((sum, p) => sum + (p.text.match(/\|/g) ?? []).length, 0);
310
+ const effectiveModel = totalPipes > PIPE_THRESHOLD ? 'HIGH-fast' : model;
311
+ if (totalPipes > PIPE_THRESHOLD) {
312
+ console.log(`pdf: ${totalPipes} pipes detected → upgrading model LOW-fast → HIGH-fast`);
126
313
  }
314
+ const pageLoader = {
315
+ loadNativeChunk: async (pos) => ({
316
+ content: pages[pos].text,
317
+ eof: pos + 1 >= pages.length,
318
+ position: pos + 1,
319
+ }),
320
+ };
321
+ //
322
+ // finalReduce: false — reserved for future "N-page light summary" feature
323
+ const mapper = new reducer_core_1.MapLLM(pageLoader, { finalReduce: false });
324
+ const result = await mapper.reduce((res, current) => {
325
+ const section = typeof current === 'string' ? current : JSON.stringify(current);
326
+ res.acc = res.acc ? `${res.acc}\n\n---\n\n${section}` : section;
327
+ return res;
328
+ }, {
329
+ acc: '',
330
+ config: {
331
+ digestPrompt: prompts_1.mupdfPagePrompt,
332
+ reducePrompt: '',
333
+ },
334
+ model: effectiveModel,
335
+ verbose: true,
336
+ });
337
+ const raw = typeof result.acc === 'string' ? result.acc : JSON.stringify(result.acc);
338
+ const clean = raw.replace(/<thinking>[\s\S]*?<\/thinking>/g, '').trim();
339
+ return { markdown: clean, cost: 0 };
127
340
  }
341
+ /**
342
+ * Extracts hyperlinks from a PDF file by converting it to HTML and parsing the links.
343
+ *
344
+ * @param {string} pdfPath - The file path to the PDF document to extract links from
345
+ * @param {string} output - The directory output where temporary files will be created
346
+ * @returns {Promise<Array<{text: string, href: string}>>} - A promise that resolves to an array of link objects
347
+ */
128
348
  function cleanHTML(html) {
129
349
  const dom = new jsdom_1.JSDOM(html);
130
- // Instancie Readability avec le document
131
350
  const reader = new readability_1.Readability(dom.window.document);
132
351
  const article = reader.parse();
133
352
  return article?.content || '';
134
353
  }
135
354
  /**
136
- * Parses an HTML file and converts it to markdown using GPT.
355
+ * Parses an HTML file and converts it to markdown using LLM.
137
356
  *
138
357
  * @param {string} output - The directory path where the output markdown file will be saved.
139
358
  * @param {string} file - The path to the HTML file to be parsed.
140
359
  * @param {string} service - The service name used as part of the output filename output.
141
360
  * @param {string} model - The model to use for parsing (default: "MEDIUM-fast")
142
- * @returns {Promise<{markdown: string, cost: number}>} - The generated markdown content and the cost of the GPT API call.
361
+ * @returns {Promise<{markdown: string, cost: number}>} - The generated markdown content and the cost of the API call.
143
362
  */
144
363
  async function html2markdown(output, file, service, model = "MEDIUM-fast") {
145
364
  const filename = (0, utils_1.toSlug)(path_1.default.basename(file, path_1.default.extname(file)));
@@ -150,89 +369,248 @@ async function html2markdown(output, file, service, model = "MEDIUM-fast") {
150
369
  fs_1.default.writeFileSync(path_1.default.join(output, `${outputfile + filename}.md`), markdown, { encoding: 'utf8', flag: 'w' });
151
370
  return { markdown, cost };
152
371
  }
372
+ // ─── PDF text extraction ──────────────────────────────────────────────────────
153
373
  /**
154
- * Parse un PDF en effectuant :
155
- * 1. Le nettoyage du PDF avec Ghostscript.
156
- * 2. Sa conversion en XML via pdftohtml.
157
- * 3. (Optionnellement) Le passage du contenu converti au modèle LLM pour analyser la structure.
158
- *
159
- * @param {string} outputDir - Dossier de sortie pour le fichier markdown.
160
- * @param {string} pdf - Chemin vers le fichier PDF à analyser.
161
- * @param {FrontMatter|null} matter - Métadonnées du document (title, service, author, role). Si null, utilise le nom du PDF pour le titre.
162
- * @param {string} model - Le modèle à utiliser (défaut: "MEDIUM-fast").
163
- * @returns {Promise<{markdown: string, cost: number, outputPath: string}>} - Le markdown structuré, le coût et le chemin du fichier de sortie.
374
+ * Extracts plain text from a PDF using the system `pdftotext` binary (poppler-utils).
375
+ *
376
+ * - Pages are delimited by form-feed (\f) characters in the binary's output.
377
+ * - Excessive blank lines are normalised (3+ 2).
378
+ * - Images are NOT extracted (always []).
379
+ *
380
+ * NOTE: Better alternative is `pdftotext_pdfjs` which uses Mozilla's PDF engine
381
+ * to extract text + images + links in a single Node.js-native pass, with better
382
+ * table reconstruction for complex layouts. See `pdftotext_pdfjs` for details.
383
+ *
384
+ * @param {string} pdfPath - Absolute path to the PDF file.
385
+ * @param {string} outputDir - Directory used for temporary files.
386
+ * @returns {Promise<Page[]>} One `Page` per PDF page, text-only.
164
387
  */
165
- async function pdf2markdown(outputDir, pdf, matter, model = "MEDIUM-fast") {
388
+ async function pdftotext_poppler(pdfPath, outputDir) {
389
+ const tempOut = path_1.default.join(outputDir, `${randomFile()}.txt`);
390
+ try {
391
+ //
392
+ // Omit -nopgbrk so pdftotext emits \f between pages
393
+ await execAsync(`pdftotext -nodiag "${pdfPath}" "${tempOut}"`);
394
+ const rawText = fs_1.default.readFileSync(tempOut, 'utf8');
395
+ //
396
+ // \f (form feed = \x0C) is the page delimiter; filter empty trailing entries
397
+ return rawText
398
+ .split('\f')
399
+ .filter(p => p.trim() !== '')
400
+ .map((raw, idx) => {
401
+ const text = raw.replace(/\n{3,}/g, '\n\n').trim();
402
+ return { pageNumber: idx + 1, text, tables: detectTableStats(text), images: [] };
403
+ });
404
+ }
405
+ finally {
406
+ if (fs_1.default.existsSync(tempOut))
407
+ fs_1.default.unlinkSync(tempOut);
408
+ }
409
+ }
410
+ /**
411
+ * Walks the mupdf `asJSON()` tree and returns all non-empty text lines as
412
+ * `RawTextItem[]` (same shape used by the pdfjs geometric reconstruction).
413
+ *
414
+ * Coordinate space: mupdf uses screen coords (Y from top, increases downward).
415
+ * `line.y` is the **baseline** y — the same semantic as pdfjs `transform[5]`.
416
+ * Callers must call `groupByY(items).reverse()` to get top-to-bottom order.
417
+ */
418
+ function flattenMupdfTextItems(node) {
419
+ const items = [];
420
+ function walk(n) {
421
+ if (!n || typeof n !== 'object')
422
+ return;
423
+ const obj = n;
424
+ if (obj.type === 'text' && Array.isArray(obj.lines)) {
425
+ for (const line of obj.lines) {
426
+ const text = line.text;
427
+ if (!text?.trim())
428
+ continue;
429
+ const bbox = line.bbox;
430
+ if (!bbox)
431
+ continue;
432
+ //
433
+ // PDFs with fonts missing ToUnicode CMaps produce U+FFFD replacement
434
+ // characters for every undecodable glyph. Drop lines where more than
435
+ // 40% of characters are replacement chars (decorative/unreadable text)
436
+ // and strip residual runs from otherwise readable lines.
437
+ const replacements = (text.match(/\uFFFD/g) ?? []).length;
438
+ if (replacements / text.length > 0.4)
439
+ continue;
440
+ const cleaned = text.replace(/\uFFFD+/g, '').trim();
441
+ if (!cleaned)
442
+ continue;
443
+ items.push({
444
+ str: cleaned,
445
+ //
446
+ // Use line.x/y (baseline) when available, fall back to bbox top-left
447
+ x: typeof line.x === 'number' ? line.x : bbox.x,
448
+ y: typeof line.y === 'number' ? line.y : bbox.y + bbox.h,
449
+ width: bbox.w,
450
+ height: bbox.h,
451
+ });
452
+ }
453
+ }
454
+ //
455
+ // Recurse: structure nodes expose `contents`, top-level page exposes `blocks`
456
+ if (Array.isArray(obj.contents))
457
+ for (const c of obj.contents)
458
+ walk(c);
459
+ if (Array.isArray(obj.blocks))
460
+ for (const b of obj.blocks)
461
+ walk(b);
462
+ }
463
+ walk(node);
464
+ return items;
465
+ }
466
+ /**
467
+ * Resolves the absolute path to the `mupdf-extract.mjs` ESM worker script.
468
+ *
469
+ * Works in both ts-jest context (`__dirname` = `src/`) and compiled context
470
+ * (`__dirname` = `dist/src/`) since both paths resolve to `<package>/`.
471
+ */
472
+ function resolveMupdfScript() {
473
+ //
474
+ // ts-jest: __dirname = …/agentic-api/src → 1 level up → package root
475
+ const fromSrc = path_1.default.resolve(__dirname, '..', 'mupdf-extract.mjs');
166
476
  //
167
- // Extract matter values with defaults
477
+ // post-build copy (cp mupdf-extract.mjs dist/):
478
+ // __dirname = …/agentic-api/dist/src → 1 level up → dist/
479
+ const fromDistFlat = path_1.default.resolve(__dirname, '..', 'mupdf-extract.mjs');
480
+ //
481
+ // fallback — repo root deployed without cp step:
482
+ // __dirname = …/agentic-api/dist/src → 2 levels up → package root
483
+ const fromDistRoot = path_1.default.resolve(__dirname, '..', '..', 'mupdf-extract.mjs');
484
+ for (const candidate of [fromSrc, fromDistFlat, fromDistRoot]) {
485
+ if (fs_1.default.existsSync(candidate))
486
+ return candidate;
487
+ }
488
+ throw new Error(`mupdf-extract.mjs not found. Searched:\n` +
489
+ ` ${fromSrc}\n ${fromDistFlat}\n ${fromDistRoot}`);
490
+ }
491
+ /**
492
+ * Extracts text, reconstructed tables, links, and optionally page-raster images
493
+ * from a PDF using the **mupdf** npm package (WASM build of the MuPDF C library).
494
+ *
495
+ * Key advantages over the poppler engine:
496
+ * - `table-hunt` detects tables geometrically even in **untagged** PDFs.
497
+ * - `segment` splits the page into logical reading-order blocks.
498
+ * - Significantly faster than pdfjs for large documents.
499
+ * - No shell binary dependency (pure WASM, runs anywhere Node.js does).
500
+ *
501
+ * Images (opt-in via `withImages: true`): each page is rasterised at 1.5× scale
502
+ * (≈ 113 DPI). The `imageFormat` option controls encoding:
503
+ *
504
+ * | format | size/page (base64) | notes |
505
+ * |-------------|-------------------|--------------------------------|
506
+ * | `'rgb'` | ≈ 4.4 MB | raw RGB, lossless, large |
507
+ * | `'gray'` | ≈ 1.5 MB | raw grayscale, 3× smaller |
508
+ * | `'jpeg'` | ≈ 100–200 KB | JPEG quality 75, 31× smaller |
509
+ *
510
+ * Disabled by default because image data quickly exhausts stdout buffers for
511
+ * large documents. Use `jpeg` for production with vision models.
512
+ *
513
+ * NOTE: `mupdf` is ESM-only. Extraction is delegated to a standalone
514
+ * `mupdf-extract.mjs` worker spawned via `execAsync`, which avoids any
515
+ * ESM/CJS interoperability issues in the main process and under ts-jest.
516
+ *
517
+ * @param {string} pdfPath - Absolute path to the PDF file.
518
+ * @param {object} [options]
519
+ * @param {boolean} [options.withImages=false] - Rasterise each page.
520
+ * @param {'rgb'|'gray'|'jpeg'} [options.imageFormat='rgb'] - Pixel encoding.
521
+ * @returns {Promise<Page[]>} One `Page` per PDF page with text, GFM tables, and optional images.
522
+ */
523
+ async function pdftotext_mupdf(pdfPath, options = {}) {
524
+ const scriptPath = resolveMupdfScript();
525
+ const fmt = options.imageFormat ?? 'rgb';
526
+ const imageFlags = options.withImages ? ` --with-images --image-format=${fmt}` : '';
527
+ //
528
+ // maxBuffer scales with expected image size:
529
+ // jpeg ≈ 150KB/page, gray ≈ 1.5MB/page, rgb ≈ 4.4MB/page (base64).
530
+ // 32 MB is plenty for text-only or jpeg; rgb on large docs needs more.
531
+ const maxBuffer = options.withImages && fmt === 'rgb' ? 256 * 1024 * 1024 : 32 * 1024 * 1024;
532
+ const { stdout } = await execAsync(`node "${scriptPath}" "${pdfPath}"${imageFlags}`, { maxBuffer });
533
+ const result = JSON.parse(stdout);
534
+ //
535
+ // 1. Build raw pages — header/footer candidates kept separately
536
+ const rawPages = result.pages.map(p => {
537
+ let parsed;
538
+ try {
539
+ parsed = JSON.parse(p.json);
540
+ }
541
+ catch {
542
+ parsed = {};
543
+ }
544
+ //
545
+ // Block-by-block: each segment processed independently (multi-column safe).
546
+ // pageHeight classifies top/bottom 12% as header/footer candidate zones.
547
+ const { header: _rawHeader, body, footer: _rawFooter } = reconstructFromBlocks(parsed, p.pageHeight);
548
+ let text = body;
549
+ if (p.links.length > 0) {
550
+ text += '\n\n## Liens\n\n' + p.links.map(u => `- ${u}`).join('\n');
551
+ }
552
+ const images = p.image
553
+ ? [{
554
+ type: p.image.type,
555
+ width: p.image.width,
556
+ height: p.image.height,
557
+ data: Buffer.from(p.image.data, 'base64'),
558
+ }]
559
+ : [];
560
+ return { pageNumber: p.pageNumber, text, _rawHeader, _rawFooter, tables: detectTableStats(text), images };
561
+ });
562
+ //
563
+ // 2. Promote header/footer candidates that repeat on ≥ 3 pages.
564
+ // Unique occurrences (e.g. first-page title) are re-injected into body.
565
+ return detectRunningHeaders(rawPages);
566
+ }
567
+ // ─── PDF → Markdown ───────────────────────────────────────────────────────────
568
+ /**
569
+ * Converts a PDF to a structured Markdown file.
570
+ *
571
+ * Pipeline:
572
+ * 1. `pdftotext_mupdf` (or poppler) → `Page[]`
573
+ * 2. `callLLMForParsingPDF` — MapLLM.reduce, one page per chunk
574
+ * 3. Prepend a **single** YAML frontmatter block and write to `outputDir`.
575
+ *
576
+ * Model choice: `LOW-fast` is sufficient — mupdf output is already clean GFM;
577
+ * the LLM only normalises headings and removes repeated headers/footers.
578
+ * Use `MEDIUM-fast` for complex layouts that need heavier restructuring.
579
+ *
580
+ * @param outputDir - Directory for the output `.md` file.
581
+ * @param pdf - Absolute path to the PDF file.
582
+ * @param matter - Document metadata; defaults derived from filename.
583
+ * @param model - LLM model alias (default: `'LOW-fast'`).
584
+ * @param engine - Extraction backend (default: `'mupdf'`).
585
+ * @returns `{ markdown, outputPath }` — frontmatter-prefixed markdown and output path.
586
+ */
587
+ async function pdf2markdown(outputDir, pdf, matter, model = 'LOW-fast', engine = 'mupdf') {
168
588
  const service = matter?.service || 'unknown';
169
589
  const title = matter?.title || path_1.default.basename(pdf, path_1.default.extname(pdf));
170
- //
171
- // Build complete FrontMatter with defaults
172
590
  const frontMatter = {
173
591
  title,
174
592
  service,
175
593
  author: matter?.author || '',
176
594
  role: matter?.role || 'rule',
177
595
  };
178
- //
179
- // Use title for filename
180
- const filename = (0, utils_1.toSlug)(title);
181
- // Créez des noms de fichiers temporaires pour le PDF nettoyé et le XML généré.
182
- const tempPdf = path_1.default.join(outputDir, `cleaned-${randomFile()}.pdf`);
183
- const tempOut = path_1.default.join(outputDir, `${filename}.txt`);
184
- //
185
- // generated folder path
186
- const outputPath = path_1.default.join(outputDir, `${(0, utils_1.toSlug)(service.toLocaleLowerCase())}-${filename}.md`);
596
+ const outputPath = path_1.default.join(outputDir, `${(0, utils_1.toSlug)(service.toLowerCase())}-${(0, utils_1.toSlug)(title)}.md`);
187
597
  try {
188
598
  //
189
- // replace pdftotext with python script PyMuPDF
190
- // Ca ne marche pas mieux que pdftotext
191
- // const { stdout } = await execFileAsync("python3", ["./bin/extract_text_with_links.py", pdf]);
192
- // const { text, links } = JSON.parse(stdout);
193
- // `pdftotext -f 1 -l 2 -layout -eol unix -nodiag "${pdf}" "${outputPath}"`;
194
- await execAsync(`pdftotext -nodiag -nopgbrk "${pdf}" "${outputPath}"`);
195
- const links = await extractLinksFromPDF(pdf, outputDir);
196
- const text = fs_1.default.readFileSync(outputPath, "utf8");
197
- const { markdown, cost } = await callLLMForParsingPDF(pdf, text, links, model);
599
+ // 1. Extract pages (GFM text, tables, per-page link footer)
600
+ const pages = engine === 'mupdf'
601
+ ? await pdftotext_mupdf(pdf)
602
+ : await pdftotext_poppler(pdf, outputDir);
198
603
  //
199
- // Add frontmatter to the markdown before saving
200
- const markdownWithMatter = (0, utils_matter_1.matterSerialize)(markdown, frontMatter);
201
- fs_1.default.writeFileSync(outputPath, markdownWithMatter);
202
- return { markdown: markdownWithMatter, cost, outputPath };
203
- /**
204
-
205
- // STEP 1: Clean the PDF using Ghostscript.
206
- // La commande utilise -o pour spécifier le fichier de sortie et applique des options de mise en forme.
207
- const gsCommand = `gs -o "${tempPdf}" -sDEVICE=pdfwrite -dFIXEDMEDIA -dDEVICEWIDTHPOINTS=595 -dDEVICEHEIGHTPOINTS=752 -dORIGINY=100 -dFILTERVECTOR "${file}"`;
208
- console.log("Executing Ghostscript command:", gsCommand);
209
- await execAsync(gsCommand);
210
-
211
- // STEP 2: Convert the cleaned PDF to XML using pdftohtml.
212
- // La commande génère un fichier XML à partir du PDF nettoyé.
213
- const pdftohtmlCommand = `pdftohtml -xml -nodrm -s -c "${tempPdf}" "${tempOut}"`;
214
- console.log("Executing pdftohtml command:", pdftohtmlCommand);
215
- await execAsync(pdftohtmlCommand);
216
-
217
- // Lecture du contenu XML généré
218
- const xmlContent = fs.readFileSync(tempOut, "utf8");
219
-
220
- // (OPTIONNEL) STEP 3: Utilisez GPT pour analyser la structure du contenu XML.
221
- // Vous pouvez adapter le traitement en fonction du contenu généré par pdftohtml.
222
- const {markdown,cost} = await callLLMForParsingPDF(file, xmlContent, [], model);
223
- fs.writeFileSync(outputPath, markdown);
224
- return {markdown,cost}; */
604
+ // 2. LLM: format each page as clean Markdown (no frontmatter inside)
605
+ const { markdown: body } = await callLLMForParsingPDF(pdf, pages, [], model);
606
+ //
607
+ // 3. Prepend single YAML frontmatter and write
608
+ const markdown = (0, utils_matter_1.matterSerialize)(body, frontMatter);
609
+ fs_1.default.writeFileSync(outputPath, markdown);
610
+ return { markdown, outputPath };
225
611
  }
226
612
  catch (error) {
227
- console.error("Error during PDF parsing: ", error);
613
+ console.error('Error during PDF parsing:', error);
228
614
  throw error;
229
615
  }
230
- finally {
231
- if (fs_1.default.existsSync(tempPdf)) {
232
- fs_1.default.unlinkSync(tempPdf);
233
- }
234
- if (fs_1.default.existsSync(tempOut)) {
235
- fs_1.default.unlinkSync(tempOut);
236
- }
237
- }
238
616
  }