@msbayindir/context-rag 1.0.0-beta.3 → 1.0.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4,6 +4,7 @@ var zod = require('zod');
4
4
  var crypto = require('crypto');
5
5
  var generativeAi = require('@google/generative-ai');
6
6
  var server = require('@google/generative-ai/server');
7
+ var zodToJsonSchema = require('zod-to-json-schema');
7
8
  var fs = require('fs/promises');
8
9
  var path = require('path');
9
10
  var pdf = require('pdf-parse');
@@ -388,10 +389,10 @@ Analyze the document and return ONLY a JSON response with the following structur
388
389
  "Specific instruction 3 for this document type"
389
390
  ],
390
391
 
391
- "exampleFormats": {
392
- "example1": "How a specific format should look",
393
- "example2": "Another format example"
394
- },
392
+ "exampleFormats": [
393
+ { "element": "table", "format": "Markdown table with headers" },
394
+ { "element": "code", "format": "Code block with language tag" }
395
+ ],
395
396
 
396
397
  "chunkStrategy": {
397
398
  "maxTokens": 800,
@@ -413,92 +414,104 @@ IMPORTANT RULES:
413
414
 
414
415
  {{DOCUMENT_TYPE_HINT}}
415
416
  `;
416
- var BASE_EXTRACTION_TEMPLATE = `You are a document processing AI. Extract content following the EXACT format below.
417
+ var BASE_EXTRACTION_TEMPLATE = `You are a document processing AI. Extract content from the document with HIGH FIDELITY and DETAIL.
417
418
 
418
419
  ## OUTPUT FORMAT (MANDATORY - DO NOT MODIFY)
419
420
 
420
- Use this structure for EVERY content section:
421
+ \u26A0\uFE0F CRITICAL: You MUST use EXACTLY this marker format. Any deviation will cause parsing errors:
421
422
 
423
+ \`\`\`
422
424
  <!-- SECTION type="[TYPE]" page="[PAGE]" confidence="[0.0-1.0]" -->
423
425
  [Content here in Markdown format]
424
426
  <!-- /SECTION -->
427
+ \`\`\`
425
428
 
426
- ### Valid Types:
427
- - TEXT: Regular paragraphs and prose
428
- - TABLE: Data tables in Markdown format
429
- - LIST: Bullet (-) or numbered (1. 2. 3.) lists
430
- - HEADING: Section headers with # ## ### levels
431
- - CODE: Code blocks with language specification
432
- - QUOTE: Quoted text or citations
433
- - IMAGE_REF: Description of images, charts, figures
434
- - QUESTION: Multiple choice questions with options (A, B, C, D, E)
435
-
436
- ### Format Rules:
437
- 1. **Tables**: Use Markdown table format
438
- | Column1 | Column2 | Column3 |
439
- |---------|---------|---------|
440
- | data | data | data |
429
+ ### EXAMPLE OUTPUT (FOLLOW THIS EXACTLY):
430
+ \`\`\`
431
+ <!-- SECTION type="HEADING" page="1" confidence="0.95" -->
432
+ # Introduction to Metabolism
433
+ <!-- /SECTION -->
441
434
 
442
- 2. **Lists**: Use consistent format
443
- - Bullet item
444
- - Another bullet
445
-
446
- OR
447
-
448
- 1. Numbered item
449
- 2. Another numbered
435
+ <!-- SECTION type="TEXT" page="1" confidence="0.92" -->
436
+ Metabolism refers to all chemical reactions in an organism. It creates energy...
437
+ (Extract full paragraphs, do not break them up unnecessarily)
438
+ <!-- /SECTION -->
450
439
 
451
- 3. **Headings**: Maximum 3 levels, use hierarchy
452
- # Main Section
453
- ## Subsection
454
- ### Sub-subsection
440
+ <!-- SECTION type="LIST" page="2" confidence="0.90" -->
441
+ - First item in the list
442
+ - Second item in the list
443
+ - Third item in the list
444
+ <!-- /SECTION -->
455
445
 
456
- 4. **Code**: Specify language
457
- \`\`\`python
458
- code here
459
- \`\`\`
446
+ <!-- SECTION type="TABLE" page="2" confidence="0.88" -->
447
+ | Column1 | Column2 |
448
+ |---------|---------|
449
+ | Data1 | Data2 |
450
+ <!-- /SECTION -->
451
+ \`\`\`
460
452
 
461
- 5. **Images**: Describe visual content
462
- [IMAGE: Description of what the image shows]
453
+ ### Valid Types:
454
+ - TEXT: Regular paragraphs and prose. **PREFER THIS** for standard text.
455
+ - TABLE: **ONLY** for explicit data tables in the source.
456
+ - LIST: **ONLY** for explicit bulleted/numbered lists in source.
457
+ - HEADING: Section headers with # ## ### levels.
458
+ - CODE: Code blocks with language specification.
459
+ - QUOTE: Quoted text or citations.
460
+ - IMAGE_REF: Description of images, charts, figures.
461
+ - QUESTION: Multiple choice questions.
463
462
 
464
- 6. **Questions**: Multiple choice questions with options
465
- **Question 1:** Question text here?
466
- A) Option A text
467
- B) Option B text
468
- C) Option C text
469
- D) Option D text
470
- E) Option E text (if exists)
471
- **Answer:** [Letter] (if answer is provided in document)
463
+ ### Format Rules:
464
+ 1. **Tables**: Use Markdown table format.
465
+ 2. **Lists**: Use consistent format (bullets or numbers).
466
+ 3. **Headings**: Use Markdown headers (#, ##, ###).
467
+ 4. **Code**: Use fenced code blocks with language.
468
+ 5. **Images**: Describe visual content clearly.
472
469
 
473
470
  ## DOCUMENT-SPECIFIC INSTRUCTIONS
474
471
  {{DOCUMENT_INSTRUCTIONS}}
475
472
 
476
473
  ## CRITICAL EXTRACTION RULES (DO NOT VIOLATE)
477
- \u26A0\uFE0F These rules are MANDATORY for legal, medical, and financial document accuracy:
478
474
 
479
475
  1. **NO SUMMARIZATION**: Extract content EXACTLY as written. Do not summarize, paraphrase, or condense.
480
- 2. **NO INTERPRETATION**: Do not interpret, explain, or add commentary to the content.
481
- 3. **PRESERVE ORIGINAL WORDING**: Keep exact terminology, especially for:
482
- - Legal terms, clauses, and article references
483
- - Medical terminology, diagnoses, and prescriptions
484
- - Financial figures, percentages, and calculations
485
- - Technical specifications and measurements
486
- 4. **VERBATIM EXTRACTION**: Copy text word-for-word from the document.
487
- 5. **NO OMISSIONS**: Include all content, even if it seems redundant or repetitive.
488
- 6. **UNCLEAR CONTENT**: If text is unclear or illegible, extract as-is and mark: [UNCLEAR: partial text visible]
489
- 7. **FOREIGN TERMS**: Keep foreign language terms, Latin phrases, and abbreviations exactly as written.
476
+ 2. **PRESERVE FLOW**: **DO NOT** break continuous text into lists unless it is explicitly a list in the source. Keep paragraphs together.
477
+ 3. **AVOID OVER-SEGMENTATION**: Combine related sentences into single TEXT blocks. Do not create a new section for every sentence.
478
+ 4. **PRESERVE ORIGINAL WORDING**: Keep exact terminology, especially for technical, medical, or legal terms.
479
+ 5. **NO INTERPRETATION**: Do not interpret or explain the content. Just extract it.
480
+ 6. **UNCLEAR CONTENT**: If text is unclear, mark: [UNCLEAR: partial text].
481
+ 7. **FOREIGN TERMS**: Keep foreign language terms exactly as written.
490
482
 
491
483
  ## PROCESSING RULES
492
- - Extract ALL content completely, do not summarize or skip
493
- - Preserve original document structure and hierarchy
494
- - Include page references for each section
495
- - Maintain technical accuracy and terminology
496
- - Use appropriate confidence scores based on extraction quality
497
- - If content spans multiple pages, use the starting page number
484
+ - Extract ALL content completely.
485
+ - Preserve original document structure and hierarchy.
486
+ - Include page references for each section.
487
+ - If content spans multiple pages, use the starting page number.
498
488
 
499
489
  ## PAGE RANGE
500
490
  {{PAGE_RANGE}}
501
491
  `;
492
+ var STRUCTURED_EXTRACTION_TEMPLATE = `You are a document processing AI. Extract content from the provided document pages.
493
+
494
+ Your goal is to extract content accurately, preserving the logical structure and semantics.
495
+
496
+ ## INSTRUCTIONS
497
+ {{DOCUMENT_INSTRUCTIONS}}
498
+
499
+ ## CRITICAL RULES (DO NOT VIOLATE)
500
+ 1. **NO SUMMARIZATION**: Extract content EXACTLY as written. Do not summarize, paraphrase, or condense.
501
+ 2. **PRESERVE FLOW**: **DO NOT** break continuous text into lists unless it is explicitly a list in the source. Keep paragraphs together.
502
+ 3. **AVOID OVER-SEGMENTATION**: Combine related sentences into single TEXT blocks. Do not create a new section for every sentence.
503
+ 4. **PRESERVE ORIGINAL WORDING**: Keep exact terminology, especially for technical, medical, or legal terms.
504
+ 5. **NO INTERPRETATION**: Do not interpret or explain the content. Just extract it.
505
+
506
+ ## PAGE RANGE
507
+ {{PAGE_RANGE}}
508
+
509
+ IMPORTANT:
510
+ 1. Extract content strictly from the specified page range.
511
+ 2. Maintain the order of elements as they appear in the document.
512
+ 3. Don't summarize code blocks or tables; extract them fully.
513
+ 4. Follow the specific document instructions provided above.
514
+ `;
502
515
  var DEFAULT_DOCUMENT_INSTRUCTIONS = `
503
516
  - Extract all text content preserving structure
504
517
  - Convert tables to Markdown table format
@@ -507,12 +520,21 @@ var DEFAULT_DOCUMENT_INSTRUCTIONS = `
507
520
  - Note any images with descriptive text
508
521
  - Maintain the logical flow of content
509
522
  `;
510
- function buildExtractionPrompt(documentInstructions, exampleFormats, pageStart, pageEnd) {
523
+ function buildExtractionPrompt(documentInstructions, exampleFormats, pageStart, pageEnd, useStructuredOutput = false) {
511
524
  let instructionsBlock = documentInstructions.map((instruction) => `- ${instruction}`).join("\n");
512
- if (exampleFormats && Object.keys(exampleFormats).length > 0) {
525
+ let formats = [];
526
+ if (Array.isArray(exampleFormats)) {
527
+ formats = exampleFormats;
528
+ } else if (exampleFormats) {
529
+ formats = Object.entries(exampleFormats).map(([key, value]) => ({
530
+ element: key,
531
+ format: value
532
+ }));
533
+ }
534
+ if (formats.length > 0) {
513
535
  instructionsBlock += "\n\n### Example Formats:\n";
514
- for (const [key, value] of Object.entries(exampleFormats)) {
515
- instructionsBlock += `- **${key}**: \`${value}\`
536
+ for (const example of formats) {
537
+ instructionsBlock += `- **${example.element}**: \`${example.format}\`
516
538
  `;
517
539
  }
518
540
  }
@@ -524,7 +546,8 @@ function buildExtractionPrompt(documentInstructions, exampleFormats, pageStart,
524
546
  pageRange = `Process pages ${pageStart}-${pageEnd} of this document.`;
525
547
  }
526
548
  }
527
- return BASE_EXTRACTION_TEMPLATE.replace("{{DOCUMENT_INSTRUCTIONS}}", instructionsBlock || DEFAULT_DOCUMENT_INSTRUCTIONS).replace("{{PAGE_RANGE}}", pageRange);
549
+ const template = useStructuredOutput ? STRUCTURED_EXTRACTION_TEMPLATE : BASE_EXTRACTION_TEMPLATE;
550
+ return template.replace("{{DOCUMENT_INSTRUCTIONS}}", instructionsBlock || DEFAULT_DOCUMENT_INSTRUCTIONS).replace("{{PAGE_RANGE}}", pageRange);
528
551
  }
529
552
  function buildDiscoveryPrompt(documentTypeHint) {
530
553
  let hint = "";
@@ -534,7 +557,7 @@ Hint: The user expects this to be a "${documentTypeHint}" document. Consider thi
534
557
  }
535
558
  return DISCOVERY_TEMPLATE.replace("{{DOCUMENT_TYPE_HINT}}", hint);
536
559
  }
537
- var SECTION_PATTERN = /<!-- SECTION type="(\w+)" page="(\d+)" confidence="([\d.]+)" -->\n?([\s\S]*?)\n?<!-- \/SECTION -->/g;
560
+ var SECTION_PATTERN = /<!-- SECTION (?:type=")?(\w+)"? page="(\d+)" confidence="([\d.]+)" -->\n?([\s\S]*?)\n?<!-- \/SECTION -->/g;
538
561
 
539
562
  // src/types/enums.ts
540
563
  var ChunkTypeEnum = {
@@ -1196,7 +1219,7 @@ var ChunkRepository = class {
1196
1219
  const result = await this.prisma.$queryRaw`
1197
1220
  INSERT INTO context_rag_chunks (
1198
1221
  id, prompt_config_id, document_id, chunk_index, chunk_type,
1199
- search_content, search_vector, display_content,
1222
+ search_content, enriched_content, context_text, search_vector, display_content,
1200
1223
  source_page_start, source_page_end, confidence_score, metadata, created_at
1201
1224
  ) VALUES (
1202
1225
  gen_random_uuid(),
@@ -1205,6 +1228,8 @@ var ChunkRepository = class {
1205
1228
  ${input.chunkIndex},
1206
1229
  ${input.chunkType},
1207
1230
  ${input.searchContent},
1231
+ ${input.enrichedContent ?? null},
1232
+ ${input.contextText ?? null},
1208
1233
  ${embedding}::vector,
1209
1234
  ${input.displayContent},
1210
1235
  ${input.sourcePageStart},
@@ -1235,7 +1260,7 @@ var ChunkRepository = class {
1235
1260
  const result = await tx.$queryRaw`
1236
1261
  INSERT INTO context_rag_chunks (
1237
1262
  id, prompt_config_id, document_id, chunk_index, chunk_type,
1238
- search_content, search_vector, display_content,
1263
+ search_content, enriched_content, context_text, search_vector, display_content,
1239
1264
  source_page_start, source_page_end, confidence_score, metadata, created_at
1240
1265
  ) VALUES (
1241
1266
  gen_random_uuid(),
@@ -1244,6 +1269,8 @@ var ChunkRepository = class {
1244
1269
  ${input.chunkIndex},
1245
1270
  ${input.chunkType},
1246
1271
  ${input.searchContent},
1272
+ ${input.enrichedContent ?? null},
1273
+ ${input.contextText ?? null},
1247
1274
  ${embedding}::vector,
1248
1275
  ${input.displayContent},
1249
1276
  ${input.sourcePageStart},
@@ -1453,6 +1480,121 @@ async function getDatabaseStats(prisma) {
1453
1480
  });
1454
1481
  }
1455
1482
  }
1483
+ var ChunkTypeSchema = zod.z.enum([
1484
+ "TEXT",
1485
+ "TABLE",
1486
+ "LIST",
1487
+ "HEADING",
1488
+ "CODE",
1489
+ "QUOTE",
1490
+ "IMAGE_REF",
1491
+ "QUESTION",
1492
+ "MIXED"
1493
+ ]).describe("The type of content in this section. PREFER 'TEXT' for regular paragraphs. Only use 'LIST' for actual bulleted/numbered lists. Only use 'TABLE' for actual tables.");
1494
+ var SectionSchema = zod.z.object({
1495
+ /** Content type */
1496
+ type: ChunkTypeSchema,
1497
+ /** Source page number (1-indexed) */
1498
+ page: zod.z.number().int().min(1).describe("The page number where this content starts."),
1499
+ /** Extraction confidence score (0.0-1.0) */
1500
+ confidence: zod.z.number().min(0).max(1).describe("Confidence score (0.0-1.0)"),
1501
+ /** Extracted content in Markdown format */
1502
+ content: zod.z.string().min(1).describe("The full extracted content in Markdown. DO NOT summarize. DO NOT break paragraphs into lists. Maintain the original flow.")
1503
+ });
1504
+ var SectionArraySchema = zod.z.array(SectionSchema);
1505
+ var DetectedElementTypeSchema = zod.z.enum([
1506
+ "table",
1507
+ "list",
1508
+ "code",
1509
+ "image",
1510
+ "chart",
1511
+ "form",
1512
+ "heading"
1513
+ ]);
1514
+ var DetectedElementSchema = zod.z.object({
1515
+ /** Element type */
1516
+ type: DetectedElementTypeSchema,
1517
+ /** Approximate count */
1518
+ count: zod.z.number().int().min(0),
1519
+ /** Example locations (page numbers) */
1520
+ examples: zod.z.array(zod.z.number()).optional()
1521
+ });
1522
+ var ChunkStrategySchema = zod.z.object({
1523
+ /** Maximum tokens per chunk */
1524
+ maxTokens: zod.z.number().int().min(100).max(2e3).default(500),
1525
+ /** Split method */
1526
+ splitBy: zod.z.enum(["semantic", "page", "paragraph", "section"]).default("semantic"),
1527
+ /** Preserve tables as single chunks */
1528
+ preserveTables: zod.z.boolean().default(true),
1529
+ /** Preserve lists as single chunks */
1530
+ preserveLists: zod.z.boolean().default(true)
1531
+ });
1532
+ var DiscoveryResponseSchema = zod.z.object({
1533
+ /** Detected document type (e.g., 'Medical', 'Legal') */
1534
+ documentType: zod.z.string().min(1),
1535
+ /** Human-readable document type name */
1536
+ documentTypeName: zod.z.string().min(1),
1537
+ /** Document language (e.g., 'tr', 'en') */
1538
+ language: zod.z.string().optional(),
1539
+ /** Document complexity assessment */
1540
+ complexity: zod.z.enum(["low", "medium", "high"]).optional(),
1541
+ /** Detected elements in document */
1542
+ detectedElements: zod.z.array(DetectedElementSchema).default([]),
1543
+ /** Document-specific extraction instructions */
1544
+ specialInstructions: zod.z.array(zod.z.string()),
1545
+ /** Example formats for each element type */
1546
+ exampleFormats: zod.z.array(zod.z.object({
1547
+ element: zod.z.string(),
1548
+ format: zod.z.string()
1549
+ })).optional(),
1550
+ /** Recommended chunk strategy */
1551
+ chunkStrategy: ChunkStrategySchema.optional(),
1552
+ /** Detection confidence (0.0-1.0) */
1553
+ confidence: zod.z.number().min(0).max(1),
1554
+ /** AI reasoning for the analysis */
1555
+ reasoning: zod.z.string()
1556
+ });
1557
+ var ContextGenerationSchema = zod.z.object({
1558
+ /** Generated context text */
1559
+ context: zod.z.string(),
1560
+ /** Confidence in the generated context */
1561
+ confidence: zod.z.number().min(0).max(1).optional()
1562
+ });
1563
+ function zodToGeminiSchema(zodSchema) {
1564
+ const jsonSchema = zodToJsonSchema.zodToJsonSchema(zodSchema, {
1565
+ target: "jsonSchema7",
1566
+ $refStrategy: "none",
1567
+ // Critical: Gemini doesn't support $ref in responseSchema
1568
+ definitionPath: "$defs"
1569
+ });
1570
+ return cleanSchemaForGemini(jsonSchema);
1571
+ }
1572
+ function cleanSchemaForGemini(schema) {
1573
+ if (typeof schema !== "object" || schema === null) {
1574
+ return schema;
1575
+ }
1576
+ if (Array.isArray(schema)) {
1577
+ return schema.map(cleanSchemaForGemini);
1578
+ }
1579
+ const newObj = {};
1580
+ for (const [key, value] of Object.entries(schema)) {
1581
+ if (key === "additionalProperties" || key === "$schema" || key === "title" || key === "default") {
1582
+ continue;
1583
+ }
1584
+ newObj[key] = cleanSchemaForGemini(value);
1585
+ }
1586
+ return newObj;
1587
+ }
1588
+ ({
1589
+ /** Schema for section extraction */
1590
+ sectionArray: zodToGeminiSchema(SectionArraySchema),
1591
+ /** Schema for discovery response */
1592
+ discovery: zodToGeminiSchema(DiscoveryResponseSchema),
1593
+ /** Schema for context generation */
1594
+ contextGeneration: zodToGeminiSchema(ContextGenerationSchema)
1595
+ });
1596
+
1597
+ // src/services/gemini.service.ts
1456
1598
  var GeminiService = class {
1457
1599
  genAI;
1458
1600
  fileManager;
@@ -1731,6 +1873,111 @@ ${userContent}` }]
1731
1873
  throw error;
1732
1874
  }
1733
1875
  }
1876
+ /**
1877
+ * Generate structured data from text prompt
1878
+ */
1879
+ async generateStructured(prompt, schema, options) {
1880
+ return this.executeStructuredRetry(
1881
+ [{ role: "user", parts: [{ text: prompt }] }],
1882
+ schema,
1883
+ options
1884
+ );
1885
+ }
1886
+ /**
1887
+ * Generate structured data from PDF
1888
+ */
1889
+ async generateStructuredWithPdf(pdfUri, prompt, schema, options) {
1890
+ return this.executeStructuredRetry(
1891
+ [
1892
+ {
1893
+ role: "user",
1894
+ parts: [
1895
+ { fileData: { mimeType: "application/pdf", fileUri: pdfUri } },
1896
+ { text: prompt }
1897
+ ]
1898
+ }
1899
+ ],
1900
+ schema,
1901
+ options
1902
+ );
1903
+ }
1904
+ /**
1905
+ * Execute structured generation with retry logic
1906
+ */
1907
+ async executeStructuredRetry(contents, schema, options) {
1908
+ const maxRetries = options?.maxRetries ?? 2;
1909
+ let attempt = 0;
1910
+ let lastError;
1911
+ const currentContents = [...contents];
1912
+ while (attempt <= maxRetries) {
1913
+ attempt++;
1914
+ await this.rateLimiter.acquire();
1915
+ try {
1916
+ const result = await this.model.generateContent({
1917
+ contents: currentContents,
1918
+ generationConfig: {
1919
+ responseMimeType: "application/json",
1920
+ // Cast to any because the new schema format might have slight type mismatch
1921
+ // but is valid for the API
1922
+ responseSchema: zodToGeminiSchema(schema),
1923
+ temperature: options?.temperature ?? 0.2,
1924
+ maxOutputTokens: options?.maxOutputTokens
1925
+ }
1926
+ });
1927
+ const response = result.response;
1928
+ const text = response.text();
1929
+ const usage = response.usageMetadata;
1930
+ this.rateLimiter.reportSuccess();
1931
+ try {
1932
+ const parsed = JSON.parse(text);
1933
+ const data = schema.parse(parsed);
1934
+ return {
1935
+ data,
1936
+ tokenUsage: {
1937
+ input: usage?.promptTokenCount ?? 0,
1938
+ output: usage?.candidatesTokenCount ?? 0,
1939
+ total: usage?.totalTokenCount ?? 0
1940
+ }
1941
+ };
1942
+ } catch (e) {
1943
+ const errorMessage = e instanceof Error ? e.message : String(e);
1944
+ const snippet = text.length > 500 ? text.substring(0, 200) + "...[truncated]..." + text.substring(text.length - 200) : text;
1945
+ this.logger.warn(`Structured validation failed (attempt ${attempt}/${maxRetries + 1})`, {
1946
+ error: errorMessage,
1947
+ snippet: text.substring(0, 100)
1948
+ });
1949
+ lastError = new Error(`Structured output validation failed: ${errorMessage}. Raw response snippet: ${snippet}`);
1950
+ if (attempt <= maxRetries) {
1951
+ currentContents.push({
1952
+ role: "model",
1953
+ parts: [{ text }]
1954
+ });
1955
+ currentContents.push({
1956
+ role: "user",
1957
+ parts: [{ text: `JSON Validation Error: ${errorMessage}
1958
+
1959
+ Please fix the JSON output to match the schema exactly.` }]
1960
+ });
1961
+ continue;
1962
+ }
1963
+ throw lastError;
1964
+ }
1965
+ } catch (error) {
1966
+ try {
1967
+ this.handleError(error);
1968
+ } catch (handledError) {
1969
+ throw handledError;
1970
+ }
1971
+ lastError = error;
1972
+ if (attempt <= maxRetries) {
1973
+ this.logger.warn(`Gemini API error (attempt ${attempt}/${maxRetries + 1}), retrying...`, { error: error.message });
1974
+ continue;
1975
+ }
1976
+ throw error;
1977
+ }
1978
+ }
1979
+ throw lastError;
1980
+ }
1734
1981
  /**
1735
1982
  * Handle API errors
1736
1983
  */
@@ -1864,11 +2111,22 @@ var AnthropicHandler = class {
1864
2111
  gemini;
1865
2112
  limit;
1866
2113
  skipTypes;
1867
- constructor(config, gemini) {
2114
+ constructor(config, mainGemini, resolvedConfig) {
1868
2115
  this.config = config;
1869
- this.gemini = gemini;
1870
2116
  this.limit = pLimit__default.default(config.concurrencyLimit ?? DEFAULT_ANTHROPIC_CONFIG.concurrencyLimit);
1871
2117
  this.skipTypes = new Set(config.skipChunkTypes ?? DEFAULT_ANTHROPIC_CONFIG.skipChunkTypes);
2118
+ if (config.model && config.model !== resolvedConfig.model) {
2119
+ console.log(`[AnthropicHandler] Using separate model for enhancement: ${config.model}`);
2120
+ const enhancementConfig = {
2121
+ ...resolvedConfig,
2122
+ model: config.model
2123
+ };
2124
+ const rateLimiter = new RateLimiter(resolvedConfig.rateLimitConfig);
2125
+ const logger = createLogger(resolvedConfig.logging);
2126
+ this.gemini = new GeminiService(enhancementConfig, rateLimiter, logger);
2127
+ } else {
2128
+ this.gemini = mainGemini;
2129
+ }
1872
2130
  }
1873
2131
  shouldSkip(chunkType) {
1874
2132
  return this.skipTypes.has(chunkType);
@@ -1916,15 +2174,18 @@ ${doc.fullDocumentText.slice(0, 15e3)}
1916
2174
  ${chunk.content}
1917
2175
  </chunk_to_contextualize>
1918
2176
 
1919
- Bu chunk'\u0131n belgede nerede oldu\u011Funu ve ne hakk\u0131nda oldu\u011Funu 1-2 c\xFCmle ile T\xFCrk\xE7e a\xE7\u0131kla:`;
2177
+ Bu i\xE7eri\u011Fin belgenin genel ak\u0131\u015F\u0131 i\xE7indeki yerini, ba\u011Fl\u0131 oldu\u011Fu ana ba\u015Fl\u0131klar\u0131 ve ele ald\u0131\u011F\u0131 konuyu detayl\u0131 bir \u015Fekilde \xF6zetle. \u0130\xE7eri\u011Fin ne oldu\u011Funu de\u011Fil, ba\u011Flam\u0131n\u0131 anlat:`;
1920
2178
  try {
1921
2179
  if (doc.fileUri) {
1922
- const chunkPrompt = `Bu chunk'\u0131n belgede nerede oldu\u011Funu ve ne hakk\u0131nda oldu\u011Funu 1-2 c\xFCmle ile T\xFCrk\xE7e a\xE7\u0131kla:
2180
+ const chunkPrompt = `Bu i\xE7eri\u011Fin belgenin genel ak\u0131\u015F\u0131 i\xE7indeki yerini, ba\u011Fl\u0131 oldu\u011Fu ana ba\u015Fl\u0131klar\u0131 ve ele ald\u0131\u011F\u0131 konuyu detayl\u0131 bir \u015Fekilde \xF6zetle. \u0130\xE7eri\u011Fin ne oldu\u011Funu de\u011Fil, ba\u011Flam\u0131n\u0131 anlat:
1923
2181
 
1924
2182
  <chunk>
1925
2183
  ${chunk.content}
1926
2184
  </chunk>`;
1927
- const result2 = await this.gemini.generateWithPdfUri(doc.fileUri, chunkPrompt);
2185
+ const result2 = await this.gemini.generateWithPdfUri(doc.fileUri, chunkPrompt, {
2186
+ maxOutputTokens: 2048,
2187
+ temperature: 0.3
2188
+ });
1928
2189
  return result2.text;
1929
2190
  }
1930
2191
  const result = await this.gemini.generateSimple(fullPrompt);
@@ -1943,7 +2204,7 @@ function createEnhancementHandler(config, _resolvedConfig, gemini) {
1943
2204
  }
1944
2205
  switch (config.approach) {
1945
2206
  case "anthropic_contextual":
1946
- return new AnthropicHandler(config, gemini);
2207
+ return new AnthropicHandler(config, gemini, _resolvedConfig);
1947
2208
  case "google_grounding":
1948
2209
  throw new Error("Google Grounding is not yet implemented");
1949
2210
  case "custom":
@@ -2168,24 +2429,50 @@ var IngestionEngine = class {
2168
2429
  try {
2169
2430
  const result = await withRetry(
2170
2431
  async () => {
2171
- const prompt = buildExtractionPrompt(
2172
- documentInstructions,
2173
- exampleFormats,
2174
- batch.pageStart,
2175
- batch.pageEnd
2176
- );
2177
- const fullPrompt = `${prompt}
2432
+ const useStructured = this.config.useStructuredOutput;
2433
+ const getPrompt = (structured) => {
2434
+ const basePrompt = buildExtractionPrompt(
2435
+ documentInstructions,
2436
+ exampleFormats,
2437
+ batch.pageStart,
2438
+ batch.pageEnd,
2439
+ structured
2440
+ );
2441
+ return `${basePrompt}
2178
2442
 
2179
2443
  IMPORTANT: You have the FULL document. Restrict your extraction STRICTLY to pages ${batch.pageStart} to ${batch.pageEnd}. Do not extract content from other pages.`;
2180
- const response = await this.gemini.generateWithPdfUri(
2444
+ };
2445
+ if (useStructured) {
2446
+ try {
2447
+ const structuredResponse = await this.gemini.generateStructuredWithPdf(
2448
+ fileUri,
2449
+ getPrompt(true),
2450
+ SectionArraySchema,
2451
+ {
2452
+ temperature: this.config.generationConfig?.temperature,
2453
+ maxOutputTokens: this.config.generationConfig?.maxOutputTokens
2454
+ }
2455
+ );
2456
+ this.logger.debug("Structured extraction success", {
2457
+ batchId: batch.id,
2458
+ chunkCount: structuredResponse.data.length
2459
+ });
2460
+ return structuredResponse;
2461
+ } catch (structuredError) {
2462
+ this.logger.warn("Structured extraction failed, falling back to legacy parsing", {
2463
+ batchId: batch.id,
2464
+ error: structuredError.message
2465
+ });
2466
+ }
2467
+ }
2468
+ return await this.gemini.generateWithPdfUri(
2181
2469
  fileUri,
2182
- fullPrompt,
2470
+ getPrompt(false),
2183
2471
  {
2184
2472
  temperature: this.config.generationConfig?.temperature,
2185
2473
  maxOutputTokens: this.config.generationConfig?.maxOutputTokens
2186
2474
  }
2187
2475
  );
2188
- return response;
2189
2476
  },
2190
2477
  {
2191
2478
  ...retryOptions,
@@ -2206,13 +2493,40 @@ var IngestionEngine = class {
2206
2493
  }
2207
2494
  }
2208
2495
  );
2209
- const chunks = this.parseContentToChunks(
2210
- result.text,
2211
- promptConfigId,
2212
- documentId,
2213
- batch.pageStart,
2214
- batch.pageEnd
2215
- );
2496
+ let chunks;
2497
+ if ("data" in result && Array.isArray(result.data)) {
2498
+ const sections = result.data;
2499
+ chunks = sections.map((section, index) => ({
2500
+ promptConfigId,
2501
+ documentId,
2502
+ chunkIndex: index,
2503
+ chunkType: section.type,
2504
+ searchContent: cleanForSearch(section.content),
2505
+ displayContent: section.content,
2506
+ sourcePageStart: section.page,
2507
+ sourcePageEnd: section.page,
2508
+ confidenceScore: section.confidence,
2509
+ metadata: {
2510
+ type: section.type,
2511
+ pageRange: { start: section.page, end: section.page },
2512
+ confidence: {
2513
+ score: section.confidence,
2514
+ category: section.confidence >= 0.8 ? "HIGH" : section.confidence >= 0.5 ? "MEDIUM" : "LOW"
2515
+ },
2516
+ parsedWithStructuredMarkers: true,
2517
+ parsingMethod: "gemini_response_schema"
2518
+ }
2519
+ }));
2520
+ } else {
2521
+ const textResponse = result;
2522
+ chunks = this.parseContentToChunks(
2523
+ textResponse.text,
2524
+ promptConfigId,
2525
+ documentId,
2526
+ batch.pageStart,
2527
+ batch.pageEnd
2528
+ );
2529
+ }
2216
2530
  const docContext = {
2217
2531
  documentType: void 0,
2218
2532
  // Inferred from processing
@@ -2525,34 +2839,53 @@ var DiscoveryEngine = class {
2525
2839
  const { buffer, metadata } = await this.pdfProcessor.load(options.file);
2526
2840
  const fileUri = await this.gemini.uploadPdfBuffer(buffer, metadata.filename);
2527
2841
  const prompt = buildDiscoveryPrompt(options.documentTypeHint);
2528
- const response = await this.gemini.generateWithPdfUri(fileUri, prompt);
2529
2842
  let analysisResult;
2530
2843
  try {
2531
- let jsonStr = response.text;
2532
- const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/);
2533
- if (jsonMatch?.[1]) {
2534
- jsonStr = jsonMatch[1];
2535
- }
2536
- analysisResult = JSON.parse(jsonStr);
2537
- if (!analysisResult.documentType) {
2538
- throw new Error("Missing documentType in response");
2539
- }
2540
- if (!Array.isArray(analysisResult.specialInstructions)) {
2541
- analysisResult.specialInstructions = this.getDefaultInstructions();
2542
- }
2543
- } catch (parseError) {
2544
- this.logger.warn("Failed to parse discovery response as JSON, using defaults", {
2545
- error: parseError.message
2546
- });
2844
+ const response = await this.gemini.generateStructuredWithPdf(
2845
+ fileUri,
2846
+ prompt,
2847
+ DiscoveryResponseSchema
2848
+ );
2547
2849
  analysisResult = {
2548
- documentType: options.documentTypeHint ?? "General",
2549
- documentTypeName: options.documentTypeHint ?? "General Document",
2550
- detectedElements: [],
2551
- specialInstructions: this.getDefaultInstructions(),
2552
- chunkStrategy: DEFAULT_CHUNK_STRATEGY,
2553
- confidence: 0.5,
2554
- reasoning: "Failed to parse AI response, using default configuration"
2850
+ ...response.data,
2851
+ detectedElements: response.data.detectedElements ?? []
2555
2852
  };
2853
+ this.logger.debug("Structured discovery response received", {
2854
+ documentType: analysisResult.documentType,
2855
+ confidence: analysisResult.confidence
2856
+ });
2857
+ } catch (structuredError) {
2858
+ this.logger.warn("Structured output failed, trying legacy parsing", {
2859
+ error: structuredError.message
2860
+ });
2861
+ try {
2862
+ const response = await this.gemini.generateWithPdfUri(fileUri, prompt);
2863
+ let jsonStr = response.text;
2864
+ const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/);
2865
+ if (jsonMatch?.[1]) {
2866
+ jsonStr = jsonMatch[1];
2867
+ }
2868
+ const parsed = JSON.parse(jsonStr);
2869
+ analysisResult = DiscoveryResponseSchema.parse(parsed);
2870
+ } catch (legacyError) {
2871
+ this.logger.warn("All parsing methods failed, using defaults", {
2872
+ error: legacyError.message
2873
+ });
2874
+ analysisResult = {
2875
+ documentType: options.documentTypeHint ?? "General",
2876
+ documentTypeName: options.documentTypeHint ?? "General Document",
2877
+ detectedElements: [],
2878
+ specialInstructions: this.getDefaultInstructions(),
2879
+ chunkStrategy: {
2880
+ maxTokens: DEFAULT_CHUNK_STRATEGY.maxTokens,
2881
+ splitBy: DEFAULT_CHUNK_STRATEGY.splitBy,
2882
+ preserveTables: DEFAULT_CHUNK_STRATEGY.preserveTables,
2883
+ preserveLists: DEFAULT_CHUNK_STRATEGY.preserveLists
2884
+ },
2885
+ confidence: 0.5,
2886
+ reasoning: "Failed to parse AI response, using default configuration"
2887
+ };
2888
+ }
2556
2889
  }
2557
2890
  const discoveryResult = {
2558
2891
  id: correlationId,
@@ -2693,7 +3026,9 @@ var ContextRAG = class {
2693
3026
  logging: {
2694
3027
  ...DEFAULT_LOG_CONFIG,
2695
3028
  ...userConfig.logging
2696
- }
3029
+ },
3030
+ ragEnhancement: userConfig.ragEnhancement,
3031
+ useStructuredOutput: userConfig.useStructuredOutput ?? true
2697
3032
  };
2698
3033
  }
2699
3034
  /**