@msbayindir/context-rag 1.0.0-beta.3 → 1.0.0-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +455 -120
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +12 -1
- package/dist/index.d.ts +12 -1
- package/dist/index.js +455 -120
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/index.cjs
CHANGED
|
@@ -4,6 +4,7 @@ var zod = require('zod');
|
|
|
4
4
|
var crypto = require('crypto');
|
|
5
5
|
var generativeAi = require('@google/generative-ai');
|
|
6
6
|
var server = require('@google/generative-ai/server');
|
|
7
|
+
var zodToJsonSchema = require('zod-to-json-schema');
|
|
7
8
|
var fs = require('fs/promises');
|
|
8
9
|
var path = require('path');
|
|
9
10
|
var pdf = require('pdf-parse');
|
|
@@ -388,10 +389,10 @@ Analyze the document and return ONLY a JSON response with the following structur
|
|
|
388
389
|
"Specific instruction 3 for this document type"
|
|
389
390
|
],
|
|
390
391
|
|
|
391
|
-
"exampleFormats":
|
|
392
|
-
"
|
|
393
|
-
"
|
|
394
|
-
|
|
392
|
+
"exampleFormats": [
|
|
393
|
+
{ "element": "table", "format": "Markdown table with headers" },
|
|
394
|
+
{ "element": "code", "format": "Code block with language tag" }
|
|
395
|
+
],
|
|
395
396
|
|
|
396
397
|
"chunkStrategy": {
|
|
397
398
|
"maxTokens": 800,
|
|
@@ -413,92 +414,104 @@ IMPORTANT RULES:
|
|
|
413
414
|
|
|
414
415
|
{{DOCUMENT_TYPE_HINT}}
|
|
415
416
|
`;
|
|
416
|
-
var BASE_EXTRACTION_TEMPLATE = `You are a document processing AI. Extract content
|
|
417
|
+
var BASE_EXTRACTION_TEMPLATE = `You are a document processing AI. Extract content from the document with HIGH FIDELITY and DETAIL.
|
|
417
418
|
|
|
418
419
|
## OUTPUT FORMAT (MANDATORY - DO NOT MODIFY)
|
|
419
420
|
|
|
420
|
-
|
|
421
|
+
\u26A0\uFE0F CRITICAL: You MUST use EXACTLY this marker format. Any deviation will cause parsing errors:
|
|
421
422
|
|
|
423
|
+
\`\`\`
|
|
422
424
|
<!-- SECTION type="[TYPE]" page="[PAGE]" confidence="[0.0-1.0]" -->
|
|
423
425
|
[Content here in Markdown format]
|
|
424
426
|
<!-- /SECTION -->
|
|
427
|
+
\`\`\`
|
|
425
428
|
|
|
426
|
-
###
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
- CODE: Code blocks with language specification
|
|
432
|
-
- QUOTE: Quoted text or citations
|
|
433
|
-
- IMAGE_REF: Description of images, charts, figures
|
|
434
|
-
- QUESTION: Multiple choice questions with options (A, B, C, D, E)
|
|
435
|
-
|
|
436
|
-
### Format Rules:
|
|
437
|
-
1. **Tables**: Use Markdown table format
|
|
438
|
-
| Column1 | Column2 | Column3 |
|
|
439
|
-
|---------|---------|---------|
|
|
440
|
-
| data | data | data |
|
|
429
|
+
### EXAMPLE OUTPUT (FOLLOW THIS EXACTLY):
|
|
430
|
+
\`\`\`
|
|
431
|
+
<!-- SECTION type="HEADING" page="1" confidence="0.95" -->
|
|
432
|
+
# Introduction to Metabolism
|
|
433
|
+
<!-- /SECTION -->
|
|
441
434
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
OR
|
|
447
|
-
|
|
448
|
-
1. Numbered item
|
|
449
|
-
2. Another numbered
|
|
435
|
+
<!-- SECTION type="TEXT" page="1" confidence="0.92" -->
|
|
436
|
+
Metabolism refers to all chemical reactions in an organism. It creates energy...
|
|
437
|
+
(Extract full paragraphs, do not break them up unnecessarily)
|
|
438
|
+
<!-- /SECTION -->
|
|
450
439
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
440
|
+
<!-- SECTION type="LIST" page="2" confidence="0.90" -->
|
|
441
|
+
- First item in the list
|
|
442
|
+
- Second item in the list
|
|
443
|
+
- Third item in the list
|
|
444
|
+
<!-- /SECTION -->
|
|
455
445
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
446
|
+
<!-- SECTION type="TABLE" page="2" confidence="0.88" -->
|
|
447
|
+
| Column1 | Column2 |
|
|
448
|
+
|---------|---------|
|
|
449
|
+
| Data1 | Data2 |
|
|
450
|
+
<!-- /SECTION -->
|
|
451
|
+
\`\`\`
|
|
460
452
|
|
|
461
|
-
|
|
462
|
-
|
|
453
|
+
### Valid Types:
|
|
454
|
+
- TEXT: Regular paragraphs and prose. **PREFER THIS** for standard text.
|
|
455
|
+
- TABLE: **ONLY** for explicit data tables in the source.
|
|
456
|
+
- LIST: **ONLY** for explicit bulleted/numbered lists in source.
|
|
457
|
+
- HEADING: Section headers with # ## ### levels.
|
|
458
|
+
- CODE: Code blocks with language specification.
|
|
459
|
+
- QUOTE: Quoted text or citations.
|
|
460
|
+
- IMAGE_REF: Description of images, charts, figures.
|
|
461
|
+
- QUESTION: Multiple choice questions.
|
|
463
462
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
E) Option E text (if exists)
|
|
471
|
-
**Answer:** [Letter] (if answer is provided in document)
|
|
463
|
+
### Format Rules:
|
|
464
|
+
1. **Tables**: Use Markdown table format.
|
|
465
|
+
2. **Lists**: Use consistent format (bullets or numbers).
|
|
466
|
+
3. **Headings**: Use Markdown headers (#, ##, ###).
|
|
467
|
+
4. **Code**: Use fenced code blocks with language.
|
|
468
|
+
5. **Images**: Describe visual content clearly.
|
|
472
469
|
|
|
473
470
|
## DOCUMENT-SPECIFIC INSTRUCTIONS
|
|
474
471
|
{{DOCUMENT_INSTRUCTIONS}}
|
|
475
472
|
|
|
476
473
|
## CRITICAL EXTRACTION RULES (DO NOT VIOLATE)
|
|
477
|
-
\u26A0\uFE0F These rules are MANDATORY for legal, medical, and financial document accuracy:
|
|
478
474
|
|
|
479
475
|
1. **NO SUMMARIZATION**: Extract content EXACTLY as written. Do not summarize, paraphrase, or condense.
|
|
480
|
-
2. **
|
|
481
|
-
3. **
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
4. **VERBATIM EXTRACTION**: Copy text word-for-word from the document.
|
|
487
|
-
5. **NO OMISSIONS**: Include all content, even if it seems redundant or repetitive.
|
|
488
|
-
6. **UNCLEAR CONTENT**: If text is unclear or illegible, extract as-is and mark: [UNCLEAR: partial text visible]
|
|
489
|
-
7. **FOREIGN TERMS**: Keep foreign language terms, Latin phrases, and abbreviations exactly as written.
|
|
476
|
+
2. **PRESERVE FLOW**: **DO NOT** break continuous text into lists unless it is explicitly a list in the source. Keep paragraphs together.
|
|
477
|
+
3. **AVOID OVER-SEGMENTATION**: Combine related sentences into single TEXT blocks. Do not create a new section for every sentence.
|
|
478
|
+
4. **PRESERVE ORIGINAL WORDING**: Keep exact terminology, especially for technical, medical, or legal terms.
|
|
479
|
+
5. **NO INTERPRETATION**: Do not interpret or explain the content. Just extract it.
|
|
480
|
+
6. **UNCLEAR CONTENT**: If text is unclear, mark: [UNCLEAR: partial text].
|
|
481
|
+
7. **FOREIGN TERMS**: Keep foreign language terms exactly as written.
|
|
490
482
|
|
|
491
483
|
## PROCESSING RULES
|
|
492
|
-
- Extract ALL content completely
|
|
493
|
-
- Preserve original document structure and hierarchy
|
|
494
|
-
- Include page references for each section
|
|
495
|
-
-
|
|
496
|
-
- Use appropriate confidence scores based on extraction quality
|
|
497
|
-
- If content spans multiple pages, use the starting page number
|
|
484
|
+
- Extract ALL content completely.
|
|
485
|
+
- Preserve original document structure and hierarchy.
|
|
486
|
+
- Include page references for each section.
|
|
487
|
+
- If content spans multiple pages, use the starting page number.
|
|
498
488
|
|
|
499
489
|
## PAGE RANGE
|
|
500
490
|
{{PAGE_RANGE}}
|
|
501
491
|
`;
|
|
492
|
+
var STRUCTURED_EXTRACTION_TEMPLATE = `You are a document processing AI. Extract content from the provided document pages.
|
|
493
|
+
|
|
494
|
+
Your goal is to extract content accurately, preserving the logical structure and semantics.
|
|
495
|
+
|
|
496
|
+
## INSTRUCTIONS
|
|
497
|
+
{{DOCUMENT_INSTRUCTIONS}}
|
|
498
|
+
|
|
499
|
+
## CRITICAL RULES (DO NOT VIOLATE)
|
|
500
|
+
1. **NO SUMMARIZATION**: Extract content EXACTLY as written. Do not summarize, paraphrase, or condense.
|
|
501
|
+
2. **PRESERVE FLOW**: **DO NOT** break continuous text into lists unless it is explicitly a list in the source. Keep paragraphs together.
|
|
502
|
+
3. **AVOID OVER-SEGMENTATION**: Combine related sentences into single TEXT blocks. Do not create a new section for every sentence.
|
|
503
|
+
4. **PRESERVE ORIGINAL WORDING**: Keep exact terminology, especially for technical, medical, or legal terms.
|
|
504
|
+
5. **NO INTERPRETATION**: Do not interpret or explain the content. Just extract it.
|
|
505
|
+
|
|
506
|
+
## PAGE RANGE
|
|
507
|
+
{{PAGE_RANGE}}
|
|
508
|
+
|
|
509
|
+
IMPORTANT:
|
|
510
|
+
1. Extract content strictly from the specified page range.
|
|
511
|
+
2. Maintain the order of elements as they appear in the document.
|
|
512
|
+
3. Don't summarize code blocks or tables; extract them fully.
|
|
513
|
+
4. Follow the specific document instructions provided above.
|
|
514
|
+
`;
|
|
502
515
|
var DEFAULT_DOCUMENT_INSTRUCTIONS = `
|
|
503
516
|
- Extract all text content preserving structure
|
|
504
517
|
- Convert tables to Markdown table format
|
|
@@ -507,12 +520,21 @@ var DEFAULT_DOCUMENT_INSTRUCTIONS = `
|
|
|
507
520
|
- Note any images with descriptive text
|
|
508
521
|
- Maintain the logical flow of content
|
|
509
522
|
`;
|
|
510
|
-
function buildExtractionPrompt(documentInstructions, exampleFormats, pageStart, pageEnd) {
|
|
523
|
+
function buildExtractionPrompt(documentInstructions, exampleFormats, pageStart, pageEnd, useStructuredOutput = false) {
|
|
511
524
|
let instructionsBlock = documentInstructions.map((instruction) => `- ${instruction}`).join("\n");
|
|
512
|
-
|
|
525
|
+
let formats = [];
|
|
526
|
+
if (Array.isArray(exampleFormats)) {
|
|
527
|
+
formats = exampleFormats;
|
|
528
|
+
} else if (exampleFormats) {
|
|
529
|
+
formats = Object.entries(exampleFormats).map(([key, value]) => ({
|
|
530
|
+
element: key,
|
|
531
|
+
format: value
|
|
532
|
+
}));
|
|
533
|
+
}
|
|
534
|
+
if (formats.length > 0) {
|
|
513
535
|
instructionsBlock += "\n\n### Example Formats:\n";
|
|
514
|
-
for (const
|
|
515
|
-
instructionsBlock += `- **${
|
|
536
|
+
for (const example of formats) {
|
|
537
|
+
instructionsBlock += `- **${example.element}**: \`${example.format}\`
|
|
516
538
|
`;
|
|
517
539
|
}
|
|
518
540
|
}
|
|
@@ -524,7 +546,8 @@ function buildExtractionPrompt(documentInstructions, exampleFormats, pageStart,
|
|
|
524
546
|
pageRange = `Process pages ${pageStart}-${pageEnd} of this document.`;
|
|
525
547
|
}
|
|
526
548
|
}
|
|
527
|
-
|
|
549
|
+
const template = useStructuredOutput ? STRUCTURED_EXTRACTION_TEMPLATE : BASE_EXTRACTION_TEMPLATE;
|
|
550
|
+
return template.replace("{{DOCUMENT_INSTRUCTIONS}}", instructionsBlock || DEFAULT_DOCUMENT_INSTRUCTIONS).replace("{{PAGE_RANGE}}", pageRange);
|
|
528
551
|
}
|
|
529
552
|
function buildDiscoveryPrompt(documentTypeHint) {
|
|
530
553
|
let hint = "";
|
|
@@ -534,7 +557,7 @@ Hint: The user expects this to be a "${documentTypeHint}" document. Consider thi
|
|
|
534
557
|
}
|
|
535
558
|
return DISCOVERY_TEMPLATE.replace("{{DOCUMENT_TYPE_HINT}}", hint);
|
|
536
559
|
}
|
|
537
|
-
var SECTION_PATTERN = /<!-- SECTION type="(\w+)" page="(\d+)" confidence="([\d.]+)" -->\n?([\s\S]*?)\n?<!-- \/SECTION -->/g;
|
|
560
|
+
var SECTION_PATTERN = /<!-- SECTION (?:type=")?(\w+)"? page="(\d+)" confidence="([\d.]+)" -->\n?([\s\S]*?)\n?<!-- \/SECTION -->/g;
|
|
538
561
|
|
|
539
562
|
// src/types/enums.ts
|
|
540
563
|
var ChunkTypeEnum = {
|
|
@@ -1196,7 +1219,7 @@ var ChunkRepository = class {
|
|
|
1196
1219
|
const result = await this.prisma.$queryRaw`
|
|
1197
1220
|
INSERT INTO context_rag_chunks (
|
|
1198
1221
|
id, prompt_config_id, document_id, chunk_index, chunk_type,
|
|
1199
|
-
search_content, search_vector, display_content,
|
|
1222
|
+
search_content, enriched_content, context_text, search_vector, display_content,
|
|
1200
1223
|
source_page_start, source_page_end, confidence_score, metadata, created_at
|
|
1201
1224
|
) VALUES (
|
|
1202
1225
|
gen_random_uuid(),
|
|
@@ -1205,6 +1228,8 @@ var ChunkRepository = class {
|
|
|
1205
1228
|
${input.chunkIndex},
|
|
1206
1229
|
${input.chunkType},
|
|
1207
1230
|
${input.searchContent},
|
|
1231
|
+
${input.enrichedContent ?? null},
|
|
1232
|
+
${input.contextText ?? null},
|
|
1208
1233
|
${embedding}::vector,
|
|
1209
1234
|
${input.displayContent},
|
|
1210
1235
|
${input.sourcePageStart},
|
|
@@ -1235,7 +1260,7 @@ var ChunkRepository = class {
|
|
|
1235
1260
|
const result = await tx.$queryRaw`
|
|
1236
1261
|
INSERT INTO context_rag_chunks (
|
|
1237
1262
|
id, prompt_config_id, document_id, chunk_index, chunk_type,
|
|
1238
|
-
search_content, search_vector, display_content,
|
|
1263
|
+
search_content, enriched_content, context_text, search_vector, display_content,
|
|
1239
1264
|
source_page_start, source_page_end, confidence_score, metadata, created_at
|
|
1240
1265
|
) VALUES (
|
|
1241
1266
|
gen_random_uuid(),
|
|
@@ -1244,6 +1269,8 @@ var ChunkRepository = class {
|
|
|
1244
1269
|
${input.chunkIndex},
|
|
1245
1270
|
${input.chunkType},
|
|
1246
1271
|
${input.searchContent},
|
|
1272
|
+
${input.enrichedContent ?? null},
|
|
1273
|
+
${input.contextText ?? null},
|
|
1247
1274
|
${embedding}::vector,
|
|
1248
1275
|
${input.displayContent},
|
|
1249
1276
|
${input.sourcePageStart},
|
|
@@ -1453,6 +1480,121 @@ async function getDatabaseStats(prisma) {
|
|
|
1453
1480
|
});
|
|
1454
1481
|
}
|
|
1455
1482
|
}
|
|
1483
|
+
var ChunkTypeSchema = zod.z.enum([
|
|
1484
|
+
"TEXT",
|
|
1485
|
+
"TABLE",
|
|
1486
|
+
"LIST",
|
|
1487
|
+
"HEADING",
|
|
1488
|
+
"CODE",
|
|
1489
|
+
"QUOTE",
|
|
1490
|
+
"IMAGE_REF",
|
|
1491
|
+
"QUESTION",
|
|
1492
|
+
"MIXED"
|
|
1493
|
+
]).describe("The type of content in this section. PREFER 'TEXT' for regular paragraphs. Only use 'LIST' for actual bulleted/numbered lists. Only use 'TABLE' for actual tables.");
|
|
1494
|
+
var SectionSchema = zod.z.object({
|
|
1495
|
+
/** Content type */
|
|
1496
|
+
type: ChunkTypeSchema,
|
|
1497
|
+
/** Source page number (1-indexed) */
|
|
1498
|
+
page: zod.z.number().int().min(1).describe("The page number where this content starts."),
|
|
1499
|
+
/** Extraction confidence score (0.0-1.0) */
|
|
1500
|
+
confidence: zod.z.number().min(0).max(1).describe("Confidence score (0.0-1.0)"),
|
|
1501
|
+
/** Extracted content in Markdown format */
|
|
1502
|
+
content: zod.z.string().min(1).describe("The full extracted content in Markdown. DO NOT summarize. DO NOT break paragraphs into lists. Maintain the original flow.")
|
|
1503
|
+
});
|
|
1504
|
+
var SectionArraySchema = zod.z.array(SectionSchema);
|
|
1505
|
+
var DetectedElementTypeSchema = zod.z.enum([
|
|
1506
|
+
"table",
|
|
1507
|
+
"list",
|
|
1508
|
+
"code",
|
|
1509
|
+
"image",
|
|
1510
|
+
"chart",
|
|
1511
|
+
"form",
|
|
1512
|
+
"heading"
|
|
1513
|
+
]);
|
|
1514
|
+
var DetectedElementSchema = zod.z.object({
|
|
1515
|
+
/** Element type */
|
|
1516
|
+
type: DetectedElementTypeSchema,
|
|
1517
|
+
/** Approximate count */
|
|
1518
|
+
count: zod.z.number().int().min(0),
|
|
1519
|
+
/** Example locations (page numbers) */
|
|
1520
|
+
examples: zod.z.array(zod.z.number()).optional()
|
|
1521
|
+
});
|
|
1522
|
+
var ChunkStrategySchema = zod.z.object({
|
|
1523
|
+
/** Maximum tokens per chunk */
|
|
1524
|
+
maxTokens: zod.z.number().int().min(100).max(2e3).default(500),
|
|
1525
|
+
/** Split method */
|
|
1526
|
+
splitBy: zod.z.enum(["semantic", "page", "paragraph", "section"]).default("semantic"),
|
|
1527
|
+
/** Preserve tables as single chunks */
|
|
1528
|
+
preserveTables: zod.z.boolean().default(true),
|
|
1529
|
+
/** Preserve lists as single chunks */
|
|
1530
|
+
preserveLists: zod.z.boolean().default(true)
|
|
1531
|
+
});
|
|
1532
|
+
var DiscoveryResponseSchema = zod.z.object({
|
|
1533
|
+
/** Detected document type (e.g., 'Medical', 'Legal') */
|
|
1534
|
+
documentType: zod.z.string().min(1),
|
|
1535
|
+
/** Human-readable document type name */
|
|
1536
|
+
documentTypeName: zod.z.string().min(1),
|
|
1537
|
+
/** Document language (e.g., 'tr', 'en') */
|
|
1538
|
+
language: zod.z.string().optional(),
|
|
1539
|
+
/** Document complexity assessment */
|
|
1540
|
+
complexity: zod.z.enum(["low", "medium", "high"]).optional(),
|
|
1541
|
+
/** Detected elements in document */
|
|
1542
|
+
detectedElements: zod.z.array(DetectedElementSchema).default([]),
|
|
1543
|
+
/** Document-specific extraction instructions */
|
|
1544
|
+
specialInstructions: zod.z.array(zod.z.string()),
|
|
1545
|
+
/** Example formats for each element type */
|
|
1546
|
+
exampleFormats: zod.z.array(zod.z.object({
|
|
1547
|
+
element: zod.z.string(),
|
|
1548
|
+
format: zod.z.string()
|
|
1549
|
+
})).optional(),
|
|
1550
|
+
/** Recommended chunk strategy */
|
|
1551
|
+
chunkStrategy: ChunkStrategySchema.optional(),
|
|
1552
|
+
/** Detection confidence (0.0-1.0) */
|
|
1553
|
+
confidence: zod.z.number().min(0).max(1),
|
|
1554
|
+
/** AI reasoning for the analysis */
|
|
1555
|
+
reasoning: zod.z.string()
|
|
1556
|
+
});
|
|
1557
|
+
var ContextGenerationSchema = zod.z.object({
|
|
1558
|
+
/** Generated context text */
|
|
1559
|
+
context: zod.z.string(),
|
|
1560
|
+
/** Confidence in the generated context */
|
|
1561
|
+
confidence: zod.z.number().min(0).max(1).optional()
|
|
1562
|
+
});
|
|
1563
|
+
function zodToGeminiSchema(zodSchema) {
|
|
1564
|
+
const jsonSchema = zodToJsonSchema.zodToJsonSchema(zodSchema, {
|
|
1565
|
+
target: "jsonSchema7",
|
|
1566
|
+
$refStrategy: "none",
|
|
1567
|
+
// Critical: Gemini doesn't support $ref in responseSchema
|
|
1568
|
+
definitionPath: "$defs"
|
|
1569
|
+
});
|
|
1570
|
+
return cleanSchemaForGemini(jsonSchema);
|
|
1571
|
+
}
|
|
1572
|
+
function cleanSchemaForGemini(schema) {
|
|
1573
|
+
if (typeof schema !== "object" || schema === null) {
|
|
1574
|
+
return schema;
|
|
1575
|
+
}
|
|
1576
|
+
if (Array.isArray(schema)) {
|
|
1577
|
+
return schema.map(cleanSchemaForGemini);
|
|
1578
|
+
}
|
|
1579
|
+
const newObj = {};
|
|
1580
|
+
for (const [key, value] of Object.entries(schema)) {
|
|
1581
|
+
if (key === "additionalProperties" || key === "$schema" || key === "title" || key === "default") {
|
|
1582
|
+
continue;
|
|
1583
|
+
}
|
|
1584
|
+
newObj[key] = cleanSchemaForGemini(value);
|
|
1585
|
+
}
|
|
1586
|
+
return newObj;
|
|
1587
|
+
}
|
|
1588
|
+
({
|
|
1589
|
+
/** Schema for section extraction */
|
|
1590
|
+
sectionArray: zodToGeminiSchema(SectionArraySchema),
|
|
1591
|
+
/** Schema for discovery response */
|
|
1592
|
+
discovery: zodToGeminiSchema(DiscoveryResponseSchema),
|
|
1593
|
+
/** Schema for context generation */
|
|
1594
|
+
contextGeneration: zodToGeminiSchema(ContextGenerationSchema)
|
|
1595
|
+
});
|
|
1596
|
+
|
|
1597
|
+
// src/services/gemini.service.ts
|
|
1456
1598
|
var GeminiService = class {
|
|
1457
1599
|
genAI;
|
|
1458
1600
|
fileManager;
|
|
@@ -1731,6 +1873,111 @@ ${userContent}` }]
|
|
|
1731
1873
|
throw error;
|
|
1732
1874
|
}
|
|
1733
1875
|
}
|
|
1876
|
+
/**
|
|
1877
|
+
* Generate structured data from text prompt
|
|
1878
|
+
*/
|
|
1879
|
+
async generateStructured(prompt, schema, options) {
|
|
1880
|
+
return this.executeStructuredRetry(
|
|
1881
|
+
[{ role: "user", parts: [{ text: prompt }] }],
|
|
1882
|
+
schema,
|
|
1883
|
+
options
|
|
1884
|
+
);
|
|
1885
|
+
}
|
|
1886
|
+
/**
|
|
1887
|
+
* Generate structured data from PDF
|
|
1888
|
+
*/
|
|
1889
|
+
async generateStructuredWithPdf(pdfUri, prompt, schema, options) {
|
|
1890
|
+
return this.executeStructuredRetry(
|
|
1891
|
+
[
|
|
1892
|
+
{
|
|
1893
|
+
role: "user",
|
|
1894
|
+
parts: [
|
|
1895
|
+
{ fileData: { mimeType: "application/pdf", fileUri: pdfUri } },
|
|
1896
|
+
{ text: prompt }
|
|
1897
|
+
]
|
|
1898
|
+
}
|
|
1899
|
+
],
|
|
1900
|
+
schema,
|
|
1901
|
+
options
|
|
1902
|
+
);
|
|
1903
|
+
}
|
|
1904
|
+
/**
|
|
1905
|
+
* Execute structured generation with retry logic
|
|
1906
|
+
*/
|
|
1907
|
+
async executeStructuredRetry(contents, schema, options) {
|
|
1908
|
+
const maxRetries = options?.maxRetries ?? 2;
|
|
1909
|
+
let attempt = 0;
|
|
1910
|
+
let lastError;
|
|
1911
|
+
const currentContents = [...contents];
|
|
1912
|
+
while (attempt <= maxRetries) {
|
|
1913
|
+
attempt++;
|
|
1914
|
+
await this.rateLimiter.acquire();
|
|
1915
|
+
try {
|
|
1916
|
+
const result = await this.model.generateContent({
|
|
1917
|
+
contents: currentContents,
|
|
1918
|
+
generationConfig: {
|
|
1919
|
+
responseMimeType: "application/json",
|
|
1920
|
+
// Cast to any because the new schema format might have slight type mismatch
|
|
1921
|
+
// but is valid for the API
|
|
1922
|
+
responseSchema: zodToGeminiSchema(schema),
|
|
1923
|
+
temperature: options?.temperature ?? 0.2,
|
|
1924
|
+
maxOutputTokens: options?.maxOutputTokens
|
|
1925
|
+
}
|
|
1926
|
+
});
|
|
1927
|
+
const response = result.response;
|
|
1928
|
+
const text = response.text();
|
|
1929
|
+
const usage = response.usageMetadata;
|
|
1930
|
+
this.rateLimiter.reportSuccess();
|
|
1931
|
+
try {
|
|
1932
|
+
const parsed = JSON.parse(text);
|
|
1933
|
+
const data = schema.parse(parsed);
|
|
1934
|
+
return {
|
|
1935
|
+
data,
|
|
1936
|
+
tokenUsage: {
|
|
1937
|
+
input: usage?.promptTokenCount ?? 0,
|
|
1938
|
+
output: usage?.candidatesTokenCount ?? 0,
|
|
1939
|
+
total: usage?.totalTokenCount ?? 0
|
|
1940
|
+
}
|
|
1941
|
+
};
|
|
1942
|
+
} catch (e) {
|
|
1943
|
+
const errorMessage = e instanceof Error ? e.message : String(e);
|
|
1944
|
+
const snippet = text.length > 500 ? text.substring(0, 200) + "...[truncated]..." + text.substring(text.length - 200) : text;
|
|
1945
|
+
this.logger.warn(`Structured validation failed (attempt ${attempt}/${maxRetries + 1})`, {
|
|
1946
|
+
error: errorMessage,
|
|
1947
|
+
snippet: text.substring(0, 100)
|
|
1948
|
+
});
|
|
1949
|
+
lastError = new Error(`Structured output validation failed: ${errorMessage}. Raw response snippet: ${snippet}`);
|
|
1950
|
+
if (attempt <= maxRetries) {
|
|
1951
|
+
currentContents.push({
|
|
1952
|
+
role: "model",
|
|
1953
|
+
parts: [{ text }]
|
|
1954
|
+
});
|
|
1955
|
+
currentContents.push({
|
|
1956
|
+
role: "user",
|
|
1957
|
+
parts: [{ text: `JSON Validation Error: ${errorMessage}
|
|
1958
|
+
|
|
1959
|
+
Please fix the JSON output to match the schema exactly.` }]
|
|
1960
|
+
});
|
|
1961
|
+
continue;
|
|
1962
|
+
}
|
|
1963
|
+
throw lastError;
|
|
1964
|
+
}
|
|
1965
|
+
} catch (error) {
|
|
1966
|
+
try {
|
|
1967
|
+
this.handleError(error);
|
|
1968
|
+
} catch (handledError) {
|
|
1969
|
+
throw handledError;
|
|
1970
|
+
}
|
|
1971
|
+
lastError = error;
|
|
1972
|
+
if (attempt <= maxRetries) {
|
|
1973
|
+
this.logger.warn(`Gemini API error (attempt ${attempt}/${maxRetries + 1}), retrying...`, { error: error.message });
|
|
1974
|
+
continue;
|
|
1975
|
+
}
|
|
1976
|
+
throw error;
|
|
1977
|
+
}
|
|
1978
|
+
}
|
|
1979
|
+
throw lastError;
|
|
1980
|
+
}
|
|
1734
1981
|
/**
|
|
1735
1982
|
* Handle API errors
|
|
1736
1983
|
*/
|
|
@@ -1864,11 +2111,22 @@ var AnthropicHandler = class {
|
|
|
1864
2111
|
gemini;
|
|
1865
2112
|
limit;
|
|
1866
2113
|
skipTypes;
|
|
1867
|
-
constructor(config,
|
|
2114
|
+
constructor(config, mainGemini, resolvedConfig) {
|
|
1868
2115
|
this.config = config;
|
|
1869
|
-
this.gemini = gemini;
|
|
1870
2116
|
this.limit = pLimit__default.default(config.concurrencyLimit ?? DEFAULT_ANTHROPIC_CONFIG.concurrencyLimit);
|
|
1871
2117
|
this.skipTypes = new Set(config.skipChunkTypes ?? DEFAULT_ANTHROPIC_CONFIG.skipChunkTypes);
|
|
2118
|
+
if (config.model && config.model !== resolvedConfig.model) {
|
|
2119
|
+
console.log(`[AnthropicHandler] Using separate model for enhancement: ${config.model}`);
|
|
2120
|
+
const enhancementConfig = {
|
|
2121
|
+
...resolvedConfig,
|
|
2122
|
+
model: config.model
|
|
2123
|
+
};
|
|
2124
|
+
const rateLimiter = new RateLimiter(resolvedConfig.rateLimitConfig);
|
|
2125
|
+
const logger = createLogger(resolvedConfig.logging);
|
|
2126
|
+
this.gemini = new GeminiService(enhancementConfig, rateLimiter, logger);
|
|
2127
|
+
} else {
|
|
2128
|
+
this.gemini = mainGemini;
|
|
2129
|
+
}
|
|
1872
2130
|
}
|
|
1873
2131
|
shouldSkip(chunkType) {
|
|
1874
2132
|
return this.skipTypes.has(chunkType);
|
|
@@ -1916,15 +2174,18 @@ ${doc.fullDocumentText.slice(0, 15e3)}
|
|
|
1916
2174
|
${chunk.content}
|
|
1917
2175
|
</chunk_to_contextualize>
|
|
1918
2176
|
|
|
1919
|
-
Bu
|
|
2177
|
+
Bu i\xE7eri\u011Fin belgenin genel ak\u0131\u015F\u0131 i\xE7indeki yerini, ba\u011Fl\u0131 oldu\u011Fu ana ba\u015Fl\u0131klar\u0131 ve ele ald\u0131\u011F\u0131 konuyu detayl\u0131 bir \u015Fekilde \xF6zetle. \u0130\xE7eri\u011Fin ne oldu\u011Funu de\u011Fil, ba\u011Flam\u0131n\u0131 anlat:`;
|
|
1920
2178
|
try {
|
|
1921
2179
|
if (doc.fileUri) {
|
|
1922
|
-
const chunkPrompt = `Bu
|
|
2180
|
+
const chunkPrompt = `Bu i\xE7eri\u011Fin belgenin genel ak\u0131\u015F\u0131 i\xE7indeki yerini, ba\u011Fl\u0131 oldu\u011Fu ana ba\u015Fl\u0131klar\u0131 ve ele ald\u0131\u011F\u0131 konuyu detayl\u0131 bir \u015Fekilde \xF6zetle. \u0130\xE7eri\u011Fin ne oldu\u011Funu de\u011Fil, ba\u011Flam\u0131n\u0131 anlat:
|
|
1923
2181
|
|
|
1924
2182
|
<chunk>
|
|
1925
2183
|
${chunk.content}
|
|
1926
2184
|
</chunk>`;
|
|
1927
|
-
const result2 = await this.gemini.generateWithPdfUri(doc.fileUri, chunkPrompt
|
|
2185
|
+
const result2 = await this.gemini.generateWithPdfUri(doc.fileUri, chunkPrompt, {
|
|
2186
|
+
maxOutputTokens: 2048,
|
|
2187
|
+
temperature: 0.3
|
|
2188
|
+
});
|
|
1928
2189
|
return result2.text;
|
|
1929
2190
|
}
|
|
1930
2191
|
const result = await this.gemini.generateSimple(fullPrompt);
|
|
@@ -1943,7 +2204,7 @@ function createEnhancementHandler(config, _resolvedConfig, gemini) {
|
|
|
1943
2204
|
}
|
|
1944
2205
|
switch (config.approach) {
|
|
1945
2206
|
case "anthropic_contextual":
|
|
1946
|
-
return new AnthropicHandler(config, gemini);
|
|
2207
|
+
return new AnthropicHandler(config, gemini, _resolvedConfig);
|
|
1947
2208
|
case "google_grounding":
|
|
1948
2209
|
throw new Error("Google Grounding is not yet implemented");
|
|
1949
2210
|
case "custom":
|
|
@@ -2168,24 +2429,50 @@ var IngestionEngine = class {
|
|
|
2168
2429
|
try {
|
|
2169
2430
|
const result = await withRetry(
|
|
2170
2431
|
async () => {
|
|
2171
|
-
const
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2432
|
+
const useStructured = this.config.useStructuredOutput;
|
|
2433
|
+
const getPrompt = (structured) => {
|
|
2434
|
+
const basePrompt = buildExtractionPrompt(
|
|
2435
|
+
documentInstructions,
|
|
2436
|
+
exampleFormats,
|
|
2437
|
+
batch.pageStart,
|
|
2438
|
+
batch.pageEnd,
|
|
2439
|
+
structured
|
|
2440
|
+
);
|
|
2441
|
+
return `${basePrompt}
|
|
2178
2442
|
|
|
2179
2443
|
IMPORTANT: You have the FULL document. Restrict your extraction STRICTLY to pages ${batch.pageStart} to ${batch.pageEnd}. Do not extract content from other pages.`;
|
|
2180
|
-
|
|
2444
|
+
};
|
|
2445
|
+
if (useStructured) {
|
|
2446
|
+
try {
|
|
2447
|
+
const structuredResponse = await this.gemini.generateStructuredWithPdf(
|
|
2448
|
+
fileUri,
|
|
2449
|
+
getPrompt(true),
|
|
2450
|
+
SectionArraySchema,
|
|
2451
|
+
{
|
|
2452
|
+
temperature: this.config.generationConfig?.temperature,
|
|
2453
|
+
maxOutputTokens: this.config.generationConfig?.maxOutputTokens
|
|
2454
|
+
}
|
|
2455
|
+
);
|
|
2456
|
+
this.logger.debug("Structured extraction success", {
|
|
2457
|
+
batchId: batch.id,
|
|
2458
|
+
chunkCount: structuredResponse.data.length
|
|
2459
|
+
});
|
|
2460
|
+
return structuredResponse;
|
|
2461
|
+
} catch (structuredError) {
|
|
2462
|
+
this.logger.warn("Structured extraction failed, falling back to legacy parsing", {
|
|
2463
|
+
batchId: batch.id,
|
|
2464
|
+
error: structuredError.message
|
|
2465
|
+
});
|
|
2466
|
+
}
|
|
2467
|
+
}
|
|
2468
|
+
return await this.gemini.generateWithPdfUri(
|
|
2181
2469
|
fileUri,
|
|
2182
|
-
|
|
2470
|
+
getPrompt(false),
|
|
2183
2471
|
{
|
|
2184
2472
|
temperature: this.config.generationConfig?.temperature,
|
|
2185
2473
|
maxOutputTokens: this.config.generationConfig?.maxOutputTokens
|
|
2186
2474
|
}
|
|
2187
2475
|
);
|
|
2188
|
-
return response;
|
|
2189
2476
|
},
|
|
2190
2477
|
{
|
|
2191
2478
|
...retryOptions,
|
|
@@ -2206,13 +2493,40 @@ var IngestionEngine = class {
|
|
|
2206
2493
|
}
|
|
2207
2494
|
}
|
|
2208
2495
|
);
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
|
|
2496
|
+
let chunks;
|
|
2497
|
+
if ("data" in result && Array.isArray(result.data)) {
|
|
2498
|
+
const sections = result.data;
|
|
2499
|
+
chunks = sections.map((section, index) => ({
|
|
2500
|
+
promptConfigId,
|
|
2501
|
+
documentId,
|
|
2502
|
+
chunkIndex: index,
|
|
2503
|
+
chunkType: section.type,
|
|
2504
|
+
searchContent: cleanForSearch(section.content),
|
|
2505
|
+
displayContent: section.content,
|
|
2506
|
+
sourcePageStart: section.page,
|
|
2507
|
+
sourcePageEnd: section.page,
|
|
2508
|
+
confidenceScore: section.confidence,
|
|
2509
|
+
metadata: {
|
|
2510
|
+
type: section.type,
|
|
2511
|
+
pageRange: { start: section.page, end: section.page },
|
|
2512
|
+
confidence: {
|
|
2513
|
+
score: section.confidence,
|
|
2514
|
+
category: section.confidence >= 0.8 ? "HIGH" : section.confidence >= 0.5 ? "MEDIUM" : "LOW"
|
|
2515
|
+
},
|
|
2516
|
+
parsedWithStructuredMarkers: true,
|
|
2517
|
+
parsingMethod: "gemini_response_schema"
|
|
2518
|
+
}
|
|
2519
|
+
}));
|
|
2520
|
+
} else {
|
|
2521
|
+
const textResponse = result;
|
|
2522
|
+
chunks = this.parseContentToChunks(
|
|
2523
|
+
textResponse.text,
|
|
2524
|
+
promptConfigId,
|
|
2525
|
+
documentId,
|
|
2526
|
+
batch.pageStart,
|
|
2527
|
+
batch.pageEnd
|
|
2528
|
+
);
|
|
2529
|
+
}
|
|
2216
2530
|
const docContext = {
|
|
2217
2531
|
documentType: void 0,
|
|
2218
2532
|
// Inferred from processing
|
|
@@ -2525,34 +2839,53 @@ var DiscoveryEngine = class {
|
|
|
2525
2839
|
const { buffer, metadata } = await this.pdfProcessor.load(options.file);
|
|
2526
2840
|
const fileUri = await this.gemini.uploadPdfBuffer(buffer, metadata.filename);
|
|
2527
2841
|
const prompt = buildDiscoveryPrompt(options.documentTypeHint);
|
|
2528
|
-
const response = await this.gemini.generateWithPdfUri(fileUri, prompt);
|
|
2529
2842
|
let analysisResult;
|
|
2530
2843
|
try {
|
|
2531
|
-
|
|
2532
|
-
|
|
2533
|
-
|
|
2534
|
-
|
|
2535
|
-
|
|
2536
|
-
analysisResult = JSON.parse(jsonStr);
|
|
2537
|
-
if (!analysisResult.documentType) {
|
|
2538
|
-
throw new Error("Missing documentType in response");
|
|
2539
|
-
}
|
|
2540
|
-
if (!Array.isArray(analysisResult.specialInstructions)) {
|
|
2541
|
-
analysisResult.specialInstructions = this.getDefaultInstructions();
|
|
2542
|
-
}
|
|
2543
|
-
} catch (parseError) {
|
|
2544
|
-
this.logger.warn("Failed to parse discovery response as JSON, using defaults", {
|
|
2545
|
-
error: parseError.message
|
|
2546
|
-
});
|
|
2844
|
+
const response = await this.gemini.generateStructuredWithPdf(
|
|
2845
|
+
fileUri,
|
|
2846
|
+
prompt,
|
|
2847
|
+
DiscoveryResponseSchema
|
|
2848
|
+
);
|
|
2547
2849
|
analysisResult = {
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
detectedElements: [],
|
|
2551
|
-
specialInstructions: this.getDefaultInstructions(),
|
|
2552
|
-
chunkStrategy: DEFAULT_CHUNK_STRATEGY,
|
|
2553
|
-
confidence: 0.5,
|
|
2554
|
-
reasoning: "Failed to parse AI response, using default configuration"
|
|
2850
|
+
...response.data,
|
|
2851
|
+
detectedElements: response.data.detectedElements ?? []
|
|
2555
2852
|
};
|
|
2853
|
+
this.logger.debug("Structured discovery response received", {
|
|
2854
|
+
documentType: analysisResult.documentType,
|
|
2855
|
+
confidence: analysisResult.confidence
|
|
2856
|
+
});
|
|
2857
|
+
} catch (structuredError) {
|
|
2858
|
+
this.logger.warn("Structured output failed, trying legacy parsing", {
|
|
2859
|
+
error: structuredError.message
|
|
2860
|
+
});
|
|
2861
|
+
try {
|
|
2862
|
+
const response = await this.gemini.generateWithPdfUri(fileUri, prompt);
|
|
2863
|
+
let jsonStr = response.text;
|
|
2864
|
+
const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/);
|
|
2865
|
+
if (jsonMatch?.[1]) {
|
|
2866
|
+
jsonStr = jsonMatch[1];
|
|
2867
|
+
}
|
|
2868
|
+
const parsed = JSON.parse(jsonStr);
|
|
2869
|
+
analysisResult = DiscoveryResponseSchema.parse(parsed);
|
|
2870
|
+
} catch (legacyError) {
|
|
2871
|
+
this.logger.warn("All parsing methods failed, using defaults", {
|
|
2872
|
+
error: legacyError.message
|
|
2873
|
+
});
|
|
2874
|
+
analysisResult = {
|
|
2875
|
+
documentType: options.documentTypeHint ?? "General",
|
|
2876
|
+
documentTypeName: options.documentTypeHint ?? "General Document",
|
|
2877
|
+
detectedElements: [],
|
|
2878
|
+
specialInstructions: this.getDefaultInstructions(),
|
|
2879
|
+
chunkStrategy: {
|
|
2880
|
+
maxTokens: DEFAULT_CHUNK_STRATEGY.maxTokens,
|
|
2881
|
+
splitBy: DEFAULT_CHUNK_STRATEGY.splitBy,
|
|
2882
|
+
preserveTables: DEFAULT_CHUNK_STRATEGY.preserveTables,
|
|
2883
|
+
preserveLists: DEFAULT_CHUNK_STRATEGY.preserveLists
|
|
2884
|
+
},
|
|
2885
|
+
confidence: 0.5,
|
|
2886
|
+
reasoning: "Failed to parse AI response, using default configuration"
|
|
2887
|
+
};
|
|
2888
|
+
}
|
|
2556
2889
|
}
|
|
2557
2890
|
const discoveryResult = {
|
|
2558
2891
|
id: correlationId,
|
|
@@ -2693,7 +3026,9 @@ var ContextRAG = class {
|
|
|
2693
3026
|
logging: {
|
|
2694
3027
|
...DEFAULT_LOG_CONFIG,
|
|
2695
3028
|
...userConfig.logging
|
|
2696
|
-
}
|
|
3029
|
+
},
|
|
3030
|
+
ragEnhancement: userConfig.ragEnhancement,
|
|
3031
|
+
useStructuredOutput: userConfig.useStructuredOutput ?? true
|
|
2697
3032
|
};
|
|
2698
3033
|
}
|
|
2699
3034
|
/**
|