@absolutejs/absolute 0.19.0-beta.617 → 0.19.0-beta.619
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/client/index.js +100 -23
- package/dist/ai/client/index.js.map +4 -4
- package/dist/ai/client/ui.js +96 -22
- package/dist/ai/client/ui.js.map +3 -3
- package/dist/ai/index.js +502 -84
- package/dist/ai/index.js.map +9 -9
- package/dist/ai/rag/quality.js +5 -2
- package/dist/ai/rag/quality.js.map +3 -3
- package/dist/ai/rag/ui.js +96 -22
- package/dist/ai/rag/ui.js.map +3 -3
- package/dist/ai-client/angular/ai/index.js +95 -21
- package/dist/ai-client/react/ai/index.js +95 -21
- package/dist/ai-client/vue/ai/index.js +95 -21
- package/dist/angular/ai/index.js +96 -22
- package/dist/angular/ai/index.js.map +3 -3
- package/dist/react/ai/index.js +100 -23
- package/dist/react/ai/index.js.map +4 -4
- package/dist/src/ai/rag/quality.d.ts +2 -1
- package/dist/src/vue/ai/useRAG.d.ts +4 -4
- package/dist/src/vue/ai/useRAGChunkPreview.d.ts +2 -2
- package/dist/src/vue/ai/useRAGSearch.d.ts +2 -2
- package/dist/svelte/ai/index.js +100 -23
- package/dist/svelte/ai/index.js.map +4 -4
- package/dist/types/ai.d.ts +3 -2
- package/dist/vue/ai/index.js +100 -23
- package/dist/vue/ai/index.js.map +4 -4
- package/package.json +7 -7
package/dist/ai/index.js
CHANGED
|
@@ -2054,6 +2054,7 @@ var createRAGFileRetrievalLaneHandoffDecisionStore = (path) => ({
|
|
|
2054
2054
|
});
|
|
2055
2055
|
var createRAGFileRetrievalReleaseIncidentStore = (path) => ({
|
|
2056
2056
|
listIncidents: async ({
|
|
2057
|
+
corpusGroupKey,
|
|
2057
2058
|
groupKey,
|
|
2058
2059
|
limit,
|
|
2059
2060
|
severity,
|
|
@@ -2070,7 +2071,7 @@ var createRAGFileRetrievalReleaseIncidentStore = (path) => ({
|
|
|
2070
2071
|
throw error;
|
|
2071
2072
|
}
|
|
2072
2073
|
}
|
|
2073
|
-
const filtered = parsed.filter((entry) => (!groupKey || entry.groupKey === groupKey) && (!targetRolloutLabel || entry.targetRolloutLabel === targetRolloutLabel) && (!severity || entry.severity === severity) && (!status || entry.status === status));
|
|
2074
|
+
const filtered = parsed.filter((entry) => (!corpusGroupKey || entry.corpusGroupKey === corpusGroupKey) && (!groupKey || entry.groupKey === groupKey) && (!targetRolloutLabel || entry.targetRolloutLabel === targetRolloutLabel) && (!severity || entry.severity === severity) && (!status || entry.status === status));
|
|
2074
2075
|
const sorted = normalizeRetrievalReleaseIncidentRecords(filtered);
|
|
2075
2076
|
return typeof limit === "number" ? sorted.slice(0, limit) : sorted;
|
|
2076
2077
|
},
|
|
@@ -3053,12 +3054,14 @@ var loadRAGRetrievalLaneHandoffDecisions = async ({
|
|
|
3053
3054
|
})));
|
|
3054
3055
|
var loadRAGRetrievalReleaseIncidents = async ({
|
|
3055
3056
|
store,
|
|
3057
|
+
corpusGroupKey,
|
|
3056
3058
|
groupKey,
|
|
3057
3059
|
limit,
|
|
3058
3060
|
targetRolloutLabel,
|
|
3059
3061
|
status,
|
|
3060
3062
|
severity
|
|
3061
3063
|
}) => normalizeRetrievalReleaseIncidentRecords(await Promise.resolve(store.listIncidents({
|
|
3064
|
+
corpusGroupKey,
|
|
3062
3065
|
groupKey,
|
|
3063
3066
|
limit,
|
|
3064
3067
|
severity,
|
|
@@ -4234,6 +4237,25 @@ var buildContextLabel2 = (metadata) => {
|
|
|
4234
4237
|
if (!metadata) {
|
|
4235
4238
|
return;
|
|
4236
4239
|
}
|
|
4240
|
+
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4241
|
+
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4242
|
+
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
|
|
4243
|
+
const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
|
|
4244
|
+
if (pdfTextKind === "table_like" && sectionTitle) {
|
|
4245
|
+
return `PDF table block ${sectionTitle}`;
|
|
4246
|
+
}
|
|
4247
|
+
if (pdfTextKind === "paragraph" && sectionTitle) {
|
|
4248
|
+
return `PDF text block ${sectionTitle}`;
|
|
4249
|
+
}
|
|
4250
|
+
if (officeBlockKind === "table" && sectionTitle) {
|
|
4251
|
+
return `Office table block ${sectionTitle}`;
|
|
4252
|
+
}
|
|
4253
|
+
if (officeBlockKind === "list" && sectionTitle) {
|
|
4254
|
+
return `Office list block ${sectionTitle}`;
|
|
4255
|
+
}
|
|
4256
|
+
if (officeBlockKind === "paragraph" && sectionTitle) {
|
|
4257
|
+
return `Office paragraph block ${sectionTitle}`;
|
|
4258
|
+
}
|
|
4237
4259
|
const emailKind = getContextString2(metadata.emailKind);
|
|
4238
4260
|
if (emailKind === "attachment") {
|
|
4239
4261
|
return "Attachment evidence";
|
|
@@ -4270,8 +4292,6 @@ var buildContextLabel2 = (metadata) => {
|
|
|
4270
4292
|
if (speaker) {
|
|
4271
4293
|
return `Speaker ${speaker}`;
|
|
4272
4294
|
}
|
|
4273
|
-
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
|
|
4274
|
-
const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
|
|
4275
4295
|
if (sectionTitle) {
|
|
4276
4296
|
return `Section ${sectionTitle}`;
|
|
4277
4297
|
}
|
|
@@ -4281,11 +4301,21 @@ var buildLocatorLabel2 = (metadata, source, title) => {
|
|
|
4281
4301
|
if (!metadata) {
|
|
4282
4302
|
return;
|
|
4283
4303
|
}
|
|
4304
|
+
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4305
|
+
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4306
|
+
const pdfBlockNumber = getContextNumber2(metadata.pdfBlockNumber);
|
|
4307
|
+
const officeBlockNumber = getContextNumber2(metadata.officeBlockNumber);
|
|
4284
4308
|
const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
4285
4309
|
const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
4286
4310
|
if (page && region) {
|
|
4287
4311
|
return `Page ${page} \xB7 Region ${region}`;
|
|
4288
4312
|
}
|
|
4313
|
+
if (page && pdfBlockNumber && pdfTextKind === "table_like") {
|
|
4314
|
+
return `Page ${page} \xB7 Table Block ${pdfBlockNumber}`;
|
|
4315
|
+
}
|
|
4316
|
+
if (page && pdfBlockNumber) {
|
|
4317
|
+
return `Page ${page} \xB7 Text Block ${pdfBlockNumber}`;
|
|
4318
|
+
}
|
|
4289
4319
|
if (page) {
|
|
4290
4320
|
return `Page ${page}`;
|
|
4291
4321
|
}
|
|
@@ -4314,6 +4344,15 @@ var buildLocatorLabel2 = (metadata, source, title) => {
|
|
|
4314
4344
|
if (mediaStart) {
|
|
4315
4345
|
return `Timestamp ${mediaStart}`;
|
|
4316
4346
|
}
|
|
4347
|
+
if (officeBlockNumber && officeBlockKind === "table") {
|
|
4348
|
+
return `Office table block ${officeBlockNumber}`;
|
|
4349
|
+
}
|
|
4350
|
+
if (officeBlockNumber && officeBlockKind === "list") {
|
|
4351
|
+
return `Office list block ${officeBlockNumber}`;
|
|
4352
|
+
}
|
|
4353
|
+
if (officeBlockNumber && officeBlockKind === "paragraph") {
|
|
4354
|
+
return `Office paragraph block ${officeBlockNumber}`;
|
|
4355
|
+
}
|
|
4317
4356
|
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
|
|
4318
4357
|
if (sectionPath.length > 0) {
|
|
4319
4358
|
return `Section ${sectionPath.join(" > ")}`;
|
|
@@ -4331,12 +4370,16 @@ var buildProvenanceLabel2 = (metadata) => {
|
|
|
4331
4370
|
const mediaKind = getContextString2(metadata.mediaKind);
|
|
4332
4371
|
const transcriptSource = getContextString2(metadata.transcriptSource);
|
|
4333
4372
|
const pdfTextMode = getContextString2(metadata.pdfTextMode);
|
|
4373
|
+
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4374
|
+
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4334
4375
|
const ocrEngine = getContextString2(metadata.ocrEngine);
|
|
4335
4376
|
const extractorRegistryMatch = getContextString2(metadata.extractorRegistryMatch);
|
|
4336
4377
|
const chunkingProfile = getContextString2(metadata.chunkingProfile);
|
|
4337
4378
|
const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
|
|
4338
4379
|
const labels = [
|
|
4339
4380
|
pdfTextMode ? `PDF ${pdfTextMode}` : "",
|
|
4381
|
+
pdfTextKind === "table_like" ? "PDF table block" : pdfTextKind === "paragraph" ? "PDF text block" : "",
|
|
4382
|
+
officeBlockKind ? `Office ${officeBlockKind}` : "",
|
|
4340
4383
|
ocrEngine ? `OCR ${ocrEngine}` : "",
|
|
4341
4384
|
extractorRegistryMatch ? `Extractor ${extractorRegistryMatch}` : "",
|
|
4342
4385
|
chunkingProfile ? `Chunking ${chunkingProfile}` : "",
|
|
@@ -4372,7 +4415,7 @@ var buildRAGChunkStructure = (metadata) => {
|
|
|
4372
4415
|
return;
|
|
4373
4416
|
}
|
|
4374
4417
|
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : undefined;
|
|
4375
|
-
const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" || metadata.sectionKind === "office_heading" || metadata.sectionKind === "spreadsheet_rows" || metadata.sectionKind === "presentation_slide" ? metadata.sectionKind : undefined;
|
|
4418
|
+
const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" || metadata.sectionKind === "office_heading" || metadata.sectionKind === "office_block" || metadata.sectionKind === "pdf_block" || metadata.sectionKind === "spreadsheet_rows" || metadata.sectionKind === "presentation_slide" ? metadata.sectionKind : undefined;
|
|
4376
4419
|
const section = {
|
|
4377
4420
|
depth: getContextNumber2(metadata.sectionDepth),
|
|
4378
4421
|
kind: sectionKind,
|
|
@@ -4692,7 +4735,7 @@ var buildRAGSourceSummaries = (sources) => {
|
|
|
4692
4735
|
const citationReferenceMap = buildRAGCitationReferenceMap(citations);
|
|
4693
4736
|
return sourceGroups.map((group) => {
|
|
4694
4737
|
const groupCitations = citations.filter((citation) => group.chunks.some((chunk) => chunk.chunkId === citation.chunkId));
|
|
4695
|
-
const leadChunk = group.chunks
|
|
4738
|
+
const leadChunk = getPreferredSourceLeadChunk(group.chunks);
|
|
4696
4739
|
const excerpts = leadChunk ? buildRAGChunkExcerpts(group.chunks, leadChunk.chunkId) : undefined;
|
|
4697
4740
|
const structure = leadChunk?.structure ?? buildRAGChunkStructure(leadChunk?.metadata);
|
|
4698
4741
|
const excerptSelection = buildRAGExcerptSelection(excerpts, structure);
|
|
@@ -4720,13 +4763,45 @@ var getSectionPathFromSource = (source) => {
|
|
|
4720
4763
|
const path = source.structure?.section?.path ?? (Array.isArray(source.metadata?.sectionPath) ? source.metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : []);
|
|
4721
4764
|
return path.length > 0 ? path : undefined;
|
|
4722
4765
|
};
|
|
4766
|
+
var isBlockAwareContextLabel = (value) => typeof value === "string" && (value.startsWith("PDF ") || value.startsWith("Office "));
|
|
4767
|
+
var getStructuredSectionScoreWeight = (metadata) => {
|
|
4768
|
+
if (!metadata) {
|
|
4769
|
+
return 1;
|
|
4770
|
+
}
|
|
4771
|
+
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4772
|
+
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4773
|
+
const sectionKind = getContextString2(metadata.sectionKind);
|
|
4774
|
+
if (pdfTextKind === "table_like") {
|
|
4775
|
+
return 1.28;
|
|
4776
|
+
}
|
|
4777
|
+
if (officeBlockKind === "table" || officeBlockKind === "list") {
|
|
4778
|
+
return 1.24;
|
|
4779
|
+
}
|
|
4780
|
+
if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
|
|
4781
|
+
return 1.12;
|
|
4782
|
+
}
|
|
4783
|
+
return 1;
|
|
4784
|
+
};
|
|
4785
|
+
var getStructuredSourceLeadScore = (source) => source.score * getStructuredSectionScoreWeight(source.metadata);
|
|
4786
|
+
var getPreferredSourceLeadChunk = (chunks) => chunks.slice().sort((left, right) => {
|
|
4787
|
+
const leftWeightedScore = getStructuredSourceLeadScore(left);
|
|
4788
|
+
const rightWeightedScore = getStructuredSourceLeadScore(right);
|
|
4789
|
+
if (rightWeightedScore !== leftWeightedScore) {
|
|
4790
|
+
return rightWeightedScore - leftWeightedScore;
|
|
4791
|
+
}
|
|
4792
|
+
if (right.score !== left.score) {
|
|
4793
|
+
return right.score - left.score;
|
|
4794
|
+
}
|
|
4795
|
+
return left.chunkId.localeCompare(right.chunkId);
|
|
4796
|
+
})[0];
|
|
4723
4797
|
var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
4724
|
-
const totalScore = sources.reduce((sum, source) => sum + source.score, 0);
|
|
4798
|
+
const totalScore = sources.reduce((sum, source) => sum + source.score * getStructuredSectionScoreWeight(source.metadata), 0);
|
|
4725
4799
|
if (sources.length === 0 || totalScore <= 0) {
|
|
4726
4800
|
return [];
|
|
4727
4801
|
}
|
|
4728
4802
|
const sections = new Map;
|
|
4729
4803
|
for (const source of sources) {
|
|
4804
|
+
const structuredScore = source.score * getStructuredSectionScoreWeight(source.metadata);
|
|
4730
4805
|
const path = getSectionPathFromSource(source);
|
|
4731
4806
|
if (!path) {
|
|
4732
4807
|
continue;
|
|
@@ -4758,7 +4833,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
|
4758
4833
|
sourceSet: new Set(source.source ? [source.source] : []),
|
|
4759
4834
|
topChunkId: source.chunkId,
|
|
4760
4835
|
topSource: source.source,
|
|
4761
|
-
totalScore:
|
|
4836
|
+
totalScore: structuredScore,
|
|
4762
4837
|
transformedHits,
|
|
4763
4838
|
variantHits,
|
|
4764
4839
|
vectorHits
|
|
@@ -4766,7 +4841,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
|
4766
4841
|
continue;
|
|
4767
4842
|
}
|
|
4768
4843
|
existing.count += 1;
|
|
4769
|
-
existing.totalScore +=
|
|
4844
|
+
existing.totalScore += structuredScore;
|
|
4770
4845
|
if (source.source) {
|
|
4771
4846
|
existing.sourceSet.add(source.source);
|
|
4772
4847
|
}
|
|
@@ -4794,6 +4869,8 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
|
4794
4869
|
const parentTotal = siblingPool.reduce((sum, entry) => sum + entry.totalScore, 0);
|
|
4795
4870
|
const scoreShare = section.totalScore / totalScore;
|
|
4796
4871
|
const parentShare = parentTotal > 0 ? section.totalScore / parentTotal : undefined;
|
|
4872
|
+
const topChunk = sources.find((source) => source.chunkId === section.topChunkId);
|
|
4873
|
+
const topContextLabel = topChunk?.labels?.contextLabel ?? buildContextLabel2(topChunk?.metadata);
|
|
4797
4874
|
const parentDistribution = parentTotal > 0 ? siblingPool.map((entry) => ({
|
|
4798
4875
|
count: entry.count,
|
|
4799
4876
|
isActive: entry.key === section.key,
|
|
@@ -4919,6 +4996,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
|
4919
4996
|
reasons.push("concentrated_evidence");
|
|
4920
4997
|
}
|
|
4921
4998
|
const summaryParts = [
|
|
4999
|
+
isBlockAwareContextLabel(topContextLabel) ? topContextLabel : "",
|
|
4922
5000
|
`${section.count} hit${section.count === 1 ? "" : "s"}`,
|
|
4923
5001
|
`${(scoreShare * 100).toFixed(0)}% score share`,
|
|
4924
5002
|
`vector ${section.vectorHits} \xB7 lexical ${section.lexicalHits} \xB7 hybrid ${section.hybridHits}`,
|
|
@@ -5130,22 +5208,21 @@ var updateSourceGroup = (groups, source) => {
|
|
|
5130
5208
|
groups.set(key, buildSourceGroup(source, key));
|
|
5131
5209
|
return;
|
|
5132
5210
|
}
|
|
5133
|
-
|
|
5134
|
-
existing.bestScore = source.score;
|
|
5135
|
-
existing.label = buildSourceLabel2(source);
|
|
5136
|
-
existing.labels = source.labels ?? buildRAGSourceLabels({
|
|
5137
|
-
metadata: source.metadata,
|
|
5138
|
-
source: source.source,
|
|
5139
|
-
title: source.title
|
|
5140
|
-
});
|
|
5141
|
-
existing.structure = source.structure ?? buildRAGChunkStructure(source.metadata);
|
|
5142
|
-
existing.source = source.source;
|
|
5143
|
-
existing.title = source.title;
|
|
5144
|
-
} else {
|
|
5145
|
-
existing.bestScore = Math.max(existing.bestScore, source.score);
|
|
5146
|
-
}
|
|
5211
|
+
existing.bestScore = Math.max(existing.bestScore, source.score);
|
|
5147
5212
|
existing.count += 1;
|
|
5148
5213
|
existing.chunks.push(source);
|
|
5214
|
+
const leadChunk = getPreferredSourceLeadChunk(existing.chunks);
|
|
5215
|
+
if (leadChunk) {
|
|
5216
|
+
existing.label = buildSourceLabel2(leadChunk);
|
|
5217
|
+
existing.labels = leadChunk.labels ?? buildRAGSourceLabels({
|
|
5218
|
+
metadata: leadChunk.metadata,
|
|
5219
|
+
source: leadChunk.source,
|
|
5220
|
+
title: leadChunk.title
|
|
5221
|
+
});
|
|
5222
|
+
existing.structure = leadChunk.structure ?? buildRAGChunkStructure(leadChunk.metadata);
|
|
5223
|
+
existing.source = leadChunk.source;
|
|
5224
|
+
existing.title = leadChunk.title;
|
|
5225
|
+
}
|
|
5149
5226
|
};
|
|
5150
5227
|
var getLatestAssistantMessage = (messages) => {
|
|
5151
5228
|
for (let index = messages.length - 1;index >= 0; index -= 1) {
|
|
@@ -8482,6 +8559,55 @@ var scoreLoosePhraseMatch2 = (query, text) => {
|
|
|
8482
8559
|
}
|
|
8483
8560
|
return 0;
|
|
8484
8561
|
};
|
|
8562
|
+
var queryHasAnyToken = (queryTokens, candidates) => candidates.some((candidate) => queryTokens.includes(candidate));
|
|
8563
|
+
var scoreStructuredEvidenceMatch = (queryTokens, result) => {
|
|
8564
|
+
const metadata = result.metadata ?? {};
|
|
8565
|
+
const pdfTextKind = typeof metadata.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
|
|
8566
|
+
const officeBlockKind = typeof metadata.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
|
|
8567
|
+
const hasBlockMetadata = typeof metadata.pdfBlockNumber === "number" || typeof metadata.officeBlockNumber === "number";
|
|
8568
|
+
let score = 0;
|
|
8569
|
+
if (hasBlockMetadata) {
|
|
8570
|
+
score += 0.12;
|
|
8571
|
+
}
|
|
8572
|
+
if (pdfTextKind === "table_like" && queryHasAnyToken(queryTokens, [
|
|
8573
|
+
"table",
|
|
8574
|
+
"row",
|
|
8575
|
+
"rows",
|
|
8576
|
+
"column",
|
|
8577
|
+
"columns",
|
|
8578
|
+
"spreadsheet",
|
|
8579
|
+
"sheet",
|
|
8580
|
+
"workbook"
|
|
8581
|
+
])) {
|
|
8582
|
+
score += 0.65;
|
|
8583
|
+
}
|
|
8584
|
+
if (officeBlockKind === "table" && queryHasAnyToken(queryTokens, [
|
|
8585
|
+
"table",
|
|
8586
|
+
"row",
|
|
8587
|
+
"rows",
|
|
8588
|
+
"column",
|
|
8589
|
+
"columns",
|
|
8590
|
+
"matrix",
|
|
8591
|
+
"grid"
|
|
8592
|
+
])) {
|
|
8593
|
+
score += 0.55;
|
|
8594
|
+
}
|
|
8595
|
+
if (officeBlockKind === "list" && queryHasAnyToken(queryTokens, [
|
|
8596
|
+
"list",
|
|
8597
|
+
"checklist",
|
|
8598
|
+
"bullet",
|
|
8599
|
+
"bullets",
|
|
8600
|
+
"step",
|
|
8601
|
+
"steps",
|
|
8602
|
+
"task",
|
|
8603
|
+
"tasks",
|
|
8604
|
+
"item",
|
|
8605
|
+
"items"
|
|
8606
|
+
])) {
|
|
8607
|
+
score += 0.55;
|
|
8608
|
+
}
|
|
8609
|
+
return score;
|
|
8610
|
+
};
|
|
8485
8611
|
var scoreHeuristicMatch = ({
|
|
8486
8612
|
query,
|
|
8487
8613
|
queryTokens,
|
|
@@ -8498,7 +8624,8 @@ var scoreHeuristicMatch = ({
|
|
|
8498
8624
|
const exactPhraseBoost = Math.max(normalizeText([result.title, result.source, result.chunkText, ...metadataValues].filter(Boolean).join(" ")).includes(queryTokens.join(" ")) ? 1 : 0, scoreLoosePhraseMatch2(query, [result.title, result.source, result.chunkText, ...metadataValues].filter(Boolean).join(" ")));
|
|
8499
8625
|
const sourcePathBoost = typeof result.source === "string" && queryTokens.some((token) => result.source?.toLowerCase().includes(token)) ? 0.5 : 0;
|
|
8500
8626
|
const metadataBoost = metadataValues.length > 0 ? queryTokens.filter((token) => metadataValues.some((value) => value.toLowerCase().includes(token))).length / queryTokens.length : 0;
|
|
8501
|
-
|
|
8627
|
+
const structuredEvidenceBoost = scoreStructuredEvidenceMatch(queryTokens, result);
|
|
8628
|
+
return result.score + overlapBoost + exactPhraseBoost + sourcePathBoost + metadataBoost + structuredEvidenceBoost;
|
|
8502
8629
|
};
|
|
8503
8630
|
var normalizeText = (value) => tokenize3(value).join(" ");
|
|
8504
8631
|
var applyRAGReranking = async ({
|
|
@@ -8733,32 +8860,59 @@ var stripHtmlTags = (value) => {
|
|
|
8733
8860
|
`).replace(/<li\b[^>]*>/gi, "- ").replace(/<[^>]+>/g, " ");
|
|
8734
8861
|
return decodeHtmlEntities(withoutTags);
|
|
8735
8862
|
};
|
|
8863
|
+
var stripHtmlNoiseBlocks = (value) => value.replace(/<!--[\s\S]*?-->/g, " ").replace(/<(script|style|template|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<([a-z0-9:_-]+)\b[^>]*\b(hidden|aria-hidden=(['"])true\3)[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<(nav|footer|header|aside|form|dialog)\b[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<([a-z0-9:_-]+)\b[^>]*\b(?:id|class)=(['"])[^'"]*(nav|menu|footer|header|sidebar|promo|banner|cookie|breadcrumb|share|social|subscribe|newsletter|modal)[^'"]*\2[^>]*>[\s\S]*?<\/\1>/gi, " ");
|
|
8864
|
+
var collectHtmlContentCandidates = (value) => {
|
|
8865
|
+
const patterns = [
|
|
8866
|
+
{
|
|
8867
|
+
contentGroup: 1,
|
|
8868
|
+
pattern: /<main\b[^>]*>([\s\S]*?)<\/main>/gi
|
|
8869
|
+
},
|
|
8870
|
+
{
|
|
8871
|
+
contentGroup: 1,
|
|
8872
|
+
pattern: /<article\b[^>]*>([\s\S]*?)<\/article>/gi
|
|
8873
|
+
},
|
|
8874
|
+
{
|
|
8875
|
+
contentGroup: 3,
|
|
8876
|
+
pattern: /<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/gi
|
|
8877
|
+
},
|
|
8878
|
+
{
|
|
8879
|
+
contentGroup: 4,
|
|
8880
|
+
pattern: /<([a-z0-9:_-]+)\b[^>]*\b(?:id|class)=(['"])[^'"]*(content|article|main|post|body)[^'"]*\2[^>]*>([\s\S]*?)<\/\1>/gi
|
|
8881
|
+
}
|
|
8882
|
+
];
|
|
8883
|
+
const candidates = [];
|
|
8884
|
+
for (const entry of patterns) {
|
|
8885
|
+
for (const match of value.matchAll(entry.pattern)) {
|
|
8886
|
+
const rawCandidate = match[entry.contentGroup];
|
|
8887
|
+
const candidate = typeof rawCandidate === "string" ? rawCandidate : "";
|
|
8888
|
+
if (candidate.trim()) {
|
|
8889
|
+
candidates.push(candidate.trim());
|
|
8890
|
+
}
|
|
8891
|
+
}
|
|
8892
|
+
}
|
|
8893
|
+
return candidates;
|
|
8894
|
+
};
|
|
8736
8895
|
var extractMainHtmlContent = (value) => {
|
|
8737
8896
|
const trimmed = value.trim();
|
|
8738
8897
|
if (!/<html\b|<body\b|<main\b|<article\b/i.test(trimmed)) {
|
|
8739
8898
|
return value;
|
|
8740
8899
|
}
|
|
8741
|
-
const
|
|
8742
|
-
const
|
|
8743
|
-
if (
|
|
8744
|
-
|
|
8745
|
-
|
|
8746
|
-
|
|
8747
|
-
|
|
8748
|
-
|
|
8749
|
-
|
|
8750
|
-
|
|
8751
|
-
`);
|
|
8752
|
-
}
|
|
8753
|
-
const roleMainMatch = boilerplateStripped.match(/<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/i);
|
|
8754
|
-
if (roleMainMatch?.[3]) {
|
|
8755
|
-
return roleMainMatch[3];
|
|
8900
|
+
const stripped = stripHtmlNoiseBlocks(trimmed);
|
|
8901
|
+
const candidates = collectHtmlContentCandidates(stripped);
|
|
8902
|
+
if (candidates.length > 0) {
|
|
8903
|
+
const bestCandidate = candidates.map((candidate) => ({
|
|
8904
|
+
candidate,
|
|
8905
|
+
score: stripHtmlTags(candidate).replace(/\s+/g, " ").trim().length
|
|
8906
|
+
})).sort((left, right) => right.score - left.score)[0]?.candidate;
|
|
8907
|
+
if (bestCandidate) {
|
|
8908
|
+
return bestCandidate;
|
|
8909
|
+
}
|
|
8756
8910
|
}
|
|
8757
|
-
const bodyMatch =
|
|
8911
|
+
const bodyMatch = stripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
|
|
8758
8912
|
if (bodyMatch?.[1]) {
|
|
8759
8913
|
return bodyMatch[1];
|
|
8760
8914
|
}
|
|
8761
|
-
return
|
|
8915
|
+
return stripped;
|
|
8762
8916
|
};
|
|
8763
8917
|
var stripHtml = (value) => {
|
|
8764
8918
|
const focused = extractMainHtmlContent(value);
|
|
@@ -8776,6 +8930,93 @@ var stripMarkdown = (value) => {
|
|
|
8776
8930
|
`);
|
|
8777
8931
|
return normalizeWhitespace(stripped);
|
|
8778
8932
|
};
|
|
8933
|
+
var pdfNativeStructureUnits = (metadata) => {
|
|
8934
|
+
const blocks = Array.isArray(metadata?.pdfTextBlocks) ? metadata.pdfTextBlocks : [];
|
|
8935
|
+
const units = [];
|
|
8936
|
+
for (const block of blocks) {
|
|
8937
|
+
if (!block || typeof block !== "object") {
|
|
8938
|
+
continue;
|
|
8939
|
+
}
|
|
8940
|
+
const text = typeof block.text === "string" ? normalizeWhitespace(block.text) : "";
|
|
8941
|
+
if (!text) {
|
|
8942
|
+
continue;
|
|
8943
|
+
}
|
|
8944
|
+
const pageNumber = typeof block.pageNumber === "number" && Number.isFinite(block.pageNumber) ? block.pageNumber : undefined;
|
|
8945
|
+
const pdfBlockNumber = typeof block.blockNumber === "number" && Number.isFinite(block.blockNumber) ? block.blockNumber : undefined;
|
|
8946
|
+
const pdfTextKind = block.textKind === "table_like" ? "table_like" : "paragraph";
|
|
8947
|
+
const sectionTitle = pageNumber ? pdfTextKind === "table_like" ? `Page ${pageNumber} Table Block` : `Page ${pageNumber} Text Block` : pdfTextKind === "table_like" ? "Table Block" : "Text Block";
|
|
8948
|
+
units.push({
|
|
8949
|
+
pageNumber,
|
|
8950
|
+
pdfBlockNumber,
|
|
8951
|
+
pdfTextKind,
|
|
8952
|
+
preferredChunkUnits: pdfTextKind === "table_like" ? text.split(`
|
|
8953
|
+
`).filter(Boolean) : undefined,
|
|
8954
|
+
sectionDepth: 1,
|
|
8955
|
+
sectionKind: "pdf_block",
|
|
8956
|
+
sectionPath: [sectionTitle],
|
|
8957
|
+
sectionTitle,
|
|
8958
|
+
text
|
|
8959
|
+
});
|
|
8960
|
+
}
|
|
8961
|
+
return units;
|
|
8962
|
+
};
|
|
8963
|
+
var officeNativeStructureUnits = (metadata) => {
|
|
8964
|
+
const blocks = Array.isArray(metadata?.officeBlocks) ? metadata.officeBlocks : [];
|
|
8965
|
+
const units = [];
|
|
8966
|
+
const headingStack = [];
|
|
8967
|
+
const decorateOfficeSectionText = (text, sectionTitle) => {
|
|
8968
|
+
if (!sectionTitle || text.includes(sectionTitle)) {
|
|
8969
|
+
return text;
|
|
8970
|
+
}
|
|
8971
|
+
return normalizeWhitespace(`${sectionTitle}
|
|
8972
|
+
${text}`);
|
|
8973
|
+
};
|
|
8974
|
+
for (const [index, block] of blocks.entries()) {
|
|
8975
|
+
if (!block || typeof block !== "object") {
|
|
8976
|
+
continue;
|
|
8977
|
+
}
|
|
8978
|
+
const text = typeof block.text === "string" ? normalizeWhitespace(block.text) : "";
|
|
8979
|
+
if (!text) {
|
|
8980
|
+
continue;
|
|
8981
|
+
}
|
|
8982
|
+
const officeBlockNumber = typeof block.blockNumber === "number" && Number.isFinite(block.blockNumber) ? block.blockNumber : undefined;
|
|
8983
|
+
const officeBlockKind = block.blockKind === "title" || block.blockKind === "heading" || block.blockKind === "list" || block.blockKind === "table" ? block.blockKind : "paragraph";
|
|
8984
|
+
const headingLevel = typeof block.headingLevel === "number" && Number.isFinite(block.headingLevel) ? block.headingLevel : undefined;
|
|
8985
|
+
if (officeBlockKind === "title" || officeBlockKind === "heading") {
|
|
8986
|
+
const level = officeBlockKind === "title" ? 1 : headingLevel ?? 1;
|
|
8987
|
+
headingStack[level - 1] = text;
|
|
8988
|
+
headingStack.length = level;
|
|
8989
|
+
const nextBlock = blocks[index + 1];
|
|
8990
|
+
const nextKind = nextBlock && typeof nextBlock === "object" ? nextBlock.blockKind : undefined;
|
|
8991
|
+
if (nextKind === "title" || nextKind === "heading" || nextKind === "list" || nextKind === "table" || !nextBlock) {
|
|
8992
|
+
units.push({
|
|
8993
|
+
officeBlockKind,
|
|
8994
|
+
officeBlockNumber,
|
|
8995
|
+
sectionDepth: headingStack.length,
|
|
8996
|
+
sectionKind: "office_heading",
|
|
8997
|
+
sectionPath: [...headingStack],
|
|
8998
|
+
sectionTitle: text,
|
|
8999
|
+
text
|
|
9000
|
+
});
|
|
9001
|
+
}
|
|
9002
|
+
continue;
|
|
9003
|
+
}
|
|
9004
|
+
const sectionPath = headingStack.length > 0 ? [...headingStack] : undefined;
|
|
9005
|
+
const sectionTitle = sectionPath?.at(-1);
|
|
9006
|
+
units.push({
|
|
9007
|
+
officeBlockKind,
|
|
9008
|
+
officeBlockNumber,
|
|
9009
|
+
preferredChunkUnits: officeBlockKind === "table" ? text.split(`
|
|
9010
|
+
`).filter(Boolean) : undefined,
|
|
9011
|
+
sectionDepth: sectionPath?.length,
|
|
9012
|
+
sectionKind: officeBlockKind === "paragraph" ? "office_heading" : "office_block",
|
|
9013
|
+
sectionPath,
|
|
9014
|
+
sectionTitle,
|
|
9015
|
+
text: officeBlockKind === "paragraph" ? decorateOfficeSectionText(text, sectionTitle) : text
|
|
9016
|
+
});
|
|
9017
|
+
}
|
|
9018
|
+
return units;
|
|
9019
|
+
};
|
|
8779
9020
|
var markdownStructureUnits = (value) => {
|
|
8780
9021
|
const lines = value.replace(/\r\n?/g, `
|
|
8781
9022
|
`).split(`
|
|
@@ -9119,6 +9360,7 @@ var appendPdfLineBreak = (parts) => {
|
|
|
9119
9360
|
parts.push(`
|
|
9120
9361
|
`);
|
|
9121
9362
|
};
|
|
9363
|
+
var PDF_CHROME_LINE_MAX_LENGTH = 80;
|
|
9122
9364
|
var PDF_TEXT_OPERATOR_PATTERN = /(\[((?:\\.|[^\]])*)\]\s*TJ)|(\(((?:\\.|[^\\)])*)\)\s*Tj)|([-+]?\d*\.?\d+\s+[-+]?\d*\.?\d+\s+\(((?:\\.|[^\\)])*)\)\s*")|(\(((?:\\.|[^\\)])*)\)\s*')|((?:[-+]?\d*\.?\d+\s+){2}(?:Td|TD))|(T\*)|((?:[-+]?\d*\.?\d+\s+){6}Tm)/g;
|
|
9123
9365
|
var extractTextFromPDFTextObject = (value) => {
|
|
9124
9366
|
const parts = [];
|
|
@@ -9147,19 +9389,84 @@ var extractTextFromPDFTextObject = (value) => {
|
|
|
9147
9389
|
}
|
|
9148
9390
|
return parts.join("");
|
|
9149
9391
|
};
|
|
9150
|
-
var
|
|
9151
|
-
const
|
|
9152
|
-
|
|
9153
|
-
|
|
9154
|
-
|
|
9155
|
-
|
|
9156
|
-
`);
|
|
9157
|
-
|
|
9392
|
+
var buildPDFNativeTextBlock = (text, blockNumber, pageNumber) => {
|
|
9393
|
+
const normalized = normalizeWhitespace(text);
|
|
9394
|
+
if (!normalized) {
|
|
9395
|
+
return;
|
|
9396
|
+
}
|
|
9397
|
+
const lineCount = normalized.split(`
|
|
9398
|
+
`).filter(Boolean).length;
|
|
9399
|
+
const textKind = normalized.includes(" | ") ? "table_like" : "paragraph";
|
|
9400
|
+
return {
|
|
9401
|
+
blockNumber,
|
|
9402
|
+
lineCount,
|
|
9403
|
+
pageNumber,
|
|
9404
|
+
text: normalized,
|
|
9405
|
+
textKind
|
|
9406
|
+
};
|
|
9407
|
+
};
|
|
9408
|
+
var isLikelyPDFPageLabel = (value) => /^page\s+\d+(?:\s+of\s+\d+)?$/i.test(value.trim());
|
|
9409
|
+
var suppressRepeatedPDFChrome = (blocks) => {
|
|
9410
|
+
const linePages = new Map;
|
|
9411
|
+
for (const block of blocks) {
|
|
9412
|
+
for (const line of block.text.split(`
|
|
9413
|
+
`)) {
|
|
9414
|
+
const normalized = normalizeWhitespace(line);
|
|
9415
|
+
if (!normalized || normalized.length > PDF_CHROME_LINE_MAX_LENGTH) {
|
|
9416
|
+
continue;
|
|
9417
|
+
}
|
|
9418
|
+
const pages = linePages.get(normalized) ?? new Set;
|
|
9419
|
+
pages.add(block.pageNumber);
|
|
9420
|
+
linePages.set(normalized, pages);
|
|
9421
|
+
}
|
|
9422
|
+
}
|
|
9423
|
+
return blocks.map((block) => {
|
|
9424
|
+
const keptLines = block.text.split(`
|
|
9425
|
+
`).map((line) => normalizeWhitespace(line)).filter((line) => {
|
|
9426
|
+
if (!line) {
|
|
9427
|
+
return false;
|
|
9428
|
+
}
|
|
9429
|
+
if (isLikelyPDFPageLabel(line)) {
|
|
9430
|
+
return false;
|
|
9431
|
+
}
|
|
9432
|
+
const repeatedPages = linePages.get(line);
|
|
9433
|
+
if (line.length <= PDF_CHROME_LINE_MAX_LENGTH && repeatedPages && repeatedPages.size > 1) {
|
|
9434
|
+
return false;
|
|
9435
|
+
}
|
|
9436
|
+
return true;
|
|
9437
|
+
});
|
|
9438
|
+
const text = normalizeWhitespace(keptLines.join(`
|
|
9439
|
+
`));
|
|
9440
|
+
if (!text) {
|
|
9441
|
+
return;
|
|
9442
|
+
}
|
|
9443
|
+
return buildPDFNativeTextBlock(text, block.blockNumber, block.pageNumber);
|
|
9444
|
+
}).filter((value) => Boolean(value));
|
|
9158
9445
|
};
|
|
9159
|
-
var
|
|
9446
|
+
var extractNativePDFText = (data) => {
|
|
9160
9447
|
const raw = Buffer.from(data).toString("latin1");
|
|
9161
9448
|
const count = [...raw.matchAll(/\/Type\s*\/Page\b/g)].length;
|
|
9162
|
-
|
|
9449
|
+
const pageCount = count > 0 ? count : 1;
|
|
9450
|
+
const pageMarkers = [...raw.matchAll(/\/Type\s*\/Page\b/g)].map((match) => match.index ?? raw.length);
|
|
9451
|
+
const blocks = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match, index) => {
|
|
9452
|
+
const blockText = extractTextFromPDFTextObject(match[1] ?? "");
|
|
9453
|
+
const objectEnd = (match.index ?? 0) + (match[0]?.length ?? 0);
|
|
9454
|
+
const pageIndex = pageMarkers.findIndex((marker) => marker >= objectEnd);
|
|
9455
|
+
const pageNumber = pageIndex >= 0 ? pageIndex + 1 : pageCount;
|
|
9456
|
+
return buildPDFNativeTextBlock(blockText, index + 1, pageNumber);
|
|
9457
|
+
}).filter((value) => Boolean(value));
|
|
9458
|
+
const visibleBlocks = suppressRepeatedPDFChrome(blocks);
|
|
9459
|
+
const fallbackText = [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
|
|
9460
|
+
`);
|
|
9461
|
+
const text = visibleBlocks.length > 0 ? normalizeWhitespace(visibleBlocks.map((block) => block.text).join(`
|
|
9462
|
+
|
|
9463
|
+
`)) : normalizeWhitespace(fallbackText);
|
|
9464
|
+
return {
|
|
9465
|
+
pageCount,
|
|
9466
|
+
text,
|
|
9467
|
+
textBlockCount: visibleBlocks.length,
|
|
9468
|
+
textBlocks: visibleBlocks
|
|
9469
|
+
};
|
|
9163
9470
|
};
|
|
9164
9471
|
var readUInt16LE = (data, offset) => data[offset] | data[offset + 1] << 8;
|
|
9165
9472
|
var readUInt32LE = (data, offset) => (data[offset] | data[offset + 1] << 8 | data[offset + 2] << 16 | data[offset + 3] << 24) >>> 0;
|
|
@@ -9246,35 +9553,64 @@ var decodeGzipEntries = (data, input) => {
|
|
|
9246
9553
|
var extractXmlText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")));
|
|
9247
9554
|
var extractOfficeParagraphText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<w:tab\b[^>]*\/>/gi, "\t").replace(/<w:br\b[^>]*\/>/gi, `
|
|
9248
9555
|
`).replace(/<[^>]+>/g, " ")));
|
|
9249
|
-
var
|
|
9556
|
+
var officeDocumentBlocks = (entries) => {
|
|
9250
9557
|
const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
|
|
9251
9558
|
if (!documentEntry) {
|
|
9252
9559
|
return [];
|
|
9253
9560
|
}
|
|
9254
9561
|
const xml = decodeUtf8(documentEntry.data);
|
|
9255
|
-
const
|
|
9256
|
-
|
|
9257
|
-
|
|
9258
|
-
|
|
9562
|
+
const bodyMatch = xml.match(/<w:body\b[^>]*>([\s\S]*?)<\/w:body>/i);
|
|
9563
|
+
const body = bodyMatch?.[1] ?? xml;
|
|
9564
|
+
const blocks = [];
|
|
9565
|
+
const blockPattern = /<(w:p|w:tbl)\b[\s\S]*?<\/\1>/g;
|
|
9566
|
+
for (const match of body.matchAll(blockPattern)) {
|
|
9567
|
+
const blockXml = match[0] ?? "";
|
|
9568
|
+
if (blockXml.startsWith("<w:tbl")) {
|
|
9569
|
+
const rows = [...blockXml.matchAll(/<w:tr\b[\s\S]*?<\/w:tr>/g)].map((rowMatch, rowIndex) => {
|
|
9570
|
+
const cells = [
|
|
9571
|
+
...(rowMatch[0] ?? "").matchAll(/<w:tc\b[\s\S]*?<\/w:tc>/g)
|
|
9572
|
+
].map((cellMatch) => extractOfficeParagraphText(cellMatch[0] ?? "")).filter(Boolean);
|
|
9573
|
+
if (cells.length === 0) {
|
|
9574
|
+
return "";
|
|
9575
|
+
}
|
|
9576
|
+
return `Row ${rowIndex + 1}. ${cells.map((cell, cellIndex) => `${String.fromCharCode(65 + cellIndex)}: ${cell}`).join(" | ")}`;
|
|
9577
|
+
}).filter(Boolean);
|
|
9578
|
+
const text2 = normalizeWhitespace(rows.join(`
|
|
9579
|
+
`));
|
|
9580
|
+
if (!text2) {
|
|
9581
|
+
continue;
|
|
9582
|
+
}
|
|
9583
|
+
blocks.push({
|
|
9584
|
+
blockKind: "table",
|
|
9585
|
+
blockNumber: blocks.length + 1,
|
|
9586
|
+
text: text2
|
|
9587
|
+
});
|
|
9588
|
+
continue;
|
|
9589
|
+
}
|
|
9590
|
+
const text = extractOfficeParagraphText(blockXml);
|
|
9259
9591
|
if (!text) {
|
|
9260
|
-
|
|
9592
|
+
continue;
|
|
9261
9593
|
}
|
|
9262
|
-
const styleMatch =
|
|
9594
|
+
const styleMatch = blockXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
|
|
9263
9595
|
const style = (styleMatch?.[1] ?? "").toLowerCase();
|
|
9264
|
-
if (style === "title") {
|
|
9265
|
-
return text;
|
|
9266
|
-
}
|
|
9267
9596
|
const headingMatch = style.match(/^heading([1-6])$/);
|
|
9268
|
-
|
|
9269
|
-
|
|
9270
|
-
}
|
|
9271
|
-
|
|
9272
|
-
|
|
9597
|
+
const isListParagraph = /<w:numPr\b/i.test(blockXml) || style.includes("list") || style.includes("bullet");
|
|
9598
|
+
const blockKind = style === "title" ? "title" : headingMatch ? "heading" : isListParagraph ? "list" : "paragraph";
|
|
9599
|
+
const decoratedText = blockKind === "list" && !/^[-*]\s/.test(text) ? `- ${text}` : text;
|
|
9600
|
+
blocks.push({
|
|
9601
|
+
blockKind,
|
|
9602
|
+
blockNumber: blocks.length + 1,
|
|
9603
|
+
headingLevel: headingMatch ? Number.parseInt(headingMatch[1] ?? "1", 10) : undefined,
|
|
9604
|
+
style: style || undefined,
|
|
9605
|
+
text: decoratedText
|
|
9606
|
+
});
|
|
9607
|
+
}
|
|
9608
|
+
return blocks;
|
|
9273
9609
|
};
|
|
9274
9610
|
var officeDocumentText = (entries) => {
|
|
9275
|
-
const
|
|
9276
|
-
if (
|
|
9277
|
-
return normalizeWhitespace(
|
|
9611
|
+
const blocks = officeDocumentBlocks(entries);
|
|
9612
|
+
if (blocks.length > 0) {
|
|
9613
|
+
return normalizeWhitespace(blocks.map((block) => block.text).join(`
|
|
9278
9614
|
|
|
9279
9615
|
`));
|
|
9280
9616
|
}
|
|
@@ -9285,11 +9621,7 @@ var officeDocumentText = (entries) => {
|
|
|
9285
9621
|
return extractXmlText(decodeUtf8(documentEntry.data));
|
|
9286
9622
|
};
|
|
9287
9623
|
var officeDocumentSectionCount = (entries) => {
|
|
9288
|
-
const
|
|
9289
|
-
if (!documentEntry) {
|
|
9290
|
-
return;
|
|
9291
|
-
}
|
|
9292
|
-
const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
|
|
9624
|
+
const count = officeDocumentBlocks(entries).length;
|
|
9293
9625
|
return count > 0 ? count : undefined;
|
|
9294
9626
|
};
|
|
9295
9627
|
var spreadsheetSharedStrings = (entries) => entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
|
|
@@ -9819,8 +10151,10 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
9819
10151
|
let officeMetadata = {};
|
|
9820
10152
|
let structuredDocuments = [];
|
|
9821
10153
|
if (extension === ".docx" || extension === ".odt") {
|
|
10154
|
+
const officeBlocks = officeDocumentBlocks(entries);
|
|
9822
10155
|
text = officeDocumentText(entries);
|
|
9823
10156
|
officeMetadata = {
|
|
10157
|
+
officeBlocks,
|
|
9824
10158
|
sectionCount: officeDocumentSectionCount(entries)
|
|
9825
10159
|
};
|
|
9826
10160
|
} else if (extension === ".xlsx" || extension === ".ods") {
|
|
@@ -10010,8 +10344,8 @@ var createPDFFileExtractor = () => ({
|
|
|
10010
10344
|
name: "absolute_pdf",
|
|
10011
10345
|
supports: pdfExtractorSupports,
|
|
10012
10346
|
extract: (input) => {
|
|
10013
|
-
const
|
|
10014
|
-
if (!text) {
|
|
10347
|
+
const extracted = extractNativePDFText(input.data);
|
|
10348
|
+
if (!extracted.text) {
|
|
10015
10349
|
throw new Error("AbsoluteJS could not extract readable text from this PDF. Supply a custom extractor for scanned or image-only PDFs.");
|
|
10016
10350
|
}
|
|
10017
10351
|
return {
|
|
@@ -10021,10 +10355,12 @@ var createPDFFileExtractor = () => ({
|
|
|
10021
10355
|
metadata: {
|
|
10022
10356
|
...input.metadata ?? {},
|
|
10023
10357
|
fileKind: "pdf",
|
|
10024
|
-
pageCount:
|
|
10358
|
+
pageCount: extracted.pageCount,
|
|
10359
|
+
pdfTextBlockCount: extracted.textBlockCount,
|
|
10360
|
+
pdfTextBlocks: extracted.textBlocks
|
|
10025
10361
|
},
|
|
10026
10362
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
10027
|
-
text,
|
|
10363
|
+
text: extracted.text,
|
|
10028
10364
|
title: input.title
|
|
10029
10365
|
};
|
|
10030
10366
|
}
|
|
@@ -10049,7 +10385,8 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
10049
10385
|
name: `absolute_pdf_ocr:${options.provider.name}`,
|
|
10050
10386
|
supports: pdfExtractorSupports,
|
|
10051
10387
|
extract: async (input) => {
|
|
10052
|
-
const
|
|
10388
|
+
const extracted = extractNativePDFText(input.data);
|
|
10389
|
+
const nativeText = extracted.text;
|
|
10053
10390
|
const minLength = options.minExtractedTextLength ?? 80;
|
|
10054
10391
|
const shouldUseNativeText = !options.alwaysOCR && nativeText.length >= minLength;
|
|
10055
10392
|
if (shouldUseNativeText) {
|
|
@@ -10060,7 +10397,9 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
10060
10397
|
metadata: {
|
|
10061
10398
|
...input.metadata ?? {},
|
|
10062
10399
|
fileKind: "pdf",
|
|
10063
|
-
pageCount:
|
|
10400
|
+
pageCount: extracted.pageCount,
|
|
10401
|
+
pdfTextBlockCount: extracted.textBlockCount,
|
|
10402
|
+
pdfTextBlocks: extracted.textBlocks,
|
|
10064
10403
|
pdfTextMode: "native"
|
|
10065
10404
|
},
|
|
10066
10405
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
@@ -10075,7 +10414,7 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
10075
10414
|
const baseMetadata = {
|
|
10076
10415
|
...ocrMetadata(ocr),
|
|
10077
10416
|
fileKind: "pdf",
|
|
10078
|
-
pageCount:
|
|
10417
|
+
pageCount: extracted.pageCount,
|
|
10079
10418
|
pdfTextMode: "ocr"
|
|
10080
10419
|
};
|
|
10081
10420
|
const summaryDocument = {
|
|
@@ -10248,6 +10587,18 @@ var sourceAwareUnits = (document, format, normalizedText) => {
|
|
|
10248
10587
|
}
|
|
10249
10588
|
case "text":
|
|
10250
10589
|
default:
|
|
10590
|
+
if (document.metadata?.fileKind === "office") {
|
|
10591
|
+
const sections = officeNativeStructureUnits(document.metadata);
|
|
10592
|
+
if (sections.length > 0) {
|
|
10593
|
+
return sections;
|
|
10594
|
+
}
|
|
10595
|
+
}
|
|
10596
|
+
if (document.metadata?.fileKind === "pdf") {
|
|
10597
|
+
const sections = pdfNativeStructureUnits(document.metadata);
|
|
10598
|
+
if (sections.length > 0) {
|
|
10599
|
+
return sections;
|
|
10600
|
+
}
|
|
10601
|
+
}
|
|
10251
10602
|
if (document.metadata?.sourceNativeKind === "spreadsheet_sheet") {
|
|
10252
10603
|
return spreadsheetStructureUnits(normalizedText, document.metadata);
|
|
10253
10604
|
}
|
|
@@ -10571,6 +10922,11 @@ var prepareRAGDocument = (document, defaultChunking, chunkingRegistry) => {
|
|
|
10571
10922
|
...sectionTitle ? { sectionTitle } : {},
|
|
10572
10923
|
...sectionPath && sectionPath.length > 0 ? { sectionPath } : {},
|
|
10573
10924
|
...typeof entry.sectionDepth === "number" ? { sectionDepth: entry.sectionDepth } : {},
|
|
10925
|
+
...typeof entry.pageNumber === "number" ? { pageNumber: entry.pageNumber } : {},
|
|
10926
|
+
...typeof entry.officeBlockNumber === "number" ? { officeBlockNumber: entry.officeBlockNumber } : {},
|
|
10927
|
+
...entry.officeBlockKind ? { officeBlockKind: entry.officeBlockKind } : {},
|
|
10928
|
+
...typeof entry.pdfBlockNumber === "number" ? { pdfBlockNumber: entry.pdfBlockNumber } : {},
|
|
10929
|
+
...entry.pdfTextKind ? { pdfTextKind: entry.pdfTextKind } : {},
|
|
10574
10930
|
...entry.sectionKind ? { sectionKind: entry.sectionKind } : {},
|
|
10575
10931
|
...sectionChunkId ? { sectionChunkId } : {},
|
|
10576
10932
|
...sectionChunkId && sectionChunkIndex >= 0 ? {
|
|
@@ -10959,9 +11315,25 @@ var annotateRetrievalChannels = (input) => {
|
|
|
10959
11315
|
};
|
|
10960
11316
|
});
|
|
10961
11317
|
};
|
|
11318
|
+
var getStructuredSectionScoreWeight2 = (metadata) => {
|
|
11319
|
+
const pdfTextKind = typeof metadata?.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
|
|
11320
|
+
const officeBlockKind = typeof metadata?.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
|
|
11321
|
+
const sectionKind = typeof metadata?.sectionKind === "string" ? metadata.sectionKind : undefined;
|
|
11322
|
+
if (pdfTextKind === "table_like") {
|
|
11323
|
+
return 1.28;
|
|
11324
|
+
}
|
|
11325
|
+
if (officeBlockKind === "table" || officeBlockKind === "list") {
|
|
11326
|
+
return 1.24;
|
|
11327
|
+
}
|
|
11328
|
+
if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
|
|
11329
|
+
return 1.12;
|
|
11330
|
+
}
|
|
11331
|
+
return 1;
|
|
11332
|
+
};
|
|
10962
11333
|
var buildTraceSectionCounts = (results) => {
|
|
10963
11334
|
const sections = new Map;
|
|
10964
11335
|
for (const result of results) {
|
|
11336
|
+
const weightedScore = result.score * getStructuredSectionScoreWeight2(result.metadata);
|
|
10965
11337
|
const path = Array.isArray(result.metadata?.sectionPath) ? result.metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
|
|
10966
11338
|
if (path.length === 0) {
|
|
10967
11339
|
continue;
|
|
@@ -10988,6 +11360,7 @@ var buildTraceSectionCounts = (results) => {
|
|
|
10988
11360
|
var buildTraceSectionScores = (results) => {
|
|
10989
11361
|
const sections = new Map;
|
|
10990
11362
|
for (const result of results) {
|
|
11363
|
+
const weightedScore = result.score * getStructuredSectionScoreWeight2(result.metadata);
|
|
10991
11364
|
const path = Array.isArray(result.metadata?.sectionPath) ? result.metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
|
|
10992
11365
|
if (path.length === 0) {
|
|
10993
11366
|
continue;
|
|
@@ -10995,13 +11368,13 @@ var buildTraceSectionScores = (results) => {
|
|
|
10995
11368
|
const key = path.join(" > ");
|
|
10996
11369
|
const existing = sections.get(key);
|
|
10997
11370
|
if (existing) {
|
|
10998
|
-
existing.totalScore +=
|
|
11371
|
+
existing.totalScore += weightedScore;
|
|
10999
11372
|
continue;
|
|
11000
11373
|
}
|
|
11001
11374
|
sections.set(key, {
|
|
11002
11375
|
key,
|
|
11003
11376
|
label: path.at(-1) ?? key,
|
|
11004
|
-
totalScore:
|
|
11377
|
+
totalScore: weightedScore
|
|
11005
11378
|
});
|
|
11006
11379
|
}
|
|
11007
11380
|
return [...sections.values()].sort((left, right) => {
|
|
@@ -11440,11 +11813,32 @@ var renderSourceLabels = (input) => {
|
|
|
11440
11813
|
].filter((row) => row.length > 0);
|
|
11441
11814
|
return rows.length > 0 ? `<ul class="rag-source-labels">${rows.join("")}</ul>` : "";
|
|
11442
11815
|
};
|
|
11816
|
+
var formatStructureKindLabel = (kind) => {
|
|
11817
|
+
switch (kind) {
|
|
11818
|
+
case "markdown_heading":
|
|
11819
|
+
return "Markdown heading";
|
|
11820
|
+
case "html_heading":
|
|
11821
|
+
return "HTML heading";
|
|
11822
|
+
case "office_heading":
|
|
11823
|
+
return "Office heading";
|
|
11824
|
+
case "office_block":
|
|
11825
|
+
return "Office block";
|
|
11826
|
+
case "pdf_block":
|
|
11827
|
+
return "PDF block";
|
|
11828
|
+
case "spreadsheet_rows":
|
|
11829
|
+
return "Spreadsheet rows";
|
|
11830
|
+
case "presentation_slide":
|
|
11831
|
+
return "Presentation slide";
|
|
11832
|
+
default:
|
|
11833
|
+
return;
|
|
11834
|
+
}
|
|
11835
|
+
};
|
|
11443
11836
|
var renderChunkStructure = (structure) => {
|
|
11444
11837
|
if (!structure) {
|
|
11445
11838
|
return "";
|
|
11446
11839
|
}
|
|
11447
11840
|
const rows = [
|
|
11841
|
+
structure.section?.kind ? `<li><strong>Kind</strong> ${escapeHtml2(formatStructureKindLabel(structure.section.kind) ?? structure.section.kind)}</li>` : "",
|
|
11448
11842
|
structure.section?.title ? `<li><strong>Section</strong> ${escapeHtml2(structure.section.title)}</li>` : "",
|
|
11449
11843
|
structure.section?.path && structure.section.path.length > 1 ? `<li><strong>Section path</strong> ${escapeHtml2(structure.section.path.join(" > "))}</li>` : "",
|
|
11450
11844
|
typeof structure.sequence?.sectionChunkIndex === "number" && typeof structure.sequence?.sectionChunkCount === "number" ? `<li><strong>Section chunk</strong> ${structure.sequence.sectionChunkIndex + 1} of ${structure.sequence.sectionChunkCount}</li>` : "",
|
|
@@ -13688,6 +14082,24 @@ var ragChat = (config) => {
|
|
|
13688
14082
|
limit: 100,
|
|
13689
14083
|
store: config.retrievalReleaseIncidentStore
|
|
13690
14084
|
});
|
|
14085
|
+
const baselineCorpusGroups = config.retrievalBaselineStore ? await loadRAGRetrievalBaselines({
|
|
14086
|
+
limit: 200,
|
|
14087
|
+
store: config.retrievalBaselineStore
|
|
14088
|
+
}) : [];
|
|
14089
|
+
const comparisonRunCorpusGroups = config.retrievalComparisonHistoryStore ? await loadRAGRetrievalComparisonHistory({
|
|
14090
|
+
limit: 200,
|
|
14091
|
+
store: config.retrievalComparisonHistoryStore
|
|
14092
|
+
}) : [];
|
|
14093
|
+
const releaseDecisionCorpusGroups = config.retrievalReleaseDecisionStore ? await loadRAGRetrievalReleaseDecisions({
|
|
14094
|
+
limit: 200,
|
|
14095
|
+
store: config.retrievalReleaseDecisionStore
|
|
14096
|
+
}) : [];
|
|
14097
|
+
const resolveIncidentCorpusGroupKey = (groupKey) => {
|
|
14098
|
+
if (!groupKey) {
|
|
14099
|
+
return;
|
|
14100
|
+
}
|
|
14101
|
+
return existing.find((entry) => entry.groupKey === groupKey && typeof entry.corpusGroupKey === "string")?.corpusGroupKey ?? comparisonRunCorpusGroups.find((entry) => entry.groupKey === groupKey && typeof entry.corpusGroupKey === "string")?.corpusGroupKey ?? baselineCorpusGroups.find((entry) => entry.groupKey === groupKey && typeof entry.corpusGroupKey === "string")?.corpusGroupKey ?? releaseDecisionCorpusGroups.find((entry) => entry.groupKey === groupKey && typeof entry.corpusGroupKey === "string")?.corpusGroupKey;
|
|
14102
|
+
};
|
|
13691
14103
|
const nextByKey = new Map;
|
|
13692
14104
|
for (const candidate of input.promotionCandidates) {
|
|
13693
14105
|
if (!candidate.groupKey || !candidate.targetRolloutLabel) {
|
|
@@ -13701,6 +14113,7 @@ var ragChat = (config) => {
|
|
|
13701
14113
|
nextByKey.set(key, {
|
|
13702
14114
|
baselineRetrievalId: candidate.baselineRetrievalId,
|
|
13703
14115
|
candidateRetrievalId: candidate.candidateRetrievalId,
|
|
14116
|
+
corpusGroupKey: resolveIncidentCorpusGroupKey(candidate.groupKey),
|
|
13704
14117
|
groupKey: candidate.groupKey,
|
|
13705
14118
|
id: key,
|
|
13706
14119
|
kind,
|
|
@@ -13726,6 +14139,7 @@ var ragChat = (config) => {
|
|
|
13726
14139
|
nextByKey.set(key, {
|
|
13727
14140
|
baselineRetrievalId: handoff.targetBaselineRetrievalId,
|
|
13728
14141
|
candidateRetrievalId: handoff.candidateRetrievalId,
|
|
14142
|
+
corpusGroupKey: handoff.corpusGroupKey,
|
|
13729
14143
|
groupKey: handoff.groupKey,
|
|
13730
14144
|
id: key,
|
|
13731
14145
|
kind: "handoff_stale",
|
|
@@ -13738,7 +14152,7 @@ var ragChat = (config) => {
|
|
|
13738
14152
|
});
|
|
13739
14153
|
}
|
|
13740
14154
|
for (const incident of nextByKey.values()) {
|
|
13741
|
-
const matchingIncidents = existing.filter((entry) => entry.groupKey === incident.groupKey && entry.kind === incident.kind && (entry.targetRolloutLabel ?? undefined) === (incident.targetRolloutLabel ?? undefined)).sort((left, right) => right.triggeredAt - left.triggeredAt);
|
|
14155
|
+
const matchingIncidents = existing.filter((entry) => entry.corpusGroupKey === incident.corpusGroupKey && entry.groupKey === incident.groupKey && entry.kind === incident.kind && (entry.targetRolloutLabel ?? undefined) === (incident.targetRolloutLabel ?? undefined)).sort((left, right) => right.triggeredAt - left.triggeredAt);
|
|
13742
14156
|
const openIncident = matchingIncidents.find((entry) => entry.status === "open");
|
|
13743
14157
|
const latestMatchingIncident = matchingIncidents[0];
|
|
13744
14158
|
if (!openIncident) {
|
|
@@ -15151,6 +15565,7 @@ var ragChat = (config) => {
|
|
|
15151
15565
|
const acknowledged = getStringProperty(queryInput, "acknowledged");
|
|
15152
15566
|
const targetRolloutLabel = getStringProperty(queryInput, "targetRolloutLabel");
|
|
15153
15567
|
const incidents = await loadRAGRetrievalReleaseIncidents({
|
|
15568
|
+
corpusGroupKey: getStringProperty(queryInput, "corpusGroupKey"),
|
|
15154
15569
|
groupKey: getStringProperty(queryInput, "groupKey"),
|
|
15155
15570
|
limit: getIntegerLikeProperty(queryInput, "limit"),
|
|
15156
15571
|
severity: severity === "warning" || severity === "critical" ? severity : undefined,
|
|
@@ -15801,6 +16216,7 @@ var ragChat = (config) => {
|
|
|
15801
16216
|
});
|
|
15802
16217
|
return {
|
|
15803
16218
|
incidents: await loadRAGRetrievalReleaseIncidents({
|
|
16219
|
+
corpusGroupKey: incident.corpusGroupKey,
|
|
15804
16220
|
groupKey: incident.groupKey,
|
|
15805
16221
|
limit: 20,
|
|
15806
16222
|
store: config.retrievalReleaseIncidentStore
|
|
@@ -15844,6 +16260,7 @@ var ragChat = (config) => {
|
|
|
15844
16260
|
});
|
|
15845
16261
|
return {
|
|
15846
16262
|
incidents: await loadRAGRetrievalReleaseIncidents({
|
|
16263
|
+
corpusGroupKey: incident.corpusGroupKey,
|
|
15847
16264
|
groupKey: incident.groupKey,
|
|
15848
16265
|
limit: 20,
|
|
15849
16266
|
store: config.retrievalReleaseIncidentStore
|
|
@@ -15887,6 +16304,7 @@ var ragChat = (config) => {
|
|
|
15887
16304
|
});
|
|
15888
16305
|
return {
|
|
15889
16306
|
incidents: await loadRAGRetrievalReleaseIncidents({
|
|
16307
|
+
corpusGroupKey: incident.corpusGroupKey,
|
|
15890
16308
|
groupKey: incident.groupKey,
|
|
15891
16309
|
limit: 20,
|
|
15892
16310
|
store: config.retrievalReleaseIncidentStore
|
|
@@ -23860,5 +24278,5 @@ export {
|
|
|
23860
24278
|
aiChat
|
|
23861
24279
|
};
|
|
23862
24280
|
|
|
23863
|
-
//# debugId=
|
|
24281
|
+
//# debugId=23520EDE705830A964756E2164756E21
|
|
23864
24282
|
//# sourceMappingURL=index.js.map
|