@absolutejs/absolute 0.19.0-beta.618 → 0.19.0-beta.619
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/client/index.js +96 -22
- package/dist/ai/client/index.js.map +3 -3
- package/dist/ai/client/ui.js +96 -22
- package/dist/ai/client/ui.js.map +3 -3
- package/dist/ai/index.js +473 -82
- package/dist/ai/index.js.map +7 -7
- package/dist/ai/rag/ui.js +96 -22
- package/dist/ai/rag/ui.js.map +3 -3
- package/dist/ai-client/angular/ai/index.js +95 -21
- package/dist/ai-client/react/ai/index.js +95 -21
- package/dist/ai-client/vue/ai/index.js +95 -21
- package/dist/angular/ai/index.js +96 -22
- package/dist/angular/ai/index.js.map +3 -3
- package/dist/angular/index.js +2 -2
- package/dist/angular/index.js.map +1 -1
- package/dist/angular/server.js +2 -2
- package/dist/angular/server.js.map +1 -1
- package/dist/build.js +2 -2
- package/dist/build.js.map +1 -1
- package/dist/index.js +2 -2
- package/dist/index.js.map +1 -1
- package/dist/react/ai/index.js +96 -22
- package/dist/react/ai/index.js.map +3 -3
- package/dist/src/vue/ai/useRAG.d.ts +4 -4
- package/dist/src/vue/ai/useRAGChunkPreview.d.ts +2 -2
- package/dist/src/vue/ai/useRAGSearch.d.ts +2 -2
- package/dist/svelte/ai/index.js +96 -22
- package/dist/svelte/ai/index.js.map +3 -3
- package/dist/types/ai.d.ts +2 -2
- package/dist/vue/ai/index.js +96 -22
- package/dist/vue/ai/index.js.map +3 -3
- package/package.json +7 -7
package/dist/ai/index.js
CHANGED
|
@@ -4237,6 +4237,25 @@ var buildContextLabel2 = (metadata) => {
|
|
|
4237
4237
|
if (!metadata) {
|
|
4238
4238
|
return;
|
|
4239
4239
|
}
|
|
4240
|
+
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4241
|
+
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4242
|
+
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
|
|
4243
|
+
const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
|
|
4244
|
+
if (pdfTextKind === "table_like" && sectionTitle) {
|
|
4245
|
+
return `PDF table block ${sectionTitle}`;
|
|
4246
|
+
}
|
|
4247
|
+
if (pdfTextKind === "paragraph" && sectionTitle) {
|
|
4248
|
+
return `PDF text block ${sectionTitle}`;
|
|
4249
|
+
}
|
|
4250
|
+
if (officeBlockKind === "table" && sectionTitle) {
|
|
4251
|
+
return `Office table block ${sectionTitle}`;
|
|
4252
|
+
}
|
|
4253
|
+
if (officeBlockKind === "list" && sectionTitle) {
|
|
4254
|
+
return `Office list block ${sectionTitle}`;
|
|
4255
|
+
}
|
|
4256
|
+
if (officeBlockKind === "paragraph" && sectionTitle) {
|
|
4257
|
+
return `Office paragraph block ${sectionTitle}`;
|
|
4258
|
+
}
|
|
4240
4259
|
const emailKind = getContextString2(metadata.emailKind);
|
|
4241
4260
|
if (emailKind === "attachment") {
|
|
4242
4261
|
return "Attachment evidence";
|
|
@@ -4273,8 +4292,6 @@ var buildContextLabel2 = (metadata) => {
|
|
|
4273
4292
|
if (speaker) {
|
|
4274
4293
|
return `Speaker ${speaker}`;
|
|
4275
4294
|
}
|
|
4276
|
-
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
|
|
4277
|
-
const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
|
|
4278
4295
|
if (sectionTitle) {
|
|
4279
4296
|
return `Section ${sectionTitle}`;
|
|
4280
4297
|
}
|
|
@@ -4284,11 +4301,21 @@ var buildLocatorLabel2 = (metadata, source, title) => {
|
|
|
4284
4301
|
if (!metadata) {
|
|
4285
4302
|
return;
|
|
4286
4303
|
}
|
|
4304
|
+
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4305
|
+
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4306
|
+
const pdfBlockNumber = getContextNumber2(metadata.pdfBlockNumber);
|
|
4307
|
+
const officeBlockNumber = getContextNumber2(metadata.officeBlockNumber);
|
|
4287
4308
|
const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
4288
4309
|
const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
4289
4310
|
if (page && region) {
|
|
4290
4311
|
return `Page ${page} \xB7 Region ${region}`;
|
|
4291
4312
|
}
|
|
4313
|
+
if (page && pdfBlockNumber && pdfTextKind === "table_like") {
|
|
4314
|
+
return `Page ${page} \xB7 Table Block ${pdfBlockNumber}`;
|
|
4315
|
+
}
|
|
4316
|
+
if (page && pdfBlockNumber) {
|
|
4317
|
+
return `Page ${page} \xB7 Text Block ${pdfBlockNumber}`;
|
|
4318
|
+
}
|
|
4292
4319
|
if (page) {
|
|
4293
4320
|
return `Page ${page}`;
|
|
4294
4321
|
}
|
|
@@ -4317,6 +4344,15 @@ var buildLocatorLabel2 = (metadata, source, title) => {
|
|
|
4317
4344
|
if (mediaStart) {
|
|
4318
4345
|
return `Timestamp ${mediaStart}`;
|
|
4319
4346
|
}
|
|
4347
|
+
if (officeBlockNumber && officeBlockKind === "table") {
|
|
4348
|
+
return `Office table block ${officeBlockNumber}`;
|
|
4349
|
+
}
|
|
4350
|
+
if (officeBlockNumber && officeBlockKind === "list") {
|
|
4351
|
+
return `Office list block ${officeBlockNumber}`;
|
|
4352
|
+
}
|
|
4353
|
+
if (officeBlockNumber && officeBlockKind === "paragraph") {
|
|
4354
|
+
return `Office paragraph block ${officeBlockNumber}`;
|
|
4355
|
+
}
|
|
4320
4356
|
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
|
|
4321
4357
|
if (sectionPath.length > 0) {
|
|
4322
4358
|
return `Section ${sectionPath.join(" > ")}`;
|
|
@@ -4334,12 +4370,16 @@ var buildProvenanceLabel2 = (metadata) => {
|
|
|
4334
4370
|
const mediaKind = getContextString2(metadata.mediaKind);
|
|
4335
4371
|
const transcriptSource = getContextString2(metadata.transcriptSource);
|
|
4336
4372
|
const pdfTextMode = getContextString2(metadata.pdfTextMode);
|
|
4373
|
+
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4374
|
+
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4337
4375
|
const ocrEngine = getContextString2(metadata.ocrEngine);
|
|
4338
4376
|
const extractorRegistryMatch = getContextString2(metadata.extractorRegistryMatch);
|
|
4339
4377
|
const chunkingProfile = getContextString2(metadata.chunkingProfile);
|
|
4340
4378
|
const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
|
|
4341
4379
|
const labels = [
|
|
4342
4380
|
pdfTextMode ? `PDF ${pdfTextMode}` : "",
|
|
4381
|
+
pdfTextKind === "table_like" ? "PDF table block" : pdfTextKind === "paragraph" ? "PDF text block" : "",
|
|
4382
|
+
officeBlockKind ? `Office ${officeBlockKind}` : "",
|
|
4343
4383
|
ocrEngine ? `OCR ${ocrEngine}` : "",
|
|
4344
4384
|
extractorRegistryMatch ? `Extractor ${extractorRegistryMatch}` : "",
|
|
4345
4385
|
chunkingProfile ? `Chunking ${chunkingProfile}` : "",
|
|
@@ -4375,7 +4415,7 @@ var buildRAGChunkStructure = (metadata) => {
|
|
|
4375
4415
|
return;
|
|
4376
4416
|
}
|
|
4377
4417
|
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : undefined;
|
|
4378
|
-
const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" || metadata.sectionKind === "office_heading" || metadata.sectionKind === "spreadsheet_rows" || metadata.sectionKind === "presentation_slide" ? metadata.sectionKind : undefined;
|
|
4418
|
+
const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" || metadata.sectionKind === "office_heading" || metadata.sectionKind === "office_block" || metadata.sectionKind === "pdf_block" || metadata.sectionKind === "spreadsheet_rows" || metadata.sectionKind === "presentation_slide" ? metadata.sectionKind : undefined;
|
|
4379
4419
|
const section = {
|
|
4380
4420
|
depth: getContextNumber2(metadata.sectionDepth),
|
|
4381
4421
|
kind: sectionKind,
|
|
@@ -4695,7 +4735,7 @@ var buildRAGSourceSummaries = (sources) => {
|
|
|
4695
4735
|
const citationReferenceMap = buildRAGCitationReferenceMap(citations);
|
|
4696
4736
|
return sourceGroups.map((group) => {
|
|
4697
4737
|
const groupCitations = citations.filter((citation) => group.chunks.some((chunk) => chunk.chunkId === citation.chunkId));
|
|
4698
|
-
const leadChunk = group.chunks
|
|
4738
|
+
const leadChunk = getPreferredSourceLeadChunk(group.chunks);
|
|
4699
4739
|
const excerpts = leadChunk ? buildRAGChunkExcerpts(group.chunks, leadChunk.chunkId) : undefined;
|
|
4700
4740
|
const structure = leadChunk?.structure ?? buildRAGChunkStructure(leadChunk?.metadata);
|
|
4701
4741
|
const excerptSelection = buildRAGExcerptSelection(excerpts, structure);
|
|
@@ -4723,13 +4763,45 @@ var getSectionPathFromSource = (source) => {
|
|
|
4723
4763
|
const path = source.structure?.section?.path ?? (Array.isArray(source.metadata?.sectionPath) ? source.metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : []);
|
|
4724
4764
|
return path.length > 0 ? path : undefined;
|
|
4725
4765
|
};
|
|
4766
|
+
var isBlockAwareContextLabel = (value) => typeof value === "string" && (value.startsWith("PDF ") || value.startsWith("Office "));
|
|
4767
|
+
var getStructuredSectionScoreWeight = (metadata) => {
|
|
4768
|
+
if (!metadata) {
|
|
4769
|
+
return 1;
|
|
4770
|
+
}
|
|
4771
|
+
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4772
|
+
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4773
|
+
const sectionKind = getContextString2(metadata.sectionKind);
|
|
4774
|
+
if (pdfTextKind === "table_like") {
|
|
4775
|
+
return 1.28;
|
|
4776
|
+
}
|
|
4777
|
+
if (officeBlockKind === "table" || officeBlockKind === "list") {
|
|
4778
|
+
return 1.24;
|
|
4779
|
+
}
|
|
4780
|
+
if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
|
|
4781
|
+
return 1.12;
|
|
4782
|
+
}
|
|
4783
|
+
return 1;
|
|
4784
|
+
};
|
|
4785
|
+
var getStructuredSourceLeadScore = (source) => source.score * getStructuredSectionScoreWeight(source.metadata);
|
|
4786
|
+
var getPreferredSourceLeadChunk = (chunks) => chunks.slice().sort((left, right) => {
|
|
4787
|
+
const leftWeightedScore = getStructuredSourceLeadScore(left);
|
|
4788
|
+
const rightWeightedScore = getStructuredSourceLeadScore(right);
|
|
4789
|
+
if (rightWeightedScore !== leftWeightedScore) {
|
|
4790
|
+
return rightWeightedScore - leftWeightedScore;
|
|
4791
|
+
}
|
|
4792
|
+
if (right.score !== left.score) {
|
|
4793
|
+
return right.score - left.score;
|
|
4794
|
+
}
|
|
4795
|
+
return left.chunkId.localeCompare(right.chunkId);
|
|
4796
|
+
})[0];
|
|
4726
4797
|
var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
4727
|
-
const totalScore = sources.reduce((sum, source) => sum + source.score, 0);
|
|
4798
|
+
const totalScore = sources.reduce((sum, source) => sum + source.score * getStructuredSectionScoreWeight(source.metadata), 0);
|
|
4728
4799
|
if (sources.length === 0 || totalScore <= 0) {
|
|
4729
4800
|
return [];
|
|
4730
4801
|
}
|
|
4731
4802
|
const sections = new Map;
|
|
4732
4803
|
for (const source of sources) {
|
|
4804
|
+
const structuredScore = source.score * getStructuredSectionScoreWeight(source.metadata);
|
|
4733
4805
|
const path = getSectionPathFromSource(source);
|
|
4734
4806
|
if (!path) {
|
|
4735
4807
|
continue;
|
|
@@ -4761,7 +4833,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
|
4761
4833
|
sourceSet: new Set(source.source ? [source.source] : []),
|
|
4762
4834
|
topChunkId: source.chunkId,
|
|
4763
4835
|
topSource: source.source,
|
|
4764
|
-
totalScore:
|
|
4836
|
+
totalScore: structuredScore,
|
|
4765
4837
|
transformedHits,
|
|
4766
4838
|
variantHits,
|
|
4767
4839
|
vectorHits
|
|
@@ -4769,7 +4841,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
|
4769
4841
|
continue;
|
|
4770
4842
|
}
|
|
4771
4843
|
existing.count += 1;
|
|
4772
|
-
existing.totalScore +=
|
|
4844
|
+
existing.totalScore += structuredScore;
|
|
4773
4845
|
if (source.source) {
|
|
4774
4846
|
existing.sourceSet.add(source.source);
|
|
4775
4847
|
}
|
|
@@ -4797,6 +4869,8 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
|
4797
4869
|
const parentTotal = siblingPool.reduce((sum, entry) => sum + entry.totalScore, 0);
|
|
4798
4870
|
const scoreShare = section.totalScore / totalScore;
|
|
4799
4871
|
const parentShare = parentTotal > 0 ? section.totalScore / parentTotal : undefined;
|
|
4872
|
+
const topChunk = sources.find((source) => source.chunkId === section.topChunkId);
|
|
4873
|
+
const topContextLabel = topChunk?.labels?.contextLabel ?? buildContextLabel2(topChunk?.metadata);
|
|
4800
4874
|
const parentDistribution = parentTotal > 0 ? siblingPool.map((entry) => ({
|
|
4801
4875
|
count: entry.count,
|
|
4802
4876
|
isActive: entry.key === section.key,
|
|
@@ -4922,6 +4996,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
|
|
|
4922
4996
|
reasons.push("concentrated_evidence");
|
|
4923
4997
|
}
|
|
4924
4998
|
const summaryParts = [
|
|
4999
|
+
isBlockAwareContextLabel(topContextLabel) ? topContextLabel : "",
|
|
4925
5000
|
`${section.count} hit${section.count === 1 ? "" : "s"}`,
|
|
4926
5001
|
`${(scoreShare * 100).toFixed(0)}% score share`,
|
|
4927
5002
|
`vector ${section.vectorHits} \xB7 lexical ${section.lexicalHits} \xB7 hybrid ${section.hybridHits}`,
|
|
@@ -5133,22 +5208,21 @@ var updateSourceGroup = (groups, source) => {
|
|
|
5133
5208
|
groups.set(key, buildSourceGroup(source, key));
|
|
5134
5209
|
return;
|
|
5135
5210
|
}
|
|
5136
|
-
|
|
5137
|
-
existing.bestScore = source.score;
|
|
5138
|
-
existing.label = buildSourceLabel2(source);
|
|
5139
|
-
existing.labels = source.labels ?? buildRAGSourceLabels({
|
|
5140
|
-
metadata: source.metadata,
|
|
5141
|
-
source: source.source,
|
|
5142
|
-
title: source.title
|
|
5143
|
-
});
|
|
5144
|
-
existing.structure = source.structure ?? buildRAGChunkStructure(source.metadata);
|
|
5145
|
-
existing.source = source.source;
|
|
5146
|
-
existing.title = source.title;
|
|
5147
|
-
} else {
|
|
5148
|
-
existing.bestScore = Math.max(existing.bestScore, source.score);
|
|
5149
|
-
}
|
|
5211
|
+
existing.bestScore = Math.max(existing.bestScore, source.score);
|
|
5150
5212
|
existing.count += 1;
|
|
5151
5213
|
existing.chunks.push(source);
|
|
5214
|
+
const leadChunk = getPreferredSourceLeadChunk(existing.chunks);
|
|
5215
|
+
if (leadChunk) {
|
|
5216
|
+
existing.label = buildSourceLabel2(leadChunk);
|
|
5217
|
+
existing.labels = leadChunk.labels ?? buildRAGSourceLabels({
|
|
5218
|
+
metadata: leadChunk.metadata,
|
|
5219
|
+
source: leadChunk.source,
|
|
5220
|
+
title: leadChunk.title
|
|
5221
|
+
});
|
|
5222
|
+
existing.structure = leadChunk.structure ?? buildRAGChunkStructure(leadChunk.metadata);
|
|
5223
|
+
existing.source = leadChunk.source;
|
|
5224
|
+
existing.title = leadChunk.title;
|
|
5225
|
+
}
|
|
5152
5226
|
};
|
|
5153
5227
|
var getLatestAssistantMessage = (messages) => {
|
|
5154
5228
|
for (let index = messages.length - 1;index >= 0; index -= 1) {
|
|
@@ -8485,6 +8559,55 @@ var scoreLoosePhraseMatch2 = (query, text) => {
|
|
|
8485
8559
|
}
|
|
8486
8560
|
return 0;
|
|
8487
8561
|
};
|
|
8562
|
+
var queryHasAnyToken = (queryTokens, candidates) => candidates.some((candidate) => queryTokens.includes(candidate));
|
|
8563
|
+
var scoreStructuredEvidenceMatch = (queryTokens, result) => {
|
|
8564
|
+
const metadata = result.metadata ?? {};
|
|
8565
|
+
const pdfTextKind = typeof metadata.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
|
|
8566
|
+
const officeBlockKind = typeof metadata.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
|
|
8567
|
+
const hasBlockMetadata = typeof metadata.pdfBlockNumber === "number" || typeof metadata.officeBlockNumber === "number";
|
|
8568
|
+
let score = 0;
|
|
8569
|
+
if (hasBlockMetadata) {
|
|
8570
|
+
score += 0.12;
|
|
8571
|
+
}
|
|
8572
|
+
if (pdfTextKind === "table_like" && queryHasAnyToken(queryTokens, [
|
|
8573
|
+
"table",
|
|
8574
|
+
"row",
|
|
8575
|
+
"rows",
|
|
8576
|
+
"column",
|
|
8577
|
+
"columns",
|
|
8578
|
+
"spreadsheet",
|
|
8579
|
+
"sheet",
|
|
8580
|
+
"workbook"
|
|
8581
|
+
])) {
|
|
8582
|
+
score += 0.65;
|
|
8583
|
+
}
|
|
8584
|
+
if (officeBlockKind === "table" && queryHasAnyToken(queryTokens, [
|
|
8585
|
+
"table",
|
|
8586
|
+
"row",
|
|
8587
|
+
"rows",
|
|
8588
|
+
"column",
|
|
8589
|
+
"columns",
|
|
8590
|
+
"matrix",
|
|
8591
|
+
"grid"
|
|
8592
|
+
])) {
|
|
8593
|
+
score += 0.55;
|
|
8594
|
+
}
|
|
8595
|
+
if (officeBlockKind === "list" && queryHasAnyToken(queryTokens, [
|
|
8596
|
+
"list",
|
|
8597
|
+
"checklist",
|
|
8598
|
+
"bullet",
|
|
8599
|
+
"bullets",
|
|
8600
|
+
"step",
|
|
8601
|
+
"steps",
|
|
8602
|
+
"task",
|
|
8603
|
+
"tasks",
|
|
8604
|
+
"item",
|
|
8605
|
+
"items"
|
|
8606
|
+
])) {
|
|
8607
|
+
score += 0.55;
|
|
8608
|
+
}
|
|
8609
|
+
return score;
|
|
8610
|
+
};
|
|
8488
8611
|
var scoreHeuristicMatch = ({
|
|
8489
8612
|
query,
|
|
8490
8613
|
queryTokens,
|
|
@@ -8501,7 +8624,8 @@ var scoreHeuristicMatch = ({
|
|
|
8501
8624
|
const exactPhraseBoost = Math.max(normalizeText([result.title, result.source, result.chunkText, ...metadataValues].filter(Boolean).join(" ")).includes(queryTokens.join(" ")) ? 1 : 0, scoreLoosePhraseMatch2(query, [result.title, result.source, result.chunkText, ...metadataValues].filter(Boolean).join(" ")));
|
|
8502
8625
|
const sourcePathBoost = typeof result.source === "string" && queryTokens.some((token) => result.source?.toLowerCase().includes(token)) ? 0.5 : 0;
|
|
8503
8626
|
const metadataBoost = metadataValues.length > 0 ? queryTokens.filter((token) => metadataValues.some((value) => value.toLowerCase().includes(token))).length / queryTokens.length : 0;
|
|
8504
|
-
|
|
8627
|
+
const structuredEvidenceBoost = scoreStructuredEvidenceMatch(queryTokens, result);
|
|
8628
|
+
return result.score + overlapBoost + exactPhraseBoost + sourcePathBoost + metadataBoost + structuredEvidenceBoost;
|
|
8505
8629
|
};
|
|
8506
8630
|
var normalizeText = (value) => tokenize3(value).join(" ");
|
|
8507
8631
|
var applyRAGReranking = async ({
|
|
@@ -8736,32 +8860,59 @@ var stripHtmlTags = (value) => {
|
|
|
8736
8860
|
`).replace(/<li\b[^>]*>/gi, "- ").replace(/<[^>]+>/g, " ");
|
|
8737
8861
|
return decodeHtmlEntities(withoutTags);
|
|
8738
8862
|
};
|
|
8863
|
+
var stripHtmlNoiseBlocks = (value) => value.replace(/<!--[\s\S]*?-->/g, " ").replace(/<(script|style|template|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<([a-z0-9:_-]+)\b[^>]*\b(hidden|aria-hidden=(['"])true\3)[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<(nav|footer|header|aside|form|dialog)\b[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<([a-z0-9:_-]+)\b[^>]*\b(?:id|class)=(['"])[^'"]*(nav|menu|footer|header|sidebar|promo|banner|cookie|breadcrumb|share|social|subscribe|newsletter|modal)[^'"]*\2[^>]*>[\s\S]*?<\/\1>/gi, " ");
|
|
8864
|
+
var collectHtmlContentCandidates = (value) => {
|
|
8865
|
+
const patterns = [
|
|
8866
|
+
{
|
|
8867
|
+
contentGroup: 1,
|
|
8868
|
+
pattern: /<main\b[^>]*>([\s\S]*?)<\/main>/gi
|
|
8869
|
+
},
|
|
8870
|
+
{
|
|
8871
|
+
contentGroup: 1,
|
|
8872
|
+
pattern: /<article\b[^>]*>([\s\S]*?)<\/article>/gi
|
|
8873
|
+
},
|
|
8874
|
+
{
|
|
8875
|
+
contentGroup: 3,
|
|
8876
|
+
pattern: /<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/gi
|
|
8877
|
+
},
|
|
8878
|
+
{
|
|
8879
|
+
contentGroup: 4,
|
|
8880
|
+
pattern: /<([a-z0-9:_-]+)\b[^>]*\b(?:id|class)=(['"])[^'"]*(content|article|main|post|body)[^'"]*\2[^>]*>([\s\S]*?)<\/\1>/gi
|
|
8881
|
+
}
|
|
8882
|
+
];
|
|
8883
|
+
const candidates = [];
|
|
8884
|
+
for (const entry of patterns) {
|
|
8885
|
+
for (const match of value.matchAll(entry.pattern)) {
|
|
8886
|
+
const rawCandidate = match[entry.contentGroup];
|
|
8887
|
+
const candidate = typeof rawCandidate === "string" ? rawCandidate : "";
|
|
8888
|
+
if (candidate.trim()) {
|
|
8889
|
+
candidates.push(candidate.trim());
|
|
8890
|
+
}
|
|
8891
|
+
}
|
|
8892
|
+
}
|
|
8893
|
+
return candidates;
|
|
8894
|
+
};
|
|
8739
8895
|
var extractMainHtmlContent = (value) => {
|
|
8740
8896
|
const trimmed = value.trim();
|
|
8741
8897
|
if (!/<html\b|<body\b|<main\b|<article\b/i.test(trimmed)) {
|
|
8742
8898
|
return value;
|
|
8743
8899
|
}
|
|
8744
|
-
const
|
|
8745
|
-
const
|
|
8746
|
-
if (
|
|
8747
|
-
|
|
8748
|
-
|
|
8749
|
-
|
|
8750
|
-
|
|
8751
|
-
|
|
8752
|
-
|
|
8753
|
-
|
|
8754
|
-
`);
|
|
8755
|
-
}
|
|
8756
|
-
const roleMainMatch = boilerplateStripped.match(/<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/i);
|
|
8757
|
-
if (roleMainMatch?.[3]) {
|
|
8758
|
-
return roleMainMatch[3];
|
|
8900
|
+
const stripped = stripHtmlNoiseBlocks(trimmed);
|
|
8901
|
+
const candidates = collectHtmlContentCandidates(stripped);
|
|
8902
|
+
if (candidates.length > 0) {
|
|
8903
|
+
const bestCandidate = candidates.map((candidate) => ({
|
|
8904
|
+
candidate,
|
|
8905
|
+
score: stripHtmlTags(candidate).replace(/\s+/g, " ").trim().length
|
|
8906
|
+
})).sort((left, right) => right.score - left.score)[0]?.candidate;
|
|
8907
|
+
if (bestCandidate) {
|
|
8908
|
+
return bestCandidate;
|
|
8909
|
+
}
|
|
8759
8910
|
}
|
|
8760
|
-
const bodyMatch =
|
|
8911
|
+
const bodyMatch = stripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
|
|
8761
8912
|
if (bodyMatch?.[1]) {
|
|
8762
8913
|
return bodyMatch[1];
|
|
8763
8914
|
}
|
|
8764
|
-
return
|
|
8915
|
+
return stripped;
|
|
8765
8916
|
};
|
|
8766
8917
|
var stripHtml = (value) => {
|
|
8767
8918
|
const focused = extractMainHtmlContent(value);
|
|
@@ -8779,6 +8930,93 @@ var stripMarkdown = (value) => {
|
|
|
8779
8930
|
`);
|
|
8780
8931
|
return normalizeWhitespace(stripped);
|
|
8781
8932
|
};
|
|
8933
|
+
var pdfNativeStructureUnits = (metadata) => {
|
|
8934
|
+
const blocks = Array.isArray(metadata?.pdfTextBlocks) ? metadata.pdfTextBlocks : [];
|
|
8935
|
+
const units = [];
|
|
8936
|
+
for (const block of blocks) {
|
|
8937
|
+
if (!block || typeof block !== "object") {
|
|
8938
|
+
continue;
|
|
8939
|
+
}
|
|
8940
|
+
const text = typeof block.text === "string" ? normalizeWhitespace(block.text) : "";
|
|
8941
|
+
if (!text) {
|
|
8942
|
+
continue;
|
|
8943
|
+
}
|
|
8944
|
+
const pageNumber = typeof block.pageNumber === "number" && Number.isFinite(block.pageNumber) ? block.pageNumber : undefined;
|
|
8945
|
+
const pdfBlockNumber = typeof block.blockNumber === "number" && Number.isFinite(block.blockNumber) ? block.blockNumber : undefined;
|
|
8946
|
+
const pdfTextKind = block.textKind === "table_like" ? "table_like" : "paragraph";
|
|
8947
|
+
const sectionTitle = pageNumber ? pdfTextKind === "table_like" ? `Page ${pageNumber} Table Block` : `Page ${pageNumber} Text Block` : pdfTextKind === "table_like" ? "Table Block" : "Text Block";
|
|
8948
|
+
units.push({
|
|
8949
|
+
pageNumber,
|
|
8950
|
+
pdfBlockNumber,
|
|
8951
|
+
pdfTextKind,
|
|
8952
|
+
preferredChunkUnits: pdfTextKind === "table_like" ? text.split(`
|
|
8953
|
+
`).filter(Boolean) : undefined,
|
|
8954
|
+
sectionDepth: 1,
|
|
8955
|
+
sectionKind: "pdf_block",
|
|
8956
|
+
sectionPath: [sectionTitle],
|
|
8957
|
+
sectionTitle,
|
|
8958
|
+
text
|
|
8959
|
+
});
|
|
8960
|
+
}
|
|
8961
|
+
return units;
|
|
8962
|
+
};
|
|
8963
|
+
var officeNativeStructureUnits = (metadata) => {
|
|
8964
|
+
const blocks = Array.isArray(metadata?.officeBlocks) ? metadata.officeBlocks : [];
|
|
8965
|
+
const units = [];
|
|
8966
|
+
const headingStack = [];
|
|
8967
|
+
const decorateOfficeSectionText = (text, sectionTitle) => {
|
|
8968
|
+
if (!sectionTitle || text.includes(sectionTitle)) {
|
|
8969
|
+
return text;
|
|
8970
|
+
}
|
|
8971
|
+
return normalizeWhitespace(`${sectionTitle}
|
|
8972
|
+
${text}`);
|
|
8973
|
+
};
|
|
8974
|
+
for (const [index, block] of blocks.entries()) {
|
|
8975
|
+
if (!block || typeof block !== "object") {
|
|
8976
|
+
continue;
|
|
8977
|
+
}
|
|
8978
|
+
const text = typeof block.text === "string" ? normalizeWhitespace(block.text) : "";
|
|
8979
|
+
if (!text) {
|
|
8980
|
+
continue;
|
|
8981
|
+
}
|
|
8982
|
+
const officeBlockNumber = typeof block.blockNumber === "number" && Number.isFinite(block.blockNumber) ? block.blockNumber : undefined;
|
|
8983
|
+
const officeBlockKind = block.blockKind === "title" || block.blockKind === "heading" || block.blockKind === "list" || block.blockKind === "table" ? block.blockKind : "paragraph";
|
|
8984
|
+
const headingLevel = typeof block.headingLevel === "number" && Number.isFinite(block.headingLevel) ? block.headingLevel : undefined;
|
|
8985
|
+
if (officeBlockKind === "title" || officeBlockKind === "heading") {
|
|
8986
|
+
const level = officeBlockKind === "title" ? 1 : headingLevel ?? 1;
|
|
8987
|
+
headingStack[level - 1] = text;
|
|
8988
|
+
headingStack.length = level;
|
|
8989
|
+
const nextBlock = blocks[index + 1];
|
|
8990
|
+
const nextKind = nextBlock && typeof nextBlock === "object" ? nextBlock.blockKind : undefined;
|
|
8991
|
+
if (nextKind === "title" || nextKind === "heading" || nextKind === "list" || nextKind === "table" || !nextBlock) {
|
|
8992
|
+
units.push({
|
|
8993
|
+
officeBlockKind,
|
|
8994
|
+
officeBlockNumber,
|
|
8995
|
+
sectionDepth: headingStack.length,
|
|
8996
|
+
sectionKind: "office_heading",
|
|
8997
|
+
sectionPath: [...headingStack],
|
|
8998
|
+
sectionTitle: text,
|
|
8999
|
+
text
|
|
9000
|
+
});
|
|
9001
|
+
}
|
|
9002
|
+
continue;
|
|
9003
|
+
}
|
|
9004
|
+
const sectionPath = headingStack.length > 0 ? [...headingStack] : undefined;
|
|
9005
|
+
const sectionTitle = sectionPath?.at(-1);
|
|
9006
|
+
units.push({
|
|
9007
|
+
officeBlockKind,
|
|
9008
|
+
officeBlockNumber,
|
|
9009
|
+
preferredChunkUnits: officeBlockKind === "table" ? text.split(`
|
|
9010
|
+
`).filter(Boolean) : undefined,
|
|
9011
|
+
sectionDepth: sectionPath?.length,
|
|
9012
|
+
sectionKind: officeBlockKind === "paragraph" ? "office_heading" : "office_block",
|
|
9013
|
+
sectionPath,
|
|
9014
|
+
sectionTitle,
|
|
9015
|
+
text: officeBlockKind === "paragraph" ? decorateOfficeSectionText(text, sectionTitle) : text
|
|
9016
|
+
});
|
|
9017
|
+
}
|
|
9018
|
+
return units;
|
|
9019
|
+
};
|
|
8782
9020
|
var markdownStructureUnits = (value) => {
|
|
8783
9021
|
const lines = value.replace(/\r\n?/g, `
|
|
8784
9022
|
`).split(`
|
|
@@ -9122,6 +9360,7 @@ var appendPdfLineBreak = (parts) => {
|
|
|
9122
9360
|
parts.push(`
|
|
9123
9361
|
`);
|
|
9124
9362
|
};
|
|
9363
|
+
var PDF_CHROME_LINE_MAX_LENGTH = 80;
|
|
9125
9364
|
var PDF_TEXT_OPERATOR_PATTERN = /(\[((?:\\.|[^\]])*)\]\s*TJ)|(\(((?:\\.|[^\\)])*)\)\s*Tj)|([-+]?\d*\.?\d+\s+[-+]?\d*\.?\d+\s+\(((?:\\.|[^\\)])*)\)\s*")|(\(((?:\\.|[^\\)])*)\)\s*')|((?:[-+]?\d*\.?\d+\s+){2}(?:Td|TD))|(T\*)|((?:[-+]?\d*\.?\d+\s+){6}Tm)/g;
|
|
9126
9365
|
var extractTextFromPDFTextObject = (value) => {
|
|
9127
9366
|
const parts = [];
|
|
@@ -9150,19 +9389,84 @@ var extractTextFromPDFTextObject = (value) => {
|
|
|
9150
9389
|
}
|
|
9151
9390
|
return parts.join("");
|
|
9152
9391
|
};
|
|
9153
|
-
var
|
|
9154
|
-
const
|
|
9155
|
-
|
|
9156
|
-
|
|
9157
|
-
|
|
9158
|
-
|
|
9159
|
-
`);
|
|
9160
|
-
|
|
9392
|
+
var buildPDFNativeTextBlock = (text, blockNumber, pageNumber) => {
|
|
9393
|
+
const normalized = normalizeWhitespace(text);
|
|
9394
|
+
if (!normalized) {
|
|
9395
|
+
return;
|
|
9396
|
+
}
|
|
9397
|
+
const lineCount = normalized.split(`
|
|
9398
|
+
`).filter(Boolean).length;
|
|
9399
|
+
const textKind = normalized.includes(" | ") ? "table_like" : "paragraph";
|
|
9400
|
+
return {
|
|
9401
|
+
blockNumber,
|
|
9402
|
+
lineCount,
|
|
9403
|
+
pageNumber,
|
|
9404
|
+
text: normalized,
|
|
9405
|
+
textKind
|
|
9406
|
+
};
|
|
9407
|
+
};
|
|
9408
|
+
var isLikelyPDFPageLabel = (value) => /^page\s+\d+(?:\s+of\s+\d+)?$/i.test(value.trim());
|
|
9409
|
+
var suppressRepeatedPDFChrome = (blocks) => {
|
|
9410
|
+
const linePages = new Map;
|
|
9411
|
+
for (const block of blocks) {
|
|
9412
|
+
for (const line of block.text.split(`
|
|
9413
|
+
`)) {
|
|
9414
|
+
const normalized = normalizeWhitespace(line);
|
|
9415
|
+
if (!normalized || normalized.length > PDF_CHROME_LINE_MAX_LENGTH) {
|
|
9416
|
+
continue;
|
|
9417
|
+
}
|
|
9418
|
+
const pages = linePages.get(normalized) ?? new Set;
|
|
9419
|
+
pages.add(block.pageNumber);
|
|
9420
|
+
linePages.set(normalized, pages);
|
|
9421
|
+
}
|
|
9422
|
+
}
|
|
9423
|
+
return blocks.map((block) => {
|
|
9424
|
+
const keptLines = block.text.split(`
|
|
9425
|
+
`).map((line) => normalizeWhitespace(line)).filter((line) => {
|
|
9426
|
+
if (!line) {
|
|
9427
|
+
return false;
|
|
9428
|
+
}
|
|
9429
|
+
if (isLikelyPDFPageLabel(line)) {
|
|
9430
|
+
return false;
|
|
9431
|
+
}
|
|
9432
|
+
const repeatedPages = linePages.get(line);
|
|
9433
|
+
if (line.length <= PDF_CHROME_LINE_MAX_LENGTH && repeatedPages && repeatedPages.size > 1) {
|
|
9434
|
+
return false;
|
|
9435
|
+
}
|
|
9436
|
+
return true;
|
|
9437
|
+
});
|
|
9438
|
+
const text = normalizeWhitespace(keptLines.join(`
|
|
9439
|
+
`));
|
|
9440
|
+
if (!text) {
|
|
9441
|
+
return;
|
|
9442
|
+
}
|
|
9443
|
+
return buildPDFNativeTextBlock(text, block.blockNumber, block.pageNumber);
|
|
9444
|
+
}).filter((value) => Boolean(value));
|
|
9161
9445
|
};
|
|
9162
|
-
var
|
|
9446
|
+
var extractNativePDFText = (data) => {
|
|
9163
9447
|
const raw = Buffer.from(data).toString("latin1");
|
|
9164
9448
|
const count = [...raw.matchAll(/\/Type\s*\/Page\b/g)].length;
|
|
9165
|
-
|
|
9449
|
+
const pageCount = count > 0 ? count : 1;
|
|
9450
|
+
const pageMarkers = [...raw.matchAll(/\/Type\s*\/Page\b/g)].map((match) => match.index ?? raw.length);
|
|
9451
|
+
const blocks = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match, index) => {
|
|
9452
|
+
const blockText = extractTextFromPDFTextObject(match[1] ?? "");
|
|
9453
|
+
const objectEnd = (match.index ?? 0) + (match[0]?.length ?? 0);
|
|
9454
|
+
const pageIndex = pageMarkers.findIndex((marker) => marker >= objectEnd);
|
|
9455
|
+
const pageNumber = pageIndex >= 0 ? pageIndex + 1 : pageCount;
|
|
9456
|
+
return buildPDFNativeTextBlock(blockText, index + 1, pageNumber);
|
|
9457
|
+
}).filter((value) => Boolean(value));
|
|
9458
|
+
const visibleBlocks = suppressRepeatedPDFChrome(blocks);
|
|
9459
|
+
const fallbackText = [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
|
|
9460
|
+
`);
|
|
9461
|
+
const text = visibleBlocks.length > 0 ? normalizeWhitespace(visibleBlocks.map((block) => block.text).join(`
|
|
9462
|
+
|
|
9463
|
+
`)) : normalizeWhitespace(fallbackText);
|
|
9464
|
+
return {
|
|
9465
|
+
pageCount,
|
|
9466
|
+
text,
|
|
9467
|
+
textBlockCount: visibleBlocks.length,
|
|
9468
|
+
textBlocks: visibleBlocks
|
|
9469
|
+
};
|
|
9166
9470
|
};
|
|
9167
9471
|
var readUInt16LE = (data, offset) => data[offset] | data[offset + 1] << 8;
|
|
9168
9472
|
var readUInt32LE = (data, offset) => (data[offset] | data[offset + 1] << 8 | data[offset + 2] << 16 | data[offset + 3] << 24) >>> 0;
|
|
@@ -9249,35 +9553,64 @@ var decodeGzipEntries = (data, input) => {
|
|
|
9249
9553
|
var extractXmlText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")));
|
|
9250
9554
|
var extractOfficeParagraphText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<w:tab\b[^>]*\/>/gi, "\t").replace(/<w:br\b[^>]*\/>/gi, `
|
|
9251
9555
|
`).replace(/<[^>]+>/g, " ")));
|
|
9252
|
-
var
|
|
9556
|
+
var officeDocumentBlocks = (entries) => {
|
|
9253
9557
|
const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
|
|
9254
9558
|
if (!documentEntry) {
|
|
9255
9559
|
return [];
|
|
9256
9560
|
}
|
|
9257
9561
|
const xml = decodeUtf8(documentEntry.data);
|
|
9258
|
-
const
|
|
9259
|
-
|
|
9260
|
-
|
|
9261
|
-
|
|
9562
|
+
const bodyMatch = xml.match(/<w:body\b[^>]*>([\s\S]*?)<\/w:body>/i);
|
|
9563
|
+
const body = bodyMatch?.[1] ?? xml;
|
|
9564
|
+
const blocks = [];
|
|
9565
|
+
const blockPattern = /<(w:p|w:tbl)\b[\s\S]*?<\/\1>/g;
|
|
9566
|
+
for (const match of body.matchAll(blockPattern)) {
|
|
9567
|
+
const blockXml = match[0] ?? "";
|
|
9568
|
+
if (blockXml.startsWith("<w:tbl")) {
|
|
9569
|
+
const rows = [...blockXml.matchAll(/<w:tr\b[\s\S]*?<\/w:tr>/g)].map((rowMatch, rowIndex) => {
|
|
9570
|
+
const cells = [
|
|
9571
|
+
...(rowMatch[0] ?? "").matchAll(/<w:tc\b[\s\S]*?<\/w:tc>/g)
|
|
9572
|
+
].map((cellMatch) => extractOfficeParagraphText(cellMatch[0] ?? "")).filter(Boolean);
|
|
9573
|
+
if (cells.length === 0) {
|
|
9574
|
+
return "";
|
|
9575
|
+
}
|
|
9576
|
+
return `Row ${rowIndex + 1}. ${cells.map((cell, cellIndex) => `${String.fromCharCode(65 + cellIndex)}: ${cell}`).join(" | ")}`;
|
|
9577
|
+
}).filter(Boolean);
|
|
9578
|
+
const text2 = normalizeWhitespace(rows.join(`
|
|
9579
|
+
`));
|
|
9580
|
+
if (!text2) {
|
|
9581
|
+
continue;
|
|
9582
|
+
}
|
|
9583
|
+
blocks.push({
|
|
9584
|
+
blockKind: "table",
|
|
9585
|
+
blockNumber: blocks.length + 1,
|
|
9586
|
+
text: text2
|
|
9587
|
+
});
|
|
9588
|
+
continue;
|
|
9589
|
+
}
|
|
9590
|
+
const text = extractOfficeParagraphText(blockXml);
|
|
9262
9591
|
if (!text) {
|
|
9263
|
-
|
|
9592
|
+
continue;
|
|
9264
9593
|
}
|
|
9265
|
-
const styleMatch =
|
|
9594
|
+
const styleMatch = blockXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
|
|
9266
9595
|
const style = (styleMatch?.[1] ?? "").toLowerCase();
|
|
9267
|
-
if (style === "title") {
|
|
9268
|
-
return text;
|
|
9269
|
-
}
|
|
9270
9596
|
const headingMatch = style.match(/^heading([1-6])$/);
|
|
9271
|
-
|
|
9272
|
-
|
|
9273
|
-
}
|
|
9274
|
-
|
|
9275
|
-
|
|
9597
|
+
const isListParagraph = /<w:numPr\b/i.test(blockXml) || style.includes("list") || style.includes("bullet");
|
|
9598
|
+
const blockKind = style === "title" ? "title" : headingMatch ? "heading" : isListParagraph ? "list" : "paragraph";
|
|
9599
|
+
const decoratedText = blockKind === "list" && !/^[-*]\s/.test(text) ? `- ${text}` : text;
|
|
9600
|
+
blocks.push({
|
|
9601
|
+
blockKind,
|
|
9602
|
+
blockNumber: blocks.length + 1,
|
|
9603
|
+
headingLevel: headingMatch ? Number.parseInt(headingMatch[1] ?? "1", 10) : undefined,
|
|
9604
|
+
style: style || undefined,
|
|
9605
|
+
text: decoratedText
|
|
9606
|
+
});
|
|
9607
|
+
}
|
|
9608
|
+
return blocks;
|
|
9276
9609
|
};
|
|
9277
9610
|
var officeDocumentText = (entries) => {
|
|
9278
|
-
const
|
|
9279
|
-
if (
|
|
9280
|
-
return normalizeWhitespace(
|
|
9611
|
+
const blocks = officeDocumentBlocks(entries);
|
|
9612
|
+
if (blocks.length > 0) {
|
|
9613
|
+
return normalizeWhitespace(blocks.map((block) => block.text).join(`
|
|
9281
9614
|
|
|
9282
9615
|
`));
|
|
9283
9616
|
}
|
|
@@ -9288,11 +9621,7 @@ var officeDocumentText = (entries) => {
|
|
|
9288
9621
|
return extractXmlText(decodeUtf8(documentEntry.data));
|
|
9289
9622
|
};
|
|
9290
9623
|
var officeDocumentSectionCount = (entries) => {
|
|
9291
|
-
const
|
|
9292
|
-
if (!documentEntry) {
|
|
9293
|
-
return;
|
|
9294
|
-
}
|
|
9295
|
-
const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
|
|
9624
|
+
const count = officeDocumentBlocks(entries).length;
|
|
9296
9625
|
return count > 0 ? count : undefined;
|
|
9297
9626
|
};
|
|
9298
9627
|
var spreadsheetSharedStrings = (entries) => entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
|
|
@@ -9822,8 +10151,10 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
9822
10151
|
let officeMetadata = {};
|
|
9823
10152
|
let structuredDocuments = [];
|
|
9824
10153
|
if (extension === ".docx" || extension === ".odt") {
|
|
10154
|
+
const officeBlocks = officeDocumentBlocks(entries);
|
|
9825
10155
|
text = officeDocumentText(entries);
|
|
9826
10156
|
officeMetadata = {
|
|
10157
|
+
officeBlocks,
|
|
9827
10158
|
sectionCount: officeDocumentSectionCount(entries)
|
|
9828
10159
|
};
|
|
9829
10160
|
} else if (extension === ".xlsx" || extension === ".ods") {
|
|
@@ -10013,8 +10344,8 @@ var createPDFFileExtractor = () => ({
|
|
|
10013
10344
|
name: "absolute_pdf",
|
|
10014
10345
|
supports: pdfExtractorSupports,
|
|
10015
10346
|
extract: (input) => {
|
|
10016
|
-
const
|
|
10017
|
-
if (!text) {
|
|
10347
|
+
const extracted = extractNativePDFText(input.data);
|
|
10348
|
+
if (!extracted.text) {
|
|
10018
10349
|
throw new Error("AbsoluteJS could not extract readable text from this PDF. Supply a custom extractor for scanned or image-only PDFs.");
|
|
10019
10350
|
}
|
|
10020
10351
|
return {
|
|
@@ -10024,10 +10355,12 @@ var createPDFFileExtractor = () => ({
|
|
|
10024
10355
|
metadata: {
|
|
10025
10356
|
...input.metadata ?? {},
|
|
10026
10357
|
fileKind: "pdf",
|
|
10027
|
-
pageCount:
|
|
10358
|
+
pageCount: extracted.pageCount,
|
|
10359
|
+
pdfTextBlockCount: extracted.textBlockCount,
|
|
10360
|
+
pdfTextBlocks: extracted.textBlocks
|
|
10028
10361
|
},
|
|
10029
10362
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
10030
|
-
text,
|
|
10363
|
+
text: extracted.text,
|
|
10031
10364
|
title: input.title
|
|
10032
10365
|
};
|
|
10033
10366
|
}
|
|
@@ -10052,7 +10385,8 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
10052
10385
|
name: `absolute_pdf_ocr:${options.provider.name}`,
|
|
10053
10386
|
supports: pdfExtractorSupports,
|
|
10054
10387
|
extract: async (input) => {
|
|
10055
|
-
const
|
|
10388
|
+
const extracted = extractNativePDFText(input.data);
|
|
10389
|
+
const nativeText = extracted.text;
|
|
10056
10390
|
const minLength = options.minExtractedTextLength ?? 80;
|
|
10057
10391
|
const shouldUseNativeText = !options.alwaysOCR && nativeText.length >= minLength;
|
|
10058
10392
|
if (shouldUseNativeText) {
|
|
@@ -10063,7 +10397,9 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
10063
10397
|
metadata: {
|
|
10064
10398
|
...input.metadata ?? {},
|
|
10065
10399
|
fileKind: "pdf",
|
|
10066
|
-
pageCount:
|
|
10400
|
+
pageCount: extracted.pageCount,
|
|
10401
|
+
pdfTextBlockCount: extracted.textBlockCount,
|
|
10402
|
+
pdfTextBlocks: extracted.textBlocks,
|
|
10067
10403
|
pdfTextMode: "native"
|
|
10068
10404
|
},
|
|
10069
10405
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
@@ -10078,7 +10414,7 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
10078
10414
|
const baseMetadata = {
|
|
10079
10415
|
...ocrMetadata(ocr),
|
|
10080
10416
|
fileKind: "pdf",
|
|
10081
|
-
pageCount:
|
|
10417
|
+
pageCount: extracted.pageCount,
|
|
10082
10418
|
pdfTextMode: "ocr"
|
|
10083
10419
|
};
|
|
10084
10420
|
const summaryDocument = {
|
|
@@ -10251,6 +10587,18 @@ var sourceAwareUnits = (document, format, normalizedText) => {
|
|
|
10251
10587
|
}
|
|
10252
10588
|
case "text":
|
|
10253
10589
|
default:
|
|
10590
|
+
if (document.metadata?.fileKind === "office") {
|
|
10591
|
+
const sections = officeNativeStructureUnits(document.metadata);
|
|
10592
|
+
if (sections.length > 0) {
|
|
10593
|
+
return sections;
|
|
10594
|
+
}
|
|
10595
|
+
}
|
|
10596
|
+
if (document.metadata?.fileKind === "pdf") {
|
|
10597
|
+
const sections = pdfNativeStructureUnits(document.metadata);
|
|
10598
|
+
if (sections.length > 0) {
|
|
10599
|
+
return sections;
|
|
10600
|
+
}
|
|
10601
|
+
}
|
|
10254
10602
|
if (document.metadata?.sourceNativeKind === "spreadsheet_sheet") {
|
|
10255
10603
|
return spreadsheetStructureUnits(normalizedText, document.metadata);
|
|
10256
10604
|
}
|
|
@@ -10574,6 +10922,11 @@ var prepareRAGDocument = (document, defaultChunking, chunkingRegistry) => {
|
|
|
10574
10922
|
...sectionTitle ? { sectionTitle } : {},
|
|
10575
10923
|
...sectionPath && sectionPath.length > 0 ? { sectionPath } : {},
|
|
10576
10924
|
...typeof entry.sectionDepth === "number" ? { sectionDepth: entry.sectionDepth } : {},
|
|
10925
|
+
...typeof entry.pageNumber === "number" ? { pageNumber: entry.pageNumber } : {},
|
|
10926
|
+
...typeof entry.officeBlockNumber === "number" ? { officeBlockNumber: entry.officeBlockNumber } : {},
|
|
10927
|
+
...entry.officeBlockKind ? { officeBlockKind: entry.officeBlockKind } : {},
|
|
10928
|
+
...typeof entry.pdfBlockNumber === "number" ? { pdfBlockNumber: entry.pdfBlockNumber } : {},
|
|
10929
|
+
...entry.pdfTextKind ? { pdfTextKind: entry.pdfTextKind } : {},
|
|
10577
10930
|
...entry.sectionKind ? { sectionKind: entry.sectionKind } : {},
|
|
10578
10931
|
...sectionChunkId ? { sectionChunkId } : {},
|
|
10579
10932
|
...sectionChunkId && sectionChunkIndex >= 0 ? {
|
|
@@ -10962,9 +11315,25 @@ var annotateRetrievalChannels = (input) => {
|
|
|
10962
11315
|
};
|
|
10963
11316
|
});
|
|
10964
11317
|
};
|
|
11318
|
+
var getStructuredSectionScoreWeight2 = (metadata) => {
|
|
11319
|
+
const pdfTextKind = typeof metadata?.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
|
|
11320
|
+
const officeBlockKind = typeof metadata?.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
|
|
11321
|
+
const sectionKind = typeof metadata?.sectionKind === "string" ? metadata.sectionKind : undefined;
|
|
11322
|
+
if (pdfTextKind === "table_like") {
|
|
11323
|
+
return 1.28;
|
|
11324
|
+
}
|
|
11325
|
+
if (officeBlockKind === "table" || officeBlockKind === "list") {
|
|
11326
|
+
return 1.24;
|
|
11327
|
+
}
|
|
11328
|
+
if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
|
|
11329
|
+
return 1.12;
|
|
11330
|
+
}
|
|
11331
|
+
return 1;
|
|
11332
|
+
};
|
|
10965
11333
|
var buildTraceSectionCounts = (results) => {
|
|
10966
11334
|
const sections = new Map;
|
|
10967
11335
|
for (const result of results) {
|
|
11336
|
+
const weightedScore = result.score * getStructuredSectionScoreWeight2(result.metadata);
|
|
10968
11337
|
const path = Array.isArray(result.metadata?.sectionPath) ? result.metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
|
|
10969
11338
|
if (path.length === 0) {
|
|
10970
11339
|
continue;
|
|
@@ -10991,6 +11360,7 @@ var buildTraceSectionCounts = (results) => {
|
|
|
10991
11360
|
var buildTraceSectionScores = (results) => {
|
|
10992
11361
|
const sections = new Map;
|
|
10993
11362
|
for (const result of results) {
|
|
11363
|
+
const weightedScore = result.score * getStructuredSectionScoreWeight2(result.metadata);
|
|
10994
11364
|
const path = Array.isArray(result.metadata?.sectionPath) ? result.metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
|
|
10995
11365
|
if (path.length === 0) {
|
|
10996
11366
|
continue;
|
|
@@ -10998,13 +11368,13 @@ var buildTraceSectionScores = (results) => {
|
|
|
10998
11368
|
const key = path.join(" > ");
|
|
10999
11369
|
const existing = sections.get(key);
|
|
11000
11370
|
if (existing) {
|
|
11001
|
-
existing.totalScore +=
|
|
11371
|
+
existing.totalScore += weightedScore;
|
|
11002
11372
|
continue;
|
|
11003
11373
|
}
|
|
11004
11374
|
sections.set(key, {
|
|
11005
11375
|
key,
|
|
11006
11376
|
label: path.at(-1) ?? key,
|
|
11007
|
-
totalScore:
|
|
11377
|
+
totalScore: weightedScore
|
|
11008
11378
|
});
|
|
11009
11379
|
}
|
|
11010
11380
|
return [...sections.values()].sort((left, right) => {
|
|
@@ -11443,11 +11813,32 @@ var renderSourceLabels = (input) => {
|
|
|
11443
11813
|
].filter((row) => row.length > 0);
|
|
11444
11814
|
return rows.length > 0 ? `<ul class="rag-source-labels">${rows.join("")}</ul>` : "";
|
|
11445
11815
|
};
|
|
11816
|
+
var formatStructureKindLabel = (kind) => {
|
|
11817
|
+
switch (kind) {
|
|
11818
|
+
case "markdown_heading":
|
|
11819
|
+
return "Markdown heading";
|
|
11820
|
+
case "html_heading":
|
|
11821
|
+
return "HTML heading";
|
|
11822
|
+
case "office_heading":
|
|
11823
|
+
return "Office heading";
|
|
11824
|
+
case "office_block":
|
|
11825
|
+
return "Office block";
|
|
11826
|
+
case "pdf_block":
|
|
11827
|
+
return "PDF block";
|
|
11828
|
+
case "spreadsheet_rows":
|
|
11829
|
+
return "Spreadsheet rows";
|
|
11830
|
+
case "presentation_slide":
|
|
11831
|
+
return "Presentation slide";
|
|
11832
|
+
default:
|
|
11833
|
+
return;
|
|
11834
|
+
}
|
|
11835
|
+
};
|
|
11446
11836
|
var renderChunkStructure = (structure) => {
|
|
11447
11837
|
if (!structure) {
|
|
11448
11838
|
return "";
|
|
11449
11839
|
}
|
|
11450
11840
|
const rows = [
|
|
11841
|
+
structure.section?.kind ? `<li><strong>Kind</strong> ${escapeHtml2(formatStructureKindLabel(structure.section.kind) ?? structure.section.kind)}</li>` : "",
|
|
11451
11842
|
structure.section?.title ? `<li><strong>Section</strong> ${escapeHtml2(structure.section.title)}</li>` : "",
|
|
11452
11843
|
structure.section?.path && structure.section.path.length > 1 ? `<li><strong>Section path</strong> ${escapeHtml2(structure.section.path.join(" > "))}</li>` : "",
|
|
11453
11844
|
typeof structure.sequence?.sectionChunkIndex === "number" && typeof structure.sequence?.sectionChunkCount === "number" ? `<li><strong>Section chunk</strong> ${structure.sequence.sectionChunkIndex + 1} of ${structure.sequence.sectionChunkCount}</li>` : "",
|
|
@@ -23887,5 +24278,5 @@ export {
|
|
|
23887
24278
|
aiChat
|
|
23888
24279
|
};
|
|
23889
24280
|
|
|
23890
|
-
//# debugId=
|
|
24281
|
+
//# debugId=23520EDE705830A964756E2164756E21
|
|
23891
24282
|
//# sourceMappingURL=index.js.map
|