@memberjunction/content-autotagging 5.22.0 → 5.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -18
- package/dist/CloudStorage/generic/CloudStorageBase.d.ts +2 -2
- package/dist/CloudStorage/generic/CloudStorageBase.d.ts.map +1 -1
- package/dist/CloudStorage/generic/CloudStorageBase.js +2 -2
- package/dist/CloudStorage/generic/CloudStorageBase.js.map +1 -1
- package/dist/CloudStorage/index.d.ts +5 -0
- package/dist/CloudStorage/index.d.ts.map +1 -1
- package/dist/CloudStorage/index.js +5 -0
- package/dist/CloudStorage/index.js.map +1 -1
- package/dist/CloudStorage/providers/AutotagCloudStorage.d.ts +61 -0
- package/dist/CloudStorage/providers/AutotagCloudStorage.d.ts.map +1 -0
- package/dist/CloudStorage/providers/AutotagCloudStorage.js +256 -0
- package/dist/CloudStorage/providers/AutotagCloudStorage.js.map +1 -0
- package/dist/Core/generic/AutotagBase.d.ts +9 -1
- package/dist/Core/generic/AutotagBase.d.ts.map +1 -1
- package/dist/Core/generic/AutotagBase.js.map +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.d.ts +397 -15
- package/dist/Engine/generic/AutotagBaseEngine.d.ts.map +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.js +1362 -128
- package/dist/Engine/generic/AutotagBaseEngine.js.map +1 -1
- package/dist/Engine/generic/RateLimiter.d.ts +49 -0
- package/dist/Engine/generic/RateLimiter.d.ts.map +1 -0
- package/dist/Engine/generic/RateLimiter.js +98 -0
- package/dist/Engine/generic/RateLimiter.js.map +1 -0
- package/dist/Engine/index.d.ts +1 -0
- package/dist/Engine/index.d.ts.map +1 -1
- package/dist/Engine/index.js +1 -0
- package/dist/Engine/index.js.map +1 -1
- package/dist/Entity/generic/AutotagEntity.d.ts +64 -15
- package/dist/Entity/generic/AutotagEntity.d.ts.map +1 -1
- package/dist/Entity/generic/AutotagEntity.js +362 -83
- package/dist/Entity/generic/AutotagEntity.js.map +1 -1
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.d.ts +2 -2
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.d.ts.map +1 -1
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js +2 -2
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js.map +1 -1
- package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts +47 -16
- package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts.map +1 -1
- package/dist/RSSFeed/generic/AutotagRSSFeed.js +239 -121
- package/dist/RSSFeed/generic/AutotagRSSFeed.js.map +1 -1
- package/dist/Websites/generic/AutotagWebsite.d.ts +2 -2
- package/dist/Websites/generic/AutotagWebsite.d.ts.map +1 -1
- package/dist/Websites/generic/AutotagWebsite.js +2 -2
- package/dist/Websites/generic/AutotagWebsite.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/package.json +16 -8
|
@@ -4,9 +4,11 @@ var __decorate = (this && this.__decorate) || function (decorators, target, key,
|
|
|
4
4
|
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
|
5
5
|
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
6
6
|
};
|
|
7
|
+
var AutotagBaseEngine_1;
|
|
7
8
|
import { BaseEngine, Metadata, RunView, LogError, LogStatus } from '@memberjunction/core';
|
|
8
|
-
import { MJGlobal, UUIDsEqual, RegisterClass } from '@memberjunction/global';
|
|
9
|
+
import { MJGlobal, UUIDsEqual, NormalizeUUID, RegisterClass } from '@memberjunction/global';
|
|
9
10
|
import { ContentSourceTypeParams } from './content.types.js';
|
|
11
|
+
import { RateLimiter } from './RateLimiter.js';
|
|
10
12
|
import pdfParse from 'pdf-parse';
|
|
11
13
|
import officeparser from 'officeparser';
|
|
12
14
|
import * as fs from 'fs';
|
|
@@ -15,9 +17,16 @@ import { toZonedTime } from 'date-fns-tz';
|
|
|
15
17
|
import axios from 'axios';
|
|
16
18
|
import * as cheerio from 'cheerio';
|
|
17
19
|
import crypto from 'crypto';
|
|
18
|
-
import {
|
|
20
|
+
import { BaseEmbeddings, GetAIAPIKey } from '@memberjunction/ai';
|
|
19
21
|
import { AIEngine } from '@memberjunction/aiengine';
|
|
22
|
+
import { AIPromptRunner, AIModelRunner } from '@memberjunction/ai-prompts';
|
|
23
|
+
import { AIPromptParams } from '@memberjunction/ai-core-plus';
|
|
20
24
|
import { TextChunker } from '@memberjunction/ai-vectors';
|
|
25
|
+
import { VectorDBBase } from '@memberjunction/ai-vectordb';
|
|
26
|
+
import { TagEngine } from '@memberjunction/tag-engine';
|
|
27
|
+
import { KnowledgeHubMetadataEngine } from '@memberjunction/core-entities';
|
|
28
|
+
/** Default batch size for vectorization processing */
|
|
29
|
+
const DEFAULT_VECTORIZE_BATCH_SIZE = 20;
|
|
21
30
|
/**
|
|
22
31
|
* Core engine for content autotagging. Extends BaseEngine to cache content metadata
|
|
23
32
|
* (types, source types, file types, attributes) at startup. Uses AIEngine via composition
|
|
@@ -26,43 +35,58 @@ import { TextChunker } from '@memberjunction/ai-vectors';
|
|
|
26
35
|
let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
27
36
|
constructor() {
|
|
28
37
|
super(...arguments);
|
|
29
|
-
// Cached metadata — loaded by BaseEngine.Config()
|
|
30
|
-
this._ContentTypes = [];
|
|
31
|
-
this._ContentSourceTypes = [];
|
|
32
|
-
this._ContentFileTypes = [];
|
|
38
|
+
// Cached metadata unique to this engine — loaded by BaseEngine.Config()
|
|
33
39
|
this._ContentTypeAttributes = [];
|
|
34
40
|
this._ContentSourceTypeParams = [];
|
|
41
|
+
/**
|
|
42
|
+
* Optional taxonomy JSON string to inject into the autotagging prompt.
|
|
43
|
+
* Set by the caller (e.g., AutotagEntity) before calling ExtractTextAndProcessWithLLM.
|
|
44
|
+
* When set, the prompt template receives an `existingTaxonomy` variable containing
|
|
45
|
+
* the JSON tree of existing tags so the LLM can prefer existing tags.
|
|
46
|
+
*/
|
|
47
|
+
this.TaxonomyContext = null;
|
|
48
|
+
/**
|
|
49
|
+
* When true, skip checksum comparison and reprocess all content items
|
|
50
|
+
* even if their content hasn't changed. Useful when changing embedding models,
|
|
51
|
+
* LLM models, or vector databases.
|
|
52
|
+
*/
|
|
53
|
+
this.ForceReprocess = false;
|
|
54
|
+
/** Rate limiter for LLM (tagging) API calls */
|
|
55
|
+
this.LLMRateLimiter = new RateLimiter({ RequestsPerMinute: 60, TokensPerMinute: 100000, Name: 'LLM' });
|
|
56
|
+
/** Rate limiter for embedding API calls */
|
|
57
|
+
this.EmbeddingRateLimiter = new RateLimiter({ RequestsPerMinute: 300, TokensPerMinute: 500000, Name: 'Embedding' });
|
|
58
|
+
/** Rate limiter for vector DB API calls */
|
|
59
|
+
this.VectorDBRateLimiter = new RateLimiter({ RequestsPerMinute: 200, Name: 'VectorDB' });
|
|
60
|
+
/**
|
|
61
|
+
* Optional callback invoked after each ContentItemTag is saved, enabling the
|
|
62
|
+
* tag taxonomy bridge (ContentItemTag → Tag + TaggedItem). Set by providers
|
|
63
|
+
* like AutotagEntity that want to link free-text tags to formal taxonomy entries.
|
|
64
|
+
*
|
|
65
|
+
* Parameters: (contentItemTag: MJContentItemTagEntity, parentTag: string | null, contextUser: UserInfo)
|
|
66
|
+
*/
|
|
67
|
+
this.OnContentItemTagSaved = null;
|
|
35
68
|
}
|
|
69
|
+
static { AutotagBaseEngine_1 = this; }
|
|
36
70
|
static get Instance() {
|
|
37
71
|
return super.getInstance();
|
|
38
72
|
}
|
|
39
|
-
/**
|
|
40
|
-
get
|
|
41
|
-
/** All content
|
|
42
|
-
get
|
|
43
|
-
/** All content
|
|
44
|
-
get
|
|
73
|
+
/** Shortcut to KnowledgeHubMetadataEngine */
|
|
74
|
+
get khEngine() { return KnowledgeHubMetadataEngine.Instance; }
|
|
75
|
+
/** All content types — delegated to KnowledgeHubMetadataEngine */
|
|
76
|
+
get ContentTypes() { return this.khEngine.ContentTypes; }
|
|
77
|
+
/** All content source types — delegated to KnowledgeHubMetadataEngine */
|
|
78
|
+
get ContentSourceTypes() { return this.khEngine.ContentSourceTypes; }
|
|
79
|
+
/** All content file types — delegated to KnowledgeHubMetadataEngine */
|
|
80
|
+
get ContentFileTypes() { return this.khEngine.ContentFileTypes; }
|
|
45
81
|
/** All content type attributes, cached at startup */
|
|
46
82
|
get ContentTypeAttributes() { return this._ContentTypeAttributes; }
|
|
47
83
|
/** All content source type params, cached at startup */
|
|
48
84
|
get ContentSourceTypeParams() { return this._ContentSourceTypeParams; }
|
|
49
85
|
async Config(forceRefresh, contextUser, provider) {
|
|
86
|
+
// Content Types, Content Source Types, and Content File Types are delegated to
|
|
87
|
+
// KnowledgeHubMetadataEngine (avoid redundant loading). Only cache entities unique to this engine.
|
|
88
|
+
await KnowledgeHubMetadataEngine.Instance.Config(forceRefresh, contextUser);
|
|
50
89
|
const configs = [
|
|
51
|
-
{
|
|
52
|
-
Type: 'entity',
|
|
53
|
-
EntityName: 'MJ: Content Types',
|
|
54
|
-
PropertyName: '_ContentTypes',
|
|
55
|
-
},
|
|
56
|
-
{
|
|
57
|
-
Type: 'entity',
|
|
58
|
-
EntityName: 'MJ: Content Source Types',
|
|
59
|
-
PropertyName: '_ContentSourceTypes',
|
|
60
|
-
},
|
|
61
|
-
{
|
|
62
|
-
Type: 'entity',
|
|
63
|
-
EntityName: 'MJ: Content File Types',
|
|
64
|
-
PropertyName: '_ContentFileTypes',
|
|
65
|
-
},
|
|
66
90
|
{
|
|
67
91
|
Type: 'entity',
|
|
68
92
|
EntityName: 'MJ: Content Type Attributes',
|
|
@@ -79,27 +103,98 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
79
103
|
}
|
|
80
104
|
/**
|
|
81
105
|
* Given a list of content items, extract the text from each and process with LLM for tagging.
|
|
106
|
+
* Items are processed in configurable batches with controlled concurrency within each batch.
|
|
107
|
+
*/
|
|
108
|
+
/**
|
|
109
|
+
* Process content items through the LLM tagging pipeline with production-grade
|
|
110
|
+
* batch management: cursor-based resume, pause/cancel support, rate limiting,
|
|
111
|
+
* and circuit breaker. Each batch checkpoints progress so interrupted runs
|
|
112
|
+
* can be resumed from where they left off.
|
|
113
|
+
*
|
|
114
|
+
* @param contentItems - items to process
|
|
115
|
+
* @param contextUser - current user for permissions/audit
|
|
116
|
+
* @param processRun - optional ContentProcessRun entity for checkpoint tracking
|
|
117
|
+
* @param config - optional pipeline configuration for rate limits, thresholds
|
|
118
|
+
* @param onProgress - optional callback for UI progress updates
|
|
82
119
|
*/
|
|
83
|
-
async ExtractTextAndProcessWithLLM(contentItems, contextUser) {
|
|
120
|
+
async ExtractTextAndProcessWithLLM(contentItems, contextUser, processRun, config, onProgress) {
|
|
84
121
|
if (!contentItems || contentItems.length === 0) {
|
|
85
|
-
LogStatus('No content items to process');
|
|
122
|
+
LogStatus('[Autotag] No content items to process');
|
|
86
123
|
return;
|
|
87
124
|
}
|
|
88
|
-
const
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
125
|
+
const batchSize = config?.Pipeline?.BatchSize ?? DEFAULT_VECTORIZE_BATCH_SIZE;
|
|
126
|
+
const errorThreshold = config?.Pipeline?.ErrorThresholdPercent ?? 20;
|
|
127
|
+
const delayMs = config?.Pipeline?.DelayBetweenBatchesMs ?? 0;
|
|
128
|
+
// Resume from cursor if available
|
|
129
|
+
const resumeOffset = processRun?.LastProcessedOffset ?? 0;
|
|
130
|
+
const itemsToProcess = resumeOffset > 0
|
|
131
|
+
? contentItems.slice(resumeOffset)
|
|
132
|
+
: contentItems;
|
|
133
|
+
if (resumeOffset > 0) {
|
|
134
|
+
LogStatus(`[Autotag] Resuming from offset ${resumeOffset} (${itemsToProcess.length} remaining of ${contentItems.length})`);
|
|
135
|
+
}
|
|
136
|
+
LogStatus(`[Autotag] Processing ${itemsToProcess.length} items in batches of ${batchSize}`);
|
|
137
|
+
let totalSuccesses = 0;
|
|
138
|
+
let totalFailures = 0;
|
|
139
|
+
let totalProcessed = resumeOffset;
|
|
140
|
+
for (let i = 0; i < itemsToProcess.length; i += batchSize) {
|
|
141
|
+
const batch = itemsToProcess.slice(i, i + batchSize);
|
|
142
|
+
const batchNum = Math.floor(i / batchSize) + 1;
|
|
143
|
+
let batchOk = 0;
|
|
144
|
+
let batchFail = 0;
|
|
145
|
+
// Rate limit before each batch of parallel LLM calls
|
|
146
|
+
await this.LLMRateLimiter.Acquire();
|
|
147
|
+
const batchPromises = batch.map(async (contentItem) => {
|
|
148
|
+
try {
|
|
149
|
+
const processingParams = await this.buildProcessingParams(contentItem, contextUser);
|
|
150
|
+
await this.ProcessContentItemText(processingParams, contextUser);
|
|
151
|
+
batchOk++;
|
|
152
|
+
}
|
|
153
|
+
catch (e) {
|
|
154
|
+
LogError(`[Autotag] Failed to process item ${contentItem.ID}: ${e instanceof Error ? e.message : String(e)}`);
|
|
155
|
+
batchFail++;
|
|
156
|
+
}
|
|
157
|
+
});
|
|
158
|
+
await Promise.all(batchPromises);
|
|
159
|
+
totalSuccesses += batchOk;
|
|
160
|
+
totalFailures += batchFail;
|
|
161
|
+
totalProcessed += batch.length;
|
|
162
|
+
onProgress?.(totalProcessed, contentItems.length);
|
|
163
|
+
LogStatus(`[Autotag] Batch ${batchNum}: ${batchOk}/${batch.length} ok (${totalProcessed}/${contentItems.length} total, ${totalFailures} errors)`);
|
|
164
|
+
// Checkpoint: update cursor and check for cancellation
|
|
165
|
+
if (processRun) {
|
|
166
|
+
const shouldContinue = await this.UpdateBatchCursor(processRun, totalProcessed, totalFailures);
|
|
167
|
+
if (!shouldContinue) {
|
|
168
|
+
LogStatus(`[Autotag] Pipeline paused/cancelled at offset ${totalProcessed}`);
|
|
169
|
+
return;
|
|
170
|
+
}
|
|
96
171
|
}
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
172
|
+
// Circuit breaker: halt if error rate exceeds threshold
|
|
173
|
+
if (totalProcessed > 0 && totalFailures > 0) {
|
|
174
|
+
const errorRate = (totalFailures / totalProcessed) * 100;
|
|
175
|
+
if (errorRate > errorThreshold) {
|
|
176
|
+
LogError(`[Autotag] Circuit breaker triggered: error rate ${errorRate.toFixed(1)}% exceeds threshold ${errorThreshold}%`);
|
|
177
|
+
if (processRun) {
|
|
178
|
+
processRun.ErrorMessage = `Auto-paused: error rate ${errorRate.toFixed(1)}% exceeded ${errorThreshold}% threshold`;
|
|
179
|
+
await this.CompleteBatchedProcessRun(processRun, 'Failed', processRun.ErrorMessage);
|
|
180
|
+
}
|
|
181
|
+
return;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
// Optional delay between batches (throttling)
|
|
185
|
+
if (delayMs > 0 && i + batchSize < itemsToProcess.length) {
|
|
186
|
+
await new Promise(resolve => setTimeout(resolve, delayMs));
|
|
100
187
|
}
|
|
101
188
|
}
|
|
189
|
+
LogStatus(`[Autotag] LLM tagging complete: ${totalSuccesses} succeeded, ${totalFailures} failed of ${contentItems.length}`);
|
|
190
|
+
// Post-pipeline hook: recompute tag co-occurrence if TagCoOccurrenceEngine is available
|
|
191
|
+
await this.recomputeCoOccurrenceIfAvailable(contextUser);
|
|
192
|
+
// Legacy process run tracking (for backward compatibility)
|
|
193
|
+
const processRunParams = new ProcessRunParams();
|
|
194
|
+
processRunParams.sourceID = contentItems[0].ContentSourceID;
|
|
195
|
+
processRunParams.startTime = processRun?.StartTime ?? new Date();
|
|
102
196
|
processRunParams.endTime = new Date();
|
|
197
|
+
processRunParams.numItemsProcessed = contentItems.length;
|
|
103
198
|
await this.saveProcessRun(processRunParams, contextUser);
|
|
104
199
|
}
|
|
105
200
|
/**
|
|
@@ -116,101 +211,362 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
116
211
|
processingParams.minTags = minTags;
|
|
117
212
|
processingParams.maxTags = maxTags;
|
|
118
213
|
processingParams.contentItemID = contentItem.ID;
|
|
214
|
+
LogStatus(`[Autotag] Built params for "${contentItem.Name}" — text length: ${processingParams.text?.length ?? 0}, modelID: ${modelID || 'default'}, tags: ${minTags}-${maxTags}`);
|
|
119
215
|
return processingParams;
|
|
120
216
|
}
|
|
121
217
|
/**
|
|
122
218
|
* Process a content item's text with the LLM and save results.
|
|
123
219
|
*/
|
|
124
220
|
async ProcessContentItemText(params, contextUser) {
|
|
125
|
-
|
|
126
|
-
await this.
|
|
221
|
+
// A8: Update tagging status to Processing
|
|
222
|
+
await this.updateContentItemTaggingStatus(params.contentItemID, 'Processing', contextUser);
|
|
223
|
+
try {
|
|
224
|
+
const LLMResults = await this.promptAndRetrieveResultsFromLLM(params, contextUser);
|
|
225
|
+
await this.saveLLMResults(LLMResults, contextUser);
|
|
226
|
+
// A8: Update tagging status to Complete
|
|
227
|
+
await this.updateContentItemTaggingStatus(params.contentItemID, 'Complete', contextUser);
|
|
228
|
+
}
|
|
229
|
+
catch (e) {
|
|
230
|
+
await this.updateContentItemTaggingStatus(params.contentItemID, 'Failed', contextUser);
|
|
231
|
+
throw e;
|
|
232
|
+
}
|
|
127
233
|
}
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
234
|
+
/** Update embedding status for a batch of content items */
|
|
235
|
+
async updateEmbeddingStatusBatch(items, status, contextUser, embeddingModelID) {
|
|
236
|
+
for (const item of items) {
|
|
237
|
+
try {
|
|
238
|
+
item.EmbeddingStatus = status;
|
|
239
|
+
if (status === 'Complete') {
|
|
240
|
+
item.LastEmbeddedAt = new Date();
|
|
241
|
+
if (embeddingModelID)
|
|
242
|
+
item.EmbeddingModelID = embeddingModelID;
|
|
243
|
+
}
|
|
244
|
+
await item.Save();
|
|
245
|
+
}
|
|
246
|
+
catch {
|
|
247
|
+
// Non-critical
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
/** Update a content item's TaggingStatus and LastTaggedAt */
|
|
252
|
+
async updateContentItemTaggingStatus(contentItemID, status, contextUser) {
|
|
253
|
+
try {
|
|
254
|
+
const md = new Metadata();
|
|
255
|
+
const item = await md.GetEntityObject('MJ: Content Items', contextUser);
|
|
256
|
+
await item.Load(contentItemID);
|
|
257
|
+
item.TaggingStatus = status;
|
|
258
|
+
if (status === 'Complete') {
|
|
259
|
+
item.LastTaggedAt = new Date();
|
|
260
|
+
}
|
|
261
|
+
await item.Save();
|
|
132
262
|
}
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
263
|
+
catch {
|
|
264
|
+
// Non-critical — don't fail the pipeline for a status update
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Resolves the "Content Autotagging" prompt from the AIEngine cache.
|
|
269
|
+
* Throws if the prompt is not found or not active.
|
|
270
|
+
*/
|
|
271
|
+
getAutotagPrompt() {
|
|
272
|
+
const prompt = AIEngine.Instance.Prompts.find(p => p.Name === 'Content Autotagging');
|
|
273
|
+
if (!prompt) {
|
|
274
|
+
throw new Error('AI Prompt "Content Autotagging" not found. Ensure the prompt metadata has been synced to the database.');
|
|
275
|
+
}
|
|
276
|
+
if (prompt.Status !== 'Active') {
|
|
277
|
+
throw new Error(`AI Prompt "Content Autotagging" is not active (Status: ${prompt.Status})`);
|
|
278
|
+
}
|
|
279
|
+
return prompt;
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Initialize the taxonomy bridge so ALL content source types (RSS, Entity, Website, etc.)
|
|
283
|
+
* automatically create formal Tag + TaggedItem records from LLM-generated ContentItemTags.
|
|
284
|
+
*
|
|
285
|
+
* This sets up:
|
|
286
|
+
* 1. TagEngine with semantic embeddings for tag matching
|
|
287
|
+
* 2. TaxonomyContext for prompt injection (tells LLM about existing tags)
|
|
288
|
+
* 3. OnContentItemTagSaved callback that bridges ContentItemTag → Tag + TaggedItem
|
|
289
|
+
*
|
|
290
|
+
* Call this ONCE before running any providers. The bridge stays active until
|
|
291
|
+
* CleanupTaxonomyBridge() is called.
|
|
292
|
+
*/
|
|
293
|
+
async InitializeTaxonomyBridge(contextUser) {
|
|
294
|
+
try {
|
|
295
|
+
// TagEngine internally ensures AIEngine is loaded before building embeddings.
|
|
296
|
+
await TagEngine.Instance.Config(false, contextUser);
|
|
297
|
+
LogStatus(`[TaxonomyBridge] TagEngine initialized with ${TagEngine.Instance.Tags.length} existing tags`);
|
|
298
|
+
// Inject taxonomy into prompt context as markdown hierarchy
|
|
299
|
+
// Format: "# RootTag\n## ChildTag\n### GrandChild"
|
|
300
|
+
// LLM returns paths like "RootTag / ChildTag / GrandChild" for unambiguous matching
|
|
301
|
+
if (TagEngine.Instance.Tags.length > 0) {
|
|
302
|
+
const tree = TagEngine.Instance.GetTaxonomyTree();
|
|
303
|
+
this.TaxonomyContext = this.buildTaxonomyMarkdown(tree);
|
|
304
|
+
LogStatus(`[TaxonomyBridge] Taxonomy context injected as markdown (${tree.length} root nodes, ${TagEngine.Instance.Tags.length} total tags)`);
|
|
305
|
+
}
|
|
306
|
+
// Set up the bridge callback — fires after each ContentItemTag is saved
|
|
307
|
+
this.OnContentItemTagSaved = async (contentItemTag, parentTagName, ctxUser) => {
|
|
308
|
+
await this.BridgeContentItemTagToTaxonomy(contentItemTag, parentTagName, ctxUser);
|
|
309
|
+
};
|
|
310
|
+
LogStatus(`[TaxonomyBridge] Bridge callback installed`);
|
|
311
|
+
}
|
|
312
|
+
catch (e) {
|
|
313
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
314
|
+
LogError(`[TaxonomyBridge] Initialization failed, taxonomy features disabled: ${msg}`);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
/**
|
|
318
|
+
* Clean up the taxonomy bridge after all providers have finished.
|
|
319
|
+
*/
|
|
320
|
+
CleanupTaxonomyBridge() {
|
|
321
|
+
this.TaxonomyContext = null;
|
|
322
|
+
this.OnContentItemTagSaved = null;
|
|
323
|
+
}
|
|
324
|
+
/**
|
|
325
|
+
* Build a markdown-formatted taxonomy for LLM prompt injection.
|
|
326
|
+
* Uses heading levels for hierarchy depth (# for root, ## for child, etc.)
|
|
327
|
+
* so the LLM can return tag paths like "Root / Child / Grandchild".
|
|
328
|
+
*/
|
|
329
|
+
buildTaxonomyMarkdown(tree) {
|
|
330
|
+
const lines = [];
|
|
331
|
+
const renderNode = (node, depth, path) => {
|
|
332
|
+
const prefix = '#'.repeat(Math.min(depth + 1, 6)); // Max 6 heading levels
|
|
333
|
+
const fullPath = path ? `${path} / ${node.Name}` : node.Name;
|
|
334
|
+
lines.push(`${prefix} ${node.Name}`);
|
|
335
|
+
if (node.Children && node.Children.length > 0) {
|
|
336
|
+
for (const child of node.Children) {
|
|
337
|
+
renderNode(child, depth + 1, fullPath);
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
};
|
|
341
|
+
for (const root of tree) {
|
|
342
|
+
renderNode(root, 0, '');
|
|
343
|
+
}
|
|
344
|
+
return lines.join('\n');
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
* Bridge a ContentItemTag to the formal MJ Tag taxonomy.
|
|
348
|
+
* Uses TagEngine.ResolveTag() in auto-grow mode by default.
|
|
349
|
+
*
|
|
350
|
+
* After resolving/creating the formal Tag, also creates a TaggedItem record:
|
|
351
|
+
* - For Entity sources: tags the original entity record (e.g., Products row)
|
|
352
|
+
* - For non-Entity sources (RSS, Website, etc.): tags the ContentItem itself
|
|
353
|
+
*/
|
|
354
|
+
async BridgeContentItemTagToTaxonomy(contentItemTag, parentTagName, contextUser) {
|
|
355
|
+
try {
|
|
356
|
+
// If parent tag is suggested by LLM, resolve it through the mutex too
|
|
357
|
+
// to prevent duplicate parent tags from concurrent batch processing
|
|
358
|
+
if (parentTagName) {
|
|
359
|
+
await TagEngine.Instance.ResolveTag(parentTagName, 0, 'auto-grow', null, 0.80, contextUser);
|
|
360
|
+
}
|
|
361
|
+
// Resolve the tag using auto-grow mode (create if no match)
|
|
362
|
+
const formalTag = await TagEngine.Instance.ResolveTag(contentItemTag.Tag, contentItemTag.Weight, 'auto-grow', null, // no root constraint
|
|
363
|
+
0.80, // similarity threshold — lower to catch plurals/variants like "AI Agent" vs "AI Agents"
|
|
364
|
+
contextUser);
|
|
365
|
+
if (formalTag) {
|
|
366
|
+
// Link ContentItemTag to formal Tag
|
|
367
|
+
contentItemTag.TagID = formalTag.ID;
|
|
368
|
+
await contentItemTag.Save();
|
|
369
|
+
// Create TaggedItem linking the formal Tag to the appropriate entity record
|
|
370
|
+
await this.createTaggedItemFromContentItemTag(contentItemTag, formalTag.ID, contextUser);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
catch (e) {
|
|
374
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
375
|
+
LogError(`[TaxonomyBridge] Failed for tag "${contentItemTag.Tag}": ${msg}`);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
/**
|
|
379
|
+
* Creates a TaggedItem record linking a formal Tag to the appropriate entity record.
|
|
380
|
+
*
|
|
381
|
+
* For Entity-sourced content items: resolves the EntityRecordDocument to find the
|
|
382
|
+
* original entity (e.g., Products) and record ID, then tags that entity record.
|
|
383
|
+
*
|
|
384
|
+
* For non-Entity-sourced content items (RSS, Website, Cloud Storage): tags the
|
|
385
|
+
* ContentItem itself (EntityID = "MJ: Content Items" entity, RecordID = content item ID).
|
|
386
|
+
*/
|
|
387
|
+
async createTaggedItemFromContentItemTag(contentItemTag, tagID, contextUser) {
|
|
388
|
+
try {
|
|
389
|
+
const md = new Metadata();
|
|
390
|
+
let entityID;
|
|
391
|
+
let recordID;
|
|
392
|
+
// Load the content item to determine source type
|
|
393
|
+
const rv = new RunView();
|
|
394
|
+
const ciResult = await rv.RunView({
|
|
395
|
+
EntityName: 'MJ: Content Items',
|
|
396
|
+
ExtraFilter: `ID='${contentItemTag.ItemID}'`,
|
|
397
|
+
ResultType: 'simple',
|
|
398
|
+
Fields: ['ID', 'EntityRecordDocumentID', 'ContentSourceID'],
|
|
399
|
+
MaxRows: 1,
|
|
400
|
+
}, contextUser);
|
|
401
|
+
if (!ciResult.Success || ciResult.Results.length === 0)
|
|
402
|
+
return;
|
|
403
|
+
const ci = ciResult.Results[0];
|
|
404
|
+
if (ci.EntityRecordDocumentID) {
|
|
405
|
+
// Entity source — resolve to the original entity record
|
|
406
|
+
const erdResult = await rv.RunView({
|
|
407
|
+
EntityName: 'MJ: Entity Record Documents',
|
|
408
|
+
ExtraFilter: `ID='${ci.EntityRecordDocumentID}'`,
|
|
409
|
+
ResultType: 'simple',
|
|
410
|
+
Fields: ['EntityID', 'RecordID'],
|
|
411
|
+
MaxRows: 1,
|
|
412
|
+
}, contextUser);
|
|
413
|
+
if (!erdResult.Success || erdResult.Results.length === 0)
|
|
414
|
+
return;
|
|
415
|
+
entityID = erdResult.Results[0].EntityID;
|
|
416
|
+
recordID = erdResult.Results[0].RecordID;
|
|
417
|
+
}
|
|
418
|
+
else {
|
|
419
|
+
// Non-entity source — tag the ContentItem itself
|
|
420
|
+
const contentItemsEntity = md.Entities.find(e => e.Name === 'MJ: Content Items');
|
|
421
|
+
if (!contentItemsEntity)
|
|
422
|
+
return;
|
|
423
|
+
entityID = contentItemsEntity.ID;
|
|
424
|
+
recordID = contentItemTag.ItemID;
|
|
425
|
+
}
|
|
426
|
+
// Check if this TaggedItem already exists (avoid duplicates)
|
|
427
|
+
const existingResult = await rv.RunView({
|
|
428
|
+
EntityName: 'MJ: Tagged Items',
|
|
429
|
+
ExtraFilter: `TagID='${tagID}' AND EntityID='${entityID}' AND RecordID='${recordID}'`,
|
|
430
|
+
ResultType: 'simple',
|
|
431
|
+
Fields: ['ID'],
|
|
432
|
+
MaxRows: 1,
|
|
433
|
+
}, contextUser);
|
|
434
|
+
if (existingResult.Success && existingResult.Results.length > 0)
|
|
435
|
+
return; // Already exists
|
|
436
|
+
// Create the TaggedItem
|
|
437
|
+
const taggedItem = await md.GetEntityObject('MJ: Tagged Items', contextUser);
|
|
438
|
+
taggedItem.NewRecord();
|
|
439
|
+
taggedItem.TagID = tagID;
|
|
440
|
+
taggedItem.EntityID = entityID;
|
|
441
|
+
taggedItem.RecordID = recordID;
|
|
442
|
+
taggedItem.Weight = contentItemTag.Weight;
|
|
443
|
+
await taggedItem.Save();
|
|
444
|
+
}
|
|
445
|
+
catch (e) {
|
|
446
|
+
// Non-critical — the ContentItemTag is already saved, TaggedItem is supplemental
|
|
447
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
448
|
+
LogError(`[TaxonomyBridge] Failed to create TaggedItem for tag "${contentItemTag.Tag}": ${msg}`);
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
/**
|
|
452
|
+
* Builds template data for the autotagging prompt from processing params and chunk context.
|
|
453
|
+
*/
|
|
454
|
+
buildPromptData(params, chunk, previousResults) {
|
|
455
|
+
const contentSourceType = this.GetContentSourceTypeName(params.contentSourceTypeID);
|
|
456
|
+
const additionalAttributePrompts = this.GetAdditionalContentTypePrompt(params.contentTypeID);
|
|
457
|
+
const hasPreviousResults = Object.keys(previousResults).length > 0;
|
|
458
|
+
// Check if this source type requires content type validation in the prompt
|
|
459
|
+
const sourceType = this.ContentSourceTypes.find(st => UUIDsEqual(st.ID, params.contentSourceTypeID));
|
|
460
|
+
const sourceConfig = sourceType?.ConfigurationObject;
|
|
461
|
+
const requiresContentType = sourceConfig?.RequiresContentType !== false;
|
|
462
|
+
return {
|
|
463
|
+
contentType: requiresContentType ? this.GetContentTypeName(params.contentTypeID) : undefined,
|
|
464
|
+
contentSourceType,
|
|
465
|
+
minTags: params.minTags,
|
|
466
|
+
maxTags: params.maxTags,
|
|
467
|
+
additionalAttributePrompts,
|
|
468
|
+
existingTaxonomy: this.TaxonomyContext ?? undefined,
|
|
469
|
+
contentText: chunk,
|
|
470
|
+
previousResults: hasPreviousResults ? JSON.stringify(previousResults) : undefined,
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
async promptAndRetrieveResultsFromLLM(params, contextUser) {
|
|
474
|
+
await AIEngine.Instance.Config(false, contextUser);
|
|
475
|
+
const prompt = this.getAutotagPrompt();
|
|
476
|
+
const tokenLimit = this.resolveTokenLimit(params.modelID);
|
|
477
|
+
const chunks = this.chunkExtractedText(params.text, tokenLimit);
|
|
478
|
+
if (chunks.length === 0 || (chunks.length === 1 && (!chunks[0] || chunks[0].trim().length === 0))) {
|
|
479
|
+
LogError(`[Autotag] No text to process for item ${params.contentItemID}`);
|
|
480
|
+
return {};
|
|
136
481
|
}
|
|
137
|
-
const chunks = this.chunkExtractedText(params.text, model.InputTokenLimit);
|
|
138
482
|
let LLMResults = {};
|
|
139
483
|
const startTime = new Date();
|
|
140
|
-
for (
|
|
141
|
-
|
|
142
|
-
|
|
484
|
+
for (let ci = 0; ci < chunks.length; ci++) {
|
|
485
|
+
try {
|
|
486
|
+
LLMResults = await this.processChunkWithPromptRunner(prompt, params, chunks[ci], LLMResults, contextUser);
|
|
487
|
+
}
|
|
488
|
+
catch (chunkError) {
|
|
489
|
+
LogError(`[Autotag] Chunk ${ci + 1}/${chunks.length} failed for item ${params.contentItemID}: ${chunkError instanceof Error ? chunkError.message : String(chunkError)}`);
|
|
490
|
+
}
|
|
143
491
|
}
|
|
144
492
|
LLMResults.processStartTime = startTime;
|
|
145
493
|
LLMResults.processEndTime = new Date();
|
|
146
494
|
LLMResults.contentItemID = params.contentItemID;
|
|
147
495
|
return LLMResults;
|
|
148
496
|
}
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
497
|
+
/**
|
|
498
|
+
* Resolves the input token limit for chunking. Uses the model specified by modelID if available,
|
|
499
|
+
* otherwise falls back to a conservative default.
|
|
500
|
+
*/
|
|
501
|
+
resolveTokenLimit(modelID) {
|
|
502
|
+
const DEFAULT_TOKEN_LIMIT = 100000;
|
|
503
|
+
if (modelID) {
|
|
504
|
+
const model = AIEngine.Instance.Models.find(m => UUIDsEqual(m.ID, modelID));
|
|
505
|
+
if (model) {
|
|
506
|
+
return model.InputTokenLimit;
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
return DEFAULT_TOKEN_LIMIT;
|
|
510
|
+
}
|
|
511
|
+
/**
|
|
512
|
+
* Processes a single text chunk using AIPromptRunner and merges results.
|
|
513
|
+
* Uses the prompt's configured model by default. If ContentType.AIModelID is set,
|
|
514
|
+
* it is passed as a runtime model override via AIPromptParams.override.
|
|
515
|
+
*/
|
|
516
|
+
async processChunkWithPromptRunner(prompt, params, chunk, LLMResults, contextUser) {
|
|
517
|
+
const promptParams = new AIPromptParams();
|
|
518
|
+
promptParams.prompt = prompt;
|
|
519
|
+
promptParams.contextUser = contextUser;
|
|
520
|
+
promptParams.data = this.buildPromptData(params, chunk, LLMResults);
|
|
521
|
+
promptParams.skipValidation = false;
|
|
522
|
+
promptParams.attemptJSONRepair = true;
|
|
523
|
+
promptParams.additionalParameters = { temperature: 0.0 };
|
|
524
|
+
// If the ContentType specifies a preferred AI model, use it as a runtime override
|
|
525
|
+
if (params.modelID) {
|
|
526
|
+
promptParams.override = { modelId: params.modelID };
|
|
162
527
|
}
|
|
163
|
-
|
|
164
|
-
|
|
528
|
+
const runner = new AIPromptRunner();
|
|
529
|
+
const result = await runner.ExecutePrompt(promptParams);
|
|
530
|
+
if (!result.success) {
|
|
531
|
+
LogError(`[Autotag] LLM failed for item ${params.contentItemID}: ${result.errorMessage ?? 'unknown error'}`);
|
|
165
532
|
return LLMResults;
|
|
166
533
|
}
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
534
|
+
// Parse the result — AIPromptRunner may return a raw JSON string or a parsed object
|
|
535
|
+
let chunkResult = null;
|
|
536
|
+
if (typeof result.result === 'string') {
|
|
537
|
+
try {
|
|
538
|
+
chunkResult = JSON.parse(result.result);
|
|
539
|
+
}
|
|
540
|
+
catch {
|
|
541
|
+
LogError(`Failed to parse LLM result as JSON for item ${params.contentItemID}: ${String(result.result).substring(0, 200)}`);
|
|
542
|
+
return LLMResults;
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
else {
|
|
546
|
+
chunkResult = result.result;
|
|
547
|
+
}
|
|
548
|
+
// Merge results from this chunk into the accumulated results
|
|
549
|
+
if (chunkResult) {
|
|
550
|
+
for (const key in chunkResult) {
|
|
551
|
+
const value = chunkResult[key];
|
|
552
|
+
if (value !== null) {
|
|
553
|
+
LLMResults[key] = value;
|
|
554
|
+
}
|
|
171
555
|
}
|
|
172
556
|
}
|
|
173
557
|
return LLMResults;
|
|
174
558
|
}
|
|
175
|
-
async getLLMPrompts(params, chunk, LLMResults, contextUser) {
|
|
176
|
-
const contentType = this.GetContentTypeName(params.contentTypeID);
|
|
177
|
-
const contentSourceType = this.GetContentSourceTypeName(params.contentSourceTypeID);
|
|
178
|
-
const additionalContentTypePrompts = this.GetAdditionalContentTypePrompt(params.contentTypeID);
|
|
179
|
-
const systemPrompt = `You are a highly skilled text analysis assistant. You have decades of experience and pride yourself on your attention to detail and ability to capture both accurate information, as well as tone and subtext.
|
|
180
|
-
Your task is to accurately extract key information from a provided piece of text based on a series of prompts. You are provided with text that should be a ${contentType}, that has been extracted from a ${contentSourceType}.
|
|
181
|
-
The text MUST be of the type ${contentType} for the subsequent processing.`;
|
|
182
|
-
const userPrompt = `
|
|
183
|
-
If the provided text does not actually appear to be of the type ${contentType}, please disregard everything in the instructions after this and return this exact JSON response: { isValidContent: false (as a boolean) }.
|
|
184
|
-
Assuming the type of the text is in fact from a ${contentType}, please extract the title of the provided text, a short summary of the provided documents, as well as between ${params.minTags} and ${params.maxTags} topical key words that are most relevant to the text.
|
|
185
|
-
If there is no title explicitly provided in the text, please provide a title that you think best represents the text.
|
|
186
|
-
Please provide the keywords in a list format.
|
|
187
|
-
Make sure the response is just the json file without and formatting or code blocks, and strictly following the format below. Please don't include a greeting in the response, only output the json file:
|
|
188
|
-
|
|
189
|
-
{
|
|
190
|
-
"title": (title here),
|
|
191
|
-
"description": (description here),
|
|
192
|
-
"keywords": (list keywords here),
|
|
193
|
-
"isValidContent": true (as a boolean)
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
${additionalContentTypePrompts}
|
|
197
|
-
|
|
198
|
-
Please make sure the response in is valid JSON format.
|
|
199
|
-
|
|
200
|
-
You are also provided with the results so far as additional context, please use them to formulate the best results given the provided text: ${JSON.stringify(LLMResults)}
|
|
201
|
-
The supplied text is: ${chunk}
|
|
202
|
-
`;
|
|
203
|
-
return { systemPrompt, userPrompt };
|
|
204
|
-
}
|
|
205
559
|
async saveLLMResults(LLMResults, contextUser) {
|
|
206
560
|
if (LLMResults.isValidContent === true) {
|
|
207
561
|
await this.saveResultsToContentItemAttribute(LLMResults, contextUser);
|
|
208
562
|
await this.saveContentItemTags(LLMResults.contentItemID, LLMResults, contextUser);
|
|
209
|
-
LogStatus(`Results for content item ${LLMResults.contentItemID} saved successfully`);
|
|
210
563
|
}
|
|
211
|
-
else {
|
|
564
|
+
else if (LLMResults.isValidContent === false) {
|
|
212
565
|
await this.deleteInvalidContentItem(LLMResults.contentItemID, contextUser);
|
|
213
566
|
}
|
|
567
|
+
else {
|
|
568
|
+
LogError(`[Autotag] Unexpected LLM format for item ${LLMResults.contentItemID} — isValidContent missing. Keys: ${Object.keys(LLMResults).join(', ')}`);
|
|
569
|
+
}
|
|
214
570
|
}
|
|
215
571
|
async deleteInvalidContentItem(contentItemID, contextUser) {
|
|
216
572
|
const md = new Metadata();
|
|
@@ -264,21 +620,49 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
264
620
|
/**
|
|
265
621
|
* Saves keyword tags from LLM results as Content Item Tags.
|
|
266
622
|
* Uses batched saves for better performance.
|
|
623
|
+
* After each tag is saved, invokes the OnContentItemTagSaved callback (if set)
|
|
624
|
+
* for taxonomy bridge processing.
|
|
267
625
|
*/
|
|
268
626
|
async saveContentItemTags(contentItemID, LLMResults, contextUser) {
|
|
269
627
|
const md = new Metadata();
|
|
270
628
|
const keywords = LLMResults.keywords;
|
|
271
629
|
if (!keywords || !Array.isArray(keywords))
|
|
272
630
|
return;
|
|
631
|
+
// Normalize keywords — support both formats:
|
|
632
|
+
// Old: ["keyword1", "keyword2"]
|
|
633
|
+
// New: [{ tag: "keyword1", weight: 0.95 }, { tag: "keyword2", weight: 0.7 }]
|
|
634
|
+
// New with parentTag: [{ tag: "keyword1", weight: 0.95, parentTag: "parent" }]
|
|
635
|
+
const normalizedTags = keywords.map((kw) => {
|
|
636
|
+
if (typeof kw === 'string') {
|
|
637
|
+
return { tag: kw, weight: 1.0, parentTag: null };
|
|
638
|
+
}
|
|
639
|
+
const obj = kw;
|
|
640
|
+
return {
|
|
641
|
+
tag: obj.tag || obj.keyword || String(kw),
|
|
642
|
+
weight: typeof obj.weight === 'number' ? Math.max(0, Math.min(1, obj.weight)) : 0.5,
|
|
643
|
+
parentTag: obj.parentTag ?? null,
|
|
644
|
+
};
|
|
645
|
+
});
|
|
273
646
|
const BATCH_SIZE = 10;
|
|
274
|
-
for (let i = 0; i <
|
|
275
|
-
const batch =
|
|
276
|
-
await Promise.all(batch.map(async (
|
|
647
|
+
for (let i = 0; i < normalizedTags.length; i += BATCH_SIZE) {
|
|
648
|
+
const batch = normalizedTags.slice(i, i + BATCH_SIZE);
|
|
649
|
+
await Promise.all(batch.map(async (item) => {
|
|
277
650
|
const contentItemTag = await md.GetEntityObject('MJ: Content Item Tags', contextUser);
|
|
278
651
|
contentItemTag.NewRecord();
|
|
279
652
|
contentItemTag.ItemID = contentItemID;
|
|
280
|
-
contentItemTag.Tag =
|
|
281
|
-
|
|
653
|
+
contentItemTag.Tag = item.tag;
|
|
654
|
+
contentItemTag.Set('Weight', item.weight);
|
|
655
|
+
const saved = await contentItemTag.Save();
|
|
656
|
+
// Invoke taxonomy bridge callback if set
|
|
657
|
+
if (saved && this.OnContentItemTagSaved) {
|
|
658
|
+
try {
|
|
659
|
+
await this.OnContentItemTagSaved(contentItemTag, item.parentTag, contextUser);
|
|
660
|
+
}
|
|
661
|
+
catch (bridgeError) {
|
|
662
|
+
const msg = bridgeError instanceof Error ? bridgeError.message : String(bridgeError);
|
|
663
|
+
LogError(`Tag taxonomy bridge failed for tag "${item.tag}": ${msg}`);
|
|
664
|
+
}
|
|
665
|
+
}
|
|
282
666
|
}));
|
|
283
667
|
}
|
|
284
668
|
}
|
|
@@ -290,12 +674,17 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
290
674
|
const md = new Metadata();
|
|
291
675
|
const contentItemID = LLMResults.contentItemID;
|
|
292
676
|
const skipKeys = new Set(['keywords', 'processStartTime', 'processEndTime', 'contentItemID', 'isValidContent']);
|
|
293
|
-
// Update title and description on the content item
|
|
677
|
+
// Update title and description on the content item.
|
|
678
|
+
// For entity-sourced items (EntityRecordDocumentID is set), preserve the
|
|
679
|
+
// original entity record name — it's more meaningful to users than the
|
|
680
|
+
// AI-generated title. Only update description.
|
|
294
681
|
if (LLMResults.title || LLMResults.description) {
|
|
295
682
|
const contentItem = await md.GetEntityObject('MJ: Content Items', contextUser);
|
|
296
683
|
await contentItem.Load(contentItemID);
|
|
297
|
-
|
|
684
|
+
const isEntitySourced = contentItem.EntityRecordDocumentID != null;
|
|
685
|
+
if (LLMResults.title && !isEntitySourced) {
|
|
298
686
|
contentItem.Name = LLMResults.title;
|
|
687
|
+
}
|
|
299
688
|
if (LLMResults.description)
|
|
300
689
|
contentItem.Description = LLMResults.description;
|
|
301
690
|
await contentItem.Save();
|
|
@@ -317,21 +706,24 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
317
706
|
}
|
|
318
707
|
/**
|
|
319
708
|
* Retrieves all content sources for a given content source type.
|
|
709
|
+
* Throws if no sources are found.
|
|
320
710
|
*/
|
|
321
711
|
async getAllContentSources(contextUser, contentSourceTypeID) {
|
|
322
|
-
const
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
ResultType: 'entity_object',
|
|
326
|
-
ExtraFilter: `ContentSourceTypeID='${contentSourceTypeID}'`
|
|
327
|
-
}, contextUser);
|
|
328
|
-
if (result.Success && result.Results.length) {
|
|
329
|
-
return result.Results;
|
|
712
|
+
const sources = await this.GetAllContentSourcesSafe(contextUser, contentSourceTypeID);
|
|
713
|
+
if (sources.length === 0) {
|
|
714
|
+
throw new Error(`No content sources found for content source type with ID '${contentSourceTypeID}'`);
|
|
330
715
|
}
|
|
331
|
-
|
|
716
|
+
return sources;
|
|
717
|
+
}
|
|
718
|
+
/**
|
|
719
|
+
* Retrieves all content sources for a given content source type.
|
|
720
|
+
* Returns an empty array (instead of throwing) when no sources are configured.
|
|
721
|
+
*/
|
|
722
|
+
async GetAllContentSourcesSafe(_contextUser, contentSourceTypeID) {
|
|
723
|
+
return this.khEngine.ContentSources.filter(s => UUIDsEqual(s.ContentSourceTypeID, contentSourceTypeID));
|
|
332
724
|
}
|
|
333
725
|
SetSubclassContentSourceType(subclass) {
|
|
334
|
-
const sourceType = this.
|
|
726
|
+
const sourceType = this.ContentSourceTypes.find(st => st.Name === subclass);
|
|
335
727
|
if (!sourceType) {
|
|
336
728
|
throw new Error(`Content Source Type with name '${subclass}' not found in cached metadata`);
|
|
337
729
|
}
|
|
@@ -421,7 +813,7 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
421
813
|
throw new Error(`Failed to retrieve last run date for content source with ID ${contentSourceID}`);
|
|
422
814
|
}
|
|
423
815
|
GetContentItemParams(contentTypeID) {
|
|
424
|
-
const contentType = this.
|
|
816
|
+
const contentType = this.ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
|
|
425
817
|
if (!contentType) {
|
|
426
818
|
throw new Error(`Content Type with ID ${contentTypeID} not found in cached metadata`);
|
|
427
819
|
}
|
|
@@ -432,21 +824,21 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
432
824
|
};
|
|
433
825
|
}
|
|
434
826
|
GetContentSourceTypeName(contentSourceTypeID) {
|
|
435
|
-
const sourceType = this.
|
|
827
|
+
const sourceType = this.ContentSourceTypes.find(st => UUIDsEqual(st.ID, contentSourceTypeID));
|
|
436
828
|
if (!sourceType) {
|
|
437
829
|
throw new Error(`Content Source Type with ID ${contentSourceTypeID} not found in cached metadata`);
|
|
438
830
|
}
|
|
439
831
|
return sourceType.Name;
|
|
440
832
|
}
|
|
441
833
|
GetContentTypeName(contentTypeID) {
|
|
442
|
-
const contentType = this.
|
|
834
|
+
const contentType = this.ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
|
|
443
835
|
if (!contentType) {
|
|
444
836
|
throw new Error(`Content Type with ID ${contentTypeID} not found in cached metadata`);
|
|
445
837
|
}
|
|
446
838
|
return contentType.Name;
|
|
447
839
|
}
|
|
448
840
|
GetContentFileTypeName(contentFileTypeID) {
|
|
449
|
-
const fileType = this.
|
|
841
|
+
const fileType = this.ContentFileTypes.find(ft => UUIDsEqual(ft.ID, contentFileTypeID));
|
|
450
842
|
if (!fileType) {
|
|
451
843
|
throw new Error(`Content File Type with ID ${contentFileTypeID} not found in cached metadata`);
|
|
452
844
|
}
|
|
@@ -486,7 +878,7 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
486
878
|
throw new Error(`Content item with URL ${url} not found`);
|
|
487
879
|
}
|
|
488
880
|
/**
|
|
489
|
-
* Saves process run metadata to the database.
|
|
881
|
+
* Saves process run metadata to the database (backward-compatible simple version).
|
|
490
882
|
*/
|
|
491
883
|
async saveProcessRun(processRunParams, contextUser) {
|
|
492
884
|
const md = new Metadata();
|
|
@@ -495,10 +887,92 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
495
887
|
processRun.SourceID = processRunParams.sourceID;
|
|
496
888
|
processRun.StartTime = processRunParams.startTime;
|
|
497
889
|
processRun.EndTime = processRunParams.endTime;
|
|
498
|
-
processRun.Status = '
|
|
890
|
+
processRun.Status = 'Completed';
|
|
499
891
|
processRun.ProcessedItems = processRunParams.numItemsProcessed;
|
|
892
|
+
processRun.StartedByUserID = contextUser.ID;
|
|
500
893
|
await processRun.Save();
|
|
501
894
|
}
|
|
895
|
+
/**
|
|
896
|
+
* Create a new ContentProcessRun record for batched pipeline execution.
|
|
897
|
+
* Returns the entity so the caller can update cursor/status as batches complete.
|
|
898
|
+
* Uses the JSONType ConfigurationObject for strongly-typed configuration.
|
|
899
|
+
*/
|
|
900
|
+
async CreateBatchedProcessRun(sourceID, totalItemCount, batchSize, contextUser, config) {
|
|
901
|
+
const md = new Metadata();
|
|
902
|
+
const processRun = await md.GetEntityObject('MJ: Content Process Runs', contextUser);
|
|
903
|
+
processRun.NewRecord();
|
|
904
|
+
processRun.SourceID = sourceID;
|
|
905
|
+
processRun.StartTime = new Date();
|
|
906
|
+
processRun.Status = 'Running';
|
|
907
|
+
processRun.ProcessedItems = 0;
|
|
908
|
+
processRun.TotalItemCount = totalItemCount;
|
|
909
|
+
processRun.BatchSize = batchSize;
|
|
910
|
+
processRun.LastProcessedOffset = 0;
|
|
911
|
+
processRun.ErrorCount = 0;
|
|
912
|
+
processRun.CancellationRequested = false;
|
|
913
|
+
processRun.StartedByUserID = contextUser.ID;
|
|
914
|
+
if (config) {
|
|
915
|
+
processRun.ConfigurationObject = config;
|
|
916
|
+
}
|
|
917
|
+
const saved = await processRun.Save();
|
|
918
|
+
if (!saved) {
|
|
919
|
+
throw new Error('Failed to create ContentProcessRun record');
|
|
920
|
+
}
|
|
921
|
+
return processRun;
|
|
922
|
+
}
|
|
923
|
+
/**
|
|
924
|
+
* Update a batched process run's cursor position after a batch completes.
|
|
925
|
+
* Checks CancellationRequested to support pause/cancel.
|
|
926
|
+
* @returns true if processing should continue, false if cancelled/paused
|
|
927
|
+
*/
|
|
928
|
+
async UpdateBatchCursor(processRun, processedCount, errorCount) {
|
|
929
|
+
processRun.ProcessedItems = processedCount;
|
|
930
|
+
processRun.LastProcessedOffset = processedCount;
|
|
931
|
+
processRun.ErrorCount = errorCount;
|
|
932
|
+
await processRun.Save();
|
|
933
|
+
// Reload to check if cancellation was requested externally
|
|
934
|
+
await processRun.Load(processRun.ID);
|
|
935
|
+
if (processRun.CancellationRequested) {
|
|
936
|
+
processRun.Status = 'Paused';
|
|
937
|
+
processRun.EndTime = new Date();
|
|
938
|
+
await processRun.Save();
|
|
939
|
+
LogStatus(`[Pipeline] Cancellation requested — pausing at offset ${processedCount}`);
|
|
940
|
+
return false;
|
|
941
|
+
}
|
|
942
|
+
return true;
|
|
943
|
+
}
|
|
944
|
+
/**
|
|
945
|
+
* Complete a batched process run (success or failure).
|
|
946
|
+
*/
|
|
947
|
+
async CompleteBatchedProcessRun(processRun, status, errorMessage) {
|
|
948
|
+
processRun.Status = status;
|
|
949
|
+
processRun.EndTime = new Date();
|
|
950
|
+
if (errorMessage) {
|
|
951
|
+
processRun.ErrorMessage = errorMessage;
|
|
952
|
+
}
|
|
953
|
+
await processRun.Save();
|
|
954
|
+
}
|
|
955
|
+
/**
|
|
956
|
+
* Create rate limiters from the pipeline configuration.
|
|
957
|
+
*/
|
|
958
|
+
CreateRateLimiters(config) {
|
|
959
|
+
return {
|
|
960
|
+
llm: new RateLimiter({
|
|
961
|
+
RequestsPerMinute: config?.RateLimits?.LLM?.RequestsPerMinute ?? 60,
|
|
962
|
+
TokensPerMinute: config?.RateLimits?.LLM?.TokensPerMinute ?? 1000000,
|
|
963
|
+
Name: 'LLM',
|
|
964
|
+
}),
|
|
965
|
+
embedding: new RateLimiter({
|
|
966
|
+
RequestsPerMinute: config?.RateLimits?.Embedding?.RequestsPerMinute ?? 300,
|
|
967
|
+
TokensPerMinute: config?.RateLimits?.Embedding?.TokensPerMinute ?? 1000000,
|
|
968
|
+
Name: 'Embedding',
|
|
969
|
+
}),
|
|
970
|
+
vectorDB: new RateLimiter({
|
|
971
|
+
RequestsPerMinute: config?.RateLimits?.VectorDB?.RequestsPerMinute ?? 200,
|
|
972
|
+
Name: 'VectorDB',
|
|
973
|
+
}),
|
|
974
|
+
};
|
|
975
|
+
}
|
|
502
976
|
async parsePDF(dataBuffer) {
|
|
503
977
|
const dataPDF = await pdfParse(dataBuffer);
|
|
504
978
|
return dataPDF.text;
|
|
@@ -530,8 +1004,768 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
530
1004
|
throw new Error(`File type '${fileExtension}' not supported`);
|
|
531
1005
|
}
|
|
532
1006
|
}
|
|
1007
|
+
// ---- Direct Vectorization ----
|
|
1008
|
+
/**
|
|
1009
|
+
* Embeds content items and upserts them to the appropriate vector index.
|
|
1010
|
+
* Items are grouped by their resolved (embeddingModel + vectorIndex) pair — derived
|
|
1011
|
+
* from per-ContentSource overrides, per-ContentType defaults, or the global fallback
|
|
1012
|
+
* (first active VectorIndex). Each group is processed in configurable batches with
|
|
1013
|
+
* parallel upserts within each batch.
|
|
1014
|
+
*
|
|
1015
|
+
* Uses AIModelRunner to create AIPromptRun records for each embedding batch,
|
|
1016
|
+
* enabling token/cost tracking and linking to ContentProcessRunDetail records.
|
|
1017
|
+
*
|
|
1018
|
+
* @param items - content items to vectorize
|
|
1019
|
+
* @param contextUser - current user for permissions/audit
|
|
1020
|
+
* @param onProgress - optional callback for progress updates
|
|
1021
|
+
* @param batchSize - number of items per embedding batch
|
|
1022
|
+
* @returns counts of vectorized/skipped items and collected AIPromptRun IDs
|
|
1023
|
+
*/
|
|
1024
|
+
async VectorizeContentItems(items, contextUser, onProgress, batchSize = DEFAULT_VECTORIZE_BATCH_SIZE) {
|
|
1025
|
+
const eligible = items.filter(i => i.Text && i.Text.trim().length > 0);
|
|
1026
|
+
if (eligible.length === 0) {
|
|
1027
|
+
LogStatus('VectorizeContentItems: no items with text to vectorize');
|
|
1028
|
+
return { vectorized: 0, skipped: items.length, promptRunIDs: [] };
|
|
1029
|
+
}
|
|
1030
|
+
// Ensure AIEngine is loaded so we can resolve the embedding model
|
|
1031
|
+
await AIEngine.Instance.Config(false, contextUser);
|
|
1032
|
+
// Load content sources + types for per-item infrastructure resolution
|
|
1033
|
+
const { sourceMap, typeMap } = await this.loadContentSourceAndTypeMaps(eligible, contextUser);
|
|
1034
|
+
// Group items by their resolved (embeddingModelID + vectorIndexID) pair
|
|
1035
|
+
const groups = this.groupItemsByInfrastructure(eligible, sourceMap, typeMap);
|
|
1036
|
+
// Load tags for all items in one query
|
|
1037
|
+
const tagMap = await this.loadTagsForItems(eligible, contextUser);
|
|
1038
|
+
let vectorized = 0;
|
|
1039
|
+
let processed = 0;
|
|
1040
|
+
const allPromptRunIDs = [];
|
|
1041
|
+
for (const [groupKey, groupItems] of groups) {
|
|
1042
|
+
const infra = await this.resolveGroupInfrastructure(groupKey, contextUser);
|
|
1043
|
+
const groupResult = await this.vectorizeGroup(groupItems, infra, tagMap, batchSize, contextUser, (batchProcessed) => {
|
|
1044
|
+
processed += batchProcessed;
|
|
1045
|
+
onProgress?.(Math.min(processed, eligible.length), eligible.length);
|
|
1046
|
+
});
|
|
1047
|
+
vectorized += groupResult.vectorized;
|
|
1048
|
+
allPromptRunIDs.push(...groupResult.promptRunIDs);
|
|
1049
|
+
}
|
|
1050
|
+
LogStatus(`VectorizeContentItems: ${vectorized} vectorized, ${items.length - eligible.length} skipped (empty text), ${allPromptRunIDs.length} prompt runs created`);
|
|
1051
|
+
return { vectorized, skipped: items.length - eligible.length, promptRunIDs: allPromptRunIDs };
|
|
1052
|
+
}
|
|
1053
|
+
/**
|
|
1054
|
+
* Process a single infrastructure group: embed texts in batches and upsert to vector DB.
|
|
1055
|
+
* Uses AIModelRunner for each embedding batch to create AIPromptRun records with
|
|
1056
|
+
* token/cost tracking. Upserts within each batch run in parallel for throughput.
|
|
1057
|
+
*
|
|
1058
|
+
* @param items - content items in this infrastructure group
|
|
1059
|
+
* @param infra - resolved embedding + vector DB infrastructure
|
|
1060
|
+
* @param tagMap - pre-loaded tags for metadata enrichment
|
|
1061
|
+
* @param batchSize - number of items per embedding batch
|
|
1062
|
+
* @param contextUser - current user for AIModelRunner tracking
|
|
1063
|
+
* @param onBatchComplete - callback invoked after each batch with item count
|
|
1064
|
+
* @returns count of vectorized items and collected AIPromptRun IDs
|
|
1065
|
+
*/
|
|
1066
|
+
async vectorizeGroup(items, infra, tagMap, batchSize, contextUser, onBatchComplete) {
|
|
1067
|
+
let vectorized = 0;
|
|
1068
|
+
const promptRunIDs = [];
|
|
1069
|
+
const modelRunner = new AIModelRunner();
|
|
1070
|
+
// Resolve the "Content Embedding" prompt ID for tracking
|
|
1071
|
+
const embeddingPromptID = this.resolveEmbeddingPromptID();
|
|
1072
|
+
for (let i = 0; i < items.length; i += batchSize) {
|
|
1073
|
+
const batch = items.slice(i, i + batchSize);
|
|
1074
|
+
// Build chunks for each item — items with long text produce multiple chunks
|
|
1075
|
+
const allChunks = this.buildChunksForBatch(batch);
|
|
1076
|
+
const texts = allChunks.map(c => c.text);
|
|
1077
|
+
// Rate limit embedding API call
|
|
1078
|
+
await this.EmbeddingRateLimiter.Acquire(texts.reduce((sum, t) => sum + Math.ceil(t.length / 4), 0));
|
|
1079
|
+
// Use AIModelRunner to embed texts with AIPromptRun tracking
|
|
1080
|
+
const runResult = await modelRunner.RunEmbedding({
|
|
1081
|
+
Texts: texts,
|
|
1082
|
+
ModelID: infra.embeddingModelID,
|
|
1083
|
+
PromptID: embeddingPromptID,
|
|
1084
|
+
ContextUser: contextUser,
|
|
1085
|
+
Description: `Content vectorization batch: ${batch.length} items, ${allChunks.length} chunks`,
|
|
1086
|
+
});
|
|
1087
|
+
if (!runResult.Success || runResult.Vectors.length !== allChunks.length) {
|
|
1088
|
+
LogError(`VectorizeContentItems: embedding returned ${runResult.Vectors.length} vectors for ${allChunks.length} texts — ${runResult.ErrorMessage ?? 'unknown error'}`);
|
|
1089
|
+
onBatchComplete(batch.length);
|
|
1090
|
+
continue;
|
|
1091
|
+
}
|
|
1092
|
+
// Track the AIPromptRun ID for junction table linking
|
|
1093
|
+
if (runResult.PromptRunID) {
|
|
1094
|
+
promptRunIDs.push(runResult.PromptRunID);
|
|
1095
|
+
}
|
|
1096
|
+
const records = this.buildVectorRecords(allChunks, runResult.Vectors, tagMap);
|
|
1097
|
+
const batchSuccess = await this.upsertVectorRecords(records, infra);
|
|
1098
|
+
if (batchSuccess) {
|
|
1099
|
+
vectorized += batch.length;
|
|
1100
|
+
}
|
|
1101
|
+
onBatchComplete(batch.length);
|
|
1102
|
+
}
|
|
1103
|
+
return { vectorized, promptRunIDs };
|
|
1104
|
+
}
|
|
1105
|
+
/**
|
|
1106
|
+
* Resolve the "Content Embedding" prompt ID from AIEngine for AIModelRunner tracking.
|
|
1107
|
+
* Returns undefined if the prompt is not found (AIModelRunner will fall back to
|
|
1108
|
+
* the first active Embedding-type prompt).
|
|
1109
|
+
*/
|
|
1110
|
+
resolveEmbeddingPromptID() {
|
|
1111
|
+
const prompt = AIEngine.Instance.Prompts.find(p => p.Name === 'Content Embedding' && p.Status === 'Active');
|
|
1112
|
+
if (prompt) {
|
|
1113
|
+
return prompt.ID;
|
|
1114
|
+
}
|
|
1115
|
+
// Fall back: let AIModelRunner find the first active Embedding prompt
|
|
1116
|
+
LogStatus('[Autotag] "Content Embedding" prompt not found — AIModelRunner will use default embedding prompt');
|
|
1117
|
+
return undefined;
|
|
1118
|
+
}
|
|
1119
|
+
/**
|
|
1120
|
+
* Build text chunks for a batch of content items. Items with long text
|
|
1121
|
+
* produce multiple chunks via TextChunker.
|
|
1122
|
+
*/
|
|
1123
|
+
buildChunksForBatch(batch) {
|
|
1124
|
+
const allChunks = [];
|
|
1125
|
+
for (const item of batch) {
|
|
1126
|
+
const chunks = this.buildEmbeddingChunks(item);
|
|
1127
|
+
for (let ci = 0; ci < chunks.length; ci++) {
|
|
1128
|
+
allChunks.push({ item, chunkIndex: ci, text: chunks[ci] });
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
return allChunks;
|
|
1132
|
+
}
|
|
1133
|
+
/**
|
|
1134
|
+
* Build VectorRecord objects from embedding chunks and their corresponding vectors.
|
|
1135
|
+
*/
|
|
1136
|
+
buildVectorRecords(allChunks, vectors, tagMap) {
|
|
1137
|
+
return allChunks.map((chunk, idx) => ({
|
|
1138
|
+
id: chunk.chunkIndex === 0
|
|
1139
|
+
? this.contentItemVectorId(chunk.item.ID)
|
|
1140
|
+
: this.contentItemVectorId(chunk.item.ID) + `_chunk${chunk.chunkIndex}`,
|
|
1141
|
+
values: vectors[idx],
|
|
1142
|
+
metadata: this.buildVectorMetadata(chunk.item, tagMap.get(chunk.item.ID))
|
|
1143
|
+
}));
|
|
1144
|
+
}
|
|
1145
|
+
/**
|
|
1146
|
+
* Upsert vector records to the vector database in sub-batches with rate limiting.
|
|
1147
|
+
* Returns true if all sub-batches succeeded.
|
|
1148
|
+
*/
|
|
1149
|
+
async upsertVectorRecords(records, infra) {
|
|
1150
|
+
const UPSERT_CHUNK = 50;
|
|
1151
|
+
const upsertPromises = [];
|
|
1152
|
+
for (let j = 0; j < records.length; j += UPSERT_CHUNK) {
|
|
1153
|
+
const chunk = records.slice(j, j + UPSERT_CHUNK);
|
|
1154
|
+
await this.VectorDBRateLimiter.Acquire();
|
|
1155
|
+
upsertPromises.push(Promise.resolve(infra.vectorDB.CreateRecords(chunk, infra.indexName)));
|
|
1156
|
+
}
|
|
1157
|
+
const responses = await Promise.all(upsertPromises);
|
|
1158
|
+
let allSuccess = true;
|
|
1159
|
+
for (const response of responses) {
|
|
1160
|
+
if (!response.success) {
|
|
1161
|
+
LogError(`VectorizeContentItems: upsert failed: ${response.message}`);
|
|
1162
|
+
allSuccess = false;
|
|
1163
|
+
}
|
|
1164
|
+
}
|
|
1165
|
+
return allSuccess;
|
|
1166
|
+
}
|
|
1167
|
+
/**
|
|
1168
|
+
* Load content source and content type records for all unique source/type IDs
|
|
1169
|
+
* referenced by the given items. Returns maps keyed by normalized ID.
|
|
1170
|
+
*/
|
|
1171
|
+
async loadContentSourceAndTypeMaps(items, _contextUser) {
|
|
1172
|
+
const sourceIdSet = new Set(items.map(i => NormalizeUUID(i.ContentSourceID)));
|
|
1173
|
+
const typeIdSet = new Set(items.map(i => NormalizeUUID(i.ContentTypeID)));
|
|
1174
|
+
// Use KH engine cached data instead of RunView calls
|
|
1175
|
+
const sourceMap = new Map();
|
|
1176
|
+
for (const src of this.khEngine.ContentSources) {
|
|
1177
|
+
if (sourceIdSet.has(NormalizeUUID(src.ID))) {
|
|
1178
|
+
sourceMap.set(NormalizeUUID(src.ID), src.GetAll());
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
const typeMap = new Map();
|
|
1182
|
+
for (const ct of this.ContentTypes) {
|
|
1183
|
+
if (typeIdSet.has(NormalizeUUID(ct.ID))) {
|
|
1184
|
+
typeMap.set(NormalizeUUID(ct.ID), ct.GetAll());
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
return { sourceMap, typeMap };
|
|
1188
|
+
}
|
|
1189
|
+
/**
|
|
1190
|
+
* Resolve the (embeddingModelID, vectorIndexID) pair for a content item using
|
|
1191
|
+
* the cascade: ContentSource override -> ContentType default -> null (global fallback).
|
|
1192
|
+
*/
|
|
1193
|
+
resolveItemInfrastructureIds(item, sourceMap, typeMap) {
|
|
1194
|
+
const source = sourceMap.get(NormalizeUUID(item.ContentSourceID));
|
|
1195
|
+
if (source) {
|
|
1196
|
+
const srcEmbedding = source['EmbeddingModelID'];
|
|
1197
|
+
const srcVector = source['VectorIndexID'];
|
|
1198
|
+
if (srcEmbedding && srcVector) {
|
|
1199
|
+
return { embeddingModelID: srcEmbedding, vectorIndexID: srcVector };
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
const contentType = typeMap.get(NormalizeUUID(item.ContentTypeID));
|
|
1203
|
+
if (contentType) {
|
|
1204
|
+
const typeEmbedding = contentType['EmbeddingModelID'];
|
|
1205
|
+
const typeVector = contentType['VectorIndexID'];
|
|
1206
|
+
if (typeEmbedding && typeVector) {
|
|
1207
|
+
return { embeddingModelID: typeEmbedding, vectorIndexID: typeVector };
|
|
1208
|
+
}
|
|
1209
|
+
}
|
|
1210
|
+
// Global fallback — will be resolved in resolveGroupInfrastructure
|
|
1211
|
+
return { embeddingModelID: null, vectorIndexID: null };
|
|
1212
|
+
}
|
|
1213
|
+
/**
|
|
1214
|
+
* Group items by their resolved (embeddingModelID + vectorIndexID) key.
|
|
1215
|
+
* Items with the same pair share infrastructure and can be batched together.
|
|
1216
|
+
*/
|
|
1217
|
+
groupItemsByInfrastructure(items, sourceMap, typeMap) {
|
|
1218
|
+
const groups = new Map();
|
|
1219
|
+
for (const item of items) {
|
|
1220
|
+
const { embeddingModelID, vectorIndexID } = this.resolveItemInfrastructureIds(item, sourceMap, typeMap);
|
|
1221
|
+
const key = this.infraGroupKey(embeddingModelID, vectorIndexID);
|
|
1222
|
+
const group = groups.get(key) ?? [];
|
|
1223
|
+
group.push(item);
|
|
1224
|
+
groups.set(key, group);
|
|
1225
|
+
}
|
|
1226
|
+
return groups;
|
|
1227
|
+
}
|
|
1228
|
+
/** Create a stable cache key for an (embeddingModelID, vectorIndexID) pair */
|
|
1229
|
+
infraGroupKey(embeddingModelID, vectorIndexID) {
|
|
1230
|
+
const e = embeddingModelID ? NormalizeUUID(embeddingModelID) : 'default';
|
|
1231
|
+
const v = vectorIndexID ? NormalizeUUID(vectorIndexID) : 'default';
|
|
1232
|
+
return `${e}|${v}`;
|
|
1233
|
+
}
|
|
1234
|
+
/**
|
|
1235
|
+
* Resolve a group key into concrete infrastructure instances. For the 'default|default'
|
|
1236
|
+
* key, falls back to the first active VectorIndex (original behavior).
|
|
1237
|
+
*/
|
|
1238
|
+
async resolveGroupInfrastructure(groupKey, contextUser) {
|
|
1239
|
+
const [embeddingPart, vectorPart] = groupKey.split('|');
|
|
1240
|
+
const isDefault = embeddingPart === 'default' || vectorPart === 'default';
|
|
1241
|
+
if (isDefault) {
|
|
1242
|
+
return this.getDefaultVectorInfrastructure(contextUser);
|
|
1243
|
+
}
|
|
1244
|
+
return this.buildVectorInfrastructure(embeddingPart, vectorPart, contextUser);
|
|
1245
|
+
}
|
|
1246
|
+
/**
|
|
1247
|
+
* Build infrastructure from explicit embeddingModelID and vectorIndexID.
|
|
1248
|
+
* Looks up the vector index by ID and the embedding model from AIEngine.
|
|
1249
|
+
*/
|
|
1250
|
+
async buildVectorInfrastructure(embeddingModelID, vectorIndexID, _contextUser) {
|
|
1251
|
+
const vectorIndex = this.khEngine.GetVectorIndexById(vectorIndexID);
|
|
1252
|
+
if (!vectorIndex) {
|
|
1253
|
+
throw new Error(`Vector index ${vectorIndexID} not found in KnowledgeHubMetadataEngine cache`);
|
|
1254
|
+
}
|
|
1255
|
+
return this.createInfrastructureFromIndex(vectorIndex.Name, vectorIndex.VectorDatabaseID, embeddingModelID);
|
|
1256
|
+
}
|
|
1257
|
+
/**
|
|
1258
|
+
* Fallback: resolve infrastructure from the first available VectorIndex (original behavior).
|
|
1259
|
+
*/
|
|
1260
|
+
async getDefaultVectorInfrastructure(_contextUser) {
|
|
1261
|
+
const vectorIndexes = this.khEngine.VectorIndexes;
|
|
1262
|
+
if (vectorIndexes.length === 0) {
|
|
1263
|
+
throw new Error('No vector indexes found — create one in the Configuration tab first');
|
|
1264
|
+
}
|
|
1265
|
+
const vectorIndex = vectorIndexes[0];
|
|
1266
|
+
return this.createInfrastructureFromIndex(vectorIndex.Name, vectorIndex.VectorDatabaseID, vectorIndex.EmbeddingModelID);
|
|
1267
|
+
}
|
|
1268
|
+
/**
|
|
1269
|
+
* Shared helper: given vector index details and embedding model ID, resolve all
|
|
1270
|
+
* driver instances needed for embedding + upsert. Uses AIEngine for Vector Databases.
|
|
1271
|
+
*/
|
|
1272
|
+
async createInfrastructureFromIndex(indexName, vectorDatabaseID, embeddingModelID) {
|
|
1273
|
+
const vectorDBEntity = AIEngine.Instance.VectorDatabases.find(db => UUIDsEqual(db.ID, vectorDatabaseID));
|
|
1274
|
+
if (!vectorDBEntity || !vectorDBEntity.ClassKey) {
|
|
1275
|
+
throw new Error(`Vector database ${vectorDatabaseID} not found in AIEngine cache`);
|
|
1276
|
+
}
|
|
1277
|
+
const vectorDBClassKey = vectorDBEntity.ClassKey;
|
|
1278
|
+
const aiModel = this.findEmbeddingModel(embeddingModelID);
|
|
1279
|
+
const driverClass = aiModel.DriverClass;
|
|
1280
|
+
const embeddingModelName = aiModel.APIName ?? aiModel.Name;
|
|
1281
|
+
LogStatus(`VectorizeContentItems: USING embedding model "${aiModel.Name}" (${driverClass}), vector DB "${vectorDBClassKey}", index "${indexName}"`);
|
|
1282
|
+
const embedding = this.createEmbeddingInstance(driverClass);
|
|
1283
|
+
const vectorDB = this.createVectorDBInstance(vectorDBClassKey);
|
|
1284
|
+
return { embedding, vectorDB, indexName, embeddingModelName, embeddingModelID };
|
|
1285
|
+
}
|
|
1286
|
+
/** Find an embedding model by ID in AIEngine, with helpful error reporting */
|
|
1287
|
+
findEmbeddingModel(embeddingModelID) {
|
|
1288
|
+
const aiModel = AIEngine.Instance.Models.find(m => UUIDsEqual(m.ID, embeddingModelID));
|
|
1289
|
+
if (!aiModel) {
|
|
1290
|
+
const embModels = AIEngine.Instance.Models.filter(m => m.DriverClass?.includes('Embed') || m.Name?.includes('embed'));
|
|
1291
|
+
LogError(`VectorizeContentItems: embeddingModelID ${embeddingModelID} NOT FOUND. Available: ${JSON.stringify(embModels.map(m => ({ id: m.ID, name: m.Name, driver: m.DriverClass })))}`);
|
|
1292
|
+
throw new Error(`Embedding model ${embeddingModelID} not found in AIEngine — ensure AIEngine is configured`);
|
|
1293
|
+
}
|
|
1294
|
+
return aiModel;
|
|
1295
|
+
}
|
|
1296
|
+
/** Create a BaseEmbeddings instance for a given driver class */
|
|
1297
|
+
createEmbeddingInstance(driverClass) {
|
|
1298
|
+
const apiKey = GetAIAPIKey(driverClass);
|
|
1299
|
+
if (!apiKey) {
|
|
1300
|
+
throw new Error(`No API key found for embedding driver ${driverClass} — set AI_VENDOR_API_KEY__${driverClass} in .env`);
|
|
1301
|
+
}
|
|
1302
|
+
const instance = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, driverClass, apiKey);
|
|
1303
|
+
if (!instance)
|
|
1304
|
+
throw new Error(`Failed to create embedding instance for ${driverClass}`);
|
|
1305
|
+
return instance;
|
|
1306
|
+
}
|
|
1307
|
+
/** Create a VectorDBBase instance for a given class key */
|
|
1308
|
+
createVectorDBInstance(classKey) {
|
|
1309
|
+
const apiKey = GetAIAPIKey(classKey);
|
|
1310
|
+
if (!apiKey) {
|
|
1311
|
+
throw new Error(`No API key found for vector DB ${classKey} — set AI_VENDOR_API_KEY__${classKey} in .env`);
|
|
1312
|
+
}
|
|
1313
|
+
const instance = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, classKey, apiKey);
|
|
1314
|
+
if (!instance)
|
|
1315
|
+
throw new Error(`Failed to create vector DB instance for ${classKey}`);
|
|
1316
|
+
return instance;
|
|
1317
|
+
}
|
|
1318
|
+
/** SHA-1 deterministic vector ID for a content item */
|
|
1319
|
+
contentItemVectorId(contentItemId) {
|
|
1320
|
+
return crypto.createHash('sha1').update(`content-item_${contentItemId}`).digest('hex');
|
|
1321
|
+
}
|
|
1322
|
+
/** Build the text that gets embedded: Title + Description + full Text */
|
|
1323
|
+
/**
|
|
1324
|
+
* Max tokens per embedding chunk. text-embedding-3-small supports 8,191 tokens.
|
|
1325
|
+
* We use a conservative limit to avoid hitting the boundary.
|
|
1326
|
+
*/
|
|
1327
|
+
static { this.MAX_EMBEDDING_TOKENS = 7500; }
|
|
1328
|
+
/**
|
|
1329
|
+
* Build the text to embed for a content item, and chunk it if it exceeds
|
|
1330
|
+
* the embedding model's token limit. Returns one or more text chunks.
|
|
1331
|
+
*/
|
|
1332
|
+
buildEmbeddingChunks(item) {
|
|
1333
|
+
const parts = [];
|
|
1334
|
+
if (item.Name)
|
|
1335
|
+
parts.push(item.Name);
|
|
1336
|
+
if (item.Description)
|
|
1337
|
+
parts.push(item.Description);
|
|
1338
|
+
if (item.Text)
|
|
1339
|
+
parts.push(item.Text);
|
|
1340
|
+
const full = parts.join('\n');
|
|
1341
|
+
// Rough char estimate: 1 token ≈ 4 chars
|
|
1342
|
+
const charLimit = AutotagBaseEngine_1.MAX_EMBEDDING_TOKENS * 4;
|
|
1343
|
+
if (full.length <= charLimit) {
|
|
1344
|
+
return [full];
|
|
1345
|
+
}
|
|
1346
|
+
// Chunk using TextChunker for token-aware splitting
|
|
1347
|
+
LogStatus(`[Autotag] Chunking embedding text for "${item.Name}" (${full.length} chars, ~${Math.ceil(full.length / 4)} tokens)`);
|
|
1348
|
+
try {
|
|
1349
|
+
const chunkParams = {
|
|
1350
|
+
Text: full,
|
|
1351
|
+
MaxChunkTokens: AutotagBaseEngine_1.MAX_EMBEDDING_TOKENS,
|
|
1352
|
+
OverlapTokens: 100,
|
|
1353
|
+
};
|
|
1354
|
+
const chunks = TextChunker.ChunkText(chunkParams);
|
|
1355
|
+
LogStatus(`[Autotag] Split into ${chunks.length} chunks for embedding`);
|
|
1356
|
+
return chunks.map(c => c.Text);
|
|
1357
|
+
}
|
|
1358
|
+
catch {
|
|
1359
|
+
// Fallback: simple character-based splitting
|
|
1360
|
+
const result = [];
|
|
1361
|
+
for (let i = 0; i < full.length; i += charLimit) {
|
|
1362
|
+
result.push(full.substring(i, i + charLimit));
|
|
1363
|
+
}
|
|
1364
|
+
return result;
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
/** Build metadata stored alongside the vector — truncate large text fields */
|
|
1368
|
+
buildVectorMetadata(item, tags) {
|
|
1369
|
+
const META_TEXT_LIMIT = 1000;
|
|
1370
|
+
const meta = {
|
|
1371
|
+
RecordID: item.ID,
|
|
1372
|
+
Entity: 'MJ: Content Items',
|
|
1373
|
+
ContentSourceID: item.ContentSourceID,
|
|
1374
|
+
ContentSourceTypeID: item.ContentSourceTypeID,
|
|
1375
|
+
};
|
|
1376
|
+
if (item.Name)
|
|
1377
|
+
meta['Title'] = item.Name.substring(0, META_TEXT_LIMIT);
|
|
1378
|
+
if (item.Description)
|
|
1379
|
+
meta['Description'] = item.Description.substring(0, META_TEXT_LIMIT);
|
|
1380
|
+
if (item.URL)
|
|
1381
|
+
meta['URL'] = item.URL;
|
|
1382
|
+
if (tags && tags.length > 0)
|
|
1383
|
+
meta['Tags'] = tags;
|
|
1384
|
+
return meta;
|
|
1385
|
+
}
|
|
1386
|
+
/** Load all tags for the given items in a single RunView call */
|
|
1387
|
+
async loadTagsForItems(items, contextUser) {
|
|
1388
|
+
const tagMap = new Map();
|
|
1389
|
+
const rv = new RunView();
|
|
1390
|
+
const ids = items.map(i => `'${i.ID}'`).join(',');
|
|
1391
|
+
const result = await rv.RunView({
|
|
1392
|
+
EntityName: 'MJ: Content Item Tags',
|
|
1393
|
+
ExtraFilter: `ItemID IN (${ids})`,
|
|
1394
|
+
ResultType: 'entity_object'
|
|
1395
|
+
}, contextUser);
|
|
1396
|
+
if (result.Success) {
|
|
1397
|
+
for (const tag of result.Results) {
|
|
1398
|
+
const existing = tagMap.get(tag.ItemID) ?? [];
|
|
1399
|
+
existing.push(tag.Tag);
|
|
1400
|
+
tagMap.set(tag.ItemID, existing);
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
return tagMap;
|
|
1404
|
+
}
|
|
1405
|
+
// ---- Content Deduplication ----
|
|
1406
|
+
/**
|
|
1407
|
+
* Attempts to recompute tag co-occurrence data after the LLM tagging pipeline completes.
|
|
1408
|
+
* Uses dynamic import to avoid a hard dependency on the tag-engine package.
|
|
1409
|
+
* If TagCoOccurrenceEngine is not available or fails, it logs a warning and continues.
|
|
1410
|
+
*/
|
|
1411
|
+
async recomputeCoOccurrenceIfAvailable(contextUser) {
|
|
1412
|
+
try {
|
|
1413
|
+
// Dynamic check: TagCoOccurrenceEngine is registered via class factory
|
|
1414
|
+
const { TagCoOccurrenceEngine } = await import('@memberjunction/tag-engine');
|
|
1415
|
+
const engine = TagCoOccurrenceEngine.Instance;
|
|
1416
|
+
if (engine && typeof engine.RecomputeCoOccurrence === 'function') {
|
|
1417
|
+
LogStatus('[Autotag] Recomputing tag co-occurrence after pipeline completion...');
|
|
1418
|
+
const result = await engine.RecomputeCoOccurrence(contextUser);
|
|
1419
|
+
LogStatus(`[Autotag] Co-occurrence recompute complete: ${result.PairsUpdated} pairs updated, ${result.PairsDeleted} deleted`);
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
catch (e) {
|
|
1423
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1424
|
+
LogStatus(`[Autotag] Co-occurrence recompute skipped (not available): ${msg}`);
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
/**
|
|
1428
|
+
* Detects duplicate content items by matching the given item's checksum against
|
|
1429
|
+
* other content items from **different** content sources. When an exact checksum
|
|
1430
|
+
* match is found, a {@link MJContentItemDuplicateEntity} record is created with
|
|
1431
|
+
* `DetectionMethod = 'Checksum'` and `SimilarityScore = 1.0`.
|
|
1432
|
+
*
|
|
1433
|
+
* Duplicate pairs are stored in canonical order (lower ID = ContentItemAID) to
|
|
1434
|
+
* prevent mirror duplicates. If a duplicate pair already exists for the same
|
|
1435
|
+
* detection method, no new record is created.
|
|
1436
|
+
*
|
|
1437
|
+
* @param contentItem - The content item whose checksum should be checked for duplicates.
|
|
1438
|
+
* Must already be saved (i.e., have a valid ID and Checksum).
|
|
1439
|
+
* @param contextUser - The authenticated user context for data access and audit.
|
|
1440
|
+
* @returns A promise that resolves when detection is complete. Does not throw on
|
|
1441
|
+
* failure — errors are logged and swallowed to avoid disrupting the pipeline.
|
|
1442
|
+
*/
|
|
1443
|
+
async DetectChecksumDuplicates(contentItem, contextUser) {
|
|
1444
|
+
if (!contentItem.Checksum) {
|
|
1445
|
+
return; // No checksum to compare
|
|
1446
|
+
}
|
|
1447
|
+
try {
|
|
1448
|
+
const matches = await this.findItemsByChecksum(contentItem.Checksum, contentItem.ContentSourceID, contentItem.ID, contextUser);
|
|
1449
|
+
for (const match of matches) {
|
|
1450
|
+
await this.createDuplicateRecordIfNotExists(contentItem.ID, match.ID, 1.0, 'Checksum', contextUser);
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
catch (e) {
|
|
1454
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1455
|
+
LogError(`[Dedup] Checksum detection failed for item ${contentItem.ID}: ${msg}`);
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
1458
|
+
/**
|
|
1459
|
+
* Detects duplicate content items by matching the given item's title (Name field)
|
|
1460
|
+
* against other content items from **different** content sources. When an exact
|
|
1461
|
+
* title match is found, a {@link MJContentItemDuplicateEntity} record is created
|
|
1462
|
+
* with `DetectionMethod = 'Title'` and `SimilarityScore = 1.0`.
|
|
1463
|
+
*
|
|
1464
|
+
* Duplicate pairs are stored in canonical order (lower ID = ContentItemAID) to
|
|
1465
|
+
* prevent mirror duplicates. If a duplicate pair already exists for the same
|
|
1466
|
+
* detection method, no new record is created.
|
|
1467
|
+
*
|
|
1468
|
+
* @param contentItem - The content item whose title should be checked for duplicates.
|
|
1469
|
+
* Must already be saved (i.e., have a valid ID and Name).
|
|
1470
|
+
* @param contextUser - The authenticated user context for data access and audit.
|
|
1471
|
+
* @returns A promise that resolves when detection is complete. Does not throw on
|
|
1472
|
+
* failure — errors are logged and swallowed to avoid disrupting the pipeline.
|
|
1473
|
+
*/
|
|
1474
|
+
async DetectTitleDuplicates(contentItem, contextUser) {
|
|
1475
|
+
if (!contentItem.Name || contentItem.Name.trim().length === 0) {
|
|
1476
|
+
return; // No title to compare
|
|
1477
|
+
}
|
|
1478
|
+
try {
|
|
1479
|
+
const matches = await this.findItemsByTitle(contentItem.Name, contentItem.ContentSourceID, contentItem.ID, contextUser);
|
|
1480
|
+
for (const match of matches) {
|
|
1481
|
+
await this.createDuplicateRecordIfNotExists(contentItem.ID, match.ID, 1.0, 'Title', contextUser);
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
catch (e) {
|
|
1485
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1486
|
+
LogError(`[Dedup] Title detection failed for item ${contentItem.ID}: ${msg}`);
|
|
1487
|
+
}
|
|
1488
|
+
}
|
|
1489
|
+
/**
|
|
1490
|
+
* Runs all non-vector deduplication checks (checksum and title) for a content item.
|
|
1491
|
+
* This is a convenience method intended to be called after saving/updating a content item.
|
|
1492
|
+
*
|
|
1493
|
+
* @param contentItem - The saved content item to check for duplicates.
|
|
1494
|
+
* @param contextUser - The authenticated user context for data access and audit.
|
|
1495
|
+
*/
|
|
1496
|
+
async DetectDuplicates(contentItem, contextUser) {
|
|
1497
|
+
await Promise.all([
|
|
1498
|
+
this.DetectChecksumDuplicates(contentItem, contextUser),
|
|
1499
|
+
this.DetectTitleDuplicates(contentItem, contextUser),
|
|
1500
|
+
]);
|
|
1501
|
+
}
|
|
1502
|
+
/**
|
|
1503
|
+
* Detects near-duplicate content items by querying the vector index for items
|
|
1504
|
+
* with high cosine similarity (> 0.95 threshold). Only creates duplicate records
|
|
1505
|
+
* for matches from DIFFERENT content sources to avoid self-matches.
|
|
1506
|
+
*
|
|
1507
|
+
* This is expensive so it only checks the top 3 most similar results.
|
|
1508
|
+
* Controlled by the `enableVectorDedup` flag.
|
|
1509
|
+
*
|
|
1510
|
+
* @param contentItem - The content item to check (must have text and be vectorized).
|
|
1511
|
+
* @param contextUser - The authenticated user context for data access and audit.
|
|
1512
|
+
* @param enableVectorDedup - Whether to run vector-based dedup (default false).
|
|
1513
|
+
*/
|
|
1514
|
+
async DetectVectorDuplicates(contentItem, contextUser, enableVectorDedup = false) {
|
|
1515
|
+
if (!enableVectorDedup)
|
|
1516
|
+
return;
|
|
1517
|
+
if (!contentItem.Text || contentItem.Text.trim().length === 0)
|
|
1518
|
+
return;
|
|
1519
|
+
try {
|
|
1520
|
+
await this.performVectorDedupCheck(contentItem, contextUser);
|
|
1521
|
+
}
|
|
1522
|
+
catch (e) {
|
|
1523
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1524
|
+
LogError(`[Dedup] Vector detection failed for item ${contentItem.ID}: ${msg}`);
|
|
1525
|
+
}
|
|
1526
|
+
}
|
|
1527
|
+
/**
|
|
1528
|
+
* Internal implementation of vector-based dedup. Resolves the vector infrastructure
|
|
1529
|
+
* for the item, embeds its text, queries for similar vectors, and creates duplicate
|
|
1530
|
+
* records for high-similarity matches from different sources.
|
|
1531
|
+
*/
|
|
1532
|
+
async performVectorDedupCheck(contentItem, contextUser) {
|
|
1533
|
+
// Need AIEngine loaded to resolve embedding model
|
|
1534
|
+
await AIEngine.Instance.Config(false, contextUser);
|
|
1535
|
+
// Load the content source + type maps for this single item
|
|
1536
|
+
const { sourceMap, typeMap } = await this.loadContentSourceAndTypeMaps([contentItem], contextUser);
|
|
1537
|
+
// Resolve infrastructure for this item
|
|
1538
|
+
const groups = this.groupItemsByInfrastructure([contentItem], sourceMap, typeMap);
|
|
1539
|
+
if (groups.size === 0) {
|
|
1540
|
+
LogStatus(`[Dedup] No vector infrastructure found for item ${contentItem.ID}, skipping vector dedup`);
|
|
1541
|
+
return;
|
|
1542
|
+
}
|
|
1543
|
+
const [groupKey] = groups.entries().next().value;
|
|
1544
|
+
const infra = await this.resolveGroupInfrastructure(groupKey, contextUser);
|
|
1545
|
+
// Embed the item's text
|
|
1546
|
+
const text = contentItem.Text.trim();
|
|
1547
|
+
const truncated = text.length > 8000 ? text.substring(0, 8000) : text;
|
|
1548
|
+
await this.EmbeddingRateLimiter.Acquire(Math.ceil(truncated.length / 4));
|
|
1549
|
+
const modelRunner = new AIModelRunner();
|
|
1550
|
+
const embeddingPromptID = this.resolveEmbeddingPromptID();
|
|
1551
|
+
const runResult = await modelRunner.RunEmbedding({
|
|
1552
|
+
ModelID: infra.embeddingModelID,
|
|
1553
|
+
Texts: [truncated],
|
|
1554
|
+
PromptID: embeddingPromptID ?? undefined,
|
|
1555
|
+
ContextUser: contextUser,
|
|
1556
|
+
});
|
|
1557
|
+
if (!runResult?.Vectors || runResult.Vectors.length === 0) {
|
|
1558
|
+
LogStatus(`[Dedup] Embedding failed for item ${contentItem.ID}, skipping vector dedup`);
|
|
1559
|
+
return;
|
|
1560
|
+
}
|
|
1561
|
+
const queryVector = runResult.Vectors[0];
|
|
1562
|
+
// Query vector DB for top 4 most similar (top 3 useful + 1 for self-match)
|
|
1563
|
+
const queryResponse = await infra.vectorDB.QueryIndex({
|
|
1564
|
+
vector: queryVector,
|
|
1565
|
+
topK: 4,
|
|
1566
|
+
includeMetadata: true,
|
|
1567
|
+
});
|
|
1568
|
+
const responseData = queryResponse;
|
|
1569
|
+
if (!responseData.success || !responseData.data)
|
|
1570
|
+
return;
|
|
1571
|
+
// The data property contains matches (QueryResponse shape)
|
|
1572
|
+
const matches = responseData.data.matches;
|
|
1573
|
+
if (!matches || matches.length === 0)
|
|
1574
|
+
return;
|
|
1575
|
+
// Filter: different source, similarity > 0.95, not self
|
|
1576
|
+
const VECTOR_DEDUP_THRESHOLD = 0.95;
|
|
1577
|
+
let matchCount = 0;
|
|
1578
|
+
for (const match of matches) {
|
|
1579
|
+
if (matchCount >= 3)
|
|
1580
|
+
break; // Only check top 3
|
|
1581
|
+
const matchScore = match.score ?? 0;
|
|
1582
|
+
if (matchScore < VECTOR_DEDUP_THRESHOLD)
|
|
1583
|
+
continue;
|
|
1584
|
+
// Extract content item ID from vector metadata
|
|
1585
|
+
const matchItemID = match.metadata?.['contentItemID'];
|
|
1586
|
+
if (!matchItemID || UUIDsEqual(matchItemID, contentItem.ID))
|
|
1587
|
+
continue;
|
|
1588
|
+
// Check if the match is from a different source
|
|
1589
|
+
const isDifferentSource = await this.isFromDifferentSource(matchItemID, contentItem.ContentSourceID, contextUser);
|
|
1590
|
+
if (!isDifferentSource)
|
|
1591
|
+
continue;
|
|
1592
|
+
await this.createDuplicateRecordIfNotExists(contentItem.ID, matchItemID, matchScore, 'Vector', contextUser);
|
|
1593
|
+
matchCount++;
|
|
1594
|
+
}
|
|
1595
|
+
if (matchCount > 0) {
|
|
1596
|
+
LogStatus(`[Dedup] Vector dedup found ${matchCount} near-duplicate(s) for item ${contentItem.ID}`);
|
|
1597
|
+
}
|
|
1598
|
+
}
|
|
1599
|
+
/**
|
|
1600
|
+
* Check if a content item belongs to a different source than the given sourceID.
|
|
1601
|
+
*/
|
|
1602
|
+
async isFromDifferentSource(itemID, excludeSourceID, contextUser) {
|
|
1603
|
+
const rv = new RunView();
|
|
1604
|
+
const result = await rv.RunView({
|
|
1605
|
+
EntityName: 'MJ: Content Items',
|
|
1606
|
+
Fields: ['ContentSourceID'],
|
|
1607
|
+
ExtraFilter: `ID = '${itemID}'`,
|
|
1608
|
+
ResultType: 'simple',
|
|
1609
|
+
MaxRows: 1,
|
|
1610
|
+
}, contextUser);
|
|
1611
|
+
if (!result.Success || result.Results.length === 0)
|
|
1612
|
+
return false;
|
|
1613
|
+
return result.Results[0].ContentSourceID !== excludeSourceID;
|
|
1614
|
+
}
|
|
1615
|
+
/**
|
|
1616
|
+
* Resolves a duplicate record by updating its Status and Resolution fields.
|
|
1617
|
+
*
|
|
1618
|
+
* @param duplicateID - The ID of the ContentItemDuplicate record.
|
|
1619
|
+
* @param resolution - The resolution choice: 'KeepA', 'KeepB', 'NotDuplicate'.
|
|
1620
|
+
* @param contextUser - The authenticated user context.
|
|
1621
|
+
*/
|
|
1622
|
+
async ResolveContentDuplicate(duplicateID, resolution, contextUser) {
|
|
1623
|
+
try {
|
|
1624
|
+
const md = new Metadata();
|
|
1625
|
+
const duplicate = await md.GetEntityObject('MJ: Content Item Duplicates', contextUser);
|
|
1626
|
+
const loaded = await duplicate.Load(duplicateID);
|
|
1627
|
+
if (!loaded) {
|
|
1628
|
+
LogError(`[Dedup] Could not load duplicate record ${duplicateID} for resolution`);
|
|
1629
|
+
return false;
|
|
1630
|
+
}
|
|
1631
|
+
this.applyDuplicateResolution(duplicate, resolution);
|
|
1632
|
+
const saved = await duplicate.Save();
|
|
1633
|
+
if (!saved) {
|
|
1634
|
+
LogError(`[Dedup] Failed to save resolution for duplicate ${duplicateID}: ${duplicate.LatestResult?.Message ?? 'Unknown error'}`);
|
|
1635
|
+
return false;
|
|
1636
|
+
}
|
|
1637
|
+
LogStatus(`[Dedup] Resolved duplicate ${duplicateID}: ${resolution}`);
|
|
1638
|
+
return true;
|
|
1639
|
+
}
|
|
1640
|
+
catch (e) {
|
|
1641
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1642
|
+
LogError(`[Dedup] Error resolving duplicate ${duplicateID}: ${msg}`);
|
|
1643
|
+
return false;
|
|
1644
|
+
}
|
|
1645
|
+
}
|
|
1646
|
+
/**
|
|
1647
|
+
* Applies the resolution to a duplicate record by setting the Status and Resolution fields.
|
|
1648
|
+
*/
|
|
1649
|
+
applyDuplicateResolution(duplicate, resolution) {
|
|
1650
|
+
if (resolution === 'NotDuplicate') {
|
|
1651
|
+
duplicate.Status = 'Dismissed';
|
|
1652
|
+
duplicate.Resolution = 'NotDuplicate';
|
|
1653
|
+
}
|
|
1654
|
+
else {
|
|
1655
|
+
// KeepA or KeepB — mark as Merged
|
|
1656
|
+
duplicate.Status = 'Merged';
|
|
1657
|
+
duplicate.Resolution = resolution;
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
/**
|
|
1661
|
+
* Finds content items with the same checksum from different content sources.
|
|
1662
|
+
*
|
|
1663
|
+
* @param checksum - The SHA-256 checksum to search for.
|
|
1664
|
+
* @param excludeSourceID - The content source ID to exclude (the item's own source).
|
|
1665
|
+
* @param excludeItemID - The content item ID to exclude (the item itself).
|
|
1666
|
+
* @param contextUser - The authenticated user context.
|
|
1667
|
+
* @returns An array of matching content items (simple objects with ID field).
|
|
1668
|
+
*/
|
|
1669
|
+
async findItemsByChecksum(checksum, excludeSourceID, excludeItemID, contextUser) {
|
|
1670
|
+
const rv = new RunView();
|
|
1671
|
+
const result = await rv.RunView({
|
|
1672
|
+
EntityName: 'MJ: Content Items',
|
|
1673
|
+
Fields: ['ID'],
|
|
1674
|
+
ExtraFilter: `Checksum = '${checksum.replace(/'/g, "''")}' AND ContentSourceID <> '${excludeSourceID}' AND ID <> '${excludeItemID}'`,
|
|
1675
|
+
ResultType: 'simple'
|
|
1676
|
+
}, contextUser);
|
|
1677
|
+
if (!result.Success) {
|
|
1678
|
+
LogError(`[Dedup] RunView failed for checksum lookup: ${result.ErrorMessage}`);
|
|
1679
|
+
return [];
|
|
1680
|
+
}
|
|
1681
|
+
return result.Results;
|
|
1682
|
+
}
|
|
1683
|
+
/**
|
|
1684
|
+
* Finds content items with the same title (Name) from different content sources.
|
|
1685
|
+
*
|
|
1686
|
+
* @param title - The title to search for (exact match).
|
|
1687
|
+
* @param excludeSourceID - The content source ID to exclude (the item's own source).
|
|
1688
|
+
* @param excludeItemID - The content item ID to exclude (the item itself).
|
|
1689
|
+
* @param contextUser - The authenticated user context.
|
|
1690
|
+
* @returns An array of matching content items (simple objects with ID field).
|
|
1691
|
+
*/
|
|
1692
|
+
async findItemsByTitle(title, excludeSourceID, excludeItemID, contextUser) {
|
|
1693
|
+
const rv = new RunView();
|
|
1694
|
+
const escapedTitle = title.replace(/'/g, "''");
|
|
1695
|
+
const result = await rv.RunView({
|
|
1696
|
+
EntityName: 'MJ: Content Items',
|
|
1697
|
+
Fields: ['ID'],
|
|
1698
|
+
ExtraFilter: `Name = '${escapedTitle}' AND ContentSourceID <> '${excludeSourceID}' AND ID <> '${excludeItemID}'`,
|
|
1699
|
+
ResultType: 'simple'
|
|
1700
|
+
}, contextUser);
|
|
1701
|
+
if (!result.Success) {
|
|
1702
|
+
LogError(`[Dedup] RunView failed for title lookup: ${result.ErrorMessage}`);
|
|
1703
|
+
return [];
|
|
1704
|
+
}
|
|
1705
|
+
return result.Results;
|
|
1706
|
+
}
|
|
1707
|
+
/**
|
|
1708
|
+
* Creates a {@link MJContentItemDuplicateEntity} record for a detected duplicate pair,
|
|
1709
|
+
* but only if one does not already exist for the same pair and detection method.
|
|
1710
|
+
*
|
|
1711
|
+
* IDs are stored in canonical order: the lexicographically smaller ID is always
|
|
1712
|
+
* ContentItemAID to prevent mirror duplicates (A,B) vs (B,A).
|
|
1713
|
+
*
|
|
1714
|
+
* @param itemAID - One of the duplicate item IDs.
|
|
1715
|
+
* @param itemBID - The other duplicate item ID.
|
|
1716
|
+
* @param similarityScore - The similarity score (0.0 to 1.0).
|
|
1717
|
+
* @param detectionMethod - How the duplicate was detected.
|
|
1718
|
+
* @param contextUser - The authenticated user context.
|
|
1719
|
+
*/
|
|
1720
|
+
async createDuplicateRecordIfNotExists(itemAID, itemBID, similarityScore, detectionMethod, contextUser) {
|
|
1721
|
+
// Canonical ordering: lower normalized ID = A
|
|
1722
|
+
const normalizedA = NormalizeUUID(itemAID);
|
|
1723
|
+
const normalizedB = NormalizeUUID(itemBID);
|
|
1724
|
+
const [canonicalAID, canonicalBID] = normalizedA < normalizedB
|
|
1725
|
+
? [itemAID, itemBID]
|
|
1726
|
+
: [itemBID, itemAID];
|
|
1727
|
+
// Check if this pair already exists for the same detection method
|
|
1728
|
+
const exists = await this.duplicatePairExists(canonicalAID, canonicalBID, detectionMethod, contextUser);
|
|
1729
|
+
if (exists) {
|
|
1730
|
+
return;
|
|
1731
|
+
}
|
|
1732
|
+
const md = new Metadata();
|
|
1733
|
+
const duplicate = await md.GetEntityObject('MJ: Content Item Duplicates', contextUser);
|
|
1734
|
+
duplicate.NewRecord();
|
|
1735
|
+
duplicate.ContentItemAID = canonicalAID;
|
|
1736
|
+
duplicate.ContentItemBID = canonicalBID;
|
|
1737
|
+
duplicate.SimilarityScore = similarityScore;
|
|
1738
|
+
duplicate.DetectionMethod = detectionMethod;
|
|
1739
|
+
duplicate.Status = 'Pending';
|
|
1740
|
+
const saved = await duplicate.Save();
|
|
1741
|
+
if (!saved) {
|
|
1742
|
+
LogError(`[Dedup] Failed to save duplicate record for pair (${canonicalAID}, ${canonicalBID}) method=${detectionMethod}`);
|
|
1743
|
+
}
|
|
1744
|
+
else {
|
|
1745
|
+
LogStatus(`[Dedup] Detected ${detectionMethod} duplicate: (${canonicalAID}, ${canonicalBID}) score=${similarityScore}`);
|
|
1746
|
+
}
|
|
1747
|
+
}
|
|
1748
|
+
/**
|
|
1749
|
+
* Checks whether a duplicate record already exists for the given pair and detection method.
|
|
1750
|
+
*
|
|
1751
|
+
* @param canonicalAID - The canonical (ordered) ContentItemAID.
|
|
1752
|
+
* @param canonicalBID - The canonical (ordered) ContentItemBID.
|
|
1753
|
+
* @param detectionMethod - The detection method to check.
|
|
1754
|
+
* @param contextUser - The authenticated user context.
|
|
1755
|
+
* @returns True if a record already exists.
|
|
1756
|
+
*/
|
|
1757
|
+
async duplicatePairExists(canonicalAID, canonicalBID, detectionMethod, contextUser) {
|
|
1758
|
+
const rv = new RunView();
|
|
1759
|
+
const result = await rv.RunView({
|
|
1760
|
+
EntityName: 'MJ: Content Item Duplicates',
|
|
1761
|
+
Fields: ['ID'],
|
|
1762
|
+
ExtraFilter: `ContentItemAID = '${canonicalAID}' AND ContentItemBID = '${canonicalBID}' AND DetectionMethod = '${detectionMethod}'`,
|
|
1763
|
+
ResultType: 'simple'
|
|
1764
|
+
}, contextUser);
|
|
1765
|
+
return result.Success && result.Results.length > 0;
|
|
1766
|
+
}
|
|
533
1767
|
};
|
|
534
|
-
AutotagBaseEngine = __decorate([
|
|
1768
|
+
AutotagBaseEngine = AutotagBaseEngine_1 = __decorate([
|
|
535
1769
|
RegisterClass(BaseEngine, 'AutotagBaseEngine')
|
|
536
1770
|
], AutotagBaseEngine);
|
|
537
1771
|
export { AutotagBaseEngine };
|