@memberjunction/content-autotagging 5.23.0 → 5.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/CloudStorage/generic/CloudStorageBase.js +1 -1
  2. package/dist/CloudStorage/generic/CloudStorageBase.js.map +1 -1
  3. package/dist/CloudStorage/index.d.ts +5 -0
  4. package/dist/CloudStorage/index.d.ts.map +1 -1
  5. package/dist/CloudStorage/index.js +5 -0
  6. package/dist/CloudStorage/index.js.map +1 -1
  7. package/dist/CloudStorage/providers/AutotagCloudStorage.d.ts +61 -0
  8. package/dist/CloudStorage/providers/AutotagCloudStorage.d.ts.map +1 -0
  9. package/dist/CloudStorage/providers/AutotagCloudStorage.js +256 -0
  10. package/dist/CloudStorage/providers/AutotagCloudStorage.js.map +1 -0
  11. package/dist/Core/generic/AutotagBase.d.ts +7 -1
  12. package/dist/Core/generic/AutotagBase.d.ts.map +1 -1
  13. package/dist/Core/generic/AutotagBase.js.map +1 -1
  14. package/dist/Engine/generic/AutotagBaseEngine.d.ts +318 -18
  15. package/dist/Engine/generic/AutotagBaseEngine.d.ts.map +1 -1
  16. package/dist/Engine/generic/AutotagBaseEngine.js +1024 -176
  17. package/dist/Engine/generic/AutotagBaseEngine.js.map +1 -1
  18. package/dist/Engine/generic/RateLimiter.d.ts +49 -0
  19. package/dist/Engine/generic/RateLimiter.d.ts.map +1 -0
  20. package/dist/Engine/generic/RateLimiter.js +98 -0
  21. package/dist/Engine/generic/RateLimiter.js.map +1 -0
  22. package/dist/Engine/index.d.ts +1 -0
  23. package/dist/Engine/index.d.ts.map +1 -1
  24. package/dist/Engine/index.js +1 -0
  25. package/dist/Engine/index.js.map +1 -1
  26. package/dist/Entity/generic/AutotagEntity.d.ts +63 -14
  27. package/dist/Entity/generic/AutotagEntity.d.ts.map +1 -1
  28. package/dist/Entity/generic/AutotagEntity.js +362 -83
  29. package/dist/Entity/generic/AutotagEntity.js.map +1 -1
  30. package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js +1 -1
  31. package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js.map +1 -1
  32. package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts +47 -16
  33. package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts.map +1 -1
  34. package/dist/RSSFeed/generic/AutotagRSSFeed.js +238 -120
  35. package/dist/RSSFeed/generic/AutotagRSSFeed.js.map +1 -1
  36. package/dist/Websites/generic/AutotagWebsite.js +1 -1
  37. package/dist/Websites/generic/AutotagWebsite.js.map +1 -1
  38. package/dist/index.d.ts +1 -0
  39. package/dist/index.d.ts.map +1 -1
  40. package/dist/index.js +1 -0
  41. package/dist/index.js.map +1 -1
  42. package/package.json +16 -11
@@ -4,9 +4,11 @@ var __decorate = (this && this.__decorate) || function (decorators, target, key,
4
4
  else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
5
5
  return c > 3 && r && Object.defineProperty(target, key, r), r;
6
6
  };
7
+ var AutotagBaseEngine_1;
7
8
  import { BaseEngine, Metadata, RunView, LogError, LogStatus } from '@memberjunction/core';
8
9
  import { MJGlobal, UUIDsEqual, NormalizeUUID, RegisterClass } from '@memberjunction/global';
9
10
  import { ContentSourceTypeParams } from './content.types.js';
11
+ import { RateLimiter } from './RateLimiter.js';
10
12
  import pdfParse from 'pdf-parse';
11
13
  import officeparser from 'officeparser';
12
14
  import * as fs from 'fs';
@@ -17,10 +19,12 @@ import * as cheerio from 'cheerio';
17
19
  import crypto from 'crypto';
18
20
  import { BaseEmbeddings, GetAIAPIKey } from '@memberjunction/ai';
19
21
  import { AIEngine } from '@memberjunction/aiengine';
20
- import { AIPromptRunner } from '@memberjunction/ai-prompts';
22
+ import { AIPromptRunner, AIModelRunner } from '@memberjunction/ai-prompts';
21
23
  import { AIPromptParams } from '@memberjunction/ai-core-plus';
22
24
  import { TextChunker } from '@memberjunction/ai-vectors';
23
25
  import { VectorDBBase } from '@memberjunction/ai-vectordb';
26
+ import { TagEngine } from '@memberjunction/tag-engine';
27
+ import { KnowledgeHubMetadataEngine } from '@memberjunction/core-entities';
24
28
  /** Default batch size for vectorization processing */
25
29
  const DEFAULT_VECTORIZE_BATCH_SIZE = 20;
26
30
  /**
@@ -31,43 +35,58 @@ const DEFAULT_VECTORIZE_BATCH_SIZE = 20;
31
35
  let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
32
36
  constructor() {
33
37
  super(...arguments);
34
- // Cached metadata — loaded by BaseEngine.Config() via property configs
35
- this._ContentTypes = [];
36
- this._ContentSourceTypes = [];
37
- this._ContentFileTypes = [];
38
+ // Cached metadata unique to this engine — loaded by BaseEngine.Config()
38
39
  this._ContentTypeAttributes = [];
39
40
  this._ContentSourceTypeParams = [];
41
+ /**
42
+ * Optional taxonomy JSON string to inject into the autotagging prompt.
43
+ * Set by the caller (e.g., AutotagEntity) before calling ExtractTextAndProcessWithLLM.
44
+ * When set, the prompt template receives an `existingTaxonomy` variable containing
45
+ * the JSON tree of existing tags so the LLM can prefer existing tags.
46
+ */
47
+ this.TaxonomyContext = null;
48
+ /**
49
+ * When true, skip checksum comparison and reprocess all content items
50
+ * even if their content hasn't changed. Useful when changing embedding models,
51
+ * LLM models, or vector databases.
52
+ */
53
+ this.ForceReprocess = false;
54
+ /** Rate limiter for LLM (tagging) API calls */
55
+ this.LLMRateLimiter = new RateLimiter({ RequestsPerMinute: 60, TokensPerMinute: 100000, Name: 'LLM' });
56
+ /** Rate limiter for embedding API calls */
57
+ this.EmbeddingRateLimiter = new RateLimiter({ RequestsPerMinute: 300, TokensPerMinute: 500000, Name: 'Embedding' });
58
+ /** Rate limiter for vector DB API calls */
59
+ this.VectorDBRateLimiter = new RateLimiter({ RequestsPerMinute: 200, Name: 'VectorDB' });
60
+ /**
61
+ * Optional callback invoked after each ContentItemTag is saved, enabling the
62
+ * tag taxonomy bridge (ContentItemTag → Tag + TaggedItem). Set by providers
63
+ * like AutotagEntity that want to link free-text tags to formal taxonomy entries.
64
+ *
65
+ * Parameters: (contentItemTag: MJContentItemTagEntity, parentTag: string | null, contextUser: UserInfo)
66
+ */
67
+ this.OnContentItemTagSaved = null;
40
68
  }
69
+ static { AutotagBaseEngine_1 = this; }
41
70
  static get Instance() {
42
71
  return super.getInstance();
43
72
  }
44
- /** All content types, cached at startup */
45
- get ContentTypes() { return this._ContentTypes; }
46
- /** All content source types, cached at startup */
47
- get ContentSourceTypes() { return this._ContentSourceTypes; }
48
- /** All content file types, cached at startup */
49
- get ContentFileTypes() { return this._ContentFileTypes; }
73
+ /** Shortcut to KnowledgeHubMetadataEngine */
74
+ get khEngine() { return KnowledgeHubMetadataEngine.Instance; }
75
+ /** All content types delegated to KnowledgeHubMetadataEngine */
76
+ get ContentTypes() { return this.khEngine.ContentTypes; }
77
+ /** All content source types delegated to KnowledgeHubMetadataEngine */
78
+ get ContentSourceTypes() { return this.khEngine.ContentSourceTypes; }
79
+ /** All content file types — delegated to KnowledgeHubMetadataEngine */
80
+ get ContentFileTypes() { return this.khEngine.ContentFileTypes; }
50
81
  /** All content type attributes, cached at startup */
51
82
  get ContentTypeAttributes() { return this._ContentTypeAttributes; }
52
83
  /** All content source type params, cached at startup */
53
84
  get ContentSourceTypeParams() { return this._ContentSourceTypeParams; }
54
85
  async Config(forceRefresh, contextUser, provider) {
86
+ // Content Types, Content Source Types, and Content File Types are delegated to
87
+ // KnowledgeHubMetadataEngine (avoid redundant loading). Only cache entities unique to this engine.
88
+ await KnowledgeHubMetadataEngine.Instance.Config(forceRefresh, contextUser);
55
89
  const configs = [
56
- {
57
- Type: 'entity',
58
- EntityName: 'MJ: Content Types',
59
- PropertyName: '_ContentTypes',
60
- },
61
- {
62
- Type: 'entity',
63
- EntityName: 'MJ: Content Source Types',
64
- PropertyName: '_ContentSourceTypes',
65
- },
66
- {
67
- Type: 'entity',
68
- EntityName: 'MJ: Content File Types',
69
- PropertyName: '_ContentFileTypes',
70
- },
71
90
  {
72
91
  Type: 'entity',
73
92
  EntityName: 'MJ: Content Type Attributes',
@@ -86,45 +105,96 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
86
105
  * Given a list of content items, extract the text from each and process with LLM for tagging.
87
106
  * Items are processed in configurable batches with controlled concurrency within each batch.
88
107
  */
89
- async ExtractTextAndProcessWithLLM(contentItems, contextUser, batchSize = DEFAULT_VECTORIZE_BATCH_SIZE, onProgress) {
108
+ /**
109
+ * Process content items through the LLM tagging pipeline with production-grade
110
+ * batch management: cursor-based resume, pause/cancel support, rate limiting,
111
+ * and circuit breaker. Each batch checkpoints progress so interrupted runs
112
+ * can be resumed from where they left off.
113
+ *
114
+ * @param contentItems - items to process
115
+ * @param contextUser - current user for permissions/audit
116
+ * @param processRun - optional ContentProcessRun entity for checkpoint tracking
117
+ * @param config - optional pipeline configuration for rate limits, thresholds
118
+ * @param onProgress - optional callback for UI progress updates
119
+ */
120
+ async ExtractTextAndProcessWithLLM(contentItems, contextUser, processRun, config, onProgress) {
90
121
  if (!contentItems || contentItems.length === 0) {
91
- LogStatus('No content items to process');
122
+ LogStatus('[Autotag] No content items to process');
92
123
  return;
93
124
  }
94
- const processRunParams = new ProcessRunParams();
95
- processRunParams.sourceID = contentItems[0].ContentSourceID;
96
- processRunParams.startTime = new Date();
97
- processRunParams.numItemsProcessed = contentItems.length;
98
- let totalProcessed = 0;
99
- LogStatus(`ExtractTextAndProcessWithLLM: processing ${contentItems.length} items in batches of ${batchSize}`);
100
- let batchSuccesses = 0;
101
- let batchFailures = 0;
102
- for (let i = 0; i < contentItems.length; i += batchSize) {
103
- const batch = contentItems.slice(i, i + batchSize);
125
+ const batchSize = config?.Pipeline?.BatchSize ?? DEFAULT_VECTORIZE_BATCH_SIZE;
126
+ const errorThreshold = config?.Pipeline?.ErrorThresholdPercent ?? 20;
127
+ const delayMs = config?.Pipeline?.DelayBetweenBatchesMs ?? 0;
128
+ // Resume from cursor if available
129
+ const resumeOffset = processRun?.LastProcessedOffset ?? 0;
130
+ const itemsToProcess = resumeOffset > 0
131
+ ? contentItems.slice(resumeOffset)
132
+ : contentItems;
133
+ if (resumeOffset > 0) {
134
+ LogStatus(`[Autotag] Resuming from offset ${resumeOffset} (${itemsToProcess.length} remaining of ${contentItems.length})`);
135
+ }
136
+ LogStatus(`[Autotag] Processing ${itemsToProcess.length} items in batches of ${batchSize}`);
137
+ let totalSuccesses = 0;
138
+ let totalFailures = 0;
139
+ let totalProcessed = resumeOffset;
140
+ for (let i = 0; i < itemsToProcess.length; i += batchSize) {
141
+ const batch = itemsToProcess.slice(i, i + batchSize);
104
142
  const batchNum = Math.floor(i / batchSize) + 1;
105
- let batchOk = 0, batchFail = 0;
143
+ let batchOk = 0;
144
+ let batchFail = 0;
145
+ // Rate limit before each batch of parallel LLM calls
146
+ await this.LLMRateLimiter.Acquire();
106
147
  const batchPromises = batch.map(async (contentItem) => {
107
148
  try {
108
149
  const processingParams = await this.buildProcessingParams(contentItem, contextUser);
109
150
  await this.ProcessContentItemText(processingParams, contextUser);
110
- totalProcessed++;
111
151
  batchOk++;
112
- onProgress?.(totalProcessed, contentItems.length, contentItem.Name);
113
152
  }
114
153
  catch (e) {
115
- LogError(`Failed to process content item: ${contentItem.ID}`, undefined, e);
116
- totalProcessed++;
154
+ LogError(`[Autotag] Failed to process item ${contentItem.ID}: ${e instanceof Error ? e.message : String(e)}`);
117
155
  batchFail++;
118
- onProgress?.(totalProcessed, contentItems.length);
119
156
  }
120
157
  });
121
158
  await Promise.all(batchPromises);
122
- batchSuccesses += batchOk;
123
- batchFailures += batchFail;
124
- LogStatus(`Batch ${batchNum}: ${batchOk} succeeded, ${batchFail} failed (${totalProcessed}/${contentItems.length} total)`);
159
+ totalSuccesses += batchOk;
160
+ totalFailures += batchFail;
161
+ totalProcessed += batch.length;
162
+ onProgress?.(totalProcessed, contentItems.length);
163
+ LogStatus(`[Autotag] Batch ${batchNum}: ${batchOk}/${batch.length} ok (${totalProcessed}/${contentItems.length} total, ${totalFailures} errors)`);
164
+ // Checkpoint: update cursor and check for cancellation
165
+ if (processRun) {
166
+ const shouldContinue = await this.UpdateBatchCursor(processRun, totalProcessed, totalFailures);
167
+ if (!shouldContinue) {
168
+ LogStatus(`[Autotag] Pipeline paused/cancelled at offset ${totalProcessed}`);
169
+ return;
170
+ }
171
+ }
172
+ // Circuit breaker: halt if error rate exceeds threshold
173
+ if (totalProcessed > 0 && totalFailures > 0) {
174
+ const errorRate = (totalFailures / totalProcessed) * 100;
175
+ if (errorRate > errorThreshold) {
176
+ LogError(`[Autotag] Circuit breaker triggered: error rate ${errorRate.toFixed(1)}% exceeds threshold ${errorThreshold}%`);
177
+ if (processRun) {
178
+ processRun.ErrorMessage = `Auto-paused: error rate ${errorRate.toFixed(1)}% exceeded ${errorThreshold}% threshold`;
179
+ await this.CompleteBatchedProcessRun(processRun, 'Failed', processRun.ErrorMessage);
180
+ }
181
+ return;
182
+ }
183
+ }
184
+ // Optional delay between batches (throttling)
185
+ if (delayMs > 0 && i + batchSize < itemsToProcess.length) {
186
+ await new Promise(resolve => setTimeout(resolve, delayMs));
187
+ }
125
188
  }
126
- LogStatus(`ExtractTextAndProcessWithLLM complete: ${batchSuccesses} succeeded, ${batchFailures} failed out of ${contentItems.length}`);
189
+ LogStatus(`[Autotag] LLM tagging complete: ${totalSuccesses} succeeded, ${totalFailures} failed of ${contentItems.length}`);
190
+ // Post-pipeline hook: recompute tag co-occurrence if TagCoOccurrenceEngine is available
191
+ await this.recomputeCoOccurrenceIfAvailable(contextUser);
192
+ // Legacy process run tracking (for backward compatibility)
193
+ const processRunParams = new ProcessRunParams();
194
+ processRunParams.sourceID = contentItems[0].ContentSourceID;
195
+ processRunParams.startTime = processRun?.StartTime ?? new Date();
127
196
  processRunParams.endTime = new Date();
197
+ processRunParams.numItemsProcessed = contentItems.length;
128
198
  await this.saveProcessRun(processRunParams, contextUser);
129
199
  }
130
200
  /**
@@ -141,14 +211,58 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
141
211
  processingParams.minTags = minTags;
142
212
  processingParams.maxTags = maxTags;
143
213
  processingParams.contentItemID = contentItem.ID;
214
+ LogStatus(`[Autotag] Built params for "${contentItem.Name}" — text length: ${processingParams.text?.length ?? 0}, modelID: ${modelID || 'default'}, tags: ${minTags}-${maxTags}`);
144
215
  return processingParams;
145
216
  }
146
217
  /**
147
218
  * Process a content item's text with the LLM and save results.
148
219
  */
149
220
  async ProcessContentItemText(params, contextUser) {
150
- const LLMResults = await this.promptAndRetrieveResultsFromLLM(params, contextUser);
151
- await this.saveLLMResults(LLMResults, contextUser);
221
+ // A8: Update tagging status to Processing
222
+ await this.updateContentItemTaggingStatus(params.contentItemID, 'Processing', contextUser);
223
+ try {
224
+ const LLMResults = await this.promptAndRetrieveResultsFromLLM(params, contextUser);
225
+ await this.saveLLMResults(LLMResults, contextUser);
226
+ // A8: Update tagging status to Complete
227
+ await this.updateContentItemTaggingStatus(params.contentItemID, 'Complete', contextUser);
228
+ }
229
+ catch (e) {
230
+ await this.updateContentItemTaggingStatus(params.contentItemID, 'Failed', contextUser);
231
+ throw e;
232
+ }
233
+ }
234
+ /** Update embedding status for a batch of content items */
235
+ async updateEmbeddingStatusBatch(items, status, contextUser, embeddingModelID) {
236
+ for (const item of items) {
237
+ try {
238
+ item.EmbeddingStatus = status;
239
+ if (status === 'Complete') {
240
+ item.LastEmbeddedAt = new Date();
241
+ if (embeddingModelID)
242
+ item.EmbeddingModelID = embeddingModelID;
243
+ }
244
+ await item.Save();
245
+ }
246
+ catch {
247
+ // Non-critical
248
+ }
249
+ }
250
+ }
251
+ /** Update a content item's TaggingStatus and LastTaggedAt */
252
+ async updateContentItemTaggingStatus(contentItemID, status, contextUser) {
253
+ try {
254
+ const md = new Metadata();
255
+ const item = await md.GetEntityObject('MJ: Content Items', contextUser);
256
+ await item.Load(contentItemID);
257
+ item.TaggingStatus = status;
258
+ if (status === 'Complete') {
259
+ item.LastTaggedAt = new Date();
260
+ }
261
+ await item.Save();
262
+ }
263
+ catch {
264
+ // Non-critical — don't fail the pipeline for a status update
265
+ }
152
266
  }
153
267
  /**
154
268
  * Resolves the "Content Autotagging" prompt from the AIEngine cache.
@@ -164,20 +278,194 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
164
278
  }
165
279
  return prompt;
166
280
  }
281
+ /**
282
+ * Initialize the taxonomy bridge so ALL content source types (RSS, Entity, Website, etc.)
283
+ * automatically create formal Tag + TaggedItem records from LLM-generated ContentItemTags.
284
+ *
285
+ * This sets up:
286
+ * 1. TagEngine with semantic embeddings for tag matching
287
+ * 2. TaxonomyContext for prompt injection (tells LLM about existing tags)
288
+ * 3. OnContentItemTagSaved callback that bridges ContentItemTag → Tag + TaggedItem
289
+ *
290
+ * Call this ONCE before running any providers. The bridge stays active until
291
+ * CleanupTaxonomyBridge() is called.
292
+ */
293
+ async InitializeTaxonomyBridge(contextUser) {
294
+ try {
295
+ // TagEngine internally ensures AIEngine is loaded before building embeddings.
296
+ await TagEngine.Instance.Config(false, contextUser);
297
+ LogStatus(`[TaxonomyBridge] TagEngine initialized with ${TagEngine.Instance.Tags.length} existing tags`);
298
+ // Inject taxonomy into prompt context as markdown hierarchy
299
+ // Format: "# RootTag\n## ChildTag\n### GrandChild"
300
+ // LLM returns paths like "RootTag / ChildTag / GrandChild" for unambiguous matching
301
+ if (TagEngine.Instance.Tags.length > 0) {
302
+ const tree = TagEngine.Instance.GetTaxonomyTree();
303
+ this.TaxonomyContext = this.buildTaxonomyMarkdown(tree);
304
+ LogStatus(`[TaxonomyBridge] Taxonomy context injected as markdown (${tree.length} root nodes, ${TagEngine.Instance.Tags.length} total tags)`);
305
+ }
306
+ // Set up the bridge callback — fires after each ContentItemTag is saved
307
+ this.OnContentItemTagSaved = async (contentItemTag, parentTagName, ctxUser) => {
308
+ await this.BridgeContentItemTagToTaxonomy(contentItemTag, parentTagName, ctxUser);
309
+ };
310
+ LogStatus(`[TaxonomyBridge] Bridge callback installed`);
311
+ }
312
+ catch (e) {
313
+ const msg = e instanceof Error ? e.message : String(e);
314
+ LogError(`[TaxonomyBridge] Initialization failed, taxonomy features disabled: ${msg}`);
315
+ }
316
+ }
317
+ /**
318
+ * Clean up the taxonomy bridge after all providers have finished.
319
+ */
320
+ CleanupTaxonomyBridge() {
321
+ this.TaxonomyContext = null;
322
+ this.OnContentItemTagSaved = null;
323
+ }
324
+ /**
325
+ * Build a markdown-formatted taxonomy for LLM prompt injection.
326
+ * Uses heading levels for hierarchy depth (# for root, ## for child, etc.)
327
+ * so the LLM can return tag paths like "Root / Child / Grandchild".
328
+ */
329
+ buildTaxonomyMarkdown(tree) {
330
+ const lines = [];
331
+ const renderNode = (node, depth, path) => {
332
+ const prefix = '#'.repeat(Math.min(depth + 1, 6)); // Max 6 heading levels
333
+ const fullPath = path ? `${path} / ${node.Name}` : node.Name;
334
+ lines.push(`${prefix} ${node.Name}`);
335
+ if (node.Children && node.Children.length > 0) {
336
+ for (const child of node.Children) {
337
+ renderNode(child, depth + 1, fullPath);
338
+ }
339
+ }
340
+ };
341
+ for (const root of tree) {
342
+ renderNode(root, 0, '');
343
+ }
344
+ return lines.join('\n');
345
+ }
346
+ /**
347
+ * Bridge a ContentItemTag to the formal MJ Tag taxonomy.
348
+ * Uses TagEngine.ResolveTag() in auto-grow mode by default.
349
+ *
350
+ * After resolving/creating the formal Tag, also creates a TaggedItem record:
351
+ * - For Entity sources: tags the original entity record (e.g., Products row)
352
+ * - For non-Entity sources (RSS, Website, etc.): tags the ContentItem itself
353
+ */
354
+ async BridgeContentItemTagToTaxonomy(contentItemTag, parentTagName, contextUser) {
355
+ try {
356
+ // If parent tag is suggested by LLM, resolve it through the mutex too
357
+ // to prevent duplicate parent tags from concurrent batch processing
358
+ if (parentTagName) {
359
+ await TagEngine.Instance.ResolveTag(parentTagName, 0, 'auto-grow', null, 0.80, contextUser);
360
+ }
361
+ // Resolve the tag using auto-grow mode (create if no match)
362
+ const formalTag = await TagEngine.Instance.ResolveTag(contentItemTag.Tag, contentItemTag.Weight, 'auto-grow', null, // no root constraint
363
+ 0.80, // similarity threshold — lower to catch plurals/variants like "AI Agent" vs "AI Agents"
364
+ contextUser);
365
+ if (formalTag) {
366
+ // Link ContentItemTag to formal Tag
367
+ contentItemTag.TagID = formalTag.ID;
368
+ await contentItemTag.Save();
369
+ // Create TaggedItem linking the formal Tag to the appropriate entity record
370
+ await this.createTaggedItemFromContentItemTag(contentItemTag, formalTag.ID, contextUser);
371
+ }
372
+ }
373
+ catch (e) {
374
+ const msg = e instanceof Error ? e.message : String(e);
375
+ LogError(`[TaxonomyBridge] Failed for tag "${contentItemTag.Tag}": ${msg}`);
376
+ }
377
+ }
378
+ /**
379
+ * Creates a TaggedItem record linking a formal Tag to the appropriate entity record.
380
+ *
381
+ * For Entity-sourced content items: resolves the EntityRecordDocument to find the
382
+ * original entity (e.g., Products) and record ID, then tags that entity record.
383
+ *
384
+ * For non-Entity-sourced content items (RSS, Website, Cloud Storage): tags the
385
+ * ContentItem itself (EntityID = "MJ: Content Items" entity, RecordID = content item ID).
386
+ */
387
+ async createTaggedItemFromContentItemTag(contentItemTag, tagID, contextUser) {
388
+ try {
389
+ const md = new Metadata();
390
+ let entityID;
391
+ let recordID;
392
+ // Load the content item to determine source type
393
+ const rv = new RunView();
394
+ const ciResult = await rv.RunView({
395
+ EntityName: 'MJ: Content Items',
396
+ ExtraFilter: `ID='${contentItemTag.ItemID}'`,
397
+ ResultType: 'simple',
398
+ Fields: ['ID', 'EntityRecordDocumentID', 'ContentSourceID'],
399
+ MaxRows: 1,
400
+ }, contextUser);
401
+ if (!ciResult.Success || ciResult.Results.length === 0)
402
+ return;
403
+ const ci = ciResult.Results[0];
404
+ if (ci.EntityRecordDocumentID) {
405
+ // Entity source — resolve to the original entity record
406
+ const erdResult = await rv.RunView({
407
+ EntityName: 'MJ: Entity Record Documents',
408
+ ExtraFilter: `ID='${ci.EntityRecordDocumentID}'`,
409
+ ResultType: 'simple',
410
+ Fields: ['EntityID', 'RecordID'],
411
+ MaxRows: 1,
412
+ }, contextUser);
413
+ if (!erdResult.Success || erdResult.Results.length === 0)
414
+ return;
415
+ entityID = erdResult.Results[0].EntityID;
416
+ recordID = erdResult.Results[0].RecordID;
417
+ }
418
+ else {
419
+ // Non-entity source — tag the ContentItem itself
420
+ const contentItemsEntity = md.Entities.find(e => e.Name === 'MJ: Content Items');
421
+ if (!contentItemsEntity)
422
+ return;
423
+ entityID = contentItemsEntity.ID;
424
+ recordID = contentItemTag.ItemID;
425
+ }
426
+ // Check if this TaggedItem already exists (avoid duplicates)
427
+ const existingResult = await rv.RunView({
428
+ EntityName: 'MJ: Tagged Items',
429
+ ExtraFilter: `TagID='${tagID}' AND EntityID='${entityID}' AND RecordID='${recordID}'`,
430
+ ResultType: 'simple',
431
+ Fields: ['ID'],
432
+ MaxRows: 1,
433
+ }, contextUser);
434
+ if (existingResult.Success && existingResult.Results.length > 0)
435
+ return; // Already exists
436
+ // Create the TaggedItem
437
+ const taggedItem = await md.GetEntityObject('MJ: Tagged Items', contextUser);
438
+ taggedItem.NewRecord();
439
+ taggedItem.TagID = tagID;
440
+ taggedItem.EntityID = entityID;
441
+ taggedItem.RecordID = recordID;
442
+ taggedItem.Weight = contentItemTag.Weight;
443
+ await taggedItem.Save();
444
+ }
445
+ catch (e) {
446
+ // Non-critical — the ContentItemTag is already saved, TaggedItem is supplemental
447
+ const msg = e instanceof Error ? e.message : String(e);
448
+ LogError(`[TaxonomyBridge] Failed to create TaggedItem for tag "${contentItemTag.Tag}": ${msg}`);
449
+ }
450
+ }
167
451
  /**
168
452
  * Builds template data for the autotagging prompt from processing params and chunk context.
169
453
  */
170
454
  buildPromptData(params, chunk, previousResults) {
171
- const contentType = this.GetContentTypeName(params.contentTypeID);
172
455
  const contentSourceType = this.GetContentSourceTypeName(params.contentSourceTypeID);
173
456
  const additionalAttributePrompts = this.GetAdditionalContentTypePrompt(params.contentTypeID);
174
457
  const hasPreviousResults = Object.keys(previousResults).length > 0;
458
+ // Check if this source type requires content type validation in the prompt
459
+ const sourceType = this.ContentSourceTypes.find(st => UUIDsEqual(st.ID, params.contentSourceTypeID));
460
+ const sourceConfig = sourceType?.ConfigurationObject;
461
+ const requiresContentType = sourceConfig?.RequiresContentType !== false;
175
462
  return {
176
- contentType,
463
+ contentType: requiresContentType ? this.GetContentTypeName(params.contentTypeID) : undefined,
177
464
  contentSourceType,
178
465
  minTags: params.minTags,
179
466
  maxTags: params.maxTags,
180
467
  additionalAttributePrompts,
468
+ existingTaxonomy: this.TaxonomyContext ?? undefined,
181
469
  contentText: chunk,
182
470
  previousResults: hasPreviousResults ? JSON.stringify(previousResults) : undefined,
183
471
  };
@@ -185,13 +473,21 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
185
473
  async promptAndRetrieveResultsFromLLM(params, contextUser) {
186
474
  await AIEngine.Instance.Config(false, contextUser);
187
475
  const prompt = this.getAutotagPrompt();
188
- // Determine token limit for chunking: use override model if set, else first prompt-model, else a default
189
476
  const tokenLimit = this.resolveTokenLimit(params.modelID);
190
477
  const chunks = this.chunkExtractedText(params.text, tokenLimit);
478
+ if (chunks.length === 0 || (chunks.length === 1 && (!chunks[0] || chunks[0].trim().length === 0))) {
479
+ LogError(`[Autotag] No text to process for item ${params.contentItemID}`);
480
+ return {};
481
+ }
191
482
  let LLMResults = {};
192
483
  const startTime = new Date();
193
- for (const chunk of chunks) {
194
- LLMResults = await this.processChunkWithPromptRunner(prompt, params, chunk, LLMResults, contextUser);
484
+ for (let ci = 0; ci < chunks.length; ci++) {
485
+ try {
486
+ LLMResults = await this.processChunkWithPromptRunner(prompt, params, chunks[ci], LLMResults, contextUser);
487
+ }
488
+ catch (chunkError) {
489
+ LogError(`[Autotag] Chunk ${ci + 1}/${chunks.length} failed for item ${params.contentItemID}: ${chunkError instanceof Error ? chunkError.message : String(chunkError)}`);
490
+ }
195
491
  }
196
492
  LLMResults.processStartTime = startTime;
197
493
  LLMResults.processEndTime = new Date();
@@ -230,10 +526,9 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
230
526
  promptParams.override = { modelId: params.modelID };
231
527
  }
232
528
  const runner = new AIPromptRunner();
233
- // Per-item logging removed for cleanliness — batch-level logging in ExtractTextAndProcessWithLLM
234
529
  const result = await runner.ExecutePrompt(promptParams);
235
530
  if (!result.success) {
236
- LogError(`AIPromptRunner FAILED for content item ${params.contentItemID}: ${result.errorMessage ?? 'no error message'}`, undefined, result);
531
+ LogError(`[Autotag] LLM failed for item ${params.contentItemID}: ${result.errorMessage ?? 'unknown error'}`);
237
532
  return LLMResults;
238
533
  }
239
534
  // Parse the result — AIPromptRunner may return a raw JSON string or a parsed object
@@ -266,9 +561,12 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
266
561
  await this.saveResultsToContentItemAttribute(LLMResults, contextUser);
267
562
  await this.saveContentItemTags(LLMResults.contentItemID, LLMResults, contextUser);
268
563
  }
269
- else {
564
+ else if (LLMResults.isValidContent === false) {
270
565
  await this.deleteInvalidContentItem(LLMResults.contentItemID, contextUser);
271
566
  }
567
+ else {
568
+ LogError(`[Autotag] Unexpected LLM format for item ${LLMResults.contentItemID} — isValidContent missing. Keys: ${Object.keys(LLMResults).join(', ')}`);
569
+ }
272
570
  }
273
571
  async deleteInvalidContentItem(contentItemID, contextUser) {
274
572
  const md = new Metadata();
@@ -322,6 +620,8 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
322
620
  /**
323
621
  * Saves keyword tags from LLM results as Content Item Tags.
324
622
  * Uses batched saves for better performance.
623
+ * After each tag is saved, invokes the OnContentItemTagSaved callback (if set)
624
+ * for taxonomy bridge processing.
325
625
  */
326
626
  async saveContentItemTags(contentItemID, LLMResults, contextUser) {
327
627
  const md = new Metadata();
@@ -331,14 +631,16 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
331
631
  // Normalize keywords — support both formats:
332
632
  // Old: ["keyword1", "keyword2"]
333
633
  // New: [{ tag: "keyword1", weight: 0.95 }, { tag: "keyword2", weight: 0.7 }]
634
+ // New with parentTag: [{ tag: "keyword1", weight: 0.95, parentTag: "parent" }]
334
635
  const normalizedTags = keywords.map((kw) => {
335
636
  if (typeof kw === 'string') {
336
- return { tag: kw, weight: 1.0 };
637
+ return { tag: kw, weight: 1.0, parentTag: null };
337
638
  }
338
639
  const obj = kw;
339
640
  return {
340
641
  tag: obj.tag || obj.keyword || String(kw),
341
642
  weight: typeof obj.weight === 'number' ? Math.max(0, Math.min(1, obj.weight)) : 0.5,
643
+ parentTag: obj.parentTag ?? null,
342
644
  };
343
645
  });
344
646
  const BATCH_SIZE = 10;
@@ -350,7 +652,17 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
350
652
  contentItemTag.ItemID = contentItemID;
351
653
  contentItemTag.Tag = item.tag;
352
654
  contentItemTag.Set('Weight', item.weight);
353
- await contentItemTag.Save();
655
+ const saved = await contentItemTag.Save();
656
+ // Invoke taxonomy bridge callback if set
657
+ if (saved && this.OnContentItemTagSaved) {
658
+ try {
659
+ await this.OnContentItemTagSaved(contentItemTag, item.parentTag, contextUser);
660
+ }
661
+ catch (bridgeError) {
662
+ const msg = bridgeError instanceof Error ? bridgeError.message : String(bridgeError);
663
+ LogError(`Tag taxonomy bridge failed for tag "${item.tag}": ${msg}`);
664
+ }
665
+ }
354
666
  }));
355
667
  }
356
668
  }
@@ -362,12 +674,17 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
362
674
  const md = new Metadata();
363
675
  const contentItemID = LLMResults.contentItemID;
364
676
  const skipKeys = new Set(['keywords', 'processStartTime', 'processEndTime', 'contentItemID', 'isValidContent']);
365
- // Update title and description on the content item
677
+ // Update title and description on the content item.
678
+ // For entity-sourced items (EntityRecordDocumentID is set), preserve the
679
+ // original entity record name — it's more meaningful to users than the
680
+ // AI-generated title. Only update description.
366
681
  if (LLMResults.title || LLMResults.description) {
367
682
  const contentItem = await md.GetEntityObject('MJ: Content Items', contextUser);
368
683
  await contentItem.Load(contentItemID);
369
- if (LLMResults.title)
684
+ const isEntitySourced = contentItem.EntityRecordDocumentID != null;
685
+ if (LLMResults.title && !isEntitySourced) {
370
686
  contentItem.Name = LLMResults.title;
687
+ }
371
688
  if (LLMResults.description)
372
689
  contentItem.Description = LLMResults.description;
373
690
  await contentItem.Save();
@@ -389,21 +706,24 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
389
706
  }
390
707
  /**
391
708
  * Retrieves all content sources for a given content source type.
709
+ * Throws if no sources are found.
392
710
  */
393
711
  async getAllContentSources(contextUser, contentSourceTypeID) {
394
- const rv = new RunView();
395
- const result = await rv.RunView({
396
- EntityName: 'MJ: Content Sources',
397
- ResultType: 'entity_object',
398
- ExtraFilter: `ContentSourceTypeID='${contentSourceTypeID}'`
399
- }, contextUser);
400
- if (result.Success && result.Results.length) {
401
- return result.Results;
712
+ const sources = await this.GetAllContentSourcesSafe(contextUser, contentSourceTypeID);
713
+ if (sources.length === 0) {
714
+ throw new Error(`No content sources found for content source type with ID '${contentSourceTypeID}'`);
402
715
  }
403
- throw new Error(`No content sources found for content source type with ID '${contentSourceTypeID}'`);
716
+ return sources;
717
+ }
718
+ /**
719
+ * Retrieves all content sources for a given content source type.
720
+ * Returns an empty array (instead of throwing) when no sources are configured.
721
+ */
722
+ async GetAllContentSourcesSafe(_contextUser, contentSourceTypeID) {
723
+ return this.khEngine.ContentSources.filter(s => UUIDsEqual(s.ContentSourceTypeID, contentSourceTypeID));
404
724
  }
405
725
  SetSubclassContentSourceType(subclass) {
406
- const sourceType = this._ContentSourceTypes.find(st => st.Name === subclass);
726
+ const sourceType = this.ContentSourceTypes.find(st => st.Name === subclass);
407
727
  if (!sourceType) {
408
728
  throw new Error(`Content Source Type with name '${subclass}' not found in cached metadata`);
409
729
  }
@@ -493,7 +813,7 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
493
813
  throw new Error(`Failed to retrieve last run date for content source with ID ${contentSourceID}`);
494
814
  }
495
815
  GetContentItemParams(contentTypeID) {
496
- const contentType = this._ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
816
+ const contentType = this.ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
497
817
  if (!contentType) {
498
818
  throw new Error(`Content Type with ID ${contentTypeID} not found in cached metadata`);
499
819
  }
@@ -504,21 +824,21 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
504
824
  };
505
825
  }
506
826
  GetContentSourceTypeName(contentSourceTypeID) {
507
- const sourceType = this._ContentSourceTypes.find(st => UUIDsEqual(st.ID, contentSourceTypeID));
827
+ const sourceType = this.ContentSourceTypes.find(st => UUIDsEqual(st.ID, contentSourceTypeID));
508
828
  if (!sourceType) {
509
829
  throw new Error(`Content Source Type with ID ${contentSourceTypeID} not found in cached metadata`);
510
830
  }
511
831
  return sourceType.Name;
512
832
  }
513
833
  GetContentTypeName(contentTypeID) {
514
- const contentType = this._ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
834
+ const contentType = this.ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
515
835
  if (!contentType) {
516
836
  throw new Error(`Content Type with ID ${contentTypeID} not found in cached metadata`);
517
837
  }
518
838
  return contentType.Name;
519
839
  }
520
840
  GetContentFileTypeName(contentFileTypeID) {
521
- const fileType = this._ContentFileTypes.find(ft => UUIDsEqual(ft.ID, contentFileTypeID));
841
+ const fileType = this.ContentFileTypes.find(ft => UUIDsEqual(ft.ID, contentFileTypeID));
522
842
  if (!fileType) {
523
843
  throw new Error(`Content File Type with ID ${contentFileTypeID} not found in cached metadata`);
524
844
  }
@@ -558,7 +878,7 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
558
878
  throw new Error(`Content item with URL ${url} not found`);
559
879
  }
560
880
  /**
561
- * Saves process run metadata to the database.
881
+ * Saves process run metadata to the database (backward-compatible simple version).
562
882
  */
563
883
  async saveProcessRun(processRunParams, contextUser) {
564
884
  const md = new Metadata();
@@ -567,10 +887,92 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
567
887
  processRun.SourceID = processRunParams.sourceID;
568
888
  processRun.StartTime = processRunParams.startTime;
569
889
  processRun.EndTime = processRunParams.endTime;
570
- processRun.Status = 'Complete';
890
+ processRun.Status = 'Completed';
571
891
  processRun.ProcessedItems = processRunParams.numItemsProcessed;
892
+ processRun.StartedByUserID = contextUser.ID;
572
893
  await processRun.Save();
573
894
  }
895
+ /**
896
+ * Create a new ContentProcessRun record for batched pipeline execution.
897
+ * Returns the entity so the caller can update cursor/status as batches complete.
898
+ * Uses the JSONType ConfigurationObject for strongly-typed configuration.
899
+ */
900
+ async CreateBatchedProcessRun(sourceID, totalItemCount, batchSize, contextUser, config) {
901
+ const md = new Metadata();
902
+ const processRun = await md.GetEntityObject('MJ: Content Process Runs', contextUser);
903
+ processRun.NewRecord();
904
+ processRun.SourceID = sourceID;
905
+ processRun.StartTime = new Date();
906
+ processRun.Status = 'Running';
907
+ processRun.ProcessedItems = 0;
908
+ processRun.TotalItemCount = totalItemCount;
909
+ processRun.BatchSize = batchSize;
910
+ processRun.LastProcessedOffset = 0;
911
+ processRun.ErrorCount = 0;
912
+ processRun.CancellationRequested = false;
913
+ processRun.StartedByUserID = contextUser.ID;
914
+ if (config) {
915
+ processRun.ConfigurationObject = config;
916
+ }
917
+ const saved = await processRun.Save();
918
+ if (!saved) {
919
+ throw new Error('Failed to create ContentProcessRun record');
920
+ }
921
+ return processRun;
922
+ }
923
+ /**
924
+ * Update a batched process run's cursor position after a batch completes.
925
+ * Checks CancellationRequested to support pause/cancel.
926
+ * @returns true if processing should continue, false if cancelled/paused
927
+ */
928
+ async UpdateBatchCursor(processRun, processedCount, errorCount) {
929
+ processRun.ProcessedItems = processedCount;
930
+ processRun.LastProcessedOffset = processedCount;
931
+ processRun.ErrorCount = errorCount;
932
+ await processRun.Save();
933
+ // Reload to check if cancellation was requested externally
934
+ await processRun.Load(processRun.ID);
935
+ if (processRun.CancellationRequested) {
936
+ processRun.Status = 'Paused';
937
+ processRun.EndTime = new Date();
938
+ await processRun.Save();
939
+ LogStatus(`[Pipeline] Cancellation requested — pausing at offset ${processedCount}`);
940
+ return false;
941
+ }
942
+ return true;
943
+ }
944
+ /**
945
+ * Complete a batched process run (success or failure).
946
+ */
947
+ async CompleteBatchedProcessRun(processRun, status, errorMessage) {
948
+ processRun.Status = status;
949
+ processRun.EndTime = new Date();
950
+ if (errorMessage) {
951
+ processRun.ErrorMessage = errorMessage;
952
+ }
953
+ await processRun.Save();
954
+ }
955
+ /**
956
+ * Create rate limiters from the pipeline configuration.
957
+ */
958
+ CreateRateLimiters(config) {
959
+ return {
960
+ llm: new RateLimiter({
961
+ RequestsPerMinute: config?.RateLimits?.LLM?.RequestsPerMinute ?? 60,
962
+ TokensPerMinute: config?.RateLimits?.LLM?.TokensPerMinute ?? 1000000,
963
+ Name: 'LLM',
964
+ }),
965
+ embedding: new RateLimiter({
966
+ RequestsPerMinute: config?.RateLimits?.Embedding?.RequestsPerMinute ?? 300,
967
+ TokensPerMinute: config?.RateLimits?.Embedding?.TokensPerMinute ?? 1000000,
968
+ Name: 'Embedding',
969
+ }),
970
+ vectorDB: new RateLimiter({
971
+ RequestsPerMinute: config?.RateLimits?.VectorDB?.RequestsPerMinute ?? 200,
972
+ Name: 'VectorDB',
973
+ }),
974
+ };
975
+ }
574
976
  async parsePDF(dataBuffer) {
575
977
  const dataPDF = await pdfParse(dataBuffer);
576
978
  return dataPDF.text;
@@ -609,12 +1011,21 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
609
1011
  * from per-ContentSource overrides, per-ContentType defaults, or the global fallback
610
1012
  * (first active VectorIndex). Each group is processed in configurable batches with
611
1013
  * parallel upserts within each batch.
1014
+ *
1015
+ * Uses AIModelRunner to create AIPromptRun records for each embedding batch,
1016
+ * enabling token/cost tracking and linking to ContentProcessRunDetail records.
1017
+ *
1018
+ * @param items - content items to vectorize
1019
+ * @param contextUser - current user for permissions/audit
1020
+ * @param onProgress - optional callback for progress updates
1021
+ * @param batchSize - number of items per embedding batch
1022
+ * @returns counts of vectorized/skipped items and collected AIPromptRun IDs
612
1023
  */
613
1024
  async VectorizeContentItems(items, contextUser, onProgress, batchSize = DEFAULT_VECTORIZE_BATCH_SIZE) {
614
1025
  const eligible = items.filter(i => i.Text && i.Text.trim().length > 0);
615
1026
  if (eligible.length === 0) {
616
1027
  LogStatus('VectorizeContentItems: no items with text to vectorize');
617
- return { vectorized: 0, skipped: items.length };
1028
+ return { vectorized: 0, skipped: items.length, promptRunIDs: [] };
618
1029
  }
619
1030
  // Ensure AIEngine is loaded so we can resolve the embedding model
620
1031
  await AIEngine.Instance.Config(false, contextUser);
@@ -626,91 +1037,151 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
626
1037
  const tagMap = await this.loadTagsForItems(eligible, contextUser);
627
1038
  let vectorized = 0;
628
1039
  let processed = 0;
1040
+ const allPromptRunIDs = [];
629
1041
  for (const [groupKey, groupItems] of groups) {
630
1042
  const infra = await this.resolveGroupInfrastructure(groupKey, contextUser);
631
- const groupVectorized = await this.vectorizeGroup(groupItems, infra, tagMap, batchSize, (batchProcessed) => {
1043
+ const groupResult = await this.vectorizeGroup(groupItems, infra, tagMap, batchSize, contextUser, (batchProcessed) => {
632
1044
  processed += batchProcessed;
633
1045
  onProgress?.(Math.min(processed, eligible.length), eligible.length);
634
1046
  });
635
- vectorized += groupVectorized;
1047
+ vectorized += groupResult.vectorized;
1048
+ allPromptRunIDs.push(...groupResult.promptRunIDs);
636
1049
  }
637
- LogStatus(`VectorizeContentItems: ${vectorized} vectorized, ${items.length - eligible.length} skipped (empty text)`);
638
- return { vectorized, skipped: items.length - eligible.length };
1050
+ LogStatus(`VectorizeContentItems: ${vectorized} vectorized, ${items.length - eligible.length} skipped (empty text), ${allPromptRunIDs.length} prompt runs created`);
1051
+ return { vectorized, skipped: items.length - eligible.length, promptRunIDs: allPromptRunIDs };
639
1052
  }
640
1053
  /**
641
1054
  * Process a single infrastructure group: embed texts in batches and upsert to vector DB.
642
- * Upserts within each batch run in parallel for throughput.
1055
+ * Uses AIModelRunner for each embedding batch to create AIPromptRun records with
1056
+ * token/cost tracking. Upserts within each batch run in parallel for throughput.
1057
+ *
1058
+ * @param items - content items in this infrastructure group
1059
+ * @param infra - resolved embedding + vector DB infrastructure
1060
+ * @param tagMap - pre-loaded tags for metadata enrichment
1061
+ * @param batchSize - number of items per embedding batch
1062
+ * @param contextUser - current user for AIModelRunner tracking
1063
+ * @param onBatchComplete - callback invoked after each batch with item count
1064
+ * @returns count of vectorized items and collected AIPromptRun IDs
643
1065
  */
644
- async vectorizeGroup(items, infra, tagMap, batchSize, onBatchComplete) {
1066
+ async vectorizeGroup(items, infra, tagMap, batchSize, contextUser, onBatchComplete) {
645
1067
  let vectorized = 0;
1068
+ const promptRunIDs = [];
1069
+ const modelRunner = new AIModelRunner();
1070
+ // Resolve the "Content Embedding" prompt ID for tracking
1071
+ const embeddingPromptID = this.resolveEmbeddingPromptID();
646
1072
  for (let i = 0; i < items.length; i += batchSize) {
647
1073
  const batch = items.slice(i, i + batchSize);
648
- const texts = batch.map(item => this.buildEmbeddingText(item));
649
- const embedResult = await infra.embedding.EmbedTexts({ texts, model: infra.embeddingModelName });
650
- if (!embedResult.vectors || embedResult.vectors.length !== batch.length) {
651
- LogError(`VectorizeContentItems: embedding returned ${embedResult.vectors?.length ?? 0} vectors for ${batch.length} texts`);
1074
+ // Build chunks for each item items with long text produce multiple chunks
1075
+ const allChunks = this.buildChunksForBatch(batch);
1076
+ const texts = allChunks.map(c => c.text);
1077
+ // Rate limit embedding API call
1078
+ await this.EmbeddingRateLimiter.Acquire(texts.reduce((sum, t) => sum + Math.ceil(t.length / 4), 0));
1079
+ // Use AIModelRunner to embed texts with AIPromptRun tracking
1080
+ const runResult = await modelRunner.RunEmbedding({
1081
+ Texts: texts,
1082
+ ModelID: infra.embeddingModelID,
1083
+ PromptID: embeddingPromptID,
1084
+ ContextUser: contextUser,
1085
+ Description: `Content vectorization batch: ${batch.length} items, ${allChunks.length} chunks`,
1086
+ });
1087
+ if (!runResult.Success || runResult.Vectors.length !== allChunks.length) {
1088
+ LogError(`VectorizeContentItems: embedding returned ${runResult.Vectors.length} vectors for ${allChunks.length} texts — ${runResult.ErrorMessage ?? 'unknown error'}`);
652
1089
  onBatchComplete(batch.length);
653
1090
  continue;
654
1091
  }
655
- const records = batch.map((item, idx) => ({
656
- id: this.contentItemVectorId(item.ID),
657
- values: embedResult.vectors[idx],
658
- metadata: this.buildVectorMetadata(item, tagMap.get(item.ID))
659
- }));
660
- // Upsert records in parallel sub-batches for throughput
661
- const UPSERT_CHUNK = 50;
662
- const upsertPromises = [];
663
- for (let j = 0; j < records.length; j += UPSERT_CHUNK) {
664
- const chunk = records.slice(j, j + UPSERT_CHUNK);
665
- upsertPromises.push(Promise.resolve(infra.vectorDB.CreateRecords(chunk, infra.indexName)));
666
- }
667
- const responses = await Promise.all(upsertPromises);
668
- let batchSuccess = true;
669
- for (const response of responses) {
670
- if (!response.success) {
671
- LogError(`VectorizeContentItems: upsert failed: ${response.message}`);
672
- batchSuccess = false;
673
- }
1092
+ // Track the AIPromptRun ID for junction table linking
1093
+ if (runResult.PromptRunID) {
1094
+ promptRunIDs.push(runResult.PromptRunID);
674
1095
  }
1096
+ const records = this.buildVectorRecords(allChunks, runResult.Vectors, tagMap);
1097
+ const batchSuccess = await this.upsertVectorRecords(records, infra);
675
1098
  if (batchSuccess) {
676
1099
  vectorized += batch.length;
677
1100
  }
678
1101
  onBatchComplete(batch.length);
679
1102
  }
680
- return vectorized;
1103
+ return { vectorized, promptRunIDs };
1104
+ }
1105
+ /**
1106
+ * Resolve the "Content Embedding" prompt ID from AIEngine for AIModelRunner tracking.
1107
+ * Returns undefined if the prompt is not found (AIModelRunner will fall back to
1108
+ * the first active Embedding-type prompt).
1109
+ */
1110
+ resolveEmbeddingPromptID() {
1111
+ const prompt = AIEngine.Instance.Prompts.find(p => p.Name === 'Content Embedding' && p.Status === 'Active');
1112
+ if (prompt) {
1113
+ return prompt.ID;
1114
+ }
1115
+ // Fall back: let AIModelRunner find the first active Embedding prompt
1116
+ LogStatus('[Autotag] "Content Embedding" prompt not found — AIModelRunner will use default embedding prompt');
1117
+ return undefined;
1118
+ }
1119
+ /**
1120
+ * Build text chunks for a batch of content items. Items with long text
1121
+ * produce multiple chunks via TextChunker.
1122
+ */
1123
+ buildChunksForBatch(batch) {
1124
+ const allChunks = [];
1125
+ for (const item of batch) {
1126
+ const chunks = this.buildEmbeddingChunks(item);
1127
+ for (let ci = 0; ci < chunks.length; ci++) {
1128
+ allChunks.push({ item, chunkIndex: ci, text: chunks[ci] });
1129
+ }
1130
+ }
1131
+ return allChunks;
1132
+ }
1133
+ /**
1134
+ * Build VectorRecord objects from embedding chunks and their corresponding vectors.
1135
+ */
1136
+ buildVectorRecords(allChunks, vectors, tagMap) {
1137
+ return allChunks.map((chunk, idx) => ({
1138
+ id: chunk.chunkIndex === 0
1139
+ ? this.contentItemVectorId(chunk.item.ID)
1140
+ : this.contentItemVectorId(chunk.item.ID) + `_chunk${chunk.chunkIndex}`,
1141
+ values: vectors[idx],
1142
+ metadata: this.buildVectorMetadata(chunk.item, tagMap.get(chunk.item.ID))
1143
+ }));
1144
+ }
1145
+ /**
1146
+ * Upsert vector records to the vector database in sub-batches with rate limiting.
1147
+ * Returns true if all sub-batches succeeded.
1148
+ */
1149
+ async upsertVectorRecords(records, infra) {
1150
+ const UPSERT_CHUNK = 50;
1151
+ const upsertPromises = [];
1152
+ for (let j = 0; j < records.length; j += UPSERT_CHUNK) {
1153
+ const chunk = records.slice(j, j + UPSERT_CHUNK);
1154
+ await this.VectorDBRateLimiter.Acquire();
1155
+ upsertPromises.push(Promise.resolve(infra.vectorDB.CreateRecords(chunk, infra.indexName)));
1156
+ }
1157
+ const responses = await Promise.all(upsertPromises);
1158
+ let allSuccess = true;
1159
+ for (const response of responses) {
1160
+ if (!response.success) {
1161
+ LogError(`VectorizeContentItems: upsert failed: ${response.message}`);
1162
+ allSuccess = false;
1163
+ }
1164
+ }
1165
+ return allSuccess;
681
1166
  }
682
1167
  /**
683
1168
  * Load content source and content type records for all unique source/type IDs
684
1169
  * referenced by the given items. Returns maps keyed by normalized ID.
685
1170
  */
686
- async loadContentSourceAndTypeMaps(items, contextUser) {
687
- const sourceIds = [...new Set(items.map(i => i.ContentSourceID))];
688
- const typeIds = [...new Set(items.map(i => i.ContentTypeID))];
689
- const rv = new RunView();
690
- const [sourceResult, typeResult] = await rv.RunViews([
691
- {
692
- EntityName: 'MJ: Content Sources',
693
- ExtraFilter: `ID IN (${sourceIds.map(id => `'${id}'`).join(',')})`,
694
- ResultType: 'simple'
695
- },
696
- {
697
- EntityName: 'MJ: Content Types',
698
- ExtraFilter: `ID IN (${typeIds.map(id => `'${id}'`).join(',')})`,
699
- ResultType: 'simple'
700
- }
701
- ], contextUser);
1171
+ async loadContentSourceAndTypeMaps(items, _contextUser) {
1172
+ const sourceIdSet = new Set(items.map(i => NormalizeUUID(i.ContentSourceID)));
1173
+ const typeIdSet = new Set(items.map(i => NormalizeUUID(i.ContentTypeID)));
1174
+ // Use KH engine cached data instead of RunView calls
702
1175
  const sourceMap = new Map();
703
- if (sourceResult.Success) {
704
- for (const row of sourceResult.Results) {
705
- const rec = row;
706
- sourceMap.set(NormalizeUUID(rec['ID']), rec);
1176
+ for (const src of this.khEngine.ContentSources) {
1177
+ if (sourceIdSet.has(NormalizeUUID(src.ID))) {
1178
+ sourceMap.set(NormalizeUUID(src.ID), src.GetAll());
707
1179
  }
708
1180
  }
709
1181
  const typeMap = new Map();
710
- if (typeResult.Success) {
711
- for (const row of typeResult.Results) {
712
- const rec = row;
713
- typeMap.set(NormalizeUUID(rec['ID']), rec);
1182
+ for (const ct of this.ContentTypes) {
1183
+ if (typeIdSet.has(NormalizeUUID(ct.ID))) {
1184
+ typeMap.set(NormalizeUUID(ct.ID), ct.GetAll());
714
1185
  }
715
1186
  }
716
1187
  return { sourceMap, typeMap };
@@ -776,62 +1247,41 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
776
1247
  * Build infrastructure from explicit embeddingModelID and vectorIndexID.
777
1248
  * Looks up the vector index by ID and the embedding model from AIEngine.
778
1249
  */
779
- async buildVectorInfrastructure(embeddingModelID, vectorIndexID, contextUser) {
780
- const rv = new RunView();
781
- const indexResult = await rv.RunView({
782
- EntityName: 'MJ: Vector Indexes',
783
- ExtraFilter: `ID='${vectorIndexID}'`,
784
- ResultType: 'simple',
785
- MaxRows: 1
786
- }, contextUser);
787
- if (!indexResult.Success || indexResult.Results.length === 0) {
788
- throw new Error(`Vector index ${vectorIndexID} not found`);
1250
+ async buildVectorInfrastructure(embeddingModelID, vectorIndexID, _contextUser) {
1251
+ const vectorIndex = this.khEngine.GetVectorIndexById(vectorIndexID);
1252
+ if (!vectorIndex) {
1253
+ throw new Error(`Vector index ${vectorIndexID} not found in KnowledgeHubMetadataEngine cache`);
789
1254
  }
790
- const vectorIndex = indexResult.Results[0];
791
- return this.createInfrastructureFromIndex(vectorIndex, embeddingModelID, contextUser);
1255
+ return this.createInfrastructureFromIndex(vectorIndex.Name, vectorIndex.VectorDatabaseID, embeddingModelID);
792
1256
  }
793
1257
  /**
794
- * Fallback: resolve infrastructure from the first active VectorIndex (original behavior).
1258
+ * Fallback: resolve infrastructure from the first available VectorIndex (original behavior).
795
1259
  */
796
- async getDefaultVectorInfrastructure(contextUser) {
797
- const rv = new RunView();
798
- const indexResult = await rv.RunView({
799
- EntityName: 'MJ: Vector Indexes',
800
- ResultType: 'simple',
801
- MaxRows: 1
802
- }, contextUser);
803
- if (!indexResult.Success || indexResult.Results.length === 0) {
1260
+ async getDefaultVectorInfrastructure(_contextUser) {
1261
+ const vectorIndexes = this.khEngine.VectorIndexes;
1262
+ if (vectorIndexes.length === 0) {
804
1263
  throw new Error('No vector indexes found — create one in the Configuration tab first');
805
1264
  }
806
- const vectorIndex = indexResult.Results[0];
807
- const embeddingModelID = vectorIndex['EmbeddingModelID'];
808
- return this.createInfrastructureFromIndex(vectorIndex, embeddingModelID, contextUser);
1265
+ const vectorIndex = vectorIndexes[0];
1266
+ return this.createInfrastructureFromIndex(vectorIndex.Name, vectorIndex.VectorDatabaseID, vectorIndex.EmbeddingModelID);
809
1267
  }
810
1268
  /**
811
- * Shared helper: given a vector index record and embedding model ID, resolve all
812
- * driver instances needed for embedding + upsert.
1269
+ * Shared helper: given vector index details and embedding model ID, resolve all
1270
+ * driver instances needed for embedding + upsert. Uses AIEngine for Vector Databases.
813
1271
  */
814
- async createInfrastructureFromIndex(vectorIndex, embeddingModelID, contextUser) {
815
- const indexName = vectorIndex['Name'];
816
- const vectorDatabaseID = vectorIndex['VectorDatabaseID'];
817
- const rv = new RunView();
818
- const dbResult = await rv.RunView({
819
- EntityName: 'MJ: Vector Databases',
820
- ExtraFilter: `ID='${vectorDatabaseID}'`,
821
- ResultType: 'simple',
822
- MaxRows: 1
823
- }, contextUser);
824
- if (!dbResult.Success || dbResult.Results.length === 0) {
825
- throw new Error(`Vector database ${vectorDatabaseID} not found`);
1272
+ async createInfrastructureFromIndex(indexName, vectorDatabaseID, embeddingModelID) {
1273
+ const vectorDBEntity = AIEngine.Instance.VectorDatabases.find(db => UUIDsEqual(db.ID, vectorDatabaseID));
1274
+ if (!vectorDBEntity || !vectorDBEntity.ClassKey) {
1275
+ throw new Error(`Vector database ${vectorDatabaseID} not found in AIEngine cache`);
826
1276
  }
827
- const vectorDBClassKey = dbResult.Results[0]['ClassKey'];
1277
+ const vectorDBClassKey = vectorDBEntity.ClassKey;
828
1278
  const aiModel = this.findEmbeddingModel(embeddingModelID);
829
1279
  const driverClass = aiModel.DriverClass;
830
1280
  const embeddingModelName = aiModel.APIName ?? aiModel.Name;
831
1281
  LogStatus(`VectorizeContentItems: USING embedding model "${aiModel.Name}" (${driverClass}), vector DB "${vectorDBClassKey}", index "${indexName}"`);
832
1282
  const embedding = this.createEmbeddingInstance(driverClass);
833
1283
  const vectorDB = this.createVectorDBInstance(vectorDBClassKey);
834
- return { embedding, vectorDB, indexName, embeddingModelName };
1284
+ return { embedding, vectorDB, indexName, embeddingModelName, embeddingModelID };
835
1285
  }
836
1286
  /** Find an embedding model by ID in AIEngine, with helpful error reporting */
837
1287
  findEmbeddingModel(embeddingModelID) {
@@ -870,7 +1320,16 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
870
1320
  return crypto.createHash('sha1').update(`content-item_${contentItemId}`).digest('hex');
871
1321
  }
872
1322
  /** Build the text that gets embedded: Title + Description + full Text */
873
- buildEmbeddingText(item) {
1323
+ /**
1324
+ * Max tokens per embedding chunk. text-embedding-3-small supports 8,191 tokens.
1325
+ * We use a conservative limit to avoid hitting the boundary.
1326
+ */
1327
+ static { this.MAX_EMBEDDING_TOKENS = 7500; }
1328
+ /**
1329
+ * Build the text to embed for a content item, and chunk it if it exceeds
1330
+ * the embedding model's token limit. Returns one or more text chunks.
1331
+ */
1332
+ buildEmbeddingChunks(item) {
874
1333
  const parts = [];
875
1334
  if (item.Name)
876
1335
  parts.push(item.Name);
@@ -878,7 +1337,32 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
878
1337
  parts.push(item.Description);
879
1338
  if (item.Text)
880
1339
  parts.push(item.Text);
881
- return parts.join('\n');
1340
+ const full = parts.join('\n');
1341
+ // Rough char estimate: 1 token ≈ 4 chars
1342
+ const charLimit = AutotagBaseEngine_1.MAX_EMBEDDING_TOKENS * 4;
1343
+ if (full.length <= charLimit) {
1344
+ return [full];
1345
+ }
1346
+ // Chunk using TextChunker for token-aware splitting
1347
+ LogStatus(`[Autotag] Chunking embedding text for "${item.Name}" (${full.length} chars, ~${Math.ceil(full.length / 4)} tokens)`);
1348
+ try {
1349
+ const chunkParams = {
1350
+ Text: full,
1351
+ MaxChunkTokens: AutotagBaseEngine_1.MAX_EMBEDDING_TOKENS,
1352
+ OverlapTokens: 100,
1353
+ };
1354
+ const chunks = TextChunker.ChunkText(chunkParams);
1355
+ LogStatus(`[Autotag] Split into ${chunks.length} chunks for embedding`);
1356
+ return chunks.map(c => c.Text);
1357
+ }
1358
+ catch {
1359
+ // Fallback: simple character-based splitting
1360
+ const result = [];
1361
+ for (let i = 0; i < full.length; i += charLimit) {
1362
+ result.push(full.substring(i, i + charLimit));
1363
+ }
1364
+ return result;
1365
+ }
882
1366
  }
883
1367
  /** Build metadata stored alongside the vector — truncate large text fields */
884
1368
  buildVectorMetadata(item, tags) {
@@ -886,6 +1370,8 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
886
1370
  const meta = {
887
1371
  RecordID: item.ID,
888
1372
  Entity: 'MJ: Content Items',
1373
+ ContentSourceID: item.ContentSourceID,
1374
+ ContentSourceTypeID: item.ContentSourceTypeID,
889
1375
  };
890
1376
  if (item.Name)
891
1377
  meta['Title'] = item.Name.substring(0, META_TEXT_LIMIT);
@@ -916,8 +1402,370 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
916
1402
  }
917
1403
  return tagMap;
918
1404
  }
1405
+ // ---- Content Deduplication ----
1406
+ /**
1407
+ * Attempts to recompute tag co-occurrence data after the LLM tagging pipeline completes.
1408
+ * Uses dynamic import to avoid a hard dependency on the tag-engine package.
1409
+ * If TagCoOccurrenceEngine is not available or fails, it logs a warning and continues.
1410
+ */
1411
+ async recomputeCoOccurrenceIfAvailable(contextUser) {
1412
+ try {
1413
+ // Dynamic check: TagCoOccurrenceEngine is registered via class factory
1414
+ const { TagCoOccurrenceEngine } = await import('@memberjunction/tag-engine');
1415
+ const engine = TagCoOccurrenceEngine.Instance;
1416
+ if (engine && typeof engine.RecomputeCoOccurrence === 'function') {
1417
+ LogStatus('[Autotag] Recomputing tag co-occurrence after pipeline completion...');
1418
+ const result = await engine.RecomputeCoOccurrence(contextUser);
1419
+ LogStatus(`[Autotag] Co-occurrence recompute complete: ${result.PairsUpdated} pairs updated, ${result.PairsDeleted} deleted`);
1420
+ }
1421
+ }
1422
+ catch (e) {
1423
+ const msg = e instanceof Error ? e.message : String(e);
1424
+ LogStatus(`[Autotag] Co-occurrence recompute skipped (not available): ${msg}`);
1425
+ }
1426
+ }
1427
+ /**
1428
+ * Detects duplicate content items by matching the given item's checksum against
1429
+ * other content items from **different** content sources. When an exact checksum
1430
+ * match is found, a {@link MJContentItemDuplicateEntity} record is created with
1431
+ * `DetectionMethod = 'Checksum'` and `SimilarityScore = 1.0`.
1432
+ *
1433
+ * Duplicate pairs are stored in canonical order (lower ID = ContentItemAID) to
1434
+ * prevent mirror duplicates. If a duplicate pair already exists for the same
1435
+ * detection method, no new record is created.
1436
+ *
1437
+ * @param contentItem - The content item whose checksum should be checked for duplicates.
1438
+ * Must already be saved (i.e., have a valid ID and Checksum).
1439
+ * @param contextUser - The authenticated user context for data access and audit.
1440
+ * @returns A promise that resolves when detection is complete. Does not throw on
1441
+ * failure — errors are logged and swallowed to avoid disrupting the pipeline.
1442
+ */
1443
+ async DetectChecksumDuplicates(contentItem, contextUser) {
1444
+ if (!contentItem.Checksum) {
1445
+ return; // No checksum to compare
1446
+ }
1447
+ try {
1448
+ const matches = await this.findItemsByChecksum(contentItem.Checksum, contentItem.ContentSourceID, contentItem.ID, contextUser);
1449
+ for (const match of matches) {
1450
+ await this.createDuplicateRecordIfNotExists(contentItem.ID, match.ID, 1.0, 'Checksum', contextUser);
1451
+ }
1452
+ }
1453
+ catch (e) {
1454
+ const msg = e instanceof Error ? e.message : String(e);
1455
+ LogError(`[Dedup] Checksum detection failed for item ${contentItem.ID}: ${msg}`);
1456
+ }
1457
+ }
1458
+ /**
1459
+ * Detects duplicate content items by matching the given item's title (Name field)
1460
+ * against other content items from **different** content sources. When an exact
1461
+ * title match is found, a {@link MJContentItemDuplicateEntity} record is created
1462
+ * with `DetectionMethod = 'Title'` and `SimilarityScore = 1.0`.
1463
+ *
1464
+ * Duplicate pairs are stored in canonical order (lower ID = ContentItemAID) to
1465
+ * prevent mirror duplicates. If a duplicate pair already exists for the same
1466
+ * detection method, no new record is created.
1467
+ *
1468
+ * @param contentItem - The content item whose title should be checked for duplicates.
1469
+ * Must already be saved (i.e., have a valid ID and Name).
1470
+ * @param contextUser - The authenticated user context for data access and audit.
1471
+ * @returns A promise that resolves when detection is complete. Does not throw on
1472
+ * failure — errors are logged and swallowed to avoid disrupting the pipeline.
1473
+ */
1474
+ async DetectTitleDuplicates(contentItem, contextUser) {
1475
+ if (!contentItem.Name || contentItem.Name.trim().length === 0) {
1476
+ return; // No title to compare
1477
+ }
1478
+ try {
1479
+ const matches = await this.findItemsByTitle(contentItem.Name, contentItem.ContentSourceID, contentItem.ID, contextUser);
1480
+ for (const match of matches) {
1481
+ await this.createDuplicateRecordIfNotExists(contentItem.ID, match.ID, 1.0, 'Title', contextUser);
1482
+ }
1483
+ }
1484
+ catch (e) {
1485
+ const msg = e instanceof Error ? e.message : String(e);
1486
+ LogError(`[Dedup] Title detection failed for item ${contentItem.ID}: ${msg}`);
1487
+ }
1488
+ }
1489
+ /**
1490
+ * Runs all non-vector deduplication checks (checksum and title) for a content item.
1491
+ * This is a convenience method intended to be called after saving/updating a content item.
1492
+ *
1493
+ * @param contentItem - The saved content item to check for duplicates.
1494
+ * @param contextUser - The authenticated user context for data access and audit.
1495
+ */
1496
+ async DetectDuplicates(contentItem, contextUser) {
1497
+ await Promise.all([
1498
+ this.DetectChecksumDuplicates(contentItem, contextUser),
1499
+ this.DetectTitleDuplicates(contentItem, contextUser),
1500
+ ]);
1501
+ }
1502
+ /**
1503
+ * Detects near-duplicate content items by querying the vector index for items
1504
+ * with high cosine similarity (> 0.95 threshold). Only creates duplicate records
1505
+ * for matches from DIFFERENT content sources to avoid self-matches.
1506
+ *
1507
+ * This is expensive so it only checks the top 3 most similar results.
1508
+ * Controlled by the `enableVectorDedup` flag.
1509
+ *
1510
+ * @param contentItem - The content item to check (must have text and be vectorized).
1511
+ * @param contextUser - The authenticated user context for data access and audit.
1512
+ * @param enableVectorDedup - Whether to run vector-based dedup (default false).
1513
+ */
1514
+ async DetectVectorDuplicates(contentItem, contextUser, enableVectorDedup = false) {
1515
+ if (!enableVectorDedup)
1516
+ return;
1517
+ if (!contentItem.Text || contentItem.Text.trim().length === 0)
1518
+ return;
1519
+ try {
1520
+ await this.performVectorDedupCheck(contentItem, contextUser);
1521
+ }
1522
+ catch (e) {
1523
+ const msg = e instanceof Error ? e.message : String(e);
1524
+ LogError(`[Dedup] Vector detection failed for item ${contentItem.ID}: ${msg}`);
1525
+ }
1526
+ }
1527
+ /**
1528
+ * Internal implementation of vector-based dedup. Resolves the vector infrastructure
1529
+ * for the item, embeds its text, queries for similar vectors, and creates duplicate
1530
+ * records for high-similarity matches from different sources.
1531
+ */
1532
+ async performVectorDedupCheck(contentItem, contextUser) {
1533
+ // Need AIEngine loaded to resolve embedding model
1534
+ await AIEngine.Instance.Config(false, contextUser);
1535
+ // Load the content source + type maps for this single item
1536
+ const { sourceMap, typeMap } = await this.loadContentSourceAndTypeMaps([contentItem], contextUser);
1537
+ // Resolve infrastructure for this item
1538
+ const groups = this.groupItemsByInfrastructure([contentItem], sourceMap, typeMap);
1539
+ if (groups.size === 0) {
1540
+ LogStatus(`[Dedup] No vector infrastructure found for item ${contentItem.ID}, skipping vector dedup`);
1541
+ return;
1542
+ }
1543
+ const [groupKey] = groups.entries().next().value;
1544
+ const infra = await this.resolveGroupInfrastructure(groupKey, contextUser);
1545
+ // Embed the item's text
1546
+ const text = contentItem.Text.trim();
1547
+ const truncated = text.length > 8000 ? text.substring(0, 8000) : text;
1548
+ await this.EmbeddingRateLimiter.Acquire(Math.ceil(truncated.length / 4));
1549
+ const modelRunner = new AIModelRunner();
1550
+ const embeddingPromptID = this.resolveEmbeddingPromptID();
1551
+ const runResult = await modelRunner.RunEmbedding({
1552
+ ModelID: infra.embeddingModelID,
1553
+ Texts: [truncated],
1554
+ PromptID: embeddingPromptID ?? undefined,
1555
+ ContextUser: contextUser,
1556
+ });
1557
+ if (!runResult?.Vectors || runResult.Vectors.length === 0) {
1558
+ LogStatus(`[Dedup] Embedding failed for item ${contentItem.ID}, skipping vector dedup`);
1559
+ return;
1560
+ }
1561
+ const queryVector = runResult.Vectors[0];
1562
+ // Query vector DB for top 4 most similar (top 3 useful + 1 for self-match)
1563
+ const queryResponse = await infra.vectorDB.QueryIndex({
1564
+ vector: queryVector,
1565
+ topK: 4,
1566
+ includeMetadata: true,
1567
+ });
1568
+ const responseData = queryResponse;
1569
+ if (!responseData.success || !responseData.data)
1570
+ return;
1571
+ // The data property contains matches (QueryResponse shape)
1572
+ const matches = responseData.data.matches;
1573
+ if (!matches || matches.length === 0)
1574
+ return;
1575
+ // Filter: different source, similarity > 0.95, not self
1576
+ const VECTOR_DEDUP_THRESHOLD = 0.95;
1577
+ let matchCount = 0;
1578
+ for (const match of matches) {
1579
+ if (matchCount >= 3)
1580
+ break; // Only check top 3
1581
+ const matchScore = match.score ?? 0;
1582
+ if (matchScore < VECTOR_DEDUP_THRESHOLD)
1583
+ continue;
1584
+ // Extract content item ID from vector metadata
1585
+ const matchItemID = match.metadata?.['contentItemID'];
1586
+ if (!matchItemID || UUIDsEqual(matchItemID, contentItem.ID))
1587
+ continue;
1588
+ // Check if the match is from a different source
1589
+ const isDifferentSource = await this.isFromDifferentSource(matchItemID, contentItem.ContentSourceID, contextUser);
1590
+ if (!isDifferentSource)
1591
+ continue;
1592
+ await this.createDuplicateRecordIfNotExists(contentItem.ID, matchItemID, matchScore, 'Vector', contextUser);
1593
+ matchCount++;
1594
+ }
1595
+ if (matchCount > 0) {
1596
+ LogStatus(`[Dedup] Vector dedup found ${matchCount} near-duplicate(s) for item ${contentItem.ID}`);
1597
+ }
1598
+ }
1599
+ /**
1600
+ * Check if a content item belongs to a different source than the given sourceID.
1601
+ */
1602
+ async isFromDifferentSource(itemID, excludeSourceID, contextUser) {
1603
+ const rv = new RunView();
1604
+ const result = await rv.RunView({
1605
+ EntityName: 'MJ: Content Items',
1606
+ Fields: ['ContentSourceID'],
1607
+ ExtraFilter: `ID = '${itemID}'`,
1608
+ ResultType: 'simple',
1609
+ MaxRows: 1,
1610
+ }, contextUser);
1611
+ if (!result.Success || result.Results.length === 0)
1612
+ return false;
1613
+ return result.Results[0].ContentSourceID !== excludeSourceID;
1614
+ }
1615
+ /**
1616
+ * Resolves a duplicate record by updating its Status and Resolution fields.
1617
+ *
1618
+ * @param duplicateID - The ID of the ContentItemDuplicate record.
1619
+ * @param resolution - The resolution choice: 'KeepA', 'KeepB', 'NotDuplicate'.
1620
+ * @param contextUser - The authenticated user context.
1621
+ */
1622
+ async ResolveContentDuplicate(duplicateID, resolution, contextUser) {
1623
+ try {
1624
+ const md = new Metadata();
1625
+ const duplicate = await md.GetEntityObject('MJ: Content Item Duplicates', contextUser);
1626
+ const loaded = await duplicate.Load(duplicateID);
1627
+ if (!loaded) {
1628
+ LogError(`[Dedup] Could not load duplicate record ${duplicateID} for resolution`);
1629
+ return false;
1630
+ }
1631
+ this.applyDuplicateResolution(duplicate, resolution);
1632
+ const saved = await duplicate.Save();
1633
+ if (!saved) {
1634
+ LogError(`[Dedup] Failed to save resolution for duplicate ${duplicateID}: ${duplicate.LatestResult?.Message ?? 'Unknown error'}`);
1635
+ return false;
1636
+ }
1637
+ LogStatus(`[Dedup] Resolved duplicate ${duplicateID}: ${resolution}`);
1638
+ return true;
1639
+ }
1640
+ catch (e) {
1641
+ const msg = e instanceof Error ? e.message : String(e);
1642
+ LogError(`[Dedup] Error resolving duplicate ${duplicateID}: ${msg}`);
1643
+ return false;
1644
+ }
1645
+ }
1646
+ /**
1647
+ * Applies the resolution to a duplicate record by setting the Status and Resolution fields.
1648
+ */
1649
+ applyDuplicateResolution(duplicate, resolution) {
1650
+ if (resolution === 'NotDuplicate') {
1651
+ duplicate.Status = 'Dismissed';
1652
+ duplicate.Resolution = 'NotDuplicate';
1653
+ }
1654
+ else {
1655
+ // KeepA or KeepB — mark as Merged
1656
+ duplicate.Status = 'Merged';
1657
+ duplicate.Resolution = resolution;
1658
+ }
1659
+ }
1660
+ /**
1661
+ * Finds content items with the same checksum from different content sources.
1662
+ *
1663
+ * @param checksum - The SHA-256 checksum to search for.
1664
+ * @param excludeSourceID - The content source ID to exclude (the item's own source).
1665
+ * @param excludeItemID - The content item ID to exclude (the item itself).
1666
+ * @param contextUser - The authenticated user context.
1667
+ * @returns An array of matching content items (simple objects with ID field).
1668
+ */
1669
+ async findItemsByChecksum(checksum, excludeSourceID, excludeItemID, contextUser) {
1670
+ const rv = new RunView();
1671
+ const result = await rv.RunView({
1672
+ EntityName: 'MJ: Content Items',
1673
+ Fields: ['ID'],
1674
+ ExtraFilter: `Checksum = '${checksum.replace(/'/g, "''")}' AND ContentSourceID <> '${excludeSourceID}' AND ID <> '${excludeItemID}'`,
1675
+ ResultType: 'simple'
1676
+ }, contextUser);
1677
+ if (!result.Success) {
1678
+ LogError(`[Dedup] RunView failed for checksum lookup: ${result.ErrorMessage}`);
1679
+ return [];
1680
+ }
1681
+ return result.Results;
1682
+ }
1683
+ /**
1684
+ * Finds content items with the same title (Name) from different content sources.
1685
+ *
1686
+ * @param title - The title to search for (exact match).
1687
+ * @param excludeSourceID - The content source ID to exclude (the item's own source).
1688
+ * @param excludeItemID - The content item ID to exclude (the item itself).
1689
+ * @param contextUser - The authenticated user context.
1690
+ * @returns An array of matching content items (simple objects with ID field).
1691
+ */
1692
+ async findItemsByTitle(title, excludeSourceID, excludeItemID, contextUser) {
1693
+ const rv = new RunView();
1694
+ const escapedTitle = title.replace(/'/g, "''");
1695
+ const result = await rv.RunView({
1696
+ EntityName: 'MJ: Content Items',
1697
+ Fields: ['ID'],
1698
+ ExtraFilter: `Name = '${escapedTitle}' AND ContentSourceID <> '${excludeSourceID}' AND ID <> '${excludeItemID}'`,
1699
+ ResultType: 'simple'
1700
+ }, contextUser);
1701
+ if (!result.Success) {
1702
+ LogError(`[Dedup] RunView failed for title lookup: ${result.ErrorMessage}`);
1703
+ return [];
1704
+ }
1705
+ return result.Results;
1706
+ }
1707
+ /**
1708
+ * Creates a {@link MJContentItemDuplicateEntity} record for a detected duplicate pair,
1709
+ * but only if one does not already exist for the same pair and detection method.
1710
+ *
1711
+ * IDs are stored in canonical order: the lexicographically smaller ID is always
1712
+ * ContentItemAID to prevent mirror duplicates (A,B) vs (B,A).
1713
+ *
1714
+ * @param itemAID - One of the duplicate item IDs.
1715
+ * @param itemBID - The other duplicate item ID.
1716
+ * @param similarityScore - The similarity score (0.0 to 1.0).
1717
+ * @param detectionMethod - How the duplicate was detected.
1718
+ * @param contextUser - The authenticated user context.
1719
+ */
1720
+ async createDuplicateRecordIfNotExists(itemAID, itemBID, similarityScore, detectionMethod, contextUser) {
1721
+ // Canonical ordering: lower normalized ID = A
1722
+ const normalizedA = NormalizeUUID(itemAID);
1723
+ const normalizedB = NormalizeUUID(itemBID);
1724
+ const [canonicalAID, canonicalBID] = normalizedA < normalizedB
1725
+ ? [itemAID, itemBID]
1726
+ : [itemBID, itemAID];
1727
+ // Check if this pair already exists for the same detection method
1728
+ const exists = await this.duplicatePairExists(canonicalAID, canonicalBID, detectionMethod, contextUser);
1729
+ if (exists) {
1730
+ return;
1731
+ }
1732
+ const md = new Metadata();
1733
+ const duplicate = await md.GetEntityObject('MJ: Content Item Duplicates', contextUser);
1734
+ duplicate.NewRecord();
1735
+ duplicate.ContentItemAID = canonicalAID;
1736
+ duplicate.ContentItemBID = canonicalBID;
1737
+ duplicate.SimilarityScore = similarityScore;
1738
+ duplicate.DetectionMethod = detectionMethod;
1739
+ duplicate.Status = 'Pending';
1740
+ const saved = await duplicate.Save();
1741
+ if (!saved) {
1742
+ LogError(`[Dedup] Failed to save duplicate record for pair (${canonicalAID}, ${canonicalBID}) method=${detectionMethod}`);
1743
+ }
1744
+ else {
1745
+ LogStatus(`[Dedup] Detected ${detectionMethod} duplicate: (${canonicalAID}, ${canonicalBID}) score=${similarityScore}`);
1746
+ }
1747
+ }
1748
+ /**
1749
+ * Checks whether a duplicate record already exists for the given pair and detection method.
1750
+ *
1751
+ * @param canonicalAID - The canonical (ordered) ContentItemAID.
1752
+ * @param canonicalBID - The canonical (ordered) ContentItemBID.
1753
+ * @param detectionMethod - The detection method to check.
1754
+ * @param contextUser - The authenticated user context.
1755
+ * @returns True if a record already exists.
1756
+ */
1757
+ async duplicatePairExists(canonicalAID, canonicalBID, detectionMethod, contextUser) {
1758
+ const rv = new RunView();
1759
+ const result = await rv.RunView({
1760
+ EntityName: 'MJ: Content Item Duplicates',
1761
+ Fields: ['ID'],
1762
+ ExtraFilter: `ContentItemAID = '${canonicalAID}' AND ContentItemBID = '${canonicalBID}' AND DetectionMethod = '${detectionMethod}'`,
1763
+ ResultType: 'simple'
1764
+ }, contextUser);
1765
+ return result.Success && result.Results.length > 0;
1766
+ }
919
1767
  };
920
- AutotagBaseEngine = __decorate([
1768
+ AutotagBaseEngine = AutotagBaseEngine_1 = __decorate([
921
1769
  RegisterClass(BaseEngine, 'AutotagBaseEngine')
922
1770
  ], AutotagBaseEngine);
923
1771
  export { AutotagBaseEngine };