@memberjunction/content-autotagging 5.22.0 → 5.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/README.md +78 -18
  2. package/dist/CloudStorage/generic/CloudStorageBase.d.ts +2 -2
  3. package/dist/CloudStorage/generic/CloudStorageBase.d.ts.map +1 -1
  4. package/dist/CloudStorage/generic/CloudStorageBase.js +2 -2
  5. package/dist/CloudStorage/generic/CloudStorageBase.js.map +1 -1
  6. package/dist/Core/generic/AutotagBase.d.ts +3 -1
  7. package/dist/Core/generic/AutotagBase.d.ts.map +1 -1
  8. package/dist/Core/generic/AutotagBase.js.map +1 -1
  9. package/dist/Engine/generic/AutotagBaseEngine.d.ts +89 -7
  10. package/dist/Engine/generic/AutotagBaseEngine.d.ts.map +1 -1
  11. package/dist/Engine/generic/AutotagBaseEngine.js +462 -76
  12. package/dist/Engine/generic/AutotagBaseEngine.js.map +1 -1
  13. package/dist/Entity/generic/AutotagEntity.d.ts +2 -2
  14. package/dist/Entity/generic/AutotagEntity.d.ts.map +1 -1
  15. package/dist/Entity/generic/AutotagEntity.js +2 -2
  16. package/dist/Entity/generic/AutotagEntity.js.map +1 -1
  17. package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.d.ts +2 -2
  18. package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.d.ts.map +1 -1
  19. package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js +2 -2
  20. package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js.map +1 -1
  21. package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts +2 -2
  22. package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts.map +1 -1
  23. package/dist/RSSFeed/generic/AutotagRSSFeed.js +2 -2
  24. package/dist/RSSFeed/generic/AutotagRSSFeed.js.map +1 -1
  25. package/dist/Websites/generic/AutotagWebsite.d.ts +2 -2
  26. package/dist/Websites/generic/AutotagWebsite.d.ts.map +1 -1
  27. package/dist/Websites/generic/AutotagWebsite.js +2 -2
  28. package/dist/Websites/generic/AutotagWebsite.js.map +1 -1
  29. package/package.json +11 -8
@@ -5,7 +5,7 @@ var __decorate = (this && this.__decorate) || function (decorators, target, key,
5
5
  return c > 3 && r && Object.defineProperty(target, key, r), r;
6
6
  };
7
7
  import { BaseEngine, Metadata, RunView, LogError, LogStatus } from '@memberjunction/core';
8
- import { MJGlobal, UUIDsEqual, RegisterClass } from '@memberjunction/global';
8
+ import { MJGlobal, UUIDsEqual, NormalizeUUID, RegisterClass } from '@memberjunction/global';
9
9
  import { ContentSourceTypeParams } from './content.types.js';
10
10
  import pdfParse from 'pdf-parse';
11
11
  import officeparser from 'officeparser';
@@ -15,9 +15,14 @@ import { toZonedTime } from 'date-fns-tz';
15
15
  import axios from 'axios';
16
16
  import * as cheerio from 'cheerio';
17
17
  import crypto from 'crypto';
18
- import { BaseLLM, GetAIAPIKey } from '@memberjunction/ai';
18
+ import { BaseEmbeddings, GetAIAPIKey } from '@memberjunction/ai';
19
19
  import { AIEngine } from '@memberjunction/aiengine';
20
+ import { AIPromptRunner } from '@memberjunction/ai-prompts';
21
+ import { AIPromptParams } from '@memberjunction/ai-core-plus';
20
22
  import { TextChunker } from '@memberjunction/ai-vectors';
23
+ import { VectorDBBase } from '@memberjunction/ai-vectordb';
24
+ /** Default batch size for vectorization processing */
25
+ const DEFAULT_VECTORIZE_BATCH_SIZE = 20;
21
26
  /**
22
27
  * Core engine for content autotagging. Extends BaseEngine to cache content metadata
23
28
  * (types, source types, file types, attributes) at startup. Uses AIEngine via composition
@@ -79,8 +84,9 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
79
84
  }
80
85
  /**
81
86
  * Given a list of content items, extract the text from each and process with LLM for tagging.
87
+ * Items are processed in configurable batches with controlled concurrency within each batch.
82
88
  */
83
- async ExtractTextAndProcessWithLLM(contentItems, contextUser) {
89
+ async ExtractTextAndProcessWithLLM(contentItems, contextUser, batchSize = DEFAULT_VECTORIZE_BATCH_SIZE, onProgress) {
84
90
  if (!contentItems || contentItems.length === 0) {
85
91
  LogStatus('No content items to process');
86
92
  return;
@@ -89,16 +95,35 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
89
95
  processRunParams.sourceID = contentItems[0].ContentSourceID;
90
96
  processRunParams.startTime = new Date();
91
97
  processRunParams.numItemsProcessed = contentItems.length;
92
- for (const contentItem of contentItems) {
93
- try {
94
- const processingParams = await this.buildProcessingParams(contentItem, contextUser);
95
- await this.ProcessContentItemText(processingParams, contextUser);
96
- }
97
- catch (e) {
98
- LogError(`Failed to process content item: ${contentItem.ID}`, undefined, e);
99
- throw e;
100
- }
98
+ let totalProcessed = 0;
99
+ LogStatus(`ExtractTextAndProcessWithLLM: processing ${contentItems.length} items in batches of ${batchSize}`);
100
+ let batchSuccesses = 0;
101
+ let batchFailures = 0;
102
+ for (let i = 0; i < contentItems.length; i += batchSize) {
103
+ const batch = contentItems.slice(i, i + batchSize);
104
+ const batchNum = Math.floor(i / batchSize) + 1;
105
+ let batchOk = 0, batchFail = 0;
106
+ const batchPromises = batch.map(async (contentItem) => {
107
+ try {
108
+ const processingParams = await this.buildProcessingParams(contentItem, contextUser);
109
+ await this.ProcessContentItemText(processingParams, contextUser);
110
+ totalProcessed++;
111
+ batchOk++;
112
+ onProgress?.(totalProcessed, contentItems.length, contentItem.Name);
113
+ }
114
+ catch (e) {
115
+ LogError(`Failed to process content item: ${contentItem.ID}`, undefined, e);
116
+ totalProcessed++;
117
+ batchFail++;
118
+ onProgress?.(totalProcessed, contentItems.length);
119
+ }
120
+ });
121
+ await Promise.all(batchPromises);
122
+ batchSuccesses += batchOk;
123
+ batchFailures += batchFail;
124
+ LogStatus(`Batch ${batchNum}: ${batchOk} succeeded, ${batchFail} failed (${totalProcessed}/${contentItems.length} total)`);
101
125
  }
126
+ LogStatus(`ExtractTextAndProcessWithLLM complete: ${batchSuccesses} succeeded, ${batchFailures} failed out of ${contentItems.length}`);
102
127
  processRunParams.endTime = new Date();
103
128
  await this.saveProcessRun(processRunParams, contextUser);
104
129
  }
@@ -125,88 +150,121 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
125
150
  const LLMResults = await this.promptAndRetrieveResultsFromLLM(params, contextUser);
126
151
  await this.saveLLMResults(LLMResults, contextUser);
127
152
  }
128
- async promptAndRetrieveResultsFromLLM(params, contextUser) {
129
- const model = AIEngine.Instance.Models.find(m => UUIDsEqual(m.ID, params.modelID));
130
- if (!model) {
131
- throw new Error(`AI Model with ID ${params.modelID} not found`);
153
+ /**
154
+ * Resolves the "Content Autotagging" prompt from the AIEngine cache.
155
+ * Throws if the prompt is not found or not active.
156
+ */
157
+ getAutotagPrompt() {
158
+ const prompt = AIEngine.Instance.Prompts.find(p => p.Name === 'Content Autotagging');
159
+ if (!prompt) {
160
+ throw new Error('AI Prompt "Content Autotagging" not found. Ensure the prompt metadata has been synced to the database.');
132
161
  }
133
- const llm = MJGlobal.Instance.ClassFactory.CreateInstance(BaseLLM, model.DriverClass, GetAIAPIKey(model.DriverClass));
134
- if (!llm) {
135
- throw new Error(`Failed to create LLM instance for driver ${model.DriverClass}`);
162
+ if (prompt.Status !== 'Active') {
163
+ throw new Error(`AI Prompt "Content Autotagging" is not active (Status: ${prompt.Status})`);
136
164
  }
137
- const chunks = this.chunkExtractedText(params.text, model.InputTokenLimit);
165
+ return prompt;
166
+ }
167
+ /**
168
+ * Builds template data for the autotagging prompt from processing params and chunk context.
169
+ */
170
+ buildPromptData(params, chunk, previousResults) {
171
+ const contentType = this.GetContentTypeName(params.contentTypeID);
172
+ const contentSourceType = this.GetContentSourceTypeName(params.contentSourceTypeID);
173
+ const additionalAttributePrompts = this.GetAdditionalContentTypePrompt(params.contentTypeID);
174
+ const hasPreviousResults = Object.keys(previousResults).length > 0;
175
+ return {
176
+ contentType,
177
+ contentSourceType,
178
+ minTags: params.minTags,
179
+ maxTags: params.maxTags,
180
+ additionalAttributePrompts,
181
+ contentText: chunk,
182
+ previousResults: hasPreviousResults ? JSON.stringify(previousResults) : undefined,
183
+ };
184
+ }
185
+ async promptAndRetrieveResultsFromLLM(params, contextUser) {
186
+ await AIEngine.Instance.Config(false, contextUser);
187
+ const prompt = this.getAutotagPrompt();
188
+ // Determine token limit for chunking: use override model if set, else first prompt-model, else a default
189
+ const tokenLimit = this.resolveTokenLimit(params.modelID);
190
+ const chunks = this.chunkExtractedText(params.text, tokenLimit);
138
191
  let LLMResults = {};
139
192
  const startTime = new Date();
140
193
  for (const chunk of chunks) {
141
- const { systemPrompt, userPrompt } = await this.getLLMPrompts(params, chunk, LLMResults, contextUser);
142
- LLMResults = await this.processChunkWithLLM(llm, systemPrompt, userPrompt, LLMResults, model.APIName);
194
+ LLMResults = await this.processChunkWithPromptRunner(prompt, params, chunk, LLMResults, contextUser);
143
195
  }
144
196
  LLMResults.processStartTime = startTime;
145
197
  LLMResults.processEndTime = new Date();
146
198
  LLMResults.contentItemID = params.contentItemID;
147
199
  return LLMResults;
148
200
  }
149
- async processChunkWithLLM(llm, systemPrompt, userPrompt, LLMResults, modelAPIName) {
150
- const response = await llm.ChatCompletion({
151
- messages: [
152
- { role: 'system', content: systemPrompt },
153
- { role: 'user', content: userPrompt }
154
- ],
155
- model: modelAPIName,
156
- temperature: 0.0,
157
- });
158
- const queryResponse = response.data.choices[0]?.message?.content?.trim() || '';
159
- let JSONQueryResponse;
160
- try {
161
- JSONQueryResponse = JSON.parse(queryResponse);
201
+ /**
202
+ * Resolves the input token limit for chunking. Uses the model specified by modelID if available,
203
+ * otherwise falls back to a conservative default.
204
+ */
205
+ resolveTokenLimit(modelID) {
206
+ const DEFAULT_TOKEN_LIMIT = 100000;
207
+ if (modelID) {
208
+ const model = AIEngine.Instance.Models.find(m => UUIDsEqual(m.ID, modelID));
209
+ if (model) {
210
+ return model.InputTokenLimit;
211
+ }
162
212
  }
163
- catch (parseError) {
164
- LogError('Failed to parse LLM response as JSON', undefined, queryResponse);
213
+ return DEFAULT_TOKEN_LIMIT;
214
+ }
215
+ /**
216
+ * Processes a single text chunk using AIPromptRunner and merges results.
217
+ * Uses the prompt's configured model by default. If ContentType.AIModelID is set,
218
+ * it is passed as a runtime model override via AIPromptParams.override.
219
+ */
220
+ async processChunkWithPromptRunner(prompt, params, chunk, LLMResults, contextUser) {
221
+ const promptParams = new AIPromptParams();
222
+ promptParams.prompt = prompt;
223
+ promptParams.contextUser = contextUser;
224
+ promptParams.data = this.buildPromptData(params, chunk, LLMResults);
225
+ promptParams.skipValidation = false;
226
+ promptParams.attemptJSONRepair = true;
227
+ promptParams.additionalParameters = { temperature: 0.0 };
228
+ // If the ContentType specifies a preferred AI model, use it as a runtime override
229
+ if (params.modelID) {
230
+ promptParams.override = { modelId: params.modelID };
231
+ }
232
+ const runner = new AIPromptRunner();
233
+ // Per-item logging removed for cleanliness — batch-level logging in ExtractTextAndProcessWithLLM
234
+ const result = await runner.ExecutePrompt(promptParams);
235
+ if (!result.success) {
236
+ LogError(`AIPromptRunner FAILED for content item ${params.contentItemID}: ${result.errorMessage ?? 'no error message'}`, undefined, result);
165
237
  return LLMResults;
166
238
  }
167
- for (const key in JSONQueryResponse) {
168
- const value = JSONQueryResponse[key];
169
- if (value !== null) {
170
- LLMResults[key] = value;
239
+ // Parse the result AIPromptRunner may return a raw JSON string or a parsed object
240
+ let chunkResult = null;
241
+ if (typeof result.result === 'string') {
242
+ try {
243
+ chunkResult = JSON.parse(result.result);
244
+ }
245
+ catch {
246
+ LogError(`Failed to parse LLM result as JSON for item ${params.contentItemID}: ${String(result.result).substring(0, 200)}`);
247
+ return LLMResults;
248
+ }
249
+ }
250
+ else {
251
+ chunkResult = result.result;
252
+ }
253
+ // Merge results from this chunk into the accumulated results
254
+ if (chunkResult) {
255
+ for (const key in chunkResult) {
256
+ const value = chunkResult[key];
257
+ if (value !== null) {
258
+ LLMResults[key] = value;
259
+ }
171
260
  }
172
261
  }
173
262
  return LLMResults;
174
263
  }
175
- async getLLMPrompts(params, chunk, LLMResults, contextUser) {
176
- const contentType = this.GetContentTypeName(params.contentTypeID);
177
- const contentSourceType = this.GetContentSourceTypeName(params.contentSourceTypeID);
178
- const additionalContentTypePrompts = this.GetAdditionalContentTypePrompt(params.contentTypeID);
179
- const systemPrompt = `You are a highly skilled text analysis assistant. You have decades of experience and pride yourself on your attention to detail and ability to capture both accurate information, as well as tone and subtext.
180
- Your task is to accurately extract key information from a provided piece of text based on a series of prompts. You are provided with text that should be a ${contentType}, that has been extracted from a ${contentSourceType}.
181
- The text MUST be of the type ${contentType} for the subsequent processing.`;
182
- const userPrompt = `
183
- If the provided text does not actually appear to be of the type ${contentType}, please disregard everything in the instructions after this and return this exact JSON response: { isValidContent: false (as a boolean) }.
184
- Assuming the type of the text is in fact from a ${contentType}, please extract the title of the provided text, a short summary of the provided documents, as well as between ${params.minTags} and ${params.maxTags} topical key words that are most relevant to the text.
185
- If there is no title explicitly provided in the text, please provide a title that you think best represents the text.
186
- Please provide the keywords in a list format.
187
- Make sure the response is just the json file without and formatting or code blocks, and strictly following the format below. Please don't include a greeting in the response, only output the json file:
188
-
189
- {
190
- "title": (title here),
191
- "description": (description here),
192
- "keywords": (list keywords here),
193
- "isValidContent": true (as a boolean)
194
- }
195
-
196
- ${additionalContentTypePrompts}
197
-
198
- Please make sure the response in is valid JSON format.
199
-
200
- You are also provided with the results so far as additional context, please use them to formulate the best results given the provided text: ${JSON.stringify(LLMResults)}
201
- The supplied text is: ${chunk}
202
- `;
203
- return { systemPrompt, userPrompt };
204
- }
205
264
  async saveLLMResults(LLMResults, contextUser) {
206
265
  if (LLMResults.isValidContent === true) {
207
266
  await this.saveResultsToContentItemAttribute(LLMResults, contextUser);
208
267
  await this.saveContentItemTags(LLMResults.contentItemID, LLMResults, contextUser);
209
- LogStatus(`Results for content item ${LLMResults.contentItemID} saved successfully`);
210
268
  }
211
269
  else {
212
270
  await this.deleteInvalidContentItem(LLMResults.contentItemID, contextUser);
@@ -270,14 +328,28 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
270
328
  const keywords = LLMResults.keywords;
271
329
  if (!keywords || !Array.isArray(keywords))
272
330
  return;
331
+ // Normalize keywords — support both formats:
332
+ // Old: ["keyword1", "keyword2"]
333
+ // New: [{ tag: "keyword1", weight: 0.95 }, { tag: "keyword2", weight: 0.7 }]
334
+ const normalizedTags = keywords.map((kw) => {
335
+ if (typeof kw === 'string') {
336
+ return { tag: kw, weight: 1.0 };
337
+ }
338
+ const obj = kw;
339
+ return {
340
+ tag: obj.tag || obj.keyword || String(kw),
341
+ weight: typeof obj.weight === 'number' ? Math.max(0, Math.min(1, obj.weight)) : 0.5,
342
+ };
343
+ });
273
344
  const BATCH_SIZE = 10;
274
- for (let i = 0; i < keywords.length; i += BATCH_SIZE) {
275
- const batch = keywords.slice(i, i + BATCH_SIZE);
276
- await Promise.all(batch.map(async (keyword) => {
345
+ for (let i = 0; i < normalizedTags.length; i += BATCH_SIZE) {
346
+ const batch = normalizedTags.slice(i, i + BATCH_SIZE);
347
+ await Promise.all(batch.map(async (item) => {
277
348
  const contentItemTag = await md.GetEntityObject('MJ: Content Item Tags', contextUser);
278
349
  contentItemTag.NewRecord();
279
350
  contentItemTag.ItemID = contentItemID;
280
- contentItemTag.Tag = keyword;
351
+ contentItemTag.Tag = item.tag;
352
+ contentItemTag.Set('Weight', item.weight);
281
353
  await contentItemTag.Save();
282
354
  }));
283
355
  }
@@ -530,6 +602,320 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
530
602
  throw new Error(`File type '${fileExtension}' not supported`);
531
603
  }
532
604
  }
605
+ // ---- Direct Vectorization ----
606
+ /**
607
+ * Embeds content items and upserts them to the appropriate vector index.
608
+ * Items are grouped by their resolved (embeddingModel + vectorIndex) pair — derived
609
+ * from per-ContentSource overrides, per-ContentType defaults, or the global fallback
610
+ * (first active VectorIndex). Each group is processed in configurable batches with
611
+ * parallel upserts within each batch.
612
+ */
613
+ async VectorizeContentItems(items, contextUser, onProgress, batchSize = DEFAULT_VECTORIZE_BATCH_SIZE) {
614
+ const eligible = items.filter(i => i.Text && i.Text.trim().length > 0);
615
+ if (eligible.length === 0) {
616
+ LogStatus('VectorizeContentItems: no items with text to vectorize');
617
+ return { vectorized: 0, skipped: items.length };
618
+ }
619
+ // Ensure AIEngine is loaded so we can resolve the embedding model
620
+ await AIEngine.Instance.Config(false, contextUser);
621
+ // Load content sources + types for per-item infrastructure resolution
622
+ const { sourceMap, typeMap } = await this.loadContentSourceAndTypeMaps(eligible, contextUser);
623
+ // Group items by their resolved (embeddingModelID + vectorIndexID) pair
624
+ const groups = this.groupItemsByInfrastructure(eligible, sourceMap, typeMap);
625
+ // Load tags for all items in one query
626
+ const tagMap = await this.loadTagsForItems(eligible, contextUser);
627
+ let vectorized = 0;
628
+ let processed = 0;
629
+ for (const [groupKey, groupItems] of groups) {
630
+ const infra = await this.resolveGroupInfrastructure(groupKey, contextUser);
631
+ const groupVectorized = await this.vectorizeGroup(groupItems, infra, tagMap, batchSize, (batchProcessed) => {
632
+ processed += batchProcessed;
633
+ onProgress?.(Math.min(processed, eligible.length), eligible.length);
634
+ });
635
+ vectorized += groupVectorized;
636
+ }
637
+ LogStatus(`VectorizeContentItems: ${vectorized} vectorized, ${items.length - eligible.length} skipped (empty text)`);
638
+ return { vectorized, skipped: items.length - eligible.length };
639
+ }
640
+ /**
641
+ * Process a single infrastructure group: embed texts in batches and upsert to vector DB.
642
+ * Upserts within each batch run in parallel for throughput.
643
+ */
644
+ async vectorizeGroup(items, infra, tagMap, batchSize, onBatchComplete) {
645
+ let vectorized = 0;
646
+ for (let i = 0; i < items.length; i += batchSize) {
647
+ const batch = items.slice(i, i + batchSize);
648
+ const texts = batch.map(item => this.buildEmbeddingText(item));
649
+ const embedResult = await infra.embedding.EmbedTexts({ texts, model: infra.embeddingModelName });
650
+ if (!embedResult.vectors || embedResult.vectors.length !== batch.length) {
651
+ LogError(`VectorizeContentItems: embedding returned ${embedResult.vectors?.length ?? 0} vectors for ${batch.length} texts`);
652
+ onBatchComplete(batch.length);
653
+ continue;
654
+ }
655
+ const records = batch.map((item, idx) => ({
656
+ id: this.contentItemVectorId(item.ID),
657
+ values: embedResult.vectors[idx],
658
+ metadata: this.buildVectorMetadata(item, tagMap.get(item.ID))
659
+ }));
660
+ // Upsert records in parallel sub-batches for throughput
661
+ const UPSERT_CHUNK = 50;
662
+ const upsertPromises = [];
663
+ for (let j = 0; j < records.length; j += UPSERT_CHUNK) {
664
+ const chunk = records.slice(j, j + UPSERT_CHUNK);
665
+ upsertPromises.push(Promise.resolve(infra.vectorDB.CreateRecords(chunk, infra.indexName)));
666
+ }
667
+ const responses = await Promise.all(upsertPromises);
668
+ let batchSuccess = true;
669
+ for (const response of responses) {
670
+ if (!response.success) {
671
+ LogError(`VectorizeContentItems: upsert failed: ${response.message}`);
672
+ batchSuccess = false;
673
+ }
674
+ }
675
+ if (batchSuccess) {
676
+ vectorized += batch.length;
677
+ }
678
+ onBatchComplete(batch.length);
679
+ }
680
+ return vectorized;
681
+ }
682
+ /**
683
+ * Load content source and content type records for all unique source/type IDs
684
+ * referenced by the given items. Returns maps keyed by normalized ID.
685
+ */
686
+ async loadContentSourceAndTypeMaps(items, contextUser) {
687
+ const sourceIds = [...new Set(items.map(i => i.ContentSourceID))];
688
+ const typeIds = [...new Set(items.map(i => i.ContentTypeID))];
689
+ const rv = new RunView();
690
+ const [sourceResult, typeResult] = await rv.RunViews([
691
+ {
692
+ EntityName: 'MJ: Content Sources',
693
+ ExtraFilter: `ID IN (${sourceIds.map(id => `'${id}'`).join(',')})`,
694
+ ResultType: 'simple'
695
+ },
696
+ {
697
+ EntityName: 'MJ: Content Types',
698
+ ExtraFilter: `ID IN (${typeIds.map(id => `'${id}'`).join(',')})`,
699
+ ResultType: 'simple'
700
+ }
701
+ ], contextUser);
702
+ const sourceMap = new Map();
703
+ if (sourceResult.Success) {
704
+ for (const row of sourceResult.Results) {
705
+ const rec = row;
706
+ sourceMap.set(NormalizeUUID(rec['ID']), rec);
707
+ }
708
+ }
709
+ const typeMap = new Map();
710
+ if (typeResult.Success) {
711
+ for (const row of typeResult.Results) {
712
+ const rec = row;
713
+ typeMap.set(NormalizeUUID(rec['ID']), rec);
714
+ }
715
+ }
716
+ return { sourceMap, typeMap };
717
+ }
718
+ /**
719
+ * Resolve the (embeddingModelID, vectorIndexID) pair for a content item using
720
+ * the cascade: ContentSource override -> ContentType default -> null (global fallback).
721
+ */
722
+ resolveItemInfrastructureIds(item, sourceMap, typeMap) {
723
+ const source = sourceMap.get(NormalizeUUID(item.ContentSourceID));
724
+ if (source) {
725
+ const srcEmbedding = source['EmbeddingModelID'];
726
+ const srcVector = source['VectorIndexID'];
727
+ if (srcEmbedding && srcVector) {
728
+ return { embeddingModelID: srcEmbedding, vectorIndexID: srcVector };
729
+ }
730
+ }
731
+ const contentType = typeMap.get(NormalizeUUID(item.ContentTypeID));
732
+ if (contentType) {
733
+ const typeEmbedding = contentType['EmbeddingModelID'];
734
+ const typeVector = contentType['VectorIndexID'];
735
+ if (typeEmbedding && typeVector) {
736
+ return { embeddingModelID: typeEmbedding, vectorIndexID: typeVector };
737
+ }
738
+ }
739
+ // Global fallback — will be resolved in resolveGroupInfrastructure
740
+ return { embeddingModelID: null, vectorIndexID: null };
741
+ }
742
+ /**
743
+ * Group items by their resolved (embeddingModelID + vectorIndexID) key.
744
+ * Items with the same pair share infrastructure and can be batched together.
745
+ */
746
+ groupItemsByInfrastructure(items, sourceMap, typeMap) {
747
+ const groups = new Map();
748
+ for (const item of items) {
749
+ const { embeddingModelID, vectorIndexID } = this.resolveItemInfrastructureIds(item, sourceMap, typeMap);
750
+ const key = this.infraGroupKey(embeddingModelID, vectorIndexID);
751
+ const group = groups.get(key) ?? [];
752
+ group.push(item);
753
+ groups.set(key, group);
754
+ }
755
+ return groups;
756
+ }
757
+ /** Create a stable cache key for an (embeddingModelID, vectorIndexID) pair */
758
+ infraGroupKey(embeddingModelID, vectorIndexID) {
759
+ const e = embeddingModelID ? NormalizeUUID(embeddingModelID) : 'default';
760
+ const v = vectorIndexID ? NormalizeUUID(vectorIndexID) : 'default';
761
+ return `${e}|${v}`;
762
+ }
763
+ /**
764
+ * Resolve a group key into concrete infrastructure instances. For the 'default|default'
765
+ * key, falls back to the first active VectorIndex (original behavior).
766
+ */
767
+ async resolveGroupInfrastructure(groupKey, contextUser) {
768
+ const [embeddingPart, vectorPart] = groupKey.split('|');
769
+ const isDefault = embeddingPart === 'default' || vectorPart === 'default';
770
+ if (isDefault) {
771
+ return this.getDefaultVectorInfrastructure(contextUser);
772
+ }
773
+ return this.buildVectorInfrastructure(embeddingPart, vectorPart, contextUser);
774
+ }
775
+ /**
776
+ * Build infrastructure from explicit embeddingModelID and vectorIndexID.
777
+ * Looks up the vector index by ID and the embedding model from AIEngine.
778
+ */
779
+ async buildVectorInfrastructure(embeddingModelID, vectorIndexID, contextUser) {
780
+ const rv = new RunView();
781
+ const indexResult = await rv.RunView({
782
+ EntityName: 'MJ: Vector Indexes',
783
+ ExtraFilter: `ID='${vectorIndexID}'`,
784
+ ResultType: 'simple',
785
+ MaxRows: 1
786
+ }, contextUser);
787
+ if (!indexResult.Success || indexResult.Results.length === 0) {
788
+ throw new Error(`Vector index ${vectorIndexID} not found`);
789
+ }
790
+ const vectorIndex = indexResult.Results[0];
791
+ return this.createInfrastructureFromIndex(vectorIndex, embeddingModelID, contextUser);
792
+ }
793
+ /**
794
+ * Fallback: resolve infrastructure from the first active VectorIndex (original behavior).
795
+ */
796
+ async getDefaultVectorInfrastructure(contextUser) {
797
+ const rv = new RunView();
798
+ const indexResult = await rv.RunView({
799
+ EntityName: 'MJ: Vector Indexes',
800
+ ResultType: 'simple',
801
+ MaxRows: 1
802
+ }, contextUser);
803
+ if (!indexResult.Success || indexResult.Results.length === 0) {
804
+ throw new Error('No vector indexes found — create one in the Configuration tab first');
805
+ }
806
+ const vectorIndex = indexResult.Results[0];
807
+ const embeddingModelID = vectorIndex['EmbeddingModelID'];
808
+ return this.createInfrastructureFromIndex(vectorIndex, embeddingModelID, contextUser);
809
+ }
810
+ /**
811
+ * Shared helper: given a vector index record and embedding model ID, resolve all
812
+ * driver instances needed for embedding + upsert.
813
+ */
814
+ async createInfrastructureFromIndex(vectorIndex, embeddingModelID, contextUser) {
815
+ const indexName = vectorIndex['Name'];
816
+ const vectorDatabaseID = vectorIndex['VectorDatabaseID'];
817
+ const rv = new RunView();
818
+ const dbResult = await rv.RunView({
819
+ EntityName: 'MJ: Vector Databases',
820
+ ExtraFilter: `ID='${vectorDatabaseID}'`,
821
+ ResultType: 'simple',
822
+ MaxRows: 1
823
+ }, contextUser);
824
+ if (!dbResult.Success || dbResult.Results.length === 0) {
825
+ throw new Error(`Vector database ${vectorDatabaseID} not found`);
826
+ }
827
+ const vectorDBClassKey = dbResult.Results[0]['ClassKey'];
828
+ const aiModel = this.findEmbeddingModel(embeddingModelID);
829
+ const driverClass = aiModel.DriverClass;
830
+ const embeddingModelName = aiModel.APIName ?? aiModel.Name;
831
+ LogStatus(`VectorizeContentItems: USING embedding model "${aiModel.Name}" (${driverClass}), vector DB "${vectorDBClassKey}", index "${indexName}"`);
832
+ const embedding = this.createEmbeddingInstance(driverClass);
833
+ const vectorDB = this.createVectorDBInstance(vectorDBClassKey);
834
+ return { embedding, vectorDB, indexName, embeddingModelName };
835
+ }
836
+ /** Find an embedding model by ID in AIEngine, with helpful error reporting */
837
+ findEmbeddingModel(embeddingModelID) {
838
+ const aiModel = AIEngine.Instance.Models.find(m => UUIDsEqual(m.ID, embeddingModelID));
839
+ if (!aiModel) {
840
+ const embModels = AIEngine.Instance.Models.filter(m => m.DriverClass?.includes('Embed') || m.Name?.includes('embed'));
841
+ LogError(`VectorizeContentItems: embeddingModelID ${embeddingModelID} NOT FOUND. Available: ${JSON.stringify(embModels.map(m => ({ id: m.ID, name: m.Name, driver: m.DriverClass })))}`);
842
+ throw new Error(`Embedding model ${embeddingModelID} not found in AIEngine — ensure AIEngine is configured`);
843
+ }
844
+ return aiModel;
845
+ }
846
+ /** Create a BaseEmbeddings instance for a given driver class */
847
+ createEmbeddingInstance(driverClass) {
848
+ const apiKey = GetAIAPIKey(driverClass);
849
+ if (!apiKey) {
850
+ throw new Error(`No API key found for embedding driver ${driverClass} — set AI_VENDOR_API_KEY__${driverClass} in .env`);
851
+ }
852
+ const instance = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, driverClass, apiKey);
853
+ if (!instance)
854
+ throw new Error(`Failed to create embedding instance for ${driverClass}`);
855
+ return instance;
856
+ }
857
+ /** Create a VectorDBBase instance for a given class key */
858
+ createVectorDBInstance(classKey) {
859
+ const apiKey = GetAIAPIKey(classKey);
860
+ if (!apiKey) {
861
+ throw new Error(`No API key found for vector DB ${classKey} — set AI_VENDOR_API_KEY__${classKey} in .env`);
862
+ }
863
+ const instance = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, classKey, apiKey);
864
+ if (!instance)
865
+ throw new Error(`Failed to create vector DB instance for ${classKey}`);
866
+ return instance;
867
+ }
868
+ /** SHA-1 deterministic vector ID for a content item */
869
+ contentItemVectorId(contentItemId) {
870
+ return crypto.createHash('sha1').update(`content-item_${contentItemId}`).digest('hex');
871
+ }
872
+ /** Build the text that gets embedded: Title + Description + full Text */
873
+ buildEmbeddingText(item) {
874
+ const parts = [];
875
+ if (item.Name)
876
+ parts.push(item.Name);
877
+ if (item.Description)
878
+ parts.push(item.Description);
879
+ if (item.Text)
880
+ parts.push(item.Text);
881
+ return parts.join('\n');
882
+ }
883
+ /** Build metadata stored alongside the vector — truncate large text fields */
884
+ buildVectorMetadata(item, tags) {
885
+ const META_TEXT_LIMIT = 1000;
886
+ const meta = {
887
+ RecordID: item.ID,
888
+ Entity: 'MJ: Content Items',
889
+ };
890
+ if (item.Name)
891
+ meta['Title'] = item.Name.substring(0, META_TEXT_LIMIT);
892
+ if (item.Description)
893
+ meta['Description'] = item.Description.substring(0, META_TEXT_LIMIT);
894
+ if (item.URL)
895
+ meta['URL'] = item.URL;
896
+ if (tags && tags.length > 0)
897
+ meta['Tags'] = tags;
898
+ return meta;
899
+ }
900
+ /** Load all tags for the given items in a single RunView call */
901
+ async loadTagsForItems(items, contextUser) {
902
+ const tagMap = new Map();
903
+ const rv = new RunView();
904
+ const ids = items.map(i => `'${i.ID}'`).join(',');
905
+ const result = await rv.RunView({
906
+ EntityName: 'MJ: Content Item Tags',
907
+ ExtraFilter: `ItemID IN (${ids})`,
908
+ ResultType: 'entity_object'
909
+ }, contextUser);
910
+ if (result.Success) {
911
+ for (const tag of result.Results) {
912
+ const existing = tagMap.get(tag.ItemID) ?? [];
913
+ existing.push(tag.Tag);
914
+ tagMap.set(tag.ItemID, existing);
915
+ }
916
+ }
917
+ return tagMap;
918
+ }
533
919
  };
534
920
  AutotagBaseEngine = __decorate([
535
921
  RegisterClass(BaseEngine, 'AutotagBaseEngine')