@memberjunction/content-autotagging 5.22.0 → 5.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -18
- package/dist/CloudStorage/generic/CloudStorageBase.d.ts +2 -2
- package/dist/CloudStorage/generic/CloudStorageBase.d.ts.map +1 -1
- package/dist/CloudStorage/generic/CloudStorageBase.js +2 -2
- package/dist/CloudStorage/generic/CloudStorageBase.js.map +1 -1
- package/dist/Core/generic/AutotagBase.d.ts +3 -1
- package/dist/Core/generic/AutotagBase.d.ts.map +1 -1
- package/dist/Core/generic/AutotagBase.js.map +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.d.ts +89 -7
- package/dist/Engine/generic/AutotagBaseEngine.d.ts.map +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.js +462 -76
- package/dist/Engine/generic/AutotagBaseEngine.js.map +1 -1
- package/dist/Entity/generic/AutotagEntity.d.ts +2 -2
- package/dist/Entity/generic/AutotagEntity.d.ts.map +1 -1
- package/dist/Entity/generic/AutotagEntity.js +2 -2
- package/dist/Entity/generic/AutotagEntity.js.map +1 -1
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.d.ts +2 -2
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.d.ts.map +1 -1
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js +2 -2
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js.map +1 -1
- package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts +2 -2
- package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts.map +1 -1
- package/dist/RSSFeed/generic/AutotagRSSFeed.js +2 -2
- package/dist/RSSFeed/generic/AutotagRSSFeed.js.map +1 -1
- package/dist/Websites/generic/AutotagWebsite.d.ts +2 -2
- package/dist/Websites/generic/AutotagWebsite.d.ts.map +1 -1
- package/dist/Websites/generic/AutotagWebsite.js +2 -2
- package/dist/Websites/generic/AutotagWebsite.js.map +1 -1
- package/package.json +11 -8
|
@@ -5,7 +5,7 @@ var __decorate = (this && this.__decorate) || function (decorators, target, key,
|
|
|
5
5
|
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
6
6
|
};
|
|
7
7
|
import { BaseEngine, Metadata, RunView, LogError, LogStatus } from '@memberjunction/core';
|
|
8
|
-
import { MJGlobal, UUIDsEqual, RegisterClass } from '@memberjunction/global';
|
|
8
|
+
import { MJGlobal, UUIDsEqual, NormalizeUUID, RegisterClass } from '@memberjunction/global';
|
|
9
9
|
import { ContentSourceTypeParams } from './content.types.js';
|
|
10
10
|
import pdfParse from 'pdf-parse';
|
|
11
11
|
import officeparser from 'officeparser';
|
|
@@ -15,9 +15,14 @@ import { toZonedTime } from 'date-fns-tz';
|
|
|
15
15
|
import axios from 'axios';
|
|
16
16
|
import * as cheerio from 'cheerio';
|
|
17
17
|
import crypto from 'crypto';
|
|
18
|
-
import {
|
|
18
|
+
import { BaseEmbeddings, GetAIAPIKey } from '@memberjunction/ai';
|
|
19
19
|
import { AIEngine } from '@memberjunction/aiengine';
|
|
20
|
+
import { AIPromptRunner } from '@memberjunction/ai-prompts';
|
|
21
|
+
import { AIPromptParams } from '@memberjunction/ai-core-plus';
|
|
20
22
|
import { TextChunker } from '@memberjunction/ai-vectors';
|
|
23
|
+
import { VectorDBBase } from '@memberjunction/ai-vectordb';
|
|
24
|
+
/** Default batch size for vectorization processing */
|
|
25
|
+
const DEFAULT_VECTORIZE_BATCH_SIZE = 20;
|
|
21
26
|
/**
|
|
22
27
|
* Core engine for content autotagging. Extends BaseEngine to cache content metadata
|
|
23
28
|
* (types, source types, file types, attributes) at startup. Uses AIEngine via composition
|
|
@@ -79,8 +84,9 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
79
84
|
}
|
|
80
85
|
/**
|
|
81
86
|
* Given a list of content items, extract the text from each and process with LLM for tagging.
|
|
87
|
+
* Items are processed in configurable batches with controlled concurrency within each batch.
|
|
82
88
|
*/
|
|
83
|
-
async ExtractTextAndProcessWithLLM(contentItems, contextUser) {
|
|
89
|
+
async ExtractTextAndProcessWithLLM(contentItems, contextUser, batchSize = DEFAULT_VECTORIZE_BATCH_SIZE, onProgress) {
|
|
84
90
|
if (!contentItems || contentItems.length === 0) {
|
|
85
91
|
LogStatus('No content items to process');
|
|
86
92
|
return;
|
|
@@ -89,16 +95,35 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
89
95
|
processRunParams.sourceID = contentItems[0].ContentSourceID;
|
|
90
96
|
processRunParams.startTime = new Date();
|
|
91
97
|
processRunParams.numItemsProcessed = contentItems.length;
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
98
|
+
let totalProcessed = 0;
|
|
99
|
+
LogStatus(`ExtractTextAndProcessWithLLM: processing ${contentItems.length} items in batches of ${batchSize}`);
|
|
100
|
+
let batchSuccesses = 0;
|
|
101
|
+
let batchFailures = 0;
|
|
102
|
+
for (let i = 0; i < contentItems.length; i += batchSize) {
|
|
103
|
+
const batch = contentItems.slice(i, i + batchSize);
|
|
104
|
+
const batchNum = Math.floor(i / batchSize) + 1;
|
|
105
|
+
let batchOk = 0, batchFail = 0;
|
|
106
|
+
const batchPromises = batch.map(async (contentItem) => {
|
|
107
|
+
try {
|
|
108
|
+
const processingParams = await this.buildProcessingParams(contentItem, contextUser);
|
|
109
|
+
await this.ProcessContentItemText(processingParams, contextUser);
|
|
110
|
+
totalProcessed++;
|
|
111
|
+
batchOk++;
|
|
112
|
+
onProgress?.(totalProcessed, contentItems.length, contentItem.Name);
|
|
113
|
+
}
|
|
114
|
+
catch (e) {
|
|
115
|
+
LogError(`Failed to process content item: ${contentItem.ID}`, undefined, e);
|
|
116
|
+
totalProcessed++;
|
|
117
|
+
batchFail++;
|
|
118
|
+
onProgress?.(totalProcessed, contentItems.length);
|
|
119
|
+
}
|
|
120
|
+
});
|
|
121
|
+
await Promise.all(batchPromises);
|
|
122
|
+
batchSuccesses += batchOk;
|
|
123
|
+
batchFailures += batchFail;
|
|
124
|
+
LogStatus(`Batch ${batchNum}: ${batchOk} succeeded, ${batchFail} failed (${totalProcessed}/${contentItems.length} total)`);
|
|
101
125
|
}
|
|
126
|
+
LogStatus(`ExtractTextAndProcessWithLLM complete: ${batchSuccesses} succeeded, ${batchFailures} failed out of ${contentItems.length}`);
|
|
102
127
|
processRunParams.endTime = new Date();
|
|
103
128
|
await this.saveProcessRun(processRunParams, contextUser);
|
|
104
129
|
}
|
|
@@ -125,88 +150,121 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
125
150
|
const LLMResults = await this.promptAndRetrieveResultsFromLLM(params, contextUser);
|
|
126
151
|
await this.saveLLMResults(LLMResults, contextUser);
|
|
127
152
|
}
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
153
|
+
/**
|
|
154
|
+
* Resolves the "Content Autotagging" prompt from the AIEngine cache.
|
|
155
|
+
* Throws if the prompt is not found or not active.
|
|
156
|
+
*/
|
|
157
|
+
getAutotagPrompt() {
|
|
158
|
+
const prompt = AIEngine.Instance.Prompts.find(p => p.Name === 'Content Autotagging');
|
|
159
|
+
if (!prompt) {
|
|
160
|
+
throw new Error('AI Prompt "Content Autotagging" not found. Ensure the prompt metadata has been synced to the database.');
|
|
132
161
|
}
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
throw new Error(`Failed to create LLM instance for driver ${model.DriverClass}`);
|
|
162
|
+
if (prompt.Status !== 'Active') {
|
|
163
|
+
throw new Error(`AI Prompt "Content Autotagging" is not active (Status: ${prompt.Status})`);
|
|
136
164
|
}
|
|
137
|
-
|
|
165
|
+
return prompt;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Builds template data for the autotagging prompt from processing params and chunk context.
|
|
169
|
+
*/
|
|
170
|
+
buildPromptData(params, chunk, previousResults) {
|
|
171
|
+
const contentType = this.GetContentTypeName(params.contentTypeID);
|
|
172
|
+
const contentSourceType = this.GetContentSourceTypeName(params.contentSourceTypeID);
|
|
173
|
+
const additionalAttributePrompts = this.GetAdditionalContentTypePrompt(params.contentTypeID);
|
|
174
|
+
const hasPreviousResults = Object.keys(previousResults).length > 0;
|
|
175
|
+
return {
|
|
176
|
+
contentType,
|
|
177
|
+
contentSourceType,
|
|
178
|
+
minTags: params.minTags,
|
|
179
|
+
maxTags: params.maxTags,
|
|
180
|
+
additionalAttributePrompts,
|
|
181
|
+
contentText: chunk,
|
|
182
|
+
previousResults: hasPreviousResults ? JSON.stringify(previousResults) : undefined,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
async promptAndRetrieveResultsFromLLM(params, contextUser) {
|
|
186
|
+
await AIEngine.Instance.Config(false, contextUser);
|
|
187
|
+
const prompt = this.getAutotagPrompt();
|
|
188
|
+
// Determine token limit for chunking: use override model if set, else first prompt-model, else a default
|
|
189
|
+
const tokenLimit = this.resolveTokenLimit(params.modelID);
|
|
190
|
+
const chunks = this.chunkExtractedText(params.text, tokenLimit);
|
|
138
191
|
let LLMResults = {};
|
|
139
192
|
const startTime = new Date();
|
|
140
193
|
for (const chunk of chunks) {
|
|
141
|
-
|
|
142
|
-
LLMResults = await this.processChunkWithLLM(llm, systemPrompt, userPrompt, LLMResults, model.APIName);
|
|
194
|
+
LLMResults = await this.processChunkWithPromptRunner(prompt, params, chunk, LLMResults, contextUser);
|
|
143
195
|
}
|
|
144
196
|
LLMResults.processStartTime = startTime;
|
|
145
197
|
LLMResults.processEndTime = new Date();
|
|
146
198
|
LLMResults.contentItemID = params.contentItemID;
|
|
147
199
|
return LLMResults;
|
|
148
200
|
}
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
try {
|
|
161
|
-
JSONQueryResponse = JSON.parse(queryResponse);
|
|
201
|
+
/**
|
|
202
|
+
* Resolves the input token limit for chunking. Uses the model specified by modelID if available,
|
|
203
|
+
* otherwise falls back to a conservative default.
|
|
204
|
+
*/
|
|
205
|
+
resolveTokenLimit(modelID) {
|
|
206
|
+
const DEFAULT_TOKEN_LIMIT = 100000;
|
|
207
|
+
if (modelID) {
|
|
208
|
+
const model = AIEngine.Instance.Models.find(m => UUIDsEqual(m.ID, modelID));
|
|
209
|
+
if (model) {
|
|
210
|
+
return model.InputTokenLimit;
|
|
211
|
+
}
|
|
162
212
|
}
|
|
163
|
-
|
|
164
|
-
|
|
213
|
+
return DEFAULT_TOKEN_LIMIT;
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Processes a single text chunk using AIPromptRunner and merges results.
|
|
217
|
+
* Uses the prompt's configured model by default. If ContentType.AIModelID is set,
|
|
218
|
+
* it is passed as a runtime model override via AIPromptParams.override.
|
|
219
|
+
*/
|
|
220
|
+
async processChunkWithPromptRunner(prompt, params, chunk, LLMResults, contextUser) {
|
|
221
|
+
const promptParams = new AIPromptParams();
|
|
222
|
+
promptParams.prompt = prompt;
|
|
223
|
+
promptParams.contextUser = contextUser;
|
|
224
|
+
promptParams.data = this.buildPromptData(params, chunk, LLMResults);
|
|
225
|
+
promptParams.skipValidation = false;
|
|
226
|
+
promptParams.attemptJSONRepair = true;
|
|
227
|
+
promptParams.additionalParameters = { temperature: 0.0 };
|
|
228
|
+
// If the ContentType specifies a preferred AI model, use it as a runtime override
|
|
229
|
+
if (params.modelID) {
|
|
230
|
+
promptParams.override = { modelId: params.modelID };
|
|
231
|
+
}
|
|
232
|
+
const runner = new AIPromptRunner();
|
|
233
|
+
// Per-item logging removed for cleanliness — batch-level logging in ExtractTextAndProcessWithLLM
|
|
234
|
+
const result = await runner.ExecutePrompt(promptParams);
|
|
235
|
+
if (!result.success) {
|
|
236
|
+
LogError(`AIPromptRunner FAILED for content item ${params.contentItemID}: ${result.errorMessage ?? 'no error message'}`, undefined, result);
|
|
165
237
|
return LLMResults;
|
|
166
238
|
}
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
239
|
+
// Parse the result — AIPromptRunner may return a raw JSON string or a parsed object
|
|
240
|
+
let chunkResult = null;
|
|
241
|
+
if (typeof result.result === 'string') {
|
|
242
|
+
try {
|
|
243
|
+
chunkResult = JSON.parse(result.result);
|
|
244
|
+
}
|
|
245
|
+
catch {
|
|
246
|
+
LogError(`Failed to parse LLM result as JSON for item ${params.contentItemID}: ${String(result.result).substring(0, 200)}`);
|
|
247
|
+
return LLMResults;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
else {
|
|
251
|
+
chunkResult = result.result;
|
|
252
|
+
}
|
|
253
|
+
// Merge results from this chunk into the accumulated results
|
|
254
|
+
if (chunkResult) {
|
|
255
|
+
for (const key in chunkResult) {
|
|
256
|
+
const value = chunkResult[key];
|
|
257
|
+
if (value !== null) {
|
|
258
|
+
LLMResults[key] = value;
|
|
259
|
+
}
|
|
171
260
|
}
|
|
172
261
|
}
|
|
173
262
|
return LLMResults;
|
|
174
263
|
}
|
|
175
|
-
async getLLMPrompts(params, chunk, LLMResults, contextUser) {
|
|
176
|
-
const contentType = this.GetContentTypeName(params.contentTypeID);
|
|
177
|
-
const contentSourceType = this.GetContentSourceTypeName(params.contentSourceTypeID);
|
|
178
|
-
const additionalContentTypePrompts = this.GetAdditionalContentTypePrompt(params.contentTypeID);
|
|
179
|
-
const systemPrompt = `You are a highly skilled text analysis assistant. You have decades of experience and pride yourself on your attention to detail and ability to capture both accurate information, as well as tone and subtext.
|
|
180
|
-
Your task is to accurately extract key information from a provided piece of text based on a series of prompts. You are provided with text that should be a ${contentType}, that has been extracted from a ${contentSourceType}.
|
|
181
|
-
The text MUST be of the type ${contentType} for the subsequent processing.`;
|
|
182
|
-
const userPrompt = `
|
|
183
|
-
If the provided text does not actually appear to be of the type ${contentType}, please disregard everything in the instructions after this and return this exact JSON response: { isValidContent: false (as a boolean) }.
|
|
184
|
-
Assuming the type of the text is in fact from a ${contentType}, please extract the title of the provided text, a short summary of the provided documents, as well as between ${params.minTags} and ${params.maxTags} topical key words that are most relevant to the text.
|
|
185
|
-
If there is no title explicitly provided in the text, please provide a title that you think best represents the text.
|
|
186
|
-
Please provide the keywords in a list format.
|
|
187
|
-
Make sure the response is just the json file without and formatting or code blocks, and strictly following the format below. Please don't include a greeting in the response, only output the json file:
|
|
188
|
-
|
|
189
|
-
{
|
|
190
|
-
"title": (title here),
|
|
191
|
-
"description": (description here),
|
|
192
|
-
"keywords": (list keywords here),
|
|
193
|
-
"isValidContent": true (as a boolean)
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
${additionalContentTypePrompts}
|
|
197
|
-
|
|
198
|
-
Please make sure the response in is valid JSON format.
|
|
199
|
-
|
|
200
|
-
You are also provided with the results so far as additional context, please use them to formulate the best results given the provided text: ${JSON.stringify(LLMResults)}
|
|
201
|
-
The supplied text is: ${chunk}
|
|
202
|
-
`;
|
|
203
|
-
return { systemPrompt, userPrompt };
|
|
204
|
-
}
|
|
205
264
|
async saveLLMResults(LLMResults, contextUser) {
|
|
206
265
|
if (LLMResults.isValidContent === true) {
|
|
207
266
|
await this.saveResultsToContentItemAttribute(LLMResults, contextUser);
|
|
208
267
|
await this.saveContentItemTags(LLMResults.contentItemID, LLMResults, contextUser);
|
|
209
|
-
LogStatus(`Results for content item ${LLMResults.contentItemID} saved successfully`);
|
|
210
268
|
}
|
|
211
269
|
else {
|
|
212
270
|
await this.deleteInvalidContentItem(LLMResults.contentItemID, contextUser);
|
|
@@ -270,14 +328,28 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
270
328
|
const keywords = LLMResults.keywords;
|
|
271
329
|
if (!keywords || !Array.isArray(keywords))
|
|
272
330
|
return;
|
|
331
|
+
// Normalize keywords — support both formats:
|
|
332
|
+
// Old: ["keyword1", "keyword2"]
|
|
333
|
+
// New: [{ tag: "keyword1", weight: 0.95 }, { tag: "keyword2", weight: 0.7 }]
|
|
334
|
+
const normalizedTags = keywords.map((kw) => {
|
|
335
|
+
if (typeof kw === 'string') {
|
|
336
|
+
return { tag: kw, weight: 1.0 };
|
|
337
|
+
}
|
|
338
|
+
const obj = kw;
|
|
339
|
+
return {
|
|
340
|
+
tag: obj.tag || obj.keyword || String(kw),
|
|
341
|
+
weight: typeof obj.weight === 'number' ? Math.max(0, Math.min(1, obj.weight)) : 0.5,
|
|
342
|
+
};
|
|
343
|
+
});
|
|
273
344
|
const BATCH_SIZE = 10;
|
|
274
|
-
for (let i = 0; i <
|
|
275
|
-
const batch =
|
|
276
|
-
await Promise.all(batch.map(async (
|
|
345
|
+
for (let i = 0; i < normalizedTags.length; i += BATCH_SIZE) {
|
|
346
|
+
const batch = normalizedTags.slice(i, i + BATCH_SIZE);
|
|
347
|
+
await Promise.all(batch.map(async (item) => {
|
|
277
348
|
const contentItemTag = await md.GetEntityObject('MJ: Content Item Tags', contextUser);
|
|
278
349
|
contentItemTag.NewRecord();
|
|
279
350
|
contentItemTag.ItemID = contentItemID;
|
|
280
|
-
contentItemTag.Tag =
|
|
351
|
+
contentItemTag.Tag = item.tag;
|
|
352
|
+
contentItemTag.Set('Weight', item.weight);
|
|
281
353
|
await contentItemTag.Save();
|
|
282
354
|
}));
|
|
283
355
|
}
|
|
@@ -530,6 +602,320 @@ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
|
530
602
|
throw new Error(`File type '${fileExtension}' not supported`);
|
|
531
603
|
}
|
|
532
604
|
}
|
|
605
|
+
// ---- Direct Vectorization ----
|
|
606
|
+
/**
|
|
607
|
+
* Embeds content items and upserts them to the appropriate vector index.
|
|
608
|
+
* Items are grouped by their resolved (embeddingModel + vectorIndex) pair — derived
|
|
609
|
+
* from per-ContentSource overrides, per-ContentType defaults, or the global fallback
|
|
610
|
+
* (first active VectorIndex). Each group is processed in configurable batches with
|
|
611
|
+
* parallel upserts within each batch.
|
|
612
|
+
*/
|
|
613
|
+
async VectorizeContentItems(items, contextUser, onProgress, batchSize = DEFAULT_VECTORIZE_BATCH_SIZE) {
|
|
614
|
+
const eligible = items.filter(i => i.Text && i.Text.trim().length > 0);
|
|
615
|
+
if (eligible.length === 0) {
|
|
616
|
+
LogStatus('VectorizeContentItems: no items with text to vectorize');
|
|
617
|
+
return { vectorized: 0, skipped: items.length };
|
|
618
|
+
}
|
|
619
|
+
// Ensure AIEngine is loaded so we can resolve the embedding model
|
|
620
|
+
await AIEngine.Instance.Config(false, contextUser);
|
|
621
|
+
// Load content sources + types for per-item infrastructure resolution
|
|
622
|
+
const { sourceMap, typeMap } = await this.loadContentSourceAndTypeMaps(eligible, contextUser);
|
|
623
|
+
// Group items by their resolved (embeddingModelID + vectorIndexID) pair
|
|
624
|
+
const groups = this.groupItemsByInfrastructure(eligible, sourceMap, typeMap);
|
|
625
|
+
// Load tags for all items in one query
|
|
626
|
+
const tagMap = await this.loadTagsForItems(eligible, contextUser);
|
|
627
|
+
let vectorized = 0;
|
|
628
|
+
let processed = 0;
|
|
629
|
+
for (const [groupKey, groupItems] of groups) {
|
|
630
|
+
const infra = await this.resolveGroupInfrastructure(groupKey, contextUser);
|
|
631
|
+
const groupVectorized = await this.vectorizeGroup(groupItems, infra, tagMap, batchSize, (batchProcessed) => {
|
|
632
|
+
processed += batchProcessed;
|
|
633
|
+
onProgress?.(Math.min(processed, eligible.length), eligible.length);
|
|
634
|
+
});
|
|
635
|
+
vectorized += groupVectorized;
|
|
636
|
+
}
|
|
637
|
+
LogStatus(`VectorizeContentItems: ${vectorized} vectorized, ${items.length - eligible.length} skipped (empty text)`);
|
|
638
|
+
return { vectorized, skipped: items.length - eligible.length };
|
|
639
|
+
}
|
|
640
|
+
/**
|
|
641
|
+
* Process a single infrastructure group: embed texts in batches and upsert to vector DB.
|
|
642
|
+
* Upserts within each batch run in parallel for throughput.
|
|
643
|
+
*/
|
|
644
|
+
async vectorizeGroup(items, infra, tagMap, batchSize, onBatchComplete) {
|
|
645
|
+
let vectorized = 0;
|
|
646
|
+
for (let i = 0; i < items.length; i += batchSize) {
|
|
647
|
+
const batch = items.slice(i, i + batchSize);
|
|
648
|
+
const texts = batch.map(item => this.buildEmbeddingText(item));
|
|
649
|
+
const embedResult = await infra.embedding.EmbedTexts({ texts, model: infra.embeddingModelName });
|
|
650
|
+
if (!embedResult.vectors || embedResult.vectors.length !== batch.length) {
|
|
651
|
+
LogError(`VectorizeContentItems: embedding returned ${embedResult.vectors?.length ?? 0} vectors for ${batch.length} texts`);
|
|
652
|
+
onBatchComplete(batch.length);
|
|
653
|
+
continue;
|
|
654
|
+
}
|
|
655
|
+
const records = batch.map((item, idx) => ({
|
|
656
|
+
id: this.contentItemVectorId(item.ID),
|
|
657
|
+
values: embedResult.vectors[idx],
|
|
658
|
+
metadata: this.buildVectorMetadata(item, tagMap.get(item.ID))
|
|
659
|
+
}));
|
|
660
|
+
// Upsert records in parallel sub-batches for throughput
|
|
661
|
+
const UPSERT_CHUNK = 50;
|
|
662
|
+
const upsertPromises = [];
|
|
663
|
+
for (let j = 0; j < records.length; j += UPSERT_CHUNK) {
|
|
664
|
+
const chunk = records.slice(j, j + UPSERT_CHUNK);
|
|
665
|
+
upsertPromises.push(Promise.resolve(infra.vectorDB.CreateRecords(chunk, infra.indexName)));
|
|
666
|
+
}
|
|
667
|
+
const responses = await Promise.all(upsertPromises);
|
|
668
|
+
let batchSuccess = true;
|
|
669
|
+
for (const response of responses) {
|
|
670
|
+
if (!response.success) {
|
|
671
|
+
LogError(`VectorizeContentItems: upsert failed: ${response.message}`);
|
|
672
|
+
batchSuccess = false;
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
if (batchSuccess) {
|
|
676
|
+
vectorized += batch.length;
|
|
677
|
+
}
|
|
678
|
+
onBatchComplete(batch.length);
|
|
679
|
+
}
|
|
680
|
+
return vectorized;
|
|
681
|
+
}
|
|
682
|
+
/**
|
|
683
|
+
* Load content source and content type records for all unique source/type IDs
|
|
684
|
+
* referenced by the given items. Returns maps keyed by normalized ID.
|
|
685
|
+
*/
|
|
686
|
+
async loadContentSourceAndTypeMaps(items, contextUser) {
|
|
687
|
+
const sourceIds = [...new Set(items.map(i => i.ContentSourceID))];
|
|
688
|
+
const typeIds = [...new Set(items.map(i => i.ContentTypeID))];
|
|
689
|
+
const rv = new RunView();
|
|
690
|
+
const [sourceResult, typeResult] = await rv.RunViews([
|
|
691
|
+
{
|
|
692
|
+
EntityName: 'MJ: Content Sources',
|
|
693
|
+
ExtraFilter: `ID IN (${sourceIds.map(id => `'${id}'`).join(',')})`,
|
|
694
|
+
ResultType: 'simple'
|
|
695
|
+
},
|
|
696
|
+
{
|
|
697
|
+
EntityName: 'MJ: Content Types',
|
|
698
|
+
ExtraFilter: `ID IN (${typeIds.map(id => `'${id}'`).join(',')})`,
|
|
699
|
+
ResultType: 'simple'
|
|
700
|
+
}
|
|
701
|
+
], contextUser);
|
|
702
|
+
const sourceMap = new Map();
|
|
703
|
+
if (sourceResult.Success) {
|
|
704
|
+
for (const row of sourceResult.Results) {
|
|
705
|
+
const rec = row;
|
|
706
|
+
sourceMap.set(NormalizeUUID(rec['ID']), rec);
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
const typeMap = new Map();
|
|
710
|
+
if (typeResult.Success) {
|
|
711
|
+
for (const row of typeResult.Results) {
|
|
712
|
+
const rec = row;
|
|
713
|
+
typeMap.set(NormalizeUUID(rec['ID']), rec);
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
return { sourceMap, typeMap };
|
|
717
|
+
}
|
|
718
|
+
/**
|
|
719
|
+
* Resolve the (embeddingModelID, vectorIndexID) pair for a content item using
|
|
720
|
+
* the cascade: ContentSource override -> ContentType default -> null (global fallback).
|
|
721
|
+
*/
|
|
722
|
+
resolveItemInfrastructureIds(item, sourceMap, typeMap) {
|
|
723
|
+
const source = sourceMap.get(NormalizeUUID(item.ContentSourceID));
|
|
724
|
+
if (source) {
|
|
725
|
+
const srcEmbedding = source['EmbeddingModelID'];
|
|
726
|
+
const srcVector = source['VectorIndexID'];
|
|
727
|
+
if (srcEmbedding && srcVector) {
|
|
728
|
+
return { embeddingModelID: srcEmbedding, vectorIndexID: srcVector };
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
const contentType = typeMap.get(NormalizeUUID(item.ContentTypeID));
|
|
732
|
+
if (contentType) {
|
|
733
|
+
const typeEmbedding = contentType['EmbeddingModelID'];
|
|
734
|
+
const typeVector = contentType['VectorIndexID'];
|
|
735
|
+
if (typeEmbedding && typeVector) {
|
|
736
|
+
return { embeddingModelID: typeEmbedding, vectorIndexID: typeVector };
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
// Global fallback — will be resolved in resolveGroupInfrastructure
|
|
740
|
+
return { embeddingModelID: null, vectorIndexID: null };
|
|
741
|
+
}
|
|
742
|
+
/**
|
|
743
|
+
* Group items by their resolved (embeddingModelID + vectorIndexID) key.
|
|
744
|
+
* Items with the same pair share infrastructure and can be batched together.
|
|
745
|
+
*/
|
|
746
|
+
groupItemsByInfrastructure(items, sourceMap, typeMap) {
|
|
747
|
+
const groups = new Map();
|
|
748
|
+
for (const item of items) {
|
|
749
|
+
const { embeddingModelID, vectorIndexID } = this.resolveItemInfrastructureIds(item, sourceMap, typeMap);
|
|
750
|
+
const key = this.infraGroupKey(embeddingModelID, vectorIndexID);
|
|
751
|
+
const group = groups.get(key) ?? [];
|
|
752
|
+
group.push(item);
|
|
753
|
+
groups.set(key, group);
|
|
754
|
+
}
|
|
755
|
+
return groups;
|
|
756
|
+
}
|
|
757
|
+
/** Create a stable cache key for an (embeddingModelID, vectorIndexID) pair */
|
|
758
|
+
infraGroupKey(embeddingModelID, vectorIndexID) {
|
|
759
|
+
const e = embeddingModelID ? NormalizeUUID(embeddingModelID) : 'default';
|
|
760
|
+
const v = vectorIndexID ? NormalizeUUID(vectorIndexID) : 'default';
|
|
761
|
+
return `${e}|${v}`;
|
|
762
|
+
}
|
|
763
|
+
/**
|
|
764
|
+
* Resolve a group key into concrete infrastructure instances. For the 'default|default'
|
|
765
|
+
* key, falls back to the first active VectorIndex (original behavior).
|
|
766
|
+
*/
|
|
767
|
+
async resolveGroupInfrastructure(groupKey, contextUser) {
|
|
768
|
+
const [embeddingPart, vectorPart] = groupKey.split('|');
|
|
769
|
+
const isDefault = embeddingPart === 'default' || vectorPart === 'default';
|
|
770
|
+
if (isDefault) {
|
|
771
|
+
return this.getDefaultVectorInfrastructure(contextUser);
|
|
772
|
+
}
|
|
773
|
+
return this.buildVectorInfrastructure(embeddingPart, vectorPart, contextUser);
|
|
774
|
+
}
|
|
775
|
+
/**
|
|
776
|
+
* Build infrastructure from explicit embeddingModelID and vectorIndexID.
|
|
777
|
+
* Looks up the vector index by ID and the embedding model from AIEngine.
|
|
778
|
+
*/
|
|
779
|
+
async buildVectorInfrastructure(embeddingModelID, vectorIndexID, contextUser) {
|
|
780
|
+
const rv = new RunView();
|
|
781
|
+
const indexResult = await rv.RunView({
|
|
782
|
+
EntityName: 'MJ: Vector Indexes',
|
|
783
|
+
ExtraFilter: `ID='${vectorIndexID}'`,
|
|
784
|
+
ResultType: 'simple',
|
|
785
|
+
MaxRows: 1
|
|
786
|
+
}, contextUser);
|
|
787
|
+
if (!indexResult.Success || indexResult.Results.length === 0) {
|
|
788
|
+
throw new Error(`Vector index ${vectorIndexID} not found`);
|
|
789
|
+
}
|
|
790
|
+
const vectorIndex = indexResult.Results[0];
|
|
791
|
+
return this.createInfrastructureFromIndex(vectorIndex, embeddingModelID, contextUser);
|
|
792
|
+
}
|
|
793
|
+
/**
|
|
794
|
+
* Fallback: resolve infrastructure from the first active VectorIndex (original behavior).
|
|
795
|
+
*/
|
|
796
|
+
async getDefaultVectorInfrastructure(contextUser) {
|
|
797
|
+
const rv = new RunView();
|
|
798
|
+
const indexResult = await rv.RunView({
|
|
799
|
+
EntityName: 'MJ: Vector Indexes',
|
|
800
|
+
ResultType: 'simple',
|
|
801
|
+
MaxRows: 1
|
|
802
|
+
}, contextUser);
|
|
803
|
+
if (!indexResult.Success || indexResult.Results.length === 0) {
|
|
804
|
+
throw new Error('No vector indexes found — create one in the Configuration tab first');
|
|
805
|
+
}
|
|
806
|
+
const vectorIndex = indexResult.Results[0];
|
|
807
|
+
const embeddingModelID = vectorIndex['EmbeddingModelID'];
|
|
808
|
+
return this.createInfrastructureFromIndex(vectorIndex, embeddingModelID, contextUser);
|
|
809
|
+
}
|
|
810
|
+
/**
|
|
811
|
+
* Shared helper: given a vector index record and embedding model ID, resolve all
|
|
812
|
+
* driver instances needed for embedding + upsert.
|
|
813
|
+
*/
|
|
814
|
+
async createInfrastructureFromIndex(vectorIndex, embeddingModelID, contextUser) {
|
|
815
|
+
const indexName = vectorIndex['Name'];
|
|
816
|
+
const vectorDatabaseID = vectorIndex['VectorDatabaseID'];
|
|
817
|
+
const rv = new RunView();
|
|
818
|
+
const dbResult = await rv.RunView({
|
|
819
|
+
EntityName: 'MJ: Vector Databases',
|
|
820
|
+
ExtraFilter: `ID='${vectorDatabaseID}'`,
|
|
821
|
+
ResultType: 'simple',
|
|
822
|
+
MaxRows: 1
|
|
823
|
+
}, contextUser);
|
|
824
|
+
if (!dbResult.Success || dbResult.Results.length === 0) {
|
|
825
|
+
throw new Error(`Vector database ${vectorDatabaseID} not found`);
|
|
826
|
+
}
|
|
827
|
+
const vectorDBClassKey = dbResult.Results[0]['ClassKey'];
|
|
828
|
+
const aiModel = this.findEmbeddingModel(embeddingModelID);
|
|
829
|
+
const driverClass = aiModel.DriverClass;
|
|
830
|
+
const embeddingModelName = aiModel.APIName ?? aiModel.Name;
|
|
831
|
+
LogStatus(`VectorizeContentItems: USING embedding model "${aiModel.Name}" (${driverClass}), vector DB "${vectorDBClassKey}", index "${indexName}"`);
|
|
832
|
+
const embedding = this.createEmbeddingInstance(driverClass);
|
|
833
|
+
const vectorDB = this.createVectorDBInstance(vectorDBClassKey);
|
|
834
|
+
return { embedding, vectorDB, indexName, embeddingModelName };
|
|
835
|
+
}
|
|
836
|
+
/** Find an embedding model by ID in AIEngine, with helpful error reporting */
|
|
837
|
+
findEmbeddingModel(embeddingModelID) {
|
|
838
|
+
const aiModel = AIEngine.Instance.Models.find(m => UUIDsEqual(m.ID, embeddingModelID));
|
|
839
|
+
if (!aiModel) {
|
|
840
|
+
const embModels = AIEngine.Instance.Models.filter(m => m.DriverClass?.includes('Embed') || m.Name?.includes('embed'));
|
|
841
|
+
LogError(`VectorizeContentItems: embeddingModelID ${embeddingModelID} NOT FOUND. Available: ${JSON.stringify(embModels.map(m => ({ id: m.ID, name: m.Name, driver: m.DriverClass })))}`);
|
|
842
|
+
throw new Error(`Embedding model ${embeddingModelID} not found in AIEngine — ensure AIEngine is configured`);
|
|
843
|
+
}
|
|
844
|
+
return aiModel;
|
|
845
|
+
}
|
|
846
|
+
/** Create a BaseEmbeddings instance for a given driver class */
|
|
847
|
+
createEmbeddingInstance(driverClass) {
|
|
848
|
+
const apiKey = GetAIAPIKey(driverClass);
|
|
849
|
+
if (!apiKey) {
|
|
850
|
+
throw new Error(`No API key found for embedding driver ${driverClass} — set AI_VENDOR_API_KEY__${driverClass} in .env`);
|
|
851
|
+
}
|
|
852
|
+
const instance = MJGlobal.Instance.ClassFactory.CreateInstance(BaseEmbeddings, driverClass, apiKey);
|
|
853
|
+
if (!instance)
|
|
854
|
+
throw new Error(`Failed to create embedding instance for ${driverClass}`);
|
|
855
|
+
return instance;
|
|
856
|
+
}
|
|
857
|
+
/** Create a VectorDBBase instance for a given class key */
|
|
858
|
+
createVectorDBInstance(classKey) {
|
|
859
|
+
const apiKey = GetAIAPIKey(classKey);
|
|
860
|
+
if (!apiKey) {
|
|
861
|
+
throw new Error(`No API key found for vector DB ${classKey} — set AI_VENDOR_API_KEY__${classKey} in .env`);
|
|
862
|
+
}
|
|
863
|
+
const instance = MJGlobal.Instance.ClassFactory.CreateInstance(VectorDBBase, classKey, apiKey);
|
|
864
|
+
if (!instance)
|
|
865
|
+
throw new Error(`Failed to create vector DB instance for ${classKey}`);
|
|
866
|
+
return instance;
|
|
867
|
+
}
|
|
868
|
+
/** SHA-1 deterministic vector ID for a content item */
|
|
869
|
+
contentItemVectorId(contentItemId) {
|
|
870
|
+
return crypto.createHash('sha1').update(`content-item_${contentItemId}`).digest('hex');
|
|
871
|
+
}
|
|
872
|
+
/** Build the text that gets embedded: Title + Description + full Text */
|
|
873
|
+
buildEmbeddingText(item) {
|
|
874
|
+
const parts = [];
|
|
875
|
+
if (item.Name)
|
|
876
|
+
parts.push(item.Name);
|
|
877
|
+
if (item.Description)
|
|
878
|
+
parts.push(item.Description);
|
|
879
|
+
if (item.Text)
|
|
880
|
+
parts.push(item.Text);
|
|
881
|
+
return parts.join('\n');
|
|
882
|
+
}
|
|
883
|
+
/** Build metadata stored alongside the vector — truncate large text fields */
|
|
884
|
+
buildVectorMetadata(item, tags) {
|
|
885
|
+
const META_TEXT_LIMIT = 1000;
|
|
886
|
+
const meta = {
|
|
887
|
+
RecordID: item.ID,
|
|
888
|
+
Entity: 'MJ: Content Items',
|
|
889
|
+
};
|
|
890
|
+
if (item.Name)
|
|
891
|
+
meta['Title'] = item.Name.substring(0, META_TEXT_LIMIT);
|
|
892
|
+
if (item.Description)
|
|
893
|
+
meta['Description'] = item.Description.substring(0, META_TEXT_LIMIT);
|
|
894
|
+
if (item.URL)
|
|
895
|
+
meta['URL'] = item.URL;
|
|
896
|
+
if (tags && tags.length > 0)
|
|
897
|
+
meta['Tags'] = tags;
|
|
898
|
+
return meta;
|
|
899
|
+
}
|
|
900
|
+
/** Load all tags for the given items in a single RunView call */
|
|
901
|
+
async loadTagsForItems(items, contextUser) {
|
|
902
|
+
const tagMap = new Map();
|
|
903
|
+
const rv = new RunView();
|
|
904
|
+
const ids = items.map(i => `'${i.ID}'`).join(',');
|
|
905
|
+
const result = await rv.RunView({
|
|
906
|
+
EntityName: 'MJ: Content Item Tags',
|
|
907
|
+
ExtraFilter: `ItemID IN (${ids})`,
|
|
908
|
+
ResultType: 'entity_object'
|
|
909
|
+
}, contextUser);
|
|
910
|
+
if (result.Success) {
|
|
911
|
+
for (const tag of result.Results) {
|
|
912
|
+
const existing = tagMap.get(tag.ItemID) ?? [];
|
|
913
|
+
existing.push(tag.Tag);
|
|
914
|
+
tagMap.set(tag.ItemID, existing);
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
return tagMap;
|
|
918
|
+
}
|
|
533
919
|
};
|
|
534
920
|
AutotagBaseEngine = __decorate([
|
|
535
921
|
RegisterClass(BaseEngine, 'AutotagBaseEngine')
|