@memberjunction/content-autotagging 5.20.0 → 5.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,11 +4,8 @@ var __decorate = (this && this.__decorate) || function (decorators, target, key,
4
4
  else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
5
5
  return c > 3 && r && Object.defineProperty(target, key, r), r;
6
6
  };
7
- var __metadata = (this && this.__metadata) || function (k, v) {
8
- if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
9
- };
10
- import { Metadata, RunView } from '@memberjunction/core';
11
- import { RegisterClass, MJGlobal, UUIDsEqual } from '@memberjunction/global';
7
+ import { BaseEngine, Metadata, RunView, LogError, LogStatus } from '@memberjunction/core';
8
+ import { MJGlobal, UUIDsEqual, RegisterClass } from '@memberjunction/global';
12
9
  import { ContentSourceTypeParams } from './content.types.js';
13
10
  import pdfParse from 'pdf-parse';
14
11
  import officeparser from 'officeparser';
@@ -20,21 +17,72 @@ import * as cheerio from 'cheerio';
20
17
  import crypto from 'crypto';
21
18
  import { BaseLLM, GetAIAPIKey } from '@memberjunction/ai';
22
19
  import { AIEngine } from '@memberjunction/aiengine';
23
- let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
20
+ import { TextChunker } from '@memberjunction/ai-vectors';
21
+ /**
22
+ * Core engine for content autotagging. Extends BaseEngine to cache content metadata
23
+ * (types, source types, file types, attributes) at startup. Uses AIEngine via composition
24
+ * for AI model access, then delegates to LLM for text analysis and tagging.
25
+ */
26
+ let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
24
27
  constructor() {
25
- super();
28
+ super(...arguments);
29
+ // Cached metadata — loaded by BaseEngine.Config() via property configs
30
+ this._ContentTypes = [];
31
+ this._ContentSourceTypes = [];
32
+ this._ContentFileTypes = [];
33
+ this._ContentTypeAttributes = [];
34
+ this._ContentSourceTypeParams = [];
26
35
  }
27
36
  static get Instance() {
28
37
  return super.getInstance();
29
38
  }
39
+ /** All content types, cached at startup */
40
+ get ContentTypes() { return this._ContentTypes; }
41
+ /** All content source types, cached at startup */
42
+ get ContentSourceTypes() { return this._ContentSourceTypes; }
43
+ /** All content file types, cached at startup */
44
+ get ContentFileTypes() { return this._ContentFileTypes; }
45
+ /** All content type attributes, cached at startup */
46
+ get ContentTypeAttributes() { return this._ContentTypeAttributes; }
47
+ /** All content source type params, cached at startup */
48
+ get ContentSourceTypeParams() { return this._ContentSourceTypeParams; }
49
+ async Config(forceRefresh, contextUser, provider) {
50
+ const configs = [
51
+ {
52
+ Type: 'entity',
53
+ EntityName: 'MJ: Content Types',
54
+ PropertyName: '_ContentTypes',
55
+ },
56
+ {
57
+ Type: 'entity',
58
+ EntityName: 'MJ: Content Source Types',
59
+ PropertyName: '_ContentSourceTypes',
60
+ },
61
+ {
62
+ Type: 'entity',
63
+ EntityName: 'MJ: Content File Types',
64
+ PropertyName: '_ContentFileTypes',
65
+ },
66
+ {
67
+ Type: 'entity',
68
+ EntityName: 'MJ: Content Type Attributes',
69
+ PropertyName: '_ContentTypeAttributes',
70
+ },
71
+ {
72
+ Type: 'entity',
73
+ EntityName: 'MJ: Content Source Type Params',
74
+ PropertyName: '_ContentSourceTypeParams',
75
+ },
76
+ ];
77
+ await this.Load(configs, provider, forceRefresh, contextUser);
78
+ return this;
79
+ }
30
80
  /**
31
- * Given a list of content items, extract the text from each content item with the LLM and send off the required parameters to the LLM for tagging.
32
- * @param contentItems
33
- * @returns
81
+ * Given a list of content items, extract the text from each and process with LLM for tagging.
34
82
  */
35
83
  async ExtractTextAndProcessWithLLM(contentItems, contextUser) {
36
84
  if (!contentItems || contentItems.length === 0) {
37
- console.log('No content items to process');
85
+ LogStatus('No content items to process');
38
86
  return;
39
87
  }
40
88
  const processRunParams = new ProcessRunParams();
@@ -43,22 +91,11 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
43
91
  processRunParams.numItemsProcessed = contentItems.length;
44
92
  for (const contentItem of contentItems) {
45
93
  try {
46
- const processingParams = new ContentItemProcessParams();
47
- // Parameters that depend on the content item
48
- processingParams.text = contentItem.Text;
49
- processingParams.contentSourceTypeID = contentItem.ContentSourceTypeID;
50
- processingParams.contentFileTypeID = contentItem.ContentFileTypeID;
51
- processingParams.contentTypeID = contentItem.ContentTypeID;
52
- // Parameters that depend on the content type
53
- const { modelID, minTags, maxTags } = await this.getContentItemParams(processingParams.contentTypeID, contextUser);
54
- processingParams.modelID = modelID;
55
- processingParams.minTags = minTags;
56
- processingParams.maxTags = maxTags;
57
- processingParams.contentItemID = contentItem.ID;
94
+ const processingParams = await this.buildProcessingParams(contentItem, contextUser);
58
95
  await this.ProcessContentItemText(processingParams, contextUser);
59
96
  }
60
97
  catch (e) {
61
- console.error(`Failed to process content source item: ${contentItem.Get('contentItemID')}`);
98
+ LogError(`Failed to process content item: ${contentItem.ID}`, undefined, e);
62
99
  throw e;
63
100
  }
64
101
  }
@@ -66,10 +103,23 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
66
103
  await this.saveProcessRun(processRunParams, contextUser);
67
104
  }
68
105
  /**
69
- * Given processing parameters that include the text from our content item, process the text with the LLM and extract the
70
- * information related to that content type.
71
- * @param params
72
- * @returns
106
+ * Builds processing parameters for a single content item
107
+ */
108
+ async buildProcessingParams(contentItem, contextUser) {
109
+ const processingParams = new ContentItemProcessParams();
110
+ processingParams.text = contentItem.Text;
111
+ processingParams.contentSourceTypeID = contentItem.ContentSourceTypeID;
112
+ processingParams.contentFileTypeID = contentItem.ContentFileTypeID;
113
+ processingParams.contentTypeID = contentItem.ContentTypeID;
114
+ const { modelID, minTags, maxTags } = this.GetContentItemParams(processingParams.contentTypeID);
115
+ processingParams.modelID = modelID;
116
+ processingParams.minTags = minTags;
117
+ processingParams.maxTags = maxTags;
118
+ processingParams.contentItemID = contentItem.ID;
119
+ return processingParams;
120
+ }
121
+ /**
122
+ * Process a content item's text with the LLM and save results.
73
123
  */
74
124
  async ProcessContentItemText(params, contextUser) {
75
125
  const LLMResults = await this.promptAndRetrieveResultsFromLLM(params, contextUser);
@@ -77,39 +127,43 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
77
127
  }
78
128
  async promptAndRetrieveResultsFromLLM(params, contextUser) {
79
129
  const model = AIEngine.Instance.Models.find(m => UUIDsEqual(m.ID, params.modelID));
130
+ if (!model) {
131
+ throw new Error(`AI Model with ID ${params.modelID} not found`);
132
+ }
80
133
  const llm = MJGlobal.Instance.ClassFactory.CreateInstance(BaseLLM, model.DriverClass, GetAIAPIKey(model.DriverClass));
81
- const tokenLimit = model.InputTokenLimit;
82
- const text = this.chunkExtractedText(params.text, model.InputTokenLimit);
134
+ if (!llm) {
135
+ throw new Error(`Failed to create LLM instance for driver ${model.DriverClass}`);
136
+ }
137
+ const chunks = this.chunkExtractedText(params.text, model.InputTokenLimit);
83
138
  let LLMResults = {};
84
139
  const startTime = new Date();
85
- for (const chunk of text) {
140
+ for (const chunk of chunks) {
86
141
  const { systemPrompt, userPrompt } = await this.getLLMPrompts(params, chunk, LLMResults, contextUser);
87
142
  LLMResults = await this.processChunkWithLLM(llm, systemPrompt, userPrompt, LLMResults, model.APIName);
88
143
  }
89
- const endTime = new Date();
90
144
  LLMResults.processStartTime = startTime;
91
- LLMResults.processEndTime = endTime;
145
+ LLMResults.processEndTime = new Date();
92
146
  LLMResults.contentItemID = params.contentItemID;
93
147
  return LLMResults;
94
148
  }
95
149
  async processChunkWithLLM(llm, systemPrompt, userPrompt, LLMResults, modelAPIName) {
96
150
  const response = await llm.ChatCompletion({
97
151
  messages: [
98
- {
99
- role: 'system',
100
- content: systemPrompt,
101
- },
102
- {
103
- role: 'user',
104
- content: userPrompt,
105
- }
152
+ { role: 'system', content: systemPrompt },
153
+ { role: 'user', content: userPrompt }
106
154
  ],
107
155
  model: modelAPIName,
108
156
  temperature: 0.0,
109
157
  });
110
158
  const queryResponse = response.data.choices[0]?.message?.content?.trim() || '';
111
- const JSONQueryResponse = JSON.parse(queryResponse);
112
- // check if the response has info to add to LLMResults
159
+ let JSONQueryResponse;
160
+ try {
161
+ JSONQueryResponse = JSON.parse(queryResponse);
162
+ }
163
+ catch (parseError) {
164
+ LogError('Failed to parse LLM response as JSON', undefined, queryResponse);
165
+ return LLMResults;
166
+ }
113
167
  for (const key in JSONQueryResponse) {
114
168
  const value = JSONQueryResponse[key];
115
169
  if (value !== null) {
@@ -119,14 +173,14 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
119
173
  return LLMResults;
120
174
  }
121
175
  async getLLMPrompts(params, chunk, LLMResults, contextUser) {
122
- const contentType = await this.getContentTypeName(params.contentTypeID, contextUser);
123
- const contentSourceType = await this.getContentSourceTypeName(params.contentSourceTypeID, contextUser);
124
- const additionalContentTypePrompts = await this.getAdditionalContentTypePrompt(params.contentTypeID, contextUser);
125
- const systemPrompt = `You are a highly skilled text analysis assistant. You have decades of experience and pride yourself on your attention to detail and ability to capture both accurate information, as well as tone and subtext.
126
- Your task is to accurately extract key information from a provided piece of text based on a series of prompts. You are provided with text that should be a ${contentType}, that has been extracted from a ${contentSourceType}.
176
+ const contentType = this.GetContentTypeName(params.contentTypeID);
177
+ const contentSourceType = this.GetContentSourceTypeName(params.contentSourceTypeID);
178
+ const additionalContentTypePrompts = this.GetAdditionalContentTypePrompt(params.contentTypeID);
179
+ const systemPrompt = `You are a highly skilled text analysis assistant. You have decades of experience and pride yourself on your attention to detail and ability to capture both accurate information, as well as tone and subtext.
180
+ Your task is to accurately extract key information from a provided piece of text based on a series of prompts. You are provided with text that should be a ${contentType}, that has been extracted from a ${contentSourceType}.
127
181
  The text MUST be of the type ${contentType} for the subsequent processing.`;
128
182
  const userPrompt = `
129
- If the provided text does not actually appear to be of the type ${contentType}, please disregard everything in the instructions after this and return this exact JSON response: { isValidContent: false (as a boolean) }.
183
+ If the provided text does not actually appear to be of the type ${contentType}, please disregard everything in the instructions after this and return this exact JSON response: { isValidContent: false (as a boolean) }.
130
184
  Assuming the type of the text is in fact from a ${contentType}, please extract the title of the provided text, a short summary of the provided documents, as well as between ${params.minTags} and ${params.maxTags} topical key words that are most relevant to the text.
131
185
  If there is no title explicitly provided in the text, please provide a title that you think best represents the text.
132
186
  Please provide the keywords in a list format.
@@ -135,13 +189,13 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
135
189
  {
136
190
  "title": (title here),
137
191
  "description": (description here),
138
- "keywords": (list keywords here),
192
+ "keywords": (list keywords here),
139
193
  "isValidContent": true (as a boolean)
140
194
  }
141
195
 
142
196
  ${additionalContentTypePrompts}
143
197
 
144
- Please make sure the response in is valid JSON format.
198
+ Please make sure the response in is valid JSON format.
145
199
 
146
200
  You are also provided with the results so far as additional context, please use them to formulate the best results given the provided text: ${JSON.stringify(LLMResults)}
147
201
  The supplied text is: ${chunk}
@@ -150,10 +204,9 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
150
204
  }
151
205
  async saveLLMResults(LLMResults, contextUser) {
152
206
  if (LLMResults.isValidContent === true) {
153
- // Only save results if the content is of the type that we expected.
154
207
  await this.saveResultsToContentItemAttribute(LLMResults, contextUser);
155
208
  await this.saveContentItemTags(LLMResults.contentItemID, LLMResults, contextUser);
156
- console.log(`Results for content item ${LLMResults.contentItemID} saved successfully`);
209
+ LogStatus(`Results for content item ${LLMResults.contentItemID} saved successfully`);
157
210
  }
158
211
  else {
159
212
  await this.deleteInvalidContentItem(LLMResults.contentItemID, contextUser);
@@ -165,116 +218,124 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
165
218
  await contentItem.Load(contentItemID);
166
219
  await contentItem.Delete();
167
220
  }
221
+ /**
222
+ * Chunks text using the shared TextChunker utility for token-aware splitting.
223
+ * Falls back to simple character-based splitting when TextChunker is not available.
224
+ */
168
225
  chunkExtractedText(text, tokenLimit) {
169
226
  try {
170
- const textLimit = Math.ceil(tokenLimit / 1.5); // bit of a conservatice estimate to ensure there is room for the additional prompts
171
- if (text.length <= textLimit) {
172
- // No need to chunk the text
227
+ const maxChunkTokens = Math.ceil(tokenLimit / 1.5);
228
+ if (text.length <= maxChunkTokens * 4) {
173
229
  return [text];
174
230
  }
175
- const numChunks = Math.ceil(text.length / textLimit);
176
- const chunkSize = Math.ceil(text.length / numChunks);
177
- const chunks = [];
178
- for (let i = 0; i < numChunks; i++) {
179
- const start = i * chunkSize;
180
- const end = (i + 1) * chunkSize;
181
- chunks.push(text.slice(start, end));
231
+ try {
232
+ const chunkParams = {
233
+ Text: text,
234
+ MaxChunkTokens: maxChunkTokens,
235
+ OverlapTokens: Math.ceil(maxChunkTokens * 0.1),
236
+ Strategy: 'sentence',
237
+ };
238
+ const chunks = TextChunker.ChunkText(chunkParams);
239
+ return chunks.map(c => c.Text);
240
+ }
241
+ catch {
242
+ return this.fallbackChunkText(text, maxChunkTokens);
182
243
  }
183
- return chunks;
184
244
  }
185
- catch (e) {
186
- console.log('Could not chunk the text');
245
+ catch {
246
+ LogError('Could not chunk the text');
187
247
  return [text];
188
248
  }
189
249
  }
190
250
  /**
191
- * Given the processing results from the LLM and the Content Element Item that was saved to the database, this function saves the tags as Content Element Tags in the database.
192
- * @param md: The metadata object to save the tags
193
- * @param contentElementItem: The content element item that was saved to the database
194
- * @param results: The results of the processing from the LLM
195
- * @param contextUser: The user context to save the tags
196
- * @returns
251
+ * Simple character-based chunking as fallback
252
+ */
253
+ fallbackChunkText(text, textLimit) {
254
+ const numChunks = Math.ceil(text.length / textLimit);
255
+ const chunkSize = Math.ceil(text.length / numChunks);
256
+ const chunks = [];
257
+ for (let i = 0; i < numChunks; i++) {
258
+ const start = i * chunkSize;
259
+ const end = (i + 1) * chunkSize;
260
+ chunks.push(text.slice(start, end));
261
+ }
262
+ return chunks;
263
+ }
264
+ /**
265
+ * Saves keyword tags from LLM results as Content Item Tags.
266
+ * Uses batched saves for better performance.
197
267
  */
198
268
  async saveContentItemTags(contentItemID, LLMResults, contextUser) {
199
269
  const md = new Metadata();
200
- for (const keyword of LLMResults.keywords) {
201
- const contentItemTags = await md.GetEntityObject('MJ: Content Item Tags', contextUser);
202
- contentItemTags.NewRecord();
203
- contentItemTags.ItemID = contentItemID;
204
- contentItemTags.Tag = keyword;
205
- await contentItemTags.Save();
270
+ const keywords = LLMResults.keywords;
271
+ if (!keywords || !Array.isArray(keywords))
272
+ return;
273
+ const BATCH_SIZE = 10;
274
+ for (let i = 0; i < keywords.length; i += BATCH_SIZE) {
275
+ const batch = keywords.slice(i, i + BATCH_SIZE);
276
+ await Promise.all(batch.map(async (keyword) => {
277
+ const contentItemTag = await md.GetEntityObject('MJ: Content Item Tags', contextUser);
278
+ contentItemTag.NewRecord();
279
+ contentItemTag.ItemID = contentItemID;
280
+ contentItemTag.Tag = keyword;
281
+ await contentItemTag.Save();
282
+ }));
206
283
  }
207
284
  }
285
+ /**
286
+ * Saves LLM-extracted attributes to the database.
287
+ * Updates content item name/description, then creates attribute records for other fields.
288
+ */
208
289
  async saveResultsToContentItemAttribute(LLMResults, contextUser) {
209
290
  const md = new Metadata();
210
- for (const key in LLMResults) {
211
- // Overwrite name of content item with title if it exists
212
- if (key === 'title') {
213
- const ID = LLMResults.contentItemID;
214
- const contentItem = await md.GetEntityObject('MJ: Content Items', contextUser);
215
- await contentItem.Load(ID);
291
+ const contentItemID = LLMResults.contentItemID;
292
+ const skipKeys = new Set(['keywords', 'processStartTime', 'processEndTime', 'contentItemID', 'isValidContent']);
293
+ // Update title and description on the content item
294
+ if (LLMResults.title || LLMResults.description) {
295
+ const contentItem = await md.GetEntityObject('MJ: Content Items', contextUser);
296
+ await contentItem.Load(contentItemID);
297
+ if (LLMResults.title)
216
298
  contentItem.Name = LLMResults.title;
217
- await contentItem.Save();
218
- }
219
- if (key === 'description') {
220
- const ID = LLMResults.contentItemID;
221
- const contentItem = await md.GetEntityObject('MJ: Content Items', contextUser);
222
- await contentItem.Load(ID);
299
+ if (LLMResults.description)
223
300
  contentItem.Description = LLMResults.description;
224
- await contentItem.Save();
225
- }
226
- if (key !== 'keywords' && key !== 'processStartTime' && key !== 'processEndTime' && key !== 'contentItemID' && key !== 'isValidContent') {
301
+ await contentItem.Save();
302
+ }
303
+ // Create attribute records for remaining fields
304
+ const attributeEntries = Object.entries(LLMResults).filter(([key]) => !skipKeys.has(key) && key !== 'title' && key !== 'description');
305
+ const BATCH_SIZE = 10;
306
+ for (let i = 0; i < attributeEntries.length; i += BATCH_SIZE) {
307
+ const batch = attributeEntries.slice(i, i + BATCH_SIZE);
308
+ await Promise.all(batch.map(async ([key, value]) => {
227
309
  const contentItemAttribute = await md.GetEntityObject('MJ: Content Item Attributes', contextUser);
228
310
  contentItemAttribute.NewRecord();
229
- //Value should be a string, if its a null or undefined value, set it to an empty string
230
- const value = LLMResults[key] || '';
231
- contentItemAttribute.ContentItemID = LLMResults.contentItemID;
311
+ contentItemAttribute.ContentItemID = contentItemID;
232
312
  contentItemAttribute.Name = key;
233
- contentItemAttribute.Value = value;
313
+ contentItemAttribute.Value = value != null ? String(value) : '';
234
314
  await contentItemAttribute.Save();
235
- }
315
+ }));
236
316
  }
237
317
  }
238
- /***
239
- * Retrieves all of the content sources of a given content source type data from the database.
240
- * @param contextUser: The user context to retrieve the content source data
241
- * @returns A list of content sources
242
- */
318
+ /**
319
+ * Retrieves all content sources for a given content source type.
320
+ */
243
321
  async getAllContentSources(contextUser, contentSourceTypeID) {
244
322
  const rv = new RunView();
245
- const contentSourceResult = await rv.RunView({
323
+ const result = await rv.RunView({
246
324
  EntityName: 'MJ: Content Sources',
247
325
  ResultType: 'entity_object',
248
326
  ExtraFilter: `ContentSourceTypeID='${contentSourceTypeID}'`
249
327
  }, contextUser);
250
- try {
251
- if (contentSourceResult.Success && contentSourceResult.Results.length) {
252
- const contentSources = contentSourceResult.Results;
253
- return contentSources;
254
- }
255
- else {
256
- throw new Error(`No content sources found for content source type with ID '${contentSourceTypeID}'`);
257
- }
258
- }
259
- catch (e) {
260
- console.error(e);
261
- throw e;
328
+ if (result.Success && result.Results.length) {
329
+ return result.Results;
262
330
  }
331
+ throw new Error(`No content sources found for content source type with ID '${contentSourceTypeID}'`);
263
332
  }
264
- async setSubclassContentSourceType(subclass, contextUser) {
265
- const rv = new RunView();
266
- const results = await rv.RunView({
267
- EntityName: 'MJ: Content Source Types',
268
- ExtraFilter: `Name='${subclass}'`,
269
- ResultType: 'entity_object'
270
- }, contextUser);
271
- if (results.Success && results.Results.length) {
272
- const contentSourceType = results.Results[0];
273
- return contentSourceType.ID;
274
- }
275
- else {
276
- throw new Error(`Subclass with name ${subclass} not found`);
333
+ SetSubclassContentSourceType(subclass) {
334
+ const sourceType = this._ContentSourceTypes.find(st => st.Name === subclass);
335
+ if (!sourceType) {
336
+ throw new Error(`Content Source Type with name '${subclass}' not found in cached metadata`);
277
337
  }
338
+ return sourceType.ID;
278
339
  }
279
340
  async getContentSourceParams(contentSource, contextUser) {
280
341
  const contentSourceParams = new Map();
@@ -285,42 +346,35 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
285
346
  ResultType: 'entity_object'
286
347
  }, contextUser);
287
348
  if (results.Success && results.Results.length) {
288
- const contentSourceParamResults = results.Results;
289
- for (const contentSourceParam of contentSourceParamResults) {
290
- const params = await this.getDefaultContentSourceTypeParams(contentSourceParam.ContentSourceTypeParamID, contextUser);
349
+ for (const contentSourceParam of results.Results) {
350
+ const params = this.GetDefaultContentSourceTypeParams(contentSourceParam.ContentSourceTypeParamID);
291
351
  params.contentSourceID = contentSource.ID;
292
352
  if (contentSourceParam.Value) {
293
- // There is a provided value, so overwrite the default value
294
353
  params.value = this.castValueAsCorrectType(contentSourceParam.Value, params.type);
295
354
  }
296
355
  contentSourceParams.set(params.name, params.value);
297
356
  }
298
- return contentSourceParams;
299
357
  }
300
358
  else {
301
- console.log(`No content source params found for content source with ID ${contentSource.ID}, using default values`);
359
+ LogStatus(`No content source params found for content source with ID ${contentSource.ID}, using default values`);
302
360
  }
361
+ return contentSourceParams;
303
362
  }
304
- async getDefaultContentSourceTypeParams(contentSourceTypeParamID, contextUser) {
305
- const rv = new RunView();
306
- const results = await rv.RunView({
307
- EntityName: 'MJ: Content Source Type Params',
308
- ExtraFilter: `ID='${contentSourceTypeParamID}'`,
309
- ResultType: 'entity_object'
310
- }, contextUser);
311
- if (results.Success && results.Results.length) {
312
- const params = new ContentSourceTypeParams();
313
- params.name = results.Results[0].Get('Name');
314
- params.type = results.Results[0].Get('Type').toLowerCase();
315
- params.value = this.castValueAsCorrectType(results.Results[0].Get('DefaultValue'), params.type); // Default value in this case, can be null or overridden later
316
- return params;
363
+ GetDefaultContentSourceTypeParams(contentSourceTypeParamID) {
364
+ const result = this._ContentSourceTypeParams.find(p => UUIDsEqual(p.ID, contentSourceTypeParamID));
365
+ if (!result) {
366
+ throw new Error(`Content Source Type Param with ID '${contentSourceTypeParamID}' not found in cached metadata`);
317
367
  }
318
- throw new Error(`Content Source Type with ID '${contentSourceTypeParamID}' not found`);
368
+ const params = new ContentSourceTypeParams();
369
+ params.name = result.Name;
370
+ params.type = result.Type.toLowerCase();
371
+ params.value = this.castValueAsCorrectType(result.DefaultValue ?? '', params.type);
372
+ return params;
319
373
  }
320
374
  castValueAsCorrectType(value, type) {
321
375
  switch (type) {
322
376
  case 'number':
323
- return parseInt(value);
377
+ return parseInt(value, 10);
324
378
  case 'boolean':
325
379
  return this.stringToBoolean(value);
326
380
  case 'string':
@@ -333,28 +387,21 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
333
387
  return value;
334
388
  }
335
389
  }
336
- stringToBoolean(string) {
337
- return string === 'true';
390
+ stringToBoolean(str) {
391
+ return str === 'true';
338
392
  }
339
393
  parseStringArray(value) {
340
- const stringArray = JSON.parse(value);
341
- return stringArray;
394
+ return JSON.parse(value);
342
395
  }
343
396
  /**
344
- * Given a run date, this function converts the run date to the user's timezone and formats it as a date object.
345
- * @param lastRunDate: The retrieved last run date from the database
346
- * @returns The last run date converted to the user's timezone
397
+ * Converts a run date to the user's local timezone.
347
398
  */
348
399
  async convertLastRunDateToTimezone(lastRunDate) {
349
400
  const userTimeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
350
- const date = toZonedTime(lastRunDate, userTimeZone);
351
- return date;
401
+ return toZonedTime(lastRunDate, userTimeZone);
352
402
  }
353
403
  /**
354
- * Retrieves the last run date of the provided content source from the database. If no previous runs exist, the epoch date is returned.
355
- * @param contentSourceID: The ID of the content source to retrieve the last run date
356
- * @param contextUser: The user context to retrieve the last run date
357
- * @returns
404
+ * Retrieves the last run date for a content source. Returns epoch date if no runs exist.
358
405
  */
359
406
  async getContentSourceLastRunDate(contentSourceID, contextUser) {
360
407
  const rv = new RunView();
@@ -364,186 +411,82 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
364
411
  ResultType: 'entity_object',
365
412
  OrderBy: 'EndTime DESC'
366
413
  }, contextUser);
367
- try {
368
- if (results.Success && results.Results.length) {
369
- const contentProcessRun = results.Results[0];
370
- const lastRunDate = contentProcessRun.Get('__mj_CreatedAt');
371
- return this.convertLastRunDateToTimezone(lastRunDate);
372
- }
373
- else if (results.Success && !results.Results.length) {
374
- // Case where we do not have any previous runs for the content source, just return the epoch date
375
- return new Date(0);
376
- }
377
- else {
378
- throw new Error(`Failed to retrieve last run date for content source with ID ${contentSourceID}`);
379
- }
380
- }
381
- catch (e) {
382
- console.error(e);
383
- throw e;
384
- }
385
- }
386
- async getContentItemParams(contentTypeID, contextUser) {
387
- const rv = new RunView();
388
- const results = await rv.RunView({
389
- EntityName: 'MJ: Content Types',
390
- ExtraFilter: `ID='${contentTypeID}'`,
391
- ResultType: 'entity_object',
392
- }, contextUser);
393
414
  if (results.Success && results.Results.length) {
394
- const contentType = results.Results[0];
395
- return {
396
- modelID: contentType.AIModelID,
397
- minTags: contentType.MinTags,
398
- maxTags: contentType.MaxTags
399
- };
415
+ const lastRunDate = results.Results[0].__mj_CreatedAt;
416
+ return this.convertLastRunDateToTimezone(lastRunDate);
400
417
  }
401
- else {
402
- throw new Error(`Content Type with ID ${contentTypeID} not found`);
418
+ if (results.Success) {
419
+ return new Date(0);
403
420
  }
421
+ throw new Error(`Failed to retrieve last run date for content source with ID ${contentSourceID}`);
404
422
  }
405
- /**
406
- * Given a content source type ID, this function retrieves the content source type name from the database.
407
- * @param contentSourceTypeID
408
- * @param contextUser
409
- * @returns
410
- */
411
- async getContentSourceTypeName(contentSourceTypeID, contextUser) {
412
- const rv = new RunView();
413
- const contentFileTypeResult = await rv.RunView({
414
- EntityName: 'MJ: Content Source Types',
415
- ResultType: 'entity_object',
416
- ExtraFilter: `ID='${contentSourceTypeID}'`
417
- }, contextUser);
418
- try {
419
- if (contentFileTypeResult.Success && contentFileTypeResult.Results.length) {
420
- const contentSourceType = contentFileTypeResult.Results[0];
421
- return contentSourceType.Name;
422
- }
423
+ GetContentItemParams(contentTypeID) {
424
+ const contentType = this._ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
425
+ if (!contentType) {
426
+ throw new Error(`Content Type with ID ${contentTypeID} not found in cached metadata`);
423
427
  }
424
- catch (e) {
425
- console.error(e);
426
- throw e;
427
- }
428
- throw new Error(`Content Source Type with ID ${contentSourceTypeID} not found`);
428
+ return {
429
+ modelID: contentType.AIModelID,
430
+ minTags: contentType.MinTags,
431
+ maxTags: contentType.MaxTags
432
+ };
429
433
  }
430
- /**
431
- * Given a content type ID, this function retrieves the content type name from the database.
432
- * @param contentTypeID
433
- * @param contextUser
434
- * @returns
435
- */
436
- async getContentTypeName(contentTypeID, contextUser) {
437
- const rv = new RunView();
438
- const contentFileTypeResult = await rv.RunView({
439
- EntityName: 'MJ: Content Types',
440
- ResultType: 'entity_object',
441
- ExtraFilter: `ID='${contentTypeID}'`
442
- }, contextUser);
443
- try {
444
- if (contentFileTypeResult.Success && contentFileTypeResult.Results.length) {
445
- const contentFileType = contentFileTypeResult.Results[0];
446
- return contentFileType.Name;
447
- }
448
- }
449
- catch (e) {
450
- console.error(e);
451
- throw e;
434
+ GetContentSourceTypeName(contentSourceTypeID) {
435
+ const sourceType = this._ContentSourceTypes.find(st => UUIDsEqual(st.ID, contentSourceTypeID));
436
+ if (!sourceType) {
437
+ throw new Error(`Content Source Type with ID ${contentSourceTypeID} not found in cached metadata`);
452
438
  }
453
- throw new Error(`Content Type with ID ${contentTypeID} not found`);
439
+ return sourceType.Name;
454
440
  }
455
- /**
456
- * Given a content file type ID, this function retrieves the content file type name from the database.
457
- * @param contentFileTypeID
458
- * @param contextUser
459
- * @returns
460
- */
461
- async getContentFileTypeName(contentFileTypeID, contextUser) {
462
- const rv = new RunView();
463
- const contentFileTypeResult = await rv.RunView({
464
- EntityName: 'MJ: Content File Types',
465
- ResultType: 'entity_object',
466
- ExtraFilter: `ID='${contentFileTypeID}'`
467
- }, contextUser);
468
- try {
469
- if (contentFileTypeResult.Success && contentFileTypeResult.Results.length) {
470
- const contentFileType = contentFileTypeResult.Results[0];
471
- return contentFileType.Name;
472
- }
441
+ GetContentTypeName(contentTypeID) {
442
+ const contentType = this._ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
443
+ if (!contentType) {
444
+ throw new Error(`Content Type with ID ${contentTypeID} not found in cached metadata`);
473
445
  }
474
- catch (e) {
475
- console.error(e);
476
- throw e;
446
+ return contentType.Name;
447
+ }
448
+ GetContentFileTypeName(contentFileTypeID) {
449
+ const fileType = this._ContentFileTypes.find(ft => UUIDsEqual(ft.ID, contentFileTypeID));
450
+ if (!fileType) {
451
+ throw new Error(`Content File Type with ID ${contentFileTypeID} not found in cached metadata`);
477
452
  }
478
- throw new Error(`Content File Type with ID ${contentFileTypeID} not found`);
453
+ return fileType.Name;
479
454
  }
480
- async getAdditionalContentTypePrompt(contentTypeID, contextUser) {
481
- try {
482
- const rv = new RunView();
483
- const results = await rv.RunView({
484
- EntityName: 'MJ: Content Type Attributes',
485
- ExtraFilter: `ContentTypeID='${contentTypeID}'`,
486
- ResultType: 'entity_object'
487
- }, contextUser);
488
- if (results.Success && results.Results.length) {
489
- let prompt = '';
490
- for (const contentTypeAttribute of results.Results) {
491
- prompt += `${contentTypeAttribute.Prompt}. The data must be included in the above described JSON file in this key-value format: { "${contentTypeAttribute.Name}": (value of ${contentTypeAttribute.Name} here)}\n`;
492
- }
493
- return prompt;
494
- }
455
+ GetAdditionalContentTypePrompt(contentTypeID) {
456
+ const attrs = this._ContentTypeAttributes.filter(a => UUIDsEqual(a.ContentTypeID, contentTypeID));
457
+ if (attrs.length === 0)
495
458
  return '';
496
- }
497
- catch (e) {
498
- console.error(e);
499
- throw e;
500
- }
459
+ return attrs.map(attr => `${attr.Prompt}. The data must be included in the above described JSON file in this key-value format: { "${attr.Name}": (value of ${attr.Name} here)}`).join('\n');
501
460
  }
502
- /**
503
- * Given the content source parameters, this function creates a description of the content source item.
504
- * @param contentSourceParams: The parameters of the content source item
505
- * @returns The description of the content source item
506
- */
507
- async getContentItemDescription(contentSourceParams, contextUser) {
508
- const description = `${await this.getContentTypeName(contentSourceParams.ContentTypeID, contextUser)} in ${await this.getContentFileTypeName(contentSourceParams.ContentFileTypeID, contextUser)} format obtained from a ${await this.getContentSourceTypeName(contentSourceParams.ContentSourceTypeID, contextUser)} source`;
509
- return description;
461
+ GetContentItemDescription(contentSourceParams) {
462
+ const contentTypeName = this.GetContentTypeName(contentSourceParams.ContentTypeID);
463
+ const fileTypeName = this.GetContentFileTypeName(contentSourceParams.ContentFileTypeID);
464
+ const sourceTypeName = this.GetContentSourceTypeName(contentSourceParams.ContentSourceTypeID);
465
+ return `${contentTypeName} in ${fileTypeName} format obtained from a ${sourceTypeName} source`;
510
466
  }
511
467
  async getChecksumFromURL(url) {
512
468
  const response = await axios.get(url);
513
- const content = response.data;
514
- const hash = crypto.createHash('sha256').update(content).digest('hex');
515
- return hash;
469
+ const content = String(response.data);
470
+ return crypto.createHash('sha256').update(content).digest('hex');
516
471
  }
517
472
  async getChecksumFromText(text) {
518
- const hash = crypto.createHash('sha256').update(text).digest('hex');
519
- return hash;
473
+ return crypto.createHash('sha256').update(text).digest('hex');
520
474
  }
521
475
  async getContentItemIDFromURL(contentSourceParams, contextUser) {
522
476
  const url = contentSourceParams.URL;
523
477
  const rv = new RunView();
524
- try {
525
- const results = await rv.RunView({
526
- EntityName: 'MJ: Content Items',
527
- ExtraFilter: `URL='${url}' AND ContentSourceID='${contentSourceParams.contentSourceID}'`,
528
- ResultType: 'entity_object'
529
- }, contextUser);
530
- if (results.Success && results.Results.length) {
531
- const contentItem = results.Results[0];
532
- return contentItem.ID;
533
- }
534
- else {
535
- throw new Error(`Content item with URL ${url} not found`);
536
- }
537
- }
538
- catch (e) {
539
- throw new Error(`Failed to retrieve content item ID from URL: ${url}`);
478
+ const results = await rv.RunView({
479
+ EntityName: 'MJ: Content Items',
480
+ ExtraFilter: `URL='${url}' AND ContentSourceID='${contentSourceParams.contentSourceID}'`,
481
+ ResultType: 'entity_object'
482
+ }, contextUser);
483
+ if (results.Success && results.Results.length) {
484
+ return results.Results[0].ID;
540
485
  }
486
+ throw new Error(`Content item with URL ${url} not found`);
541
487
  }
542
488
  /**
543
- * Given the results of the processing from the LLM, this function saves the details of the process run in the database.
544
- * @param processRunParams: The parameters holding the details of the process run
545
- * @param contextUser: The user context to save the process run
546
- * @returns
489
+ * Saves process run metadata to the database.
547
490
  */
548
491
  async saveProcessRun(processRunParams, contextUser) {
549
492
  const md = new Metadata();
@@ -556,65 +499,40 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
556
499
  processRun.ProcessedItems = processRunParams.numItemsProcessed;
557
500
  await processRun.Save();
558
501
  }
559
- /**
560
- * Given a buffer of data, this function extracts text from a PDF file
561
- * @param dataBuffer: The buffer of data to extract text from
562
- * @returns The extracted text from the PDF file
563
- */
564
502
  async parsePDF(dataBuffer) {
565
503
  const dataPDF = await pdfParse(dataBuffer);
566
504
  return dataPDF.text;
567
505
  }
568
- /**
569
- * Given a buffer of data, this function extracts text from a DOCX file
570
- * @param dataBuffer: The buffer of data to extract text from
571
- * @returns The extracted text from the DOCX file
572
- */
573
506
  async parseDOCX(dataBuffer) {
574
507
  const dataDOCX = await officeparser.parseOffice(dataBuffer);
575
508
  return dataDOCX.toText();
576
509
  }
577
510
  async parseHTML(data) {
578
511
  try {
579
- let $;
580
- try {
581
- $ = cheerio.load(data);
582
- }
583
- catch (loadError) {
584
- console.error('Error loading data with cheerio:', loadError);
585
- return undefined;
586
- }
512
+ const $ = cheerio.load(data);
587
513
  $('script, style, nav, footer, header, .hidden').remove();
588
- const text = $('body').text().replace(/\s\s+/g, ' ').trim();
589
- return text;
514
+ return $('body').text().replace(/\s\s+/g, ' ').trim();
590
515
  }
591
516
  catch (e) {
592
- console.error(e);
517
+ LogError('Error parsing HTML', undefined, e);
593
518
  throw e;
594
519
  }
595
520
  }
596
- /**
597
- * Given a file path, as along as its one of the supported file types, this function choses the correct parser
598
- * and returns the extracted text.
599
- * @param filePath - The path to the file to extract text from
600
- * @returns - The extracted text from the file
601
- */
602
521
  async parseFileFromPath(filePath) {
603
522
  const dataBuffer = await fs.promises.readFile(filePath);
604
- const fileExtension = filePath.split('.').pop();
523
+ const fileExtension = filePath.split('.').pop()?.toLowerCase();
605
524
  switch (fileExtension) {
606
525
  case 'pdf':
607
- return await this.parsePDF(dataBuffer);
526
+ return this.parsePDF(dataBuffer);
608
527
  case 'docx':
609
- return await this.parseDOCX(dataBuffer);
528
+ return this.parseDOCX(dataBuffer);
610
529
  default:
611
- throw new Error('File type not supported');
530
+ throw new Error(`File type '${fileExtension}' not supported`);
612
531
  }
613
532
  }
614
533
  };
615
534
  AutotagBaseEngine = __decorate([
616
- RegisterClass(AIEngine, 'AutotagBaseEngine'),
617
- __metadata("design:paramtypes", [])
535
+ RegisterClass(BaseEngine, 'AutotagBaseEngine')
618
536
  ], AutotagBaseEngine);
619
537
  export { AutotagBaseEngine };
620
538
  //# sourceMappingURL=AutotagBaseEngine.js.map