@memberjunction/content-autotagging 5.20.0 → 5.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/CloudStorage/generic/CloudStorageBase.js +1 -1
- package/dist/CloudStorage/generic/CloudStorageBase.js.map +1 -1
- package/dist/CloudStorage/providers/AutotagAzureBlob.js +1 -1
- package/dist/CloudStorage/providers/AutotagAzureBlob.js.map +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.d.ts +63 -86
- package/dist/Engine/generic/AutotagBaseEngine.d.ts.map +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.js +269 -351
- package/dist/Engine/generic/AutotagBaseEngine.js.map +1 -1
- package/dist/Engine/generic/content.types.d.ts +2 -1
- package/dist/Engine/generic/content.types.d.ts.map +1 -1
- package/dist/Engine/generic/content.types.js.map +1 -1
- package/dist/Entity/generic/AutotagEntity.d.ts +8 -0
- package/dist/Entity/generic/AutotagEntity.d.ts.map +1 -1
- package/dist/Entity/generic/AutotagEntity.js +32 -29
- package/dist/Entity/generic/AutotagEntity.js.map +1 -1
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js +2 -2
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js.map +1 -1
- package/dist/RSSFeed/generic/AutotagRSSFeed.js +2 -2
- package/dist/RSSFeed/generic/AutotagRSSFeed.js.map +1 -1
- package/dist/Websites/generic/AutotagWebsite.js +2 -2
- package/dist/Websites/generic/AutotagWebsite.js.map +1 -1
- package/package.json +8 -7
|
@@ -4,11 +4,8 @@ var __decorate = (this && this.__decorate) || function (decorators, target, key,
|
|
|
4
4
|
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
|
5
5
|
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
6
6
|
};
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
};
|
|
10
|
-
import { Metadata, RunView } from '@memberjunction/core';
|
|
11
|
-
import { RegisterClass, MJGlobal, UUIDsEqual } from '@memberjunction/global';
|
|
7
|
+
import { BaseEngine, Metadata, RunView, LogError, LogStatus } from '@memberjunction/core';
|
|
8
|
+
import { MJGlobal, UUIDsEqual, RegisterClass } from '@memberjunction/global';
|
|
12
9
|
import { ContentSourceTypeParams } from './content.types.js';
|
|
13
10
|
import pdfParse from 'pdf-parse';
|
|
14
11
|
import officeparser from 'officeparser';
|
|
@@ -20,21 +17,72 @@ import * as cheerio from 'cheerio';
|
|
|
20
17
|
import crypto from 'crypto';
|
|
21
18
|
import { BaseLLM, GetAIAPIKey } from '@memberjunction/ai';
|
|
22
19
|
import { AIEngine } from '@memberjunction/aiengine';
|
|
23
|
-
|
|
20
|
+
import { TextChunker } from '@memberjunction/ai-vectors';
|
|
21
|
+
/**
|
|
22
|
+
* Core engine for content autotagging. Extends BaseEngine to cache content metadata
|
|
23
|
+
* (types, source types, file types, attributes) at startup. Uses AIEngine via composition
|
|
24
|
+
* for AI model access, then delegates to LLM for text analysis and tagging.
|
|
25
|
+
*/
|
|
26
|
+
let AutotagBaseEngine = class AutotagBaseEngine extends BaseEngine {
|
|
24
27
|
constructor() {
|
|
25
|
-
super();
|
|
28
|
+
super(...arguments);
|
|
29
|
+
// Cached metadata — loaded by BaseEngine.Config() via property configs
|
|
30
|
+
this._ContentTypes = [];
|
|
31
|
+
this._ContentSourceTypes = [];
|
|
32
|
+
this._ContentFileTypes = [];
|
|
33
|
+
this._ContentTypeAttributes = [];
|
|
34
|
+
this._ContentSourceTypeParams = [];
|
|
26
35
|
}
|
|
27
36
|
static get Instance() {
|
|
28
37
|
return super.getInstance();
|
|
29
38
|
}
|
|
39
|
+
/** All content types, cached at startup */
|
|
40
|
+
get ContentTypes() { return this._ContentTypes; }
|
|
41
|
+
/** All content source types, cached at startup */
|
|
42
|
+
get ContentSourceTypes() { return this._ContentSourceTypes; }
|
|
43
|
+
/** All content file types, cached at startup */
|
|
44
|
+
get ContentFileTypes() { return this._ContentFileTypes; }
|
|
45
|
+
/** All content type attributes, cached at startup */
|
|
46
|
+
get ContentTypeAttributes() { return this._ContentTypeAttributes; }
|
|
47
|
+
/** All content source type params, cached at startup */
|
|
48
|
+
get ContentSourceTypeParams() { return this._ContentSourceTypeParams; }
|
|
49
|
+
async Config(forceRefresh, contextUser, provider) {
|
|
50
|
+
const configs = [
|
|
51
|
+
{
|
|
52
|
+
Type: 'entity',
|
|
53
|
+
EntityName: 'MJ: Content Types',
|
|
54
|
+
PropertyName: '_ContentTypes',
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
Type: 'entity',
|
|
58
|
+
EntityName: 'MJ: Content Source Types',
|
|
59
|
+
PropertyName: '_ContentSourceTypes',
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
Type: 'entity',
|
|
63
|
+
EntityName: 'MJ: Content File Types',
|
|
64
|
+
PropertyName: '_ContentFileTypes',
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
Type: 'entity',
|
|
68
|
+
EntityName: 'MJ: Content Type Attributes',
|
|
69
|
+
PropertyName: '_ContentTypeAttributes',
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
Type: 'entity',
|
|
73
|
+
EntityName: 'MJ: Content Source Type Params',
|
|
74
|
+
PropertyName: '_ContentSourceTypeParams',
|
|
75
|
+
},
|
|
76
|
+
];
|
|
77
|
+
await this.Load(configs, provider, forceRefresh, contextUser);
|
|
78
|
+
return this;
|
|
79
|
+
}
|
|
30
80
|
/**
|
|
31
|
-
* Given a list of content items, extract the text from each
|
|
32
|
-
* @param contentItems
|
|
33
|
-
* @returns
|
|
81
|
+
* Given a list of content items, extract the text from each and process with LLM for tagging.
|
|
34
82
|
*/
|
|
35
83
|
async ExtractTextAndProcessWithLLM(contentItems, contextUser) {
|
|
36
84
|
if (!contentItems || contentItems.length === 0) {
|
|
37
|
-
|
|
85
|
+
LogStatus('No content items to process');
|
|
38
86
|
return;
|
|
39
87
|
}
|
|
40
88
|
const processRunParams = new ProcessRunParams();
|
|
@@ -43,22 +91,11 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
43
91
|
processRunParams.numItemsProcessed = contentItems.length;
|
|
44
92
|
for (const contentItem of contentItems) {
|
|
45
93
|
try {
|
|
46
|
-
const processingParams =
|
|
47
|
-
// Parameters that depend on the content item
|
|
48
|
-
processingParams.text = contentItem.Text;
|
|
49
|
-
processingParams.contentSourceTypeID = contentItem.ContentSourceTypeID;
|
|
50
|
-
processingParams.contentFileTypeID = contentItem.ContentFileTypeID;
|
|
51
|
-
processingParams.contentTypeID = contentItem.ContentTypeID;
|
|
52
|
-
// Parameters that depend on the content type
|
|
53
|
-
const { modelID, minTags, maxTags } = await this.getContentItemParams(processingParams.contentTypeID, contextUser);
|
|
54
|
-
processingParams.modelID = modelID;
|
|
55
|
-
processingParams.minTags = minTags;
|
|
56
|
-
processingParams.maxTags = maxTags;
|
|
57
|
-
processingParams.contentItemID = contentItem.ID;
|
|
94
|
+
const processingParams = await this.buildProcessingParams(contentItem, contextUser);
|
|
58
95
|
await this.ProcessContentItemText(processingParams, contextUser);
|
|
59
96
|
}
|
|
60
97
|
catch (e) {
|
|
61
|
-
|
|
98
|
+
LogError(`Failed to process content item: ${contentItem.ID}`, undefined, e);
|
|
62
99
|
throw e;
|
|
63
100
|
}
|
|
64
101
|
}
|
|
@@ -66,10 +103,23 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
66
103
|
await this.saveProcessRun(processRunParams, contextUser);
|
|
67
104
|
}
|
|
68
105
|
/**
|
|
69
|
-
*
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
106
|
+
* Builds processing parameters for a single content item
|
|
107
|
+
*/
|
|
108
|
+
async buildProcessingParams(contentItem, contextUser) {
|
|
109
|
+
const processingParams = new ContentItemProcessParams();
|
|
110
|
+
processingParams.text = contentItem.Text;
|
|
111
|
+
processingParams.contentSourceTypeID = contentItem.ContentSourceTypeID;
|
|
112
|
+
processingParams.contentFileTypeID = contentItem.ContentFileTypeID;
|
|
113
|
+
processingParams.contentTypeID = contentItem.ContentTypeID;
|
|
114
|
+
const { modelID, minTags, maxTags } = this.GetContentItemParams(processingParams.contentTypeID);
|
|
115
|
+
processingParams.modelID = modelID;
|
|
116
|
+
processingParams.minTags = minTags;
|
|
117
|
+
processingParams.maxTags = maxTags;
|
|
118
|
+
processingParams.contentItemID = contentItem.ID;
|
|
119
|
+
return processingParams;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Process a content item's text with the LLM and save results.
|
|
73
123
|
*/
|
|
74
124
|
async ProcessContentItemText(params, contextUser) {
|
|
75
125
|
const LLMResults = await this.promptAndRetrieveResultsFromLLM(params, contextUser);
|
|
@@ -77,39 +127,43 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
77
127
|
}
|
|
78
128
|
async promptAndRetrieveResultsFromLLM(params, contextUser) {
|
|
79
129
|
const model = AIEngine.Instance.Models.find(m => UUIDsEqual(m.ID, params.modelID));
|
|
130
|
+
if (!model) {
|
|
131
|
+
throw new Error(`AI Model with ID ${params.modelID} not found`);
|
|
132
|
+
}
|
|
80
133
|
const llm = MJGlobal.Instance.ClassFactory.CreateInstance(BaseLLM, model.DriverClass, GetAIAPIKey(model.DriverClass));
|
|
81
|
-
|
|
82
|
-
|
|
134
|
+
if (!llm) {
|
|
135
|
+
throw new Error(`Failed to create LLM instance for driver ${model.DriverClass}`);
|
|
136
|
+
}
|
|
137
|
+
const chunks = this.chunkExtractedText(params.text, model.InputTokenLimit);
|
|
83
138
|
let LLMResults = {};
|
|
84
139
|
const startTime = new Date();
|
|
85
|
-
for (const chunk of
|
|
140
|
+
for (const chunk of chunks) {
|
|
86
141
|
const { systemPrompt, userPrompt } = await this.getLLMPrompts(params, chunk, LLMResults, contextUser);
|
|
87
142
|
LLMResults = await this.processChunkWithLLM(llm, systemPrompt, userPrompt, LLMResults, model.APIName);
|
|
88
143
|
}
|
|
89
|
-
const endTime = new Date();
|
|
90
144
|
LLMResults.processStartTime = startTime;
|
|
91
|
-
LLMResults.processEndTime =
|
|
145
|
+
LLMResults.processEndTime = new Date();
|
|
92
146
|
LLMResults.contentItemID = params.contentItemID;
|
|
93
147
|
return LLMResults;
|
|
94
148
|
}
|
|
95
149
|
async processChunkWithLLM(llm, systemPrompt, userPrompt, LLMResults, modelAPIName) {
|
|
96
150
|
const response = await llm.ChatCompletion({
|
|
97
151
|
messages: [
|
|
98
|
-
{
|
|
99
|
-
|
|
100
|
-
content: systemPrompt,
|
|
101
|
-
},
|
|
102
|
-
{
|
|
103
|
-
role: 'user',
|
|
104
|
-
content: userPrompt,
|
|
105
|
-
}
|
|
152
|
+
{ role: 'system', content: systemPrompt },
|
|
153
|
+
{ role: 'user', content: userPrompt }
|
|
106
154
|
],
|
|
107
155
|
model: modelAPIName,
|
|
108
156
|
temperature: 0.0,
|
|
109
157
|
});
|
|
110
158
|
const queryResponse = response.data.choices[0]?.message?.content?.trim() || '';
|
|
111
|
-
|
|
112
|
-
|
|
159
|
+
let JSONQueryResponse;
|
|
160
|
+
try {
|
|
161
|
+
JSONQueryResponse = JSON.parse(queryResponse);
|
|
162
|
+
}
|
|
163
|
+
catch (parseError) {
|
|
164
|
+
LogError('Failed to parse LLM response as JSON', undefined, queryResponse);
|
|
165
|
+
return LLMResults;
|
|
166
|
+
}
|
|
113
167
|
for (const key in JSONQueryResponse) {
|
|
114
168
|
const value = JSONQueryResponse[key];
|
|
115
169
|
if (value !== null) {
|
|
@@ -119,14 +173,14 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
119
173
|
return LLMResults;
|
|
120
174
|
}
|
|
121
175
|
async getLLMPrompts(params, chunk, LLMResults, contextUser) {
|
|
122
|
-
const contentType =
|
|
123
|
-
const contentSourceType =
|
|
124
|
-
const additionalContentTypePrompts =
|
|
125
|
-
const systemPrompt = `You are a highly skilled text analysis assistant. You have decades of experience and pride yourself on your attention to detail and ability to capture both accurate information, as well as tone and subtext.
|
|
126
|
-
Your task is to accurately extract key information from a provided piece of text based on a series of prompts. You are provided with text that should be a ${contentType}, that has been extracted from a ${contentSourceType}.
|
|
176
|
+
const contentType = this.GetContentTypeName(params.contentTypeID);
|
|
177
|
+
const contentSourceType = this.GetContentSourceTypeName(params.contentSourceTypeID);
|
|
178
|
+
const additionalContentTypePrompts = this.GetAdditionalContentTypePrompt(params.contentTypeID);
|
|
179
|
+
const systemPrompt = `You are a highly skilled text analysis assistant. You have decades of experience and pride yourself on your attention to detail and ability to capture both accurate information, as well as tone and subtext.
|
|
180
|
+
Your task is to accurately extract key information from a provided piece of text based on a series of prompts. You are provided with text that should be a ${contentType}, that has been extracted from a ${contentSourceType}.
|
|
127
181
|
The text MUST be of the type ${contentType} for the subsequent processing.`;
|
|
128
182
|
const userPrompt = `
|
|
129
|
-
If the provided text does not actually appear to be of the type ${contentType}, please disregard everything in the instructions after this and return this exact JSON response: { isValidContent: false (as a boolean) }.
|
|
183
|
+
If the provided text does not actually appear to be of the type ${contentType}, please disregard everything in the instructions after this and return this exact JSON response: { isValidContent: false (as a boolean) }.
|
|
130
184
|
Assuming the type of the text is in fact from a ${contentType}, please extract the title of the provided text, a short summary of the provided documents, as well as between ${params.minTags} and ${params.maxTags} topical key words that are most relevant to the text.
|
|
131
185
|
If there is no title explicitly provided in the text, please provide a title that you think best represents the text.
|
|
132
186
|
Please provide the keywords in a list format.
|
|
@@ -135,13 +189,13 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
135
189
|
{
|
|
136
190
|
"title": (title here),
|
|
137
191
|
"description": (description here),
|
|
138
|
-
"keywords": (list keywords here),
|
|
192
|
+
"keywords": (list keywords here),
|
|
139
193
|
"isValidContent": true (as a boolean)
|
|
140
194
|
}
|
|
141
195
|
|
|
142
196
|
${additionalContentTypePrompts}
|
|
143
197
|
|
|
144
|
-
Please make sure the response in is valid JSON format.
|
|
198
|
+
Please make sure the response in is valid JSON format.
|
|
145
199
|
|
|
146
200
|
You are also provided with the results so far as additional context, please use them to formulate the best results given the provided text: ${JSON.stringify(LLMResults)}
|
|
147
201
|
The supplied text is: ${chunk}
|
|
@@ -150,10 +204,9 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
150
204
|
}
|
|
151
205
|
async saveLLMResults(LLMResults, contextUser) {
|
|
152
206
|
if (LLMResults.isValidContent === true) {
|
|
153
|
-
// Only save results if the content is of the type that we expected.
|
|
154
207
|
await this.saveResultsToContentItemAttribute(LLMResults, contextUser);
|
|
155
208
|
await this.saveContentItemTags(LLMResults.contentItemID, LLMResults, contextUser);
|
|
156
|
-
|
|
209
|
+
LogStatus(`Results for content item ${LLMResults.contentItemID} saved successfully`);
|
|
157
210
|
}
|
|
158
211
|
else {
|
|
159
212
|
await this.deleteInvalidContentItem(LLMResults.contentItemID, contextUser);
|
|
@@ -165,116 +218,124 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
165
218
|
await contentItem.Load(contentItemID);
|
|
166
219
|
await contentItem.Delete();
|
|
167
220
|
}
|
|
221
|
+
/**
|
|
222
|
+
* Chunks text using the shared TextChunker utility for token-aware splitting.
|
|
223
|
+
* Falls back to simple character-based splitting when TextChunker is not available.
|
|
224
|
+
*/
|
|
168
225
|
chunkExtractedText(text, tokenLimit) {
|
|
169
226
|
try {
|
|
170
|
-
const
|
|
171
|
-
if (text.length <=
|
|
172
|
-
// No need to chunk the text
|
|
227
|
+
const maxChunkTokens = Math.ceil(tokenLimit / 1.5);
|
|
228
|
+
if (text.length <= maxChunkTokens * 4) {
|
|
173
229
|
return [text];
|
|
174
230
|
}
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
231
|
+
try {
|
|
232
|
+
const chunkParams = {
|
|
233
|
+
Text: text,
|
|
234
|
+
MaxChunkTokens: maxChunkTokens,
|
|
235
|
+
OverlapTokens: Math.ceil(maxChunkTokens * 0.1),
|
|
236
|
+
Strategy: 'sentence',
|
|
237
|
+
};
|
|
238
|
+
const chunks = TextChunker.ChunkText(chunkParams);
|
|
239
|
+
return chunks.map(c => c.Text);
|
|
240
|
+
}
|
|
241
|
+
catch {
|
|
242
|
+
return this.fallbackChunkText(text, maxChunkTokens);
|
|
182
243
|
}
|
|
183
|
-
return chunks;
|
|
184
244
|
}
|
|
185
|
-
catch
|
|
186
|
-
|
|
245
|
+
catch {
|
|
246
|
+
LogError('Could not chunk the text');
|
|
187
247
|
return [text];
|
|
188
248
|
}
|
|
189
249
|
}
|
|
190
250
|
/**
|
|
191
|
-
*
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
251
|
+
* Simple character-based chunking as fallback
|
|
252
|
+
*/
|
|
253
|
+
fallbackChunkText(text, textLimit) {
|
|
254
|
+
const numChunks = Math.ceil(text.length / textLimit);
|
|
255
|
+
const chunkSize = Math.ceil(text.length / numChunks);
|
|
256
|
+
const chunks = [];
|
|
257
|
+
for (let i = 0; i < numChunks; i++) {
|
|
258
|
+
const start = i * chunkSize;
|
|
259
|
+
const end = (i + 1) * chunkSize;
|
|
260
|
+
chunks.push(text.slice(start, end));
|
|
261
|
+
}
|
|
262
|
+
return chunks;
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Saves keyword tags from LLM results as Content Item Tags.
|
|
266
|
+
* Uses batched saves for better performance.
|
|
197
267
|
*/
|
|
198
268
|
async saveContentItemTags(contentItemID, LLMResults, contextUser) {
|
|
199
269
|
const md = new Metadata();
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
270
|
+
const keywords = LLMResults.keywords;
|
|
271
|
+
if (!keywords || !Array.isArray(keywords))
|
|
272
|
+
return;
|
|
273
|
+
const BATCH_SIZE = 10;
|
|
274
|
+
for (let i = 0; i < keywords.length; i += BATCH_SIZE) {
|
|
275
|
+
const batch = keywords.slice(i, i + BATCH_SIZE);
|
|
276
|
+
await Promise.all(batch.map(async (keyword) => {
|
|
277
|
+
const contentItemTag = await md.GetEntityObject('MJ: Content Item Tags', contextUser);
|
|
278
|
+
contentItemTag.NewRecord();
|
|
279
|
+
contentItemTag.ItemID = contentItemID;
|
|
280
|
+
contentItemTag.Tag = keyword;
|
|
281
|
+
await contentItemTag.Save();
|
|
282
|
+
}));
|
|
206
283
|
}
|
|
207
284
|
}
|
|
285
|
+
/**
|
|
286
|
+
* Saves LLM-extracted attributes to the database.
|
|
287
|
+
* Updates content item name/description, then creates attribute records for other fields.
|
|
288
|
+
*/
|
|
208
289
|
async saveResultsToContentItemAttribute(LLMResults, contextUser) {
|
|
209
290
|
const md = new Metadata();
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
291
|
+
const contentItemID = LLMResults.contentItemID;
|
|
292
|
+
const skipKeys = new Set(['keywords', 'processStartTime', 'processEndTime', 'contentItemID', 'isValidContent']);
|
|
293
|
+
// Update title and description on the content item
|
|
294
|
+
if (LLMResults.title || LLMResults.description) {
|
|
295
|
+
const contentItem = await md.GetEntityObject('MJ: Content Items', contextUser);
|
|
296
|
+
await contentItem.Load(contentItemID);
|
|
297
|
+
if (LLMResults.title)
|
|
216
298
|
contentItem.Name = LLMResults.title;
|
|
217
|
-
|
|
218
|
-
}
|
|
219
|
-
if (key === 'description') {
|
|
220
|
-
const ID = LLMResults.contentItemID;
|
|
221
|
-
const contentItem = await md.GetEntityObject('MJ: Content Items', contextUser);
|
|
222
|
-
await contentItem.Load(ID);
|
|
299
|
+
if (LLMResults.description)
|
|
223
300
|
contentItem.Description = LLMResults.description;
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
301
|
+
await contentItem.Save();
|
|
302
|
+
}
|
|
303
|
+
// Create attribute records for remaining fields
|
|
304
|
+
const attributeEntries = Object.entries(LLMResults).filter(([key]) => !skipKeys.has(key) && key !== 'title' && key !== 'description');
|
|
305
|
+
const BATCH_SIZE = 10;
|
|
306
|
+
for (let i = 0; i < attributeEntries.length; i += BATCH_SIZE) {
|
|
307
|
+
const batch = attributeEntries.slice(i, i + BATCH_SIZE);
|
|
308
|
+
await Promise.all(batch.map(async ([key, value]) => {
|
|
227
309
|
const contentItemAttribute = await md.GetEntityObject('MJ: Content Item Attributes', contextUser);
|
|
228
310
|
contentItemAttribute.NewRecord();
|
|
229
|
-
|
|
230
|
-
const value = LLMResults[key] || '';
|
|
231
|
-
contentItemAttribute.ContentItemID = LLMResults.contentItemID;
|
|
311
|
+
contentItemAttribute.ContentItemID = contentItemID;
|
|
232
312
|
contentItemAttribute.Name = key;
|
|
233
|
-
contentItemAttribute.Value = value;
|
|
313
|
+
contentItemAttribute.Value = value != null ? String(value) : '';
|
|
234
314
|
await contentItemAttribute.Save();
|
|
235
|
-
}
|
|
315
|
+
}));
|
|
236
316
|
}
|
|
237
317
|
}
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
* @returns A list of content sources
|
|
242
|
-
*/
|
|
318
|
+
/**
|
|
319
|
+
* Retrieves all content sources for a given content source type.
|
|
320
|
+
*/
|
|
243
321
|
async getAllContentSources(contextUser, contentSourceTypeID) {
|
|
244
322
|
const rv = new RunView();
|
|
245
|
-
const
|
|
323
|
+
const result = await rv.RunView({
|
|
246
324
|
EntityName: 'MJ: Content Sources',
|
|
247
325
|
ResultType: 'entity_object',
|
|
248
326
|
ExtraFilter: `ContentSourceTypeID='${contentSourceTypeID}'`
|
|
249
327
|
}, contextUser);
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
const contentSources = contentSourceResult.Results;
|
|
253
|
-
return contentSources;
|
|
254
|
-
}
|
|
255
|
-
else {
|
|
256
|
-
throw new Error(`No content sources found for content source type with ID '${contentSourceTypeID}'`);
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
catch (e) {
|
|
260
|
-
console.error(e);
|
|
261
|
-
throw e;
|
|
328
|
+
if (result.Success && result.Results.length) {
|
|
329
|
+
return result.Results;
|
|
262
330
|
}
|
|
331
|
+
throw new Error(`No content sources found for content source type with ID '${contentSourceTypeID}'`);
|
|
263
332
|
}
|
|
264
|
-
|
|
265
|
-
const
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
ExtraFilter: `Name='${subclass}'`,
|
|
269
|
-
ResultType: 'entity_object'
|
|
270
|
-
}, contextUser);
|
|
271
|
-
if (results.Success && results.Results.length) {
|
|
272
|
-
const contentSourceType = results.Results[0];
|
|
273
|
-
return contentSourceType.ID;
|
|
274
|
-
}
|
|
275
|
-
else {
|
|
276
|
-
throw new Error(`Subclass with name ${subclass} not found`);
|
|
333
|
+
SetSubclassContentSourceType(subclass) {
|
|
334
|
+
const sourceType = this._ContentSourceTypes.find(st => st.Name === subclass);
|
|
335
|
+
if (!sourceType) {
|
|
336
|
+
throw new Error(`Content Source Type with name '${subclass}' not found in cached metadata`);
|
|
277
337
|
}
|
|
338
|
+
return sourceType.ID;
|
|
278
339
|
}
|
|
279
340
|
async getContentSourceParams(contentSource, contextUser) {
|
|
280
341
|
const contentSourceParams = new Map();
|
|
@@ -285,42 +346,35 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
285
346
|
ResultType: 'entity_object'
|
|
286
347
|
}, contextUser);
|
|
287
348
|
if (results.Success && results.Results.length) {
|
|
288
|
-
const
|
|
289
|
-
|
|
290
|
-
const params = await this.getDefaultContentSourceTypeParams(contentSourceParam.ContentSourceTypeParamID, contextUser);
|
|
349
|
+
for (const contentSourceParam of results.Results) {
|
|
350
|
+
const params = this.GetDefaultContentSourceTypeParams(contentSourceParam.ContentSourceTypeParamID);
|
|
291
351
|
params.contentSourceID = contentSource.ID;
|
|
292
352
|
if (contentSourceParam.Value) {
|
|
293
|
-
// There is a provided value, so overwrite the default value
|
|
294
353
|
params.value = this.castValueAsCorrectType(contentSourceParam.Value, params.type);
|
|
295
354
|
}
|
|
296
355
|
contentSourceParams.set(params.name, params.value);
|
|
297
356
|
}
|
|
298
|
-
return contentSourceParams;
|
|
299
357
|
}
|
|
300
358
|
else {
|
|
301
|
-
|
|
359
|
+
LogStatus(`No content source params found for content source with ID ${contentSource.ID}, using default values`);
|
|
302
360
|
}
|
|
361
|
+
return contentSourceParams;
|
|
303
362
|
}
|
|
304
|
-
|
|
305
|
-
const
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
ExtraFilter: `ID='${contentSourceTypeParamID}'`,
|
|
309
|
-
ResultType: 'entity_object'
|
|
310
|
-
}, contextUser);
|
|
311
|
-
if (results.Success && results.Results.length) {
|
|
312
|
-
const params = new ContentSourceTypeParams();
|
|
313
|
-
params.name = results.Results[0].Get('Name');
|
|
314
|
-
params.type = results.Results[0].Get('Type').toLowerCase();
|
|
315
|
-
params.value = this.castValueAsCorrectType(results.Results[0].Get('DefaultValue'), params.type); // Default value in this case, can be null or overridden later
|
|
316
|
-
return params;
|
|
363
|
+
GetDefaultContentSourceTypeParams(contentSourceTypeParamID) {
|
|
364
|
+
const result = this._ContentSourceTypeParams.find(p => UUIDsEqual(p.ID, contentSourceTypeParamID));
|
|
365
|
+
if (!result) {
|
|
366
|
+
throw new Error(`Content Source Type Param with ID '${contentSourceTypeParamID}' not found in cached metadata`);
|
|
317
367
|
}
|
|
318
|
-
|
|
368
|
+
const params = new ContentSourceTypeParams();
|
|
369
|
+
params.name = result.Name;
|
|
370
|
+
params.type = result.Type.toLowerCase();
|
|
371
|
+
params.value = this.castValueAsCorrectType(result.DefaultValue ?? '', params.type);
|
|
372
|
+
return params;
|
|
319
373
|
}
|
|
320
374
|
castValueAsCorrectType(value, type) {
|
|
321
375
|
switch (type) {
|
|
322
376
|
case 'number':
|
|
323
|
-
return parseInt(value);
|
|
377
|
+
return parseInt(value, 10);
|
|
324
378
|
case 'boolean':
|
|
325
379
|
return this.stringToBoolean(value);
|
|
326
380
|
case 'string':
|
|
@@ -333,28 +387,21 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
333
387
|
return value;
|
|
334
388
|
}
|
|
335
389
|
}
|
|
336
|
-
stringToBoolean(
|
|
337
|
-
return
|
|
390
|
+
stringToBoolean(str) {
|
|
391
|
+
return str === 'true';
|
|
338
392
|
}
|
|
339
393
|
parseStringArray(value) {
|
|
340
|
-
|
|
341
|
-
return stringArray;
|
|
394
|
+
return JSON.parse(value);
|
|
342
395
|
}
|
|
343
396
|
/**
|
|
344
|
-
*
|
|
345
|
-
* @param lastRunDate: The retrieved last run date from the database
|
|
346
|
-
* @returns The last run date converted to the user's timezone
|
|
397
|
+
* Converts a run date to the user's local timezone.
|
|
347
398
|
*/
|
|
348
399
|
async convertLastRunDateToTimezone(lastRunDate) {
|
|
349
400
|
const userTimeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
|
|
350
|
-
|
|
351
|
-
return date;
|
|
401
|
+
return toZonedTime(lastRunDate, userTimeZone);
|
|
352
402
|
}
|
|
353
403
|
/**
|
|
354
|
-
* Retrieves the last run date
|
|
355
|
-
* @param contentSourceID: The ID of the content source to retrieve the last run date
|
|
356
|
-
* @param contextUser: The user context to retrieve the last run date
|
|
357
|
-
* @returns
|
|
404
|
+
* Retrieves the last run date for a content source. Returns epoch date if no runs exist.
|
|
358
405
|
*/
|
|
359
406
|
async getContentSourceLastRunDate(contentSourceID, contextUser) {
|
|
360
407
|
const rv = new RunView();
|
|
@@ -364,186 +411,82 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
364
411
|
ResultType: 'entity_object',
|
|
365
412
|
OrderBy: 'EndTime DESC'
|
|
366
413
|
}, contextUser);
|
|
367
|
-
try {
|
|
368
|
-
if (results.Success && results.Results.length) {
|
|
369
|
-
const contentProcessRun = results.Results[0];
|
|
370
|
-
const lastRunDate = contentProcessRun.Get('__mj_CreatedAt');
|
|
371
|
-
return this.convertLastRunDateToTimezone(lastRunDate);
|
|
372
|
-
}
|
|
373
|
-
else if (results.Success && !results.Results.length) {
|
|
374
|
-
// Case where we do not have any previous runs for the content source, just return the epoch date
|
|
375
|
-
return new Date(0);
|
|
376
|
-
}
|
|
377
|
-
else {
|
|
378
|
-
throw new Error(`Failed to retrieve last run date for content source with ID ${contentSourceID}`);
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
|
-
catch (e) {
|
|
382
|
-
console.error(e);
|
|
383
|
-
throw e;
|
|
384
|
-
}
|
|
385
|
-
}
|
|
386
|
-
async getContentItemParams(contentTypeID, contextUser) {
|
|
387
|
-
const rv = new RunView();
|
|
388
|
-
const results = await rv.RunView({
|
|
389
|
-
EntityName: 'MJ: Content Types',
|
|
390
|
-
ExtraFilter: `ID='${contentTypeID}'`,
|
|
391
|
-
ResultType: 'entity_object',
|
|
392
|
-
}, contextUser);
|
|
393
414
|
if (results.Success && results.Results.length) {
|
|
394
|
-
const
|
|
395
|
-
return
|
|
396
|
-
modelID: contentType.AIModelID,
|
|
397
|
-
minTags: contentType.MinTags,
|
|
398
|
-
maxTags: contentType.MaxTags
|
|
399
|
-
};
|
|
415
|
+
const lastRunDate = results.Results[0].__mj_CreatedAt;
|
|
416
|
+
return this.convertLastRunDateToTimezone(lastRunDate);
|
|
400
417
|
}
|
|
401
|
-
|
|
402
|
-
|
|
418
|
+
if (results.Success) {
|
|
419
|
+
return new Date(0);
|
|
403
420
|
}
|
|
421
|
+
throw new Error(`Failed to retrieve last run date for content source with ID ${contentSourceID}`);
|
|
404
422
|
}
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
* @returns
|
|
410
|
-
*/
|
|
411
|
-
async getContentSourceTypeName(contentSourceTypeID, contextUser) {
|
|
412
|
-
const rv = new RunView();
|
|
413
|
-
const contentFileTypeResult = await rv.RunView({
|
|
414
|
-
EntityName: 'MJ: Content Source Types',
|
|
415
|
-
ResultType: 'entity_object',
|
|
416
|
-
ExtraFilter: `ID='${contentSourceTypeID}'`
|
|
417
|
-
}, contextUser);
|
|
418
|
-
try {
|
|
419
|
-
if (contentFileTypeResult.Success && contentFileTypeResult.Results.length) {
|
|
420
|
-
const contentSourceType = contentFileTypeResult.Results[0];
|
|
421
|
-
return contentSourceType.Name;
|
|
422
|
-
}
|
|
423
|
+
GetContentItemParams(contentTypeID) {
|
|
424
|
+
const contentType = this._ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
|
|
425
|
+
if (!contentType) {
|
|
426
|
+
throw new Error(`Content Type with ID ${contentTypeID} not found in cached metadata`);
|
|
423
427
|
}
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
428
|
+
return {
|
|
429
|
+
modelID: contentType.AIModelID,
|
|
430
|
+
minTags: contentType.MinTags,
|
|
431
|
+
maxTags: contentType.MaxTags
|
|
432
|
+
};
|
|
429
433
|
}
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
* @returns
|
|
435
|
-
*/
|
|
436
|
-
async getContentTypeName(contentTypeID, contextUser) {
|
|
437
|
-
const rv = new RunView();
|
|
438
|
-
const contentFileTypeResult = await rv.RunView({
|
|
439
|
-
EntityName: 'MJ: Content Types',
|
|
440
|
-
ResultType: 'entity_object',
|
|
441
|
-
ExtraFilter: `ID='${contentTypeID}'`
|
|
442
|
-
}, contextUser);
|
|
443
|
-
try {
|
|
444
|
-
if (contentFileTypeResult.Success && contentFileTypeResult.Results.length) {
|
|
445
|
-
const contentFileType = contentFileTypeResult.Results[0];
|
|
446
|
-
return contentFileType.Name;
|
|
447
|
-
}
|
|
448
|
-
}
|
|
449
|
-
catch (e) {
|
|
450
|
-
console.error(e);
|
|
451
|
-
throw e;
|
|
434
|
+
GetContentSourceTypeName(contentSourceTypeID) {
|
|
435
|
+
const sourceType = this._ContentSourceTypes.find(st => UUIDsEqual(st.ID, contentSourceTypeID));
|
|
436
|
+
if (!sourceType) {
|
|
437
|
+
throw new Error(`Content Source Type with ID ${contentSourceTypeID} not found in cached metadata`);
|
|
452
438
|
}
|
|
453
|
-
|
|
439
|
+
return sourceType.Name;
|
|
454
440
|
}
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
* @returns
|
|
460
|
-
*/
|
|
461
|
-
async getContentFileTypeName(contentFileTypeID, contextUser) {
|
|
462
|
-
const rv = new RunView();
|
|
463
|
-
const contentFileTypeResult = await rv.RunView({
|
|
464
|
-
EntityName: 'MJ: Content File Types',
|
|
465
|
-
ResultType: 'entity_object',
|
|
466
|
-
ExtraFilter: `ID='${contentFileTypeID}'`
|
|
467
|
-
}, contextUser);
|
|
468
|
-
try {
|
|
469
|
-
if (contentFileTypeResult.Success && contentFileTypeResult.Results.length) {
|
|
470
|
-
const contentFileType = contentFileTypeResult.Results[0];
|
|
471
|
-
return contentFileType.Name;
|
|
472
|
-
}
|
|
441
|
+
GetContentTypeName(contentTypeID) {
|
|
442
|
+
const contentType = this._ContentTypes.find(ct => UUIDsEqual(ct.ID, contentTypeID));
|
|
443
|
+
if (!contentType) {
|
|
444
|
+
throw new Error(`Content Type with ID ${contentTypeID} not found in cached metadata`);
|
|
473
445
|
}
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
446
|
+
return contentType.Name;
|
|
447
|
+
}
|
|
448
|
+
GetContentFileTypeName(contentFileTypeID) {
|
|
449
|
+
const fileType = this._ContentFileTypes.find(ft => UUIDsEqual(ft.ID, contentFileTypeID));
|
|
450
|
+
if (!fileType) {
|
|
451
|
+
throw new Error(`Content File Type with ID ${contentFileTypeID} not found in cached metadata`);
|
|
477
452
|
}
|
|
478
|
-
|
|
453
|
+
return fileType.Name;
|
|
479
454
|
}
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
const results = await rv.RunView({
|
|
484
|
-
EntityName: 'MJ: Content Type Attributes',
|
|
485
|
-
ExtraFilter: `ContentTypeID='${contentTypeID}'`,
|
|
486
|
-
ResultType: 'entity_object'
|
|
487
|
-
}, contextUser);
|
|
488
|
-
if (results.Success && results.Results.length) {
|
|
489
|
-
let prompt = '';
|
|
490
|
-
for (const contentTypeAttribute of results.Results) {
|
|
491
|
-
prompt += `${contentTypeAttribute.Prompt}. The data must be included in the above described JSON file in this key-value format: { "${contentTypeAttribute.Name}": (value of ${contentTypeAttribute.Name} here)}\n`;
|
|
492
|
-
}
|
|
493
|
-
return prompt;
|
|
494
|
-
}
|
|
455
|
+
GetAdditionalContentTypePrompt(contentTypeID) {
|
|
456
|
+
const attrs = this._ContentTypeAttributes.filter(a => UUIDsEqual(a.ContentTypeID, contentTypeID));
|
|
457
|
+
if (attrs.length === 0)
|
|
495
458
|
return '';
|
|
496
|
-
}
|
|
497
|
-
catch (e) {
|
|
498
|
-
console.error(e);
|
|
499
|
-
throw e;
|
|
500
|
-
}
|
|
459
|
+
return attrs.map(attr => `${attr.Prompt}. The data must be included in the above described JSON file in this key-value format: { "${attr.Name}": (value of ${attr.Name} here)}`).join('\n');
|
|
501
460
|
}
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
async getContentItemDescription(contentSourceParams, contextUser) {
|
|
508
|
-
const description = `${await this.getContentTypeName(contentSourceParams.ContentTypeID, contextUser)} in ${await this.getContentFileTypeName(contentSourceParams.ContentFileTypeID, contextUser)} format obtained from a ${await this.getContentSourceTypeName(contentSourceParams.ContentSourceTypeID, contextUser)} source`;
|
|
509
|
-
return description;
|
|
461
|
+
GetContentItemDescription(contentSourceParams) {
|
|
462
|
+
const contentTypeName = this.GetContentTypeName(contentSourceParams.ContentTypeID);
|
|
463
|
+
const fileTypeName = this.GetContentFileTypeName(contentSourceParams.ContentFileTypeID);
|
|
464
|
+
const sourceTypeName = this.GetContentSourceTypeName(contentSourceParams.ContentSourceTypeID);
|
|
465
|
+
return `${contentTypeName} in ${fileTypeName} format obtained from a ${sourceTypeName} source`;
|
|
510
466
|
}
|
|
511
467
|
async getChecksumFromURL(url) {
|
|
512
468
|
const response = await axios.get(url);
|
|
513
|
-
const content = response.data;
|
|
514
|
-
|
|
515
|
-
return hash;
|
|
469
|
+
const content = String(response.data);
|
|
470
|
+
return crypto.createHash('sha256').update(content).digest('hex');
|
|
516
471
|
}
|
|
517
472
|
async getChecksumFromText(text) {
|
|
518
|
-
|
|
519
|
-
return hash;
|
|
473
|
+
return crypto.createHash('sha256').update(text).digest('hex');
|
|
520
474
|
}
|
|
521
475
|
async getContentItemIDFromURL(contentSourceParams, contextUser) {
|
|
522
476
|
const url = contentSourceParams.URL;
|
|
523
477
|
const rv = new RunView();
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
const contentItem = results.Results[0];
|
|
532
|
-
return contentItem.ID;
|
|
533
|
-
}
|
|
534
|
-
else {
|
|
535
|
-
throw new Error(`Content item with URL ${url} not found`);
|
|
536
|
-
}
|
|
537
|
-
}
|
|
538
|
-
catch (e) {
|
|
539
|
-
throw new Error(`Failed to retrieve content item ID from URL: ${url}`);
|
|
478
|
+
const results = await rv.RunView({
|
|
479
|
+
EntityName: 'MJ: Content Items',
|
|
480
|
+
ExtraFilter: `URL='${url}' AND ContentSourceID='${contentSourceParams.contentSourceID}'`,
|
|
481
|
+
ResultType: 'entity_object'
|
|
482
|
+
}, contextUser);
|
|
483
|
+
if (results.Success && results.Results.length) {
|
|
484
|
+
return results.Results[0].ID;
|
|
540
485
|
}
|
|
486
|
+
throw new Error(`Content item with URL ${url} not found`);
|
|
541
487
|
}
|
|
542
488
|
/**
|
|
543
|
-
*
|
|
544
|
-
* @param processRunParams: The parameters holding the details of the process run
|
|
545
|
-
* @param contextUser: The user context to save the process run
|
|
546
|
-
* @returns
|
|
489
|
+
* Saves process run metadata to the database.
|
|
547
490
|
*/
|
|
548
491
|
async saveProcessRun(processRunParams, contextUser) {
|
|
549
492
|
const md = new Metadata();
|
|
@@ -556,65 +499,40 @@ let AutotagBaseEngine = class AutotagBaseEngine extends AIEngine {
|
|
|
556
499
|
processRun.ProcessedItems = processRunParams.numItemsProcessed;
|
|
557
500
|
await processRun.Save();
|
|
558
501
|
}
|
|
559
|
-
/**
|
|
560
|
-
* Given a buffer of data, this function extracts text from a PDF file
|
|
561
|
-
* @param dataBuffer: The buffer of data to extract text from
|
|
562
|
-
* @returns The extracted text from the PDF file
|
|
563
|
-
*/
|
|
564
502
|
async parsePDF(dataBuffer) {
|
|
565
503
|
const dataPDF = await pdfParse(dataBuffer);
|
|
566
504
|
return dataPDF.text;
|
|
567
505
|
}
|
|
568
|
-
/**
|
|
569
|
-
* Given a buffer of data, this function extracts text from a DOCX file
|
|
570
|
-
* @param dataBuffer: The buffer of data to extract text from
|
|
571
|
-
* @returns The extracted text from the DOCX file
|
|
572
|
-
*/
|
|
573
506
|
async parseDOCX(dataBuffer) {
|
|
574
507
|
const dataDOCX = await officeparser.parseOffice(dataBuffer);
|
|
575
508
|
return dataDOCX.toText();
|
|
576
509
|
}
|
|
577
510
|
async parseHTML(data) {
|
|
578
511
|
try {
|
|
579
|
-
|
|
580
|
-
try {
|
|
581
|
-
$ = cheerio.load(data);
|
|
582
|
-
}
|
|
583
|
-
catch (loadError) {
|
|
584
|
-
console.error('Error loading data with cheerio:', loadError);
|
|
585
|
-
return undefined;
|
|
586
|
-
}
|
|
512
|
+
const $ = cheerio.load(data);
|
|
587
513
|
$('script, style, nav, footer, header, .hidden').remove();
|
|
588
|
-
|
|
589
|
-
return text;
|
|
514
|
+
return $('body').text().replace(/\s\s+/g, ' ').trim();
|
|
590
515
|
}
|
|
591
516
|
catch (e) {
|
|
592
|
-
|
|
517
|
+
LogError('Error parsing HTML', undefined, e);
|
|
593
518
|
throw e;
|
|
594
519
|
}
|
|
595
520
|
}
|
|
596
|
-
/**
|
|
597
|
-
* Given a file path, as along as its one of the supported file types, this function choses the correct parser
|
|
598
|
-
* and returns the extracted text.
|
|
599
|
-
* @param filePath - The path to the file to extract text from
|
|
600
|
-
* @returns - The extracted text from the file
|
|
601
|
-
*/
|
|
602
521
|
async parseFileFromPath(filePath) {
|
|
603
522
|
const dataBuffer = await fs.promises.readFile(filePath);
|
|
604
|
-
const fileExtension = filePath.split('.').pop();
|
|
523
|
+
const fileExtension = filePath.split('.').pop()?.toLowerCase();
|
|
605
524
|
switch (fileExtension) {
|
|
606
525
|
case 'pdf':
|
|
607
|
-
return
|
|
526
|
+
return this.parsePDF(dataBuffer);
|
|
608
527
|
case 'docx':
|
|
609
|
-
return
|
|
528
|
+
return this.parseDOCX(dataBuffer);
|
|
610
529
|
default:
|
|
611
|
-
throw new Error(
|
|
530
|
+
throw new Error(`File type '${fileExtension}' not supported`);
|
|
612
531
|
}
|
|
613
532
|
}
|
|
614
533
|
};
|
|
615
534
|
AutotagBaseEngine = __decorate([
|
|
616
|
-
RegisterClass(
|
|
617
|
-
__metadata("design:paramtypes", [])
|
|
535
|
+
RegisterClass(BaseEngine, 'AutotagBaseEngine')
|
|
618
536
|
], AutotagBaseEngine);
|
|
619
537
|
export { AutotagBaseEngine };
|
|
620
538
|
//# sourceMappingURL=AutotagBaseEngine.js.map
|