@memberjunction/content-autotagging 5.22.0 → 5.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -18
- package/dist/CloudStorage/generic/CloudStorageBase.d.ts +2 -2
- package/dist/CloudStorage/generic/CloudStorageBase.d.ts.map +1 -1
- package/dist/CloudStorage/generic/CloudStorageBase.js +2 -2
- package/dist/CloudStorage/generic/CloudStorageBase.js.map +1 -1
- package/dist/CloudStorage/index.d.ts +5 -0
- package/dist/CloudStorage/index.d.ts.map +1 -1
- package/dist/CloudStorage/index.js +5 -0
- package/dist/CloudStorage/index.js.map +1 -1
- package/dist/CloudStorage/providers/AutotagCloudStorage.d.ts +61 -0
- package/dist/CloudStorage/providers/AutotagCloudStorage.d.ts.map +1 -0
- package/dist/CloudStorage/providers/AutotagCloudStorage.js +256 -0
- package/dist/CloudStorage/providers/AutotagCloudStorage.js.map +1 -0
- package/dist/Core/generic/AutotagBase.d.ts +9 -1
- package/dist/Core/generic/AutotagBase.d.ts.map +1 -1
- package/dist/Core/generic/AutotagBase.js.map +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.d.ts +397 -15
- package/dist/Engine/generic/AutotagBaseEngine.d.ts.map +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.js +1362 -128
- package/dist/Engine/generic/AutotagBaseEngine.js.map +1 -1
- package/dist/Engine/generic/RateLimiter.d.ts +49 -0
- package/dist/Engine/generic/RateLimiter.d.ts.map +1 -0
- package/dist/Engine/generic/RateLimiter.js +98 -0
- package/dist/Engine/generic/RateLimiter.js.map +1 -0
- package/dist/Engine/index.d.ts +1 -0
- package/dist/Engine/index.d.ts.map +1 -1
- package/dist/Engine/index.js +1 -0
- package/dist/Engine/index.js.map +1 -1
- package/dist/Entity/generic/AutotagEntity.d.ts +64 -15
- package/dist/Entity/generic/AutotagEntity.d.ts.map +1 -1
- package/dist/Entity/generic/AutotagEntity.js +362 -83
- package/dist/Entity/generic/AutotagEntity.js.map +1 -1
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.d.ts +2 -2
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.d.ts.map +1 -1
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js +2 -2
- package/dist/LocalFileSystem/generic/AutotagLocalFileSystem.js.map +1 -1
- package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts +47 -16
- package/dist/RSSFeed/generic/AutotagRSSFeed.d.ts.map +1 -1
- package/dist/RSSFeed/generic/AutotagRSSFeed.js +239 -121
- package/dist/RSSFeed/generic/AutotagRSSFeed.js.map +1 -1
- package/dist/Websites/generic/AutotagWebsite.d.ts +2 -2
- package/dist/Websites/generic/AutotagWebsite.d.ts.map +1 -1
- package/dist/Websites/generic/AutotagWebsite.js +2 -2
- package/dist/Websites/generic/AutotagWebsite.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/package.json +16 -8
package/README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# @memberjunction/content-autotagging
|
|
2
2
|
|
|
3
|
-
AI-powered content ingestion and
|
|
3
|
+
AI-powered content ingestion, autotagging, and vectorization engine for MemberJunction. Scans content from multiple sources (local files, websites, RSS feeds, cloud storage), extracts text from documents, uses LLMs to generate weighted tags and metadata attributes, and vectorizes content for semantic search.
|
|
4
4
|
|
|
5
5
|
## Overview
|
|
6
6
|
|
|
7
|
-
The `@memberjunction/content-autotagging` package provides an extensible framework for ingesting content from diverse sources and leveraging AI models to extract meaningful tags, summaries, and metadata. Built on the MemberJunction platform, it helps organizations automatically organize and categorize their content.
|
|
7
|
+
The `@memberjunction/content-autotagging` package provides an extensible framework for ingesting content from diverse sources and leveraging AI models to extract meaningful tags, summaries, and metadata. Built on the MemberJunction platform, it helps organizations automatically organize and categorize their content. The engine uses the managed **"Content Autotagging"** AI prompt via `AIPromptRunner` (rather than direct `BaseLLM` calls), enabling prompt versioning, model routing, and centralized prompt management.
|
|
8
8
|
|
|
9
9
|
```mermaid
|
|
10
10
|
graph TD
|
|
@@ -19,10 +19,14 @@ graph TD
|
|
|
19
19
|
G --> I["Office Parser"]
|
|
20
20
|
G --> J["HTML Parser<br/>(Cheerio)"]
|
|
21
21
|
|
|
22
|
-
A --> K["
|
|
23
|
-
K --> L["Tag Generation"]
|
|
22
|
+
A --> K["AIPromptRunner<br/>(Content Autotagging prompt)"]
|
|
23
|
+
K --> L["Tag Generation<br/>(with weights)"]
|
|
24
24
|
K --> M["Attribute Extraction"]
|
|
25
25
|
|
|
26
|
+
A --> V["Vectorization"]
|
|
27
|
+
V --> W["Embedding Model"]
|
|
28
|
+
V --> X["Vector DB Upsert"]
|
|
29
|
+
|
|
26
30
|
A --> N["Content Items<br/>(Database)"]
|
|
27
31
|
A --> O["Content Item Attributes<br/>(Database)"]
|
|
28
32
|
|
|
@@ -34,10 +38,21 @@ graph TD
|
|
|
34
38
|
style F fill:#2d8659,stroke:#1a5c3a,color:#fff
|
|
35
39
|
style G fill:#b8762f,stroke:#8a5722,color:#fff
|
|
36
40
|
style K fill:#7c5295,stroke:#563a6b,color:#fff
|
|
41
|
+
style V fill:#2d6a9f,stroke:#1a4971,color:#fff
|
|
37
42
|
style N fill:#2d6a9f,stroke:#1a4971,color:#fff
|
|
38
43
|
style O fill:#2d6a9f,stroke:#1a4971,color:#fff
|
|
39
44
|
```
|
|
40
45
|
|
|
46
|
+
## Key Features
|
|
47
|
+
|
|
48
|
+
- **AIPromptRunner integration**: Uses the managed "Content Autotagging" prompt, enabling prompt versioning and model routing through MJ's prompt management system (no direct `BaseLLM` calls)
|
|
49
|
+
- **Tag weights**: Each generated tag includes a relevance weight (0.0--1.0) indicating how strongly the tag relates to the content
|
|
50
|
+
- **Batch processing**: Configurable batch size (default: 20) with concurrent processing within each batch
|
|
51
|
+
- **Parallel tagging + vectorization**: Tagging and vectorization run in parallel for maximum throughput
|
|
52
|
+
- **Per-source/type embedding model selection**: Cascade resolution for embedding model and vector index -- source override, then content type default, then global fallback (first active vector index)
|
|
53
|
+
- **Real-time progress reporting**: `AutotagProgressCallback` provides per-item progress updates during processing
|
|
54
|
+
- **Graceful provider skip**: Providers skip gracefully when no content sources are configured for their type
|
|
55
|
+
|
|
41
56
|
## Installation
|
|
42
57
|
|
|
43
58
|
```bash
|
|
@@ -51,7 +66,8 @@ sequenceDiagram
|
|
|
51
66
|
participant Source as Content Source
|
|
52
67
|
participant Engine as AutotagBaseEngine
|
|
53
68
|
participant Extract as Text Extractor
|
|
54
|
-
participant
|
|
69
|
+
participant Prompt as AIPromptRunner
|
|
70
|
+
participant Vec as Embedding + VectorDB
|
|
55
71
|
participant DB as Database
|
|
56
72
|
|
|
57
73
|
Source->>Engine: Provide content items
|
|
@@ -59,9 +75,14 @@ sequenceDiagram
|
|
|
59
75
|
Engine->>Extract: Extract text (PDF/Office/HTML)
|
|
60
76
|
Extract-->>Engine: Raw text
|
|
61
77
|
Engine->>Engine: Chunk text for token limits
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
78
|
+
par Tagging
|
|
79
|
+
Engine->>Prompt: Run "Content Autotagging" prompt
|
|
80
|
+
Prompt-->>Engine: Tags (with weights) + Attributes
|
|
81
|
+
and Vectorization
|
|
82
|
+
Engine->>Vec: Embed text + upsert to vector DB
|
|
83
|
+
Vec-->>Engine: Vectorization result
|
|
84
|
+
end
|
|
85
|
+
Engine->>DB: Save ContentItem + Tags + Attributes
|
|
65
86
|
Engine->>DB: Create ProcessRun record
|
|
66
87
|
```
|
|
67
88
|
|
|
@@ -74,7 +95,7 @@ sequenceDiagram
|
|
|
74
95
|
| RSS Feeds | `AutotagRSSFeed` | Parses RSS/Atom feeds for articles |
|
|
75
96
|
| Azure Blob | `AutotagAzureBlob` | Processes files from Azure Blob Storage |
|
|
76
97
|
|
|
77
|
-
All sources extend `AutotagBase`, which provides the common interface for content discovery and ingestion.
|
|
98
|
+
All sources extend `AutotagBase`, which provides the common interface for content discovery and ingestion. Each source's `Autotag()` method accepts an optional `AutotagProgressCallback` for real-time progress reporting. Sources skip gracefully when no content sources of their type are configured in the database.
|
|
78
99
|
|
|
79
100
|
## Supported File Formats
|
|
80
101
|
|
|
@@ -85,6 +106,32 @@ All sources extend `AutotagBase`, which provides the common interface for conten
|
|
|
85
106
|
| HTML/Web Pages | `cheerio` | .html, .htm |
|
|
86
107
|
| Plain Text | Native | .txt, .md, .csv |
|
|
87
108
|
|
|
109
|
+
## Tag Weights
|
|
110
|
+
|
|
111
|
+
The LLM prompt returns tags with relevance weights between 0.0 and 1.0 indicating how strongly each tag relates to the content. Both old-style (plain string array) and new-style (object with `tag` + `weight`) responses are supported:
|
|
112
|
+
|
|
113
|
+
```json
|
|
114
|
+
// New format (preferred) — returned by the "Content Autotagging" prompt
|
|
115
|
+
[
|
|
116
|
+
{ "tag": "machine learning", "weight": 0.95 },
|
|
117
|
+
{ "tag": "neural networks", "weight": 0.82 },
|
|
118
|
+
{ "tag": "data science", "weight": 0.70 }
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
// Legacy format — auto-normalized with weight 1.0
|
|
122
|
+
["machine learning", "neural networks", "data science"]
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Embedding Model and Vector Index Resolution
|
|
126
|
+
|
|
127
|
+
The engine resolves the embedding model and vector index for each content item using a three-level cascade:
|
|
128
|
+
|
|
129
|
+
1. **Content Source override**: If the source has `EmbeddingModelID` and `VectorIndexID` set, those are used
|
|
130
|
+
2. **Content Type default**: If the source has no override, the content type's defaults are used
|
|
131
|
+
3. **Global fallback**: If neither source nor type specifies, the first active vector index in the system is used
|
|
132
|
+
|
|
133
|
+
Items sharing the same (embeddingModel, vectorIndex) pair are grouped and processed together for efficient batching.
|
|
134
|
+
|
|
88
135
|
## Usage
|
|
89
136
|
|
|
90
137
|
### RSS Feed Processing
|
|
@@ -93,7 +140,9 @@ All sources extend `AutotagBase`, which provides the common interface for conten
|
|
|
93
140
|
import { AutotagRSSFeed } from '@memberjunction/content-autotagging';
|
|
94
141
|
|
|
95
142
|
const rssTagger = new AutotagRSSFeed();
|
|
96
|
-
await rssTagger.Autotag(contextUser)
|
|
143
|
+
await rssTagger.Autotag(contextUser, (processed, total, currentItem) => {
|
|
144
|
+
console.log(`[${processed}/${total}] Processing: ${currentItem}`);
|
|
145
|
+
});
|
|
97
146
|
```
|
|
98
147
|
|
|
99
148
|
### Website Content Processing
|
|
@@ -133,13 +182,19 @@ await blobTagger.Autotag(contextUser);
|
|
|
133
182
|
import { AutotagBaseEngine } from '@memberjunction/content-autotagging';
|
|
134
183
|
|
|
135
184
|
const engine = AutotagBaseEngine.Instance;
|
|
136
|
-
|
|
185
|
+
|
|
186
|
+
// Process content items with custom batch size
|
|
187
|
+
await engine.ExtractTextAndProcessWithLLM(contentItems, contextUser, batchSize);
|
|
188
|
+
|
|
189
|
+
// Vectorize content items (runs in parallel with tagging)
|
|
190
|
+
const result = await engine.VectorizeContentItems(contentItems, tagMap, contextUser, batchSize);
|
|
191
|
+
console.log(`Vectorized: ${result.vectorized}, Skipped: ${result.skipped}`);
|
|
137
192
|
```
|
|
138
193
|
|
|
139
194
|
## Creating a Custom Content Source
|
|
140
195
|
|
|
141
196
|
```typescript
|
|
142
|
-
import { AutotagBase } from '@memberjunction/content-autotagging';
|
|
197
|
+
import { AutotagBase, AutotagProgressCallback } from '@memberjunction/content-autotagging';
|
|
143
198
|
import { RegisterClass } from '@memberjunction/global';
|
|
144
199
|
|
|
145
200
|
@RegisterClass(AutotagBase, 'AutotagCustomSource')
|
|
@@ -149,13 +204,14 @@ export class AutotagCustomSource extends AutotagBase {
|
|
|
149
204
|
return contentItems;
|
|
150
205
|
}
|
|
151
206
|
|
|
152
|
-
public async Autotag(contextUser) {
|
|
207
|
+
public async Autotag(contextUser, onProgress?: AutotagProgressCallback) {
|
|
153
208
|
const contentSourceTypeID = await this.engine.setSubclassContentSourceType(
|
|
154
209
|
'Custom Source', contextUser
|
|
155
210
|
);
|
|
156
211
|
const contentSources = await this.engine.getAllContentSources(
|
|
157
212
|
contextUser, contentSourceTypeID
|
|
158
213
|
);
|
|
214
|
+
if (contentSources.length === 0) return; // Skip gracefully
|
|
159
215
|
const contentItems = await this.SetContentItemsToProcess(contentSources);
|
|
160
216
|
await this.engine.ExtractTextAndProcessWithLLM(contentItems, contextUser);
|
|
161
217
|
}
|
|
@@ -166,12 +222,12 @@ export class AutotagCustomSource extends AutotagBase {
|
|
|
166
222
|
|
|
167
223
|
| Entity | Purpose |
|
|
168
224
|
|--------|---------|
|
|
169
|
-
| Content Sources | Configuration for each content source |
|
|
225
|
+
| Content Sources | Configuration for each content source (with optional EmbeddingModelID/VectorIndexID overrides) |
|
|
170
226
|
| Content Items | Individual pieces of content with extracted text |
|
|
171
|
-
| Content Item Tags | AI-generated tags |
|
|
227
|
+
| Content Item Tags | AI-generated tags with relevance weights (0.0--1.0) |
|
|
172
228
|
| Content Item Attributes | Additional extracted metadata |
|
|
173
229
|
| Content Process Runs | Processing history and audit trail |
|
|
174
|
-
| Content Types | Content categorization definitions |
|
|
230
|
+
| Content Types | Content categorization definitions (with default EmbeddingModelID/VectorIndexID) |
|
|
175
231
|
| Content Source Types | Source type definitions |
|
|
176
232
|
| Content File Types | Supported file format definitions |
|
|
177
233
|
|
|
@@ -182,8 +238,12 @@ export class AutotagCustomSource extends AutotagBase {
|
|
|
182
238
|
| `@memberjunction/core` | Entity system and metadata |
|
|
183
239
|
| `@memberjunction/global` | Class registration |
|
|
184
240
|
| `@memberjunction/core-entities` | Content entity types |
|
|
185
|
-
| `@memberjunction/ai` |
|
|
186
|
-
| `@memberjunction/aiengine` | AI Engine
|
|
241
|
+
| `@memberjunction/ai` | Embedding model integration |
|
|
242
|
+
| `@memberjunction/aiengine` | AI Engine for prompt cache access |
|
|
243
|
+
| `@memberjunction/ai-prompts` | AIPromptRunner for managed prompt execution |
|
|
244
|
+
| `@memberjunction/ai-core-plus` | AIPromptParams types |
|
|
245
|
+
| `@memberjunction/ai-vectors` | TextChunker for content chunking |
|
|
246
|
+
| `@memberjunction/ai-vectordb` | VectorDBBase for vector storage |
|
|
187
247
|
| `pdf-parse` | PDF text extraction |
|
|
188
248
|
| `officeparser` | Office document parsing |
|
|
189
249
|
| `cheerio` | HTML parsing |
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { AutotagBase } from "../../Core/index.js";
|
|
1
|
+
import { AutotagBase, AutotagProgressCallback } from "../../Core/index.js";
|
|
2
2
|
import { AutotagBaseEngine } from "../../Engine/index.js";
|
|
3
3
|
import { ContentSourceParams } from "../../Engine/index.js";
|
|
4
4
|
import { UserInfo } from "@memberjunction/core";
|
|
@@ -22,7 +22,7 @@ export declare abstract class CloudStorageBase extends AutotagBase {
|
|
|
22
22
|
* @returns - An array of content source items that have been modified or added after the most recent process run for that content source
|
|
23
23
|
*/
|
|
24
24
|
abstract SetNewAndModifiedContentItems(contentSourceParams: ContentSourceParams, lastRunDate: Date, contextUser: UserInfo): Promise<MJContentItemEntity[]>;
|
|
25
|
-
Autotag(contextUser: UserInfo): Promise<void>;
|
|
25
|
+
Autotag(contextUser: UserInfo, onProgress?: AutotagProgressCallback): Promise<void>;
|
|
26
26
|
SetContentItemsToProcess(contentSources: MJContentSourceEntity[]): Promise<MJContentItemEntity[]>;
|
|
27
27
|
}
|
|
28
28
|
//# sourceMappingURL=CloudStorageBase.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"CloudStorageBase.d.ts","sourceRoot":"","sources":["../../../src/CloudStorage/generic/CloudStorageBase.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"CloudStorageBase.d.ts","sourceRoot":"","sources":["../../../src/CloudStorage/generic/CloudStorageBase.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,YAAY,CAAC;AAClE,OAAO,EAAE,iBAAiB,EAAE,MAAM,cAAc,CAAC;AACjD,OAAO,EAAE,mBAAmB,EAAE,MAAM,cAAc,CAAC;AACnD,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAChD,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,+BAA+B,CAAC;AAI3F,8BAAsB,gBAAiB,SAAQ,WAAW;IACtD,SAAS,CAAC,WAAW,EAAE,QAAQ,CAAC;IAChC,SAAS,CAAC,MAAM,EAAE,iBAAiB,CAAC;IACpC,SAAS,CAAC,mBAAmB,EAAE,MAAM,CAAA;;IAOrC;;MAEE;aACc,YAAY,IAAI,OAAO,CAAC,IAAI,CAAC;IAE7C;;;;;;;;MAQE;aACc,6BAA6B,CAAC,mBAAmB,EAAE,mBAAmB,EAAE,WAAW,EAAE,IAAI,EAAE,WAAW,EAAE,QAAQ,GAAG,OAAO,CAAC,mBAAmB,EAAE,CAAC;IAEpJ,OAAO,CAAC,WAAW,EAAE,QAAQ,EAAE,UAAU,CAAC,EAAE,uBAAuB,GAAG,OAAO,CAAC,IAAI,CAAC;IAQnF,wBAAwB,CAAC,cAAc,EAAE,qBAAqB,EAAE,GAAG,OAAO,CAAC,mBAAmB,EAAE,CAAC;CAyBjH"}
|
|
@@ -7,12 +7,12 @@ export class CloudStorageBase extends AutotagBase {
|
|
|
7
7
|
super();
|
|
8
8
|
this.engine = AutotagBaseEngine.Instance;
|
|
9
9
|
}
|
|
10
|
-
async Autotag(contextUser) {
|
|
10
|
+
async Autotag(contextUser, onProgress) {
|
|
11
11
|
this.contextUser = contextUser;
|
|
12
12
|
this.contentSourceTypeID = this.engine.SetSubclassContentSourceType('Cloud Storage');
|
|
13
13
|
const contentSources = await this.engine.getAllContentSources(this.contextUser, this.contentSourceTypeID) || [];
|
|
14
14
|
const contentItemsToProcess = await this.SetContentItemsToProcess(contentSources);
|
|
15
|
-
await this.engine.ExtractTextAndProcessWithLLM(contentItemsToProcess, this.contextUser);
|
|
15
|
+
await this.engine.ExtractTextAndProcessWithLLM(contentItemsToProcess, this.contextUser, undefined, undefined, onProgress);
|
|
16
16
|
}
|
|
17
17
|
async SetContentItemsToProcess(contentSources) {
|
|
18
18
|
const contentItemsToProcess = [];
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"CloudStorageBase.js","sourceRoot":"","sources":["../../../src/CloudStorage/generic/CloudStorageBase.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,
|
|
1
|
+
{"version":3,"file":"CloudStorageBase.js","sourceRoot":"","sources":["../../../src/CloudStorage/generic/CloudStorageBase.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAA2B,MAAM,YAAY,CAAC;AAClE,OAAO,EAAE,iBAAiB,EAAE,MAAM,cAAc,CAAC;AAIjD,OAAO,MAAM,MAAM,QAAQ,CAAC;AAC5B,MAAM,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAA;AAE9B,MAAM,OAAgB,gBAAiB,SAAQ,WAAW;IAKtD;QACI,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,MAAM,GAAG,iBAAiB,CAAC,QAAQ,CAAC;IAC7C,CAAC;IAkBM,KAAK,CAAC,OAAO,CAAC,WAAqB,EAAE,UAAoC;QAC5E,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC;QAC/B,IAAI,CAAC,mBAAmB,GAAG,IAAI,CAAC,MAAM,CAAC,4BAA4B,CAAC,eAAe,CAAC,CAAC;QACrF,MAAM,cAAc,GAA4B,MAAM,IAAI,CAAC,MAAM,CAAC,oBAAoB,CAAC,IAAI,CAAC,WAAW,EAAE,IAAI,CAAC,mBAAmB,CAAC,IAAI,EAAE,CAAC;QACzI,MAAM,qBAAqB,GAA0B,MAAM,IAAI,CAAC,wBAAwB,CAAC,cAAc,CAAC,CAAA;QACxG,MAAM,IAAI,CAAC,MAAM,CAAC,4BAA4B,CAAC,qBAAqB,EAAE,IAAI,CAAC,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;IAC9H,CAAC;IAEM,KAAK,CAAC,wBAAwB,CAAC,cAAuC;QACzE,MAAM,qBAAqB,GAA0B,EAAE,CAAA;QAEvD,KAAK,MAAM,aAAa,IAAI,cAAc,EAAE,CAAC;YACzC,MAAM,IAAI,CAAC,YAAY,EAAE,CAAC;YAE1B,MAAM,mBAAmB,GAAwB;gBAC7C,eAAe,EAAE,aAAa,CAAC,EAAE;gBACjC,IAAI,EAAE,aAAa,CAAC,IAAI;gBACxB,aAAa,EAAE,aAAa,CAAC,aAAa;gBAC1C,mBAAmB,EAAE,aAAa,CAAC,mBAAmB;gBACtD,iBAAiB,EAAE,aAAa,CAAC,iBAAiB;gBAClD,GAAG,EAAE,aAAa,CAAC,GAAG;aACzB,CAAA;YAED,MAAM,WAAW,GAAS,MAAM,IAAI,CAAC,MAAM,CAAC,2BAA2B,CAAC,mBAAmB,CAAC,eAAe,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;YAE/H,IAAI,WAAW,EAAE,CAAC;gBACd,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,6BAA6B,CAAC,mBAAmB,EAAE,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;gBAClH,qBAAqB,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAC;YAChD,CAAC;QACL,CAAC;QAED,OAAO,qBAAqB,CAAC;IACjC,CAAC;CACJ"}
|
|
@@ -1,3 +1,8 @@
|
|
|
1
1
|
export * from './generic/CloudStorageBase.js';
|
|
2
|
+
export * from './providers/AutotagCloudStorage.js';
|
|
3
|
+
/**
|
|
4
|
+
* @deprecated Use AutotagCloudStorage instead, which works with any MJ Storage provider.
|
|
5
|
+
* AutotagAzureBlob is retained for backward compatibility but will be removed in a future version.
|
|
6
|
+
*/
|
|
2
7
|
export * from './providers/AutotagAzureBlob.js';
|
|
3
8
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/CloudStorage/index.ts"],"names":[],"mappings":"AAAA,cAAc,4BAA4B,CAAA;AAC1C,cAAc,8BAA8B,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/CloudStorage/index.ts"],"names":[],"mappings":"AAAA,cAAc,4BAA4B,CAAA;AAC1C,cAAc,iCAAiC,CAAA;AAE/C;;;GAGG;AACH,cAAc,8BAA8B,CAAA"}
|
|
@@ -1,3 +1,8 @@
|
|
|
1
1
|
export * from './generic/CloudStorageBase.js';
|
|
2
|
+
export * from './providers/AutotagCloudStorage.js';
|
|
3
|
+
/**
|
|
4
|
+
* @deprecated Use AutotagCloudStorage instead, which works with any MJ Storage provider.
|
|
5
|
+
* AutotagAzureBlob is retained for backward compatibility but will be removed in a future version.
|
|
6
|
+
*/
|
|
2
7
|
export * from './providers/AutotagAzureBlob.js';
|
|
3
8
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/CloudStorage/index.ts"],"names":[],"mappings":"AAAA,cAAc,4BAA4B,CAAA;AAC1C,cAAc,8BAA8B,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/CloudStorage/index.ts"],"names":[],"mappings":"AAAA,cAAc,4BAA4B,CAAA;AAC1C,cAAc,iCAAiC,CAAA;AAE/C;;;GAGG;AACH,cAAc,8BAA8B,CAAA"}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { UserInfo } from '@memberjunction/core';
|
|
2
|
+
import { MJContentItemEntity } from '@memberjunction/core-entities';
|
|
3
|
+
import { AutotagBase, AutotagProgressCallback } from '../../Core/index.js';
|
|
4
|
+
import { MJContentSourceEntity } from '@memberjunction/core-entities';
|
|
5
|
+
/**
|
|
6
|
+
* Generic cloud storage autotag provider that works with ANY MJ Storage driver.
|
|
7
|
+
*
|
|
8
|
+
* Replaces the previous Azure-specific AutotagAzureBlob by delegating file listing
|
|
9
|
+
* and reading to the MJ Storage abstraction layer (FileStorageBase). This means it
|
|
10
|
+
* works with Azure Blob, AWS S3, Google Cloud Storage, SharePoint, Dropbox, Box, etc.
|
|
11
|
+
*
|
|
12
|
+
* Configuration is driven by ContentSource.Configuration JSON which must include
|
|
13
|
+
* a `FileStorageProviderKey` matching a registered MJ Storage driver.
|
|
14
|
+
*
|
|
15
|
+
* The storage driver is initialized via environment-based config (e.g.,
|
|
16
|
+
* STORAGE_AZURE_ACCOUNT_NAME, STORAGE_AZURE_ACCOUNT_KEY) which the MJ Storage
|
|
17
|
+
* drivers read automatically on construction.
|
|
18
|
+
*/
|
|
19
|
+
export declare class AutotagCloudStorage extends AutotagBase {
|
|
20
|
+
private contextUser;
|
|
21
|
+
private engine;
|
|
22
|
+
protected contentSourceTypeID: string;
|
|
23
|
+
Autotag(contextUser: UserInfo, onProgress?: AutotagProgressCallback): Promise<void>;
|
|
24
|
+
SetContentItemsToProcess(contentSources: MJContentSourceEntity[]): Promise<MJContentItemEntity[]>;
|
|
25
|
+
/**
|
|
26
|
+
* Process a single content source: initialize the storage driver, list files,
|
|
27
|
+
* detect new/modified files, download and extract text, create ContentItems.
|
|
28
|
+
*/
|
|
29
|
+
private ProcessContentSource;
|
|
30
|
+
/**
|
|
31
|
+
* Parse the ContentSource.Configuration JSON to extract cloud storage config.
|
|
32
|
+
*/
|
|
33
|
+
private ParseSourceConfig;
|
|
34
|
+
/**
|
|
35
|
+
* Create and return a storage driver via ClassFactory using the provider key.
|
|
36
|
+
*/
|
|
37
|
+
private CreateStorageDriver;
|
|
38
|
+
/**
|
|
39
|
+
* List all objects in the storage driver that were modified after lastRunDate.
|
|
40
|
+
* Optionally filter by file extension.
|
|
41
|
+
*/
|
|
42
|
+
private ListModifiedObjects;
|
|
43
|
+
/**
|
|
44
|
+
* Download a file, extract text, and create/update a ContentItem.
|
|
45
|
+
*/
|
|
46
|
+
private ProcessSingleFile;
|
|
47
|
+
/**
|
|
48
|
+
* Load existing ContentItems for a source, keyed by lowercase URL for upsert lookups.
|
|
49
|
+
*/
|
|
50
|
+
private LoadExistingContentItems;
|
|
51
|
+
/**
|
|
52
|
+
* Extract text from a file buffer based on file extension.
|
|
53
|
+
* Delegates to the engine's built-in parsers for PDF and Office documents.
|
|
54
|
+
*/
|
|
55
|
+
private ExtractTextFromBuffer;
|
|
56
|
+
/**
|
|
57
|
+
* Get the lowercase file extension including the dot (e.g., '.pdf').
|
|
58
|
+
*/
|
|
59
|
+
private GetFileExtension;
|
|
60
|
+
}
|
|
61
|
+
//# sourceMappingURL=AutotagCloudStorage.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AutotagCloudStorage.d.ts","sourceRoot":"","sources":["../../../src/CloudStorage/providers/AutotagCloudStorage.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,EAA0C,MAAM,sBAAsB,CAAC;AACxF,OAAO,EAAE,mBAAmB,EAAE,MAAM,+BAA+B,CAAC;AAEpE,OAAO,EAAE,WAAW,EAAE,uBAAuB,EAAE,MAAM,YAAY,CAAC;AAElE,OAAO,EAAE,qBAAqB,EAAE,MAAM,+BAA+B,CAAC;AAgBtE;;;;;;;;;;;;;GAaG;AACH,qBACa,mBAAoB,SAAQ,WAAW;IAChD,OAAO,CAAC,WAAW,CAAY;IAC/B,OAAO,CAAC,MAAM,CAAqB;IACnC,SAAS,CAAC,mBAAmB,EAAG,MAAM,CAAC;IAE1B,OAAO,CAAC,WAAW,EAAE,QAAQ,EAAE,UAAU,CAAC,EAAE,uBAAuB,GAAG,OAAO,CAAC,IAAI,CAAC;IAenF,wBAAwB,CAAC,cAAc,EAAE,qBAAqB,EAAE,GAAG,OAAO,CAAC,mBAAmB,EAAE,CAAC;IAgB9G;;;OAGG;YACW,oBAAoB;IA4ClC;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAsBzB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAgB3B;;;OAGG;YACW,mBAAmB;IAsBjC;;OAEG;YACW,iBAAiB;IAsD/B;;OAEG;YACW,wBAAwB;IAmBtC;;;OAGG;YACW,qBAAqB;IAoBnC;;OAEG;IACH,OAAO,CAAC,gBAAgB;CAK3B"}
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
|
|
2
|
+
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
|
|
3
|
+
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
|
|
4
|
+
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
|
5
|
+
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
6
|
+
};
|
|
7
|
+
import { RegisterClass } from '@memberjunction/global';
|
|
8
|
+
import { MJGlobal } from '@memberjunction/global';
|
|
9
|
+
import { Metadata, RunView, LogStatus, LogError } from '@memberjunction/core';
|
|
10
|
+
import { FileStorageBase } from '@memberjunction/storage';
|
|
11
|
+
import { AutotagBase } from '../../Core/index.js';
|
|
12
|
+
import { AutotagBaseEngine } from '../../Engine/index.js';
|
|
13
|
+
/**
|
|
14
|
+
* Generic cloud storage autotag provider that works with ANY MJ Storage driver.
|
|
15
|
+
*
|
|
16
|
+
* Replaces the previous Azure-specific AutotagAzureBlob by delegating file listing
|
|
17
|
+
* and reading to the MJ Storage abstraction layer (FileStorageBase). This means it
|
|
18
|
+
* works with Azure Blob, AWS S3, Google Cloud Storage, SharePoint, Dropbox, Box, etc.
|
|
19
|
+
*
|
|
20
|
+
* Configuration is driven by ContentSource.Configuration JSON which must include
|
|
21
|
+
* a `FileStorageProviderKey` matching a registered MJ Storage driver.
|
|
22
|
+
*
|
|
23
|
+
* The storage driver is initialized via environment-based config (e.g.,
|
|
24
|
+
* STORAGE_AZURE_ACCOUNT_NAME, STORAGE_AZURE_ACCOUNT_KEY) which the MJ Storage
|
|
25
|
+
* drivers read automatically on construction.
|
|
26
|
+
*/
|
|
27
|
+
let AutotagCloudStorage = class AutotagCloudStorage extends AutotagBase {
|
|
28
|
+
async Autotag(contextUser, onProgress) {
|
|
29
|
+
this.contextUser = contextUser;
|
|
30
|
+
this.engine = AutotagBaseEngine.Instance;
|
|
31
|
+
this.contentSourceTypeID = this.engine.SetSubclassContentSourceType('Cloud Storage');
|
|
32
|
+
const contentSources = await this.engine.getAllContentSources(this.contextUser, this.contentSourceTypeID);
|
|
33
|
+
const contentItemsToProcess = await this.SetContentItemsToProcess(contentSources);
|
|
34
|
+
if (contentItemsToProcess.length > 0) {
|
|
35
|
+
await this.engine.ExtractTextAndProcessWithLLM(contentItemsToProcess, this.contextUser, undefined, undefined, onProgress);
|
|
36
|
+
}
|
|
37
|
+
else {
|
|
38
|
+
LogStatus('AutotagCloudStorage: no new or modified files to process');
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
async SetContentItemsToProcess(contentSources) {
|
|
42
|
+
const contentItemsToProcess = [];
|
|
43
|
+
for (const contentSource of contentSources) {
|
|
44
|
+
try {
|
|
45
|
+
const items = await this.ProcessContentSource(contentSource);
|
|
46
|
+
contentItemsToProcess.push(...items);
|
|
47
|
+
}
|
|
48
|
+
catch (e) {
|
|
49
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
50
|
+
LogError(`AutotagCloudStorage: failed to process source "${contentSource.Name}": ${msg}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return contentItemsToProcess;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Process a single content source: initialize the storage driver, list files,
|
|
57
|
+
* detect new/modified files, download and extract text, create ContentItems.
|
|
58
|
+
*/
|
|
59
|
+
async ProcessContentSource(contentSource) {
|
|
60
|
+
const config = this.ParseSourceConfig(contentSource);
|
|
61
|
+
if (!config)
|
|
62
|
+
return [];
|
|
63
|
+
const driver = this.CreateStorageDriver(config.FileStorageProviderKey);
|
|
64
|
+
if (!driver)
|
|
65
|
+
return [];
|
|
66
|
+
const lastRunDate = await this.engine.getContentSourceLastRunDate(contentSource.ID, this.contextUser);
|
|
67
|
+
const prefix = config.PathPrefix ?? '';
|
|
68
|
+
const objects = await this.ListModifiedObjects(driver, prefix, lastRunDate, config.IncludeExtensions);
|
|
69
|
+
if (objects.length === 0) {
|
|
70
|
+
LogStatus(`AutotagCloudStorage: no modified files in source "${contentSource.Name}" since ${lastRunDate.toISOString()}`);
|
|
71
|
+
return [];
|
|
72
|
+
}
|
|
73
|
+
LogStatus(`AutotagCloudStorage: found ${objects.length} new/modified files in source "${contentSource.Name}"`);
|
|
74
|
+
// Load existing content items for this source to enable upsert by URL
|
|
75
|
+
const existingItems = await this.LoadExistingContentItems(contentSource.ID);
|
|
76
|
+
const contentSourceParams = {
|
|
77
|
+
contentSourceID: contentSource.ID,
|
|
78
|
+
name: contentSource.Name ?? '',
|
|
79
|
+
ContentTypeID: contentSource.ContentTypeID,
|
|
80
|
+
ContentSourceTypeID: contentSource.ContentSourceTypeID,
|
|
81
|
+
ContentFileTypeID: contentSource.ContentFileTypeID,
|
|
82
|
+
URL: contentSource.URL
|
|
83
|
+
};
|
|
84
|
+
const items = [];
|
|
85
|
+
for (const obj of objects) {
|
|
86
|
+
try {
|
|
87
|
+
const item = await this.ProcessSingleFile(driver, obj, contentSourceParams, existingItems);
|
|
88
|
+
if (item)
|
|
89
|
+
items.push(item);
|
|
90
|
+
}
|
|
91
|
+
catch (e) {
|
|
92
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
93
|
+
LogError(`AutotagCloudStorage: failed to process file "${obj.fullPath}": ${msg}`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
return items;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Parse the ContentSource.Configuration JSON to extract cloud storage config.
|
|
100
|
+
*/
|
|
101
|
+
ParseSourceConfig(contentSource) {
|
|
102
|
+
const configObj = contentSource.ConfigurationObject;
|
|
103
|
+
if (!configObj) {
|
|
104
|
+
// Fall back: try to infer from URL or legacy setup
|
|
105
|
+
LogError(`AutotagCloudStorage: source "${contentSource.Name}" has no Configuration JSON. Set FileStorageProviderKey in the Configuration field.`);
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
const raw = configObj;
|
|
109
|
+
const providerKey = raw['FileStorageProviderKey'];
|
|
110
|
+
if (!providerKey) {
|
|
111
|
+
LogError(`AutotagCloudStorage: source "${contentSource.Name}" Configuration is missing FileStorageProviderKey`);
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
return {
|
|
115
|
+
FileStorageProviderKey: providerKey,
|
|
116
|
+
PathPrefix: raw['PathPrefix'] ?? undefined,
|
|
117
|
+
IncludeExtensions: raw['IncludeExtensions'] ?? undefined,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Create and return a storage driver via ClassFactory using the provider key.
|
|
122
|
+
*/
|
|
123
|
+
CreateStorageDriver(providerKey) {
|
|
124
|
+
const driver = MJGlobal.Instance.ClassFactory.CreateInstance(FileStorageBase, providerKey);
|
|
125
|
+
if (!driver) {
|
|
126
|
+
LogError(`AutotagCloudStorage: no storage driver registered for key "${providerKey}". Ensure the driver is loaded (e.g., import '@memberjunction/storage').`);
|
|
127
|
+
return null;
|
|
128
|
+
}
|
|
129
|
+
if (!driver.IsConfigured) {
|
|
130
|
+
LogError(`AutotagCloudStorage: storage driver "${providerKey}" is not configured. Check environment variables or mj.config.cjs storage settings.`);
|
|
131
|
+
return null;
|
|
132
|
+
}
|
|
133
|
+
return driver;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* List all objects in the storage driver that were modified after lastRunDate.
|
|
137
|
+
* Optionally filter by file extension.
|
|
138
|
+
*/
|
|
139
|
+
async ListModifiedObjects(driver, prefix, lastRunDate, includeExtensions) {
|
|
140
|
+
const result = await driver.ListObjects(prefix);
|
|
141
|
+
const extSet = includeExtensions?.length
|
|
142
|
+
? new Set(includeExtensions.map(ext => ext.toLowerCase()))
|
|
143
|
+
: null;
|
|
144
|
+
return result.objects.filter(obj => {
|
|
145
|
+
if (obj.isDirectory)
|
|
146
|
+
return false;
|
|
147
|
+
if (obj.lastModified <= lastRunDate)
|
|
148
|
+
return false;
|
|
149
|
+
if (extSet) {
|
|
150
|
+
const ext = this.GetFileExtension(obj.name);
|
|
151
|
+
if (!extSet.has(ext))
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
return true;
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Download a file, extract text, and create/update a ContentItem.
|
|
159
|
+
*/
|
|
160
|
+
async ProcessSingleFile(driver, obj, contentSourceParams, existingItems) {
|
|
161
|
+
// Download the file content
|
|
162
|
+
const buffer = await driver.GetObject({ fullPath: obj.fullPath });
|
|
163
|
+
// Extract text using the engine's built-in parsers (PDF, DOCX, etc.)
|
|
164
|
+
const text = await this.ExtractTextFromBuffer(buffer, obj.name);
|
|
165
|
+
if (!text || text.trim().length === 0) {
|
|
166
|
+
LogStatus(`AutotagCloudStorage: no extractable text from "${obj.fullPath}", skipping`);
|
|
167
|
+
return null;
|
|
168
|
+
}
|
|
169
|
+
const checksum = await this.engine.getChecksumFromText(text);
|
|
170
|
+
const urlKey = obj.fullPath.toLowerCase();
|
|
171
|
+
// Check for existing content item by URL
|
|
172
|
+
const existing = existingItems.get(urlKey);
|
|
173
|
+
if (existing && existing.Checksum === checksum) {
|
|
174
|
+
return null; // Content unchanged
|
|
175
|
+
}
|
|
176
|
+
const md = new Metadata();
|
|
177
|
+
let contentItem;
|
|
178
|
+
if (existing) {
|
|
179
|
+
contentItem = existing;
|
|
180
|
+
}
|
|
181
|
+
else {
|
|
182
|
+
contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
|
|
183
|
+
contentItem.NewRecord();
|
|
184
|
+
contentItem.ContentSourceID = contentSourceParams.contentSourceID;
|
|
185
|
+
contentItem.ContentTypeID = contentSourceParams.ContentTypeID;
|
|
186
|
+
contentItem.ContentSourceTypeID = contentSourceParams.ContentSourceTypeID;
|
|
187
|
+
contentItem.ContentFileTypeID = contentSourceParams.ContentFileTypeID;
|
|
188
|
+
}
|
|
189
|
+
contentItem.Name = obj.name;
|
|
190
|
+
contentItem.Description = this.engine.GetContentItemDescription(contentSourceParams);
|
|
191
|
+
contentItem.URL = obj.fullPath;
|
|
192
|
+
contentItem.Text = text;
|
|
193
|
+
contentItem.Checksum = checksum;
|
|
194
|
+
const saved = await contentItem.Save();
|
|
195
|
+
if (!saved) {
|
|
196
|
+
throw new Error(`Failed to save ContentItem for "${obj.fullPath}"`);
|
|
197
|
+
}
|
|
198
|
+
existingItems.set(urlKey, contentItem);
|
|
199
|
+
return contentItem;
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Load existing ContentItems for a source, keyed by lowercase URL for upsert lookups.
|
|
203
|
+
*/
|
|
204
|
+
async LoadExistingContentItems(contentSourceID) {
|
|
205
|
+
const rv = new RunView();
|
|
206
|
+
const result = await rv.RunView({
|
|
207
|
+
EntityName: 'MJ: Content Items',
|
|
208
|
+
ExtraFilter: `ContentSourceID='${contentSourceID}'`,
|
|
209
|
+
ResultType: 'entity_object'
|
|
210
|
+
}, this.contextUser);
|
|
211
|
+
const map = new Map();
|
|
212
|
+
if (result.Success) {
|
|
213
|
+
for (const ci of result.Results) {
|
|
214
|
+
if (ci.URL) {
|
|
215
|
+
map.set(ci.URL.toLowerCase(), ci);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return map;
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Extract text from a file buffer based on file extension.
|
|
223
|
+
* Delegates to the engine's built-in parsers for PDF and Office documents.
|
|
224
|
+
*/
|
|
225
|
+
async ExtractTextFromBuffer(buffer, fileName) {
|
|
226
|
+
const ext = this.GetFileExtension(fileName);
|
|
227
|
+
if (ext === '.pdf' || ext === '.docx' || ext === '.doc' || ext === '.pptx' || ext === '.xlsx') {
|
|
228
|
+
return this.engine.parsePDF(buffer);
|
|
229
|
+
}
|
|
230
|
+
if (ext === '.txt' || ext === '.md' || ext === '.csv' || ext === '.json' || ext === '.xml' || ext === '.html') {
|
|
231
|
+
return buffer.toString('utf-8');
|
|
232
|
+
}
|
|
233
|
+
// Attempt parsePDF as fallback for unknown formats (officeparser handles many formats)
|
|
234
|
+
try {
|
|
235
|
+
return await this.engine.parsePDF(buffer);
|
|
236
|
+
}
|
|
237
|
+
catch {
|
|
238
|
+
LogStatus(`AutotagCloudStorage: unsupported file format "${ext}" for "${fileName}"`);
|
|
239
|
+
return '';
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Get the lowercase file extension including the dot (e.g., '.pdf').
|
|
244
|
+
*/
|
|
245
|
+
GetFileExtension(fileName) {
|
|
246
|
+
const lastDot = fileName.lastIndexOf('.');
|
|
247
|
+
if (lastDot < 0)
|
|
248
|
+
return '';
|
|
249
|
+
return fileName.substring(lastDot).toLowerCase();
|
|
250
|
+
}
|
|
251
|
+
};
|
|
252
|
+
AutotagCloudStorage = __decorate([
|
|
253
|
+
RegisterClass(AutotagBase, 'AutotagCloudStorage')
|
|
254
|
+
], AutotagCloudStorage);
|
|
255
|
+
export { AutotagCloudStorage };
|
|
256
|
+
//# sourceMappingURL=AutotagCloudStorage.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AutotagCloudStorage.js","sourceRoot":"","sources":["../../../src/CloudStorage/providers/AutotagCloudStorage.ts"],"names":[],"mappings":";;;;;;AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,wBAAwB,CAAC;AACvD,OAAO,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAC;AAClD,OAAO,EAAY,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAExF,OAAO,EAAE,eAAe,EAAyB,MAAM,yBAAyB,CAAC;AACjF,OAAO,EAAE,WAAW,EAA2B,MAAM,YAAY,CAAC;AAClE,OAAO,EAAE,iBAAiB,EAAuB,MAAM,cAAc,CAAC;AAiBtE;;;;;;;;;;;;;GAaG;AAEI,IAAM,mBAAmB,GAAzB,MAAM,mBAAoB,SAAQ,WAAW;IAKzC,KAAK,CAAC,OAAO,CAAC,WAAqB,EAAE,UAAoC;QAC5E,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC;QAC/B,IAAI,CAAC,MAAM,GAAG,iBAAiB,CAAC,QAAQ,CAAC;QACzC,IAAI,CAAC,mBAAmB,GAAG,IAAI,CAAC,MAAM,CAAC,4BAA4B,CAAC,eAAe,CAAC,CAAC;QAErF,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,oBAAoB,CAAC,IAAI,CAAC,WAAW,EAAE,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAC1G,MAAM,qBAAqB,GAAG,MAAM,IAAI,CAAC,wBAAwB,CAAC,cAAc,CAAC,CAAC;QAElF,IAAI,qBAAqB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,CAAC,MAAM,CAAC,4BAA4B,CAAC,qBAAqB,EAAE,IAAI,CAAC,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;QAC9H,CAAC;aAAM,CAAC;YACJ,SAAS,CAAC,0DAA0D,CAAC,CAAC;QAC1E,CAAC;IACL,CAAC;IAEM,KAAK,CAAC,wBAAwB,CAAC,cAAuC;QACzE,MAAM,qBAAqB,GAA0B,EAAE,CAAC;QAExD,KAAK,MAAM,aAAa,IAAI,cAAc,EAAE,CAAC;YACzC,IAAI,CAAC;gBACD,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,oBAAoB,CAAC,aAAa,CAAC,CAAC;gBAC7D,qBAAqB,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC;YACzC,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACT,MAAM,GAAG,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;gBACvD,QAAQ,CAAC,kDAAkD,aAAa,CAAC,IAAI,MAAM,GAAG,EAAE,CAAC,CAAC;YAC9F,CAAC;QACL,CAAC;QAED,OAAO,qBAAqB,CAAC;IACjC,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,oBAAoB,CAAC,aAAoC;QACnE,MAAM,MAAM,GAAG,IAAI,CAAC,iBAAiB,CAAC,aAAa,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM;YAAE,OAAO,EAAE,CAAC;QAEvB,MAAM,MAAM,GAAG,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,sBAAsB,CAAC,CAAC;QACvE,IAAI,CAAC,MAAM;YAAE,OAAO,EAAE,CAAC;QAEvB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,2BAA2B,CAAC,aAAa,CAAC,EAAE,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;QACtG,MAAM,MAAM,GAAG,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC;QACvC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,mBAAmB,CAAC,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,CAAC,iBAAiB,CAAC,CAAC;QAEtG,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,SAAS,CAAC,qDAAqD,aAAa,CAAC,IAAI,WAAW,WAAW,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;YACzH,OAAO,EAAE,CAAC;QACd,CAAC;QAED,SAAS,CAAC,8BAA8B,OAAO,CAAC,MAAM,kCAAkC,aAAa,CAAC,IAAI,GAAG,CAAC,CAAC;QAE/G,sEAAsE;QACtE,MAAM,aAAa,GAAG,MAAM,IAAI,CAAC,wBAAwB,CAAC,aAAa,CAAC,EAAE,CAAC,CAAC;QAE5E,MAAM,mBAAmB,GAAwB;YAC7C,eAAe,EAAE,aAAa,CAAC,EAAE;YACjC,IAAI,EAAE,aAAa,CAAC,IAAI,IAAI,EAAE;YAC9B,aAAa,EAAE,aAAa,CAAC,aAAa;YAC1C,mBAAmB,EAAE,aAAa,CAAC,mBAAmB;YACtD,iBAAiB,EAAE,aAAa,CAAC,iBAAiB;YAClD,GAAG,EAAE,aAAa,CAAC,GAAG;SACzB,CAAC;QAEF,MAAM,KAAK,GAA0B,EAAE,CAAC;QACxC,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;YACxB,IAAI,CAAC;gBACD,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,MAAM,EAAE,GAAG,EAAE,mBAAmB,EAAE,aAAa,CAAC,CAAC;gBAC3F,IAAI,IAAI;oBAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC/B,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACT,MAAM,GAAG,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;gBACvD,QAAQ,CAAC,gDAAgD,GAAG,CAAC,QAAQ,MAAM,GAAG,EAAE,CAAC,CAAC;YACtF,CAAC;QACL,CAAC;QAED,OAAO,KAAK,CAAC;IACjB,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,aAAoC;QAC1D,MAAM,SAAS,GAAG,aAAa,CAAC,mBAAmB,CAAC;QACpD,IAAI,CAAC,SAAS,EAAE,CAAC;YACb,mDAAmD;YACnD,QAAQ,CAAC,gCAAgC,aAAa,CAAC,IAAI,qFAAqF,CAAC,CAAC;YAClJ,OAAO,IAAI,CAAC;QAChB,CAAC;QAED,MAAM,GAAG,GAAG,SAAoC,CAAC;QACjD,MAAM,WAAW,GAAG,GAAG,CAAC,wBAAwB,CAAuB,CAAC;QACxE,IAAI,CAAC,WAAW,EAAE,CAAC;YACf,QAAQ,CAAC,gCAAgC,aAAa,CAAC,IAAI,mDAAmD,CAAC,CAAC;YAChH,OAAO,IAAI,CAAC;QAChB,CAAC;QAED,OAAO;YACH,sBAAsB,EAAE,WAAW;YACnC,UAAU,EAAG,GAAG,CAAC,YAAY,CAAY,IAAI,SAAS;YACtD,iBAAiB,EAAG,GAAG,CAAC,mBAAmB,CAAc,IAAI,SAAS;SACzE,CAAC;IACN,CAAC;IAED;;OAEG;IACK,mBAAmB,CAAC,WAAmB;QAC3C,MAAM,MAAM,GAAG,QAAQ,CAAC,QAAQ,CAAC,YAAY,CAAC,cAAc,CACxD,eAAe,EACf,WAAW,CACd,CAAC;QACF,IAAI,CAAC,MAAM,EAAE,CAAC;YACV,QAAQ,CAAC,8DAA8D,WAAW,0EAA0E,CAAC,CAAC;YAC9J,OAAO,IAAI,CAAC;QAChB,CAAC;QACD,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC;YACvB,QAAQ,CAAC,wCAAwC,WAAW,qFAAqF,CAAC,CAAC;YACnJ,OAAO,IAAI,CAAC;QAChB,CAAC;QACD,OAAO,MAAM,CAAC;IAClB,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,mBAAmB,CAC7B,MAAuB,EACvB,MAAc,EACd,WAAiB,EACjB,iBAA4B;QAE5B,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAChD,MAAM,MAAM,GAAG,iBAAiB,EAAE,MAAM;YACpC,CAAC,CAAC,IAAI,GAAG,CAAC,iBAAiB,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,CAAC;YAC1D,CAAC,CAAC,IAAI,CAAC;QAEX,OAAO,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;YAC/B,IAAI,GAAG,CAAC,WAAW;gBAAE,OAAO,KAAK,CAAC;YAClC,IAAI,GAAG,CAAC,YAAY,IAAI,WAAW;gBAAE,OAAO,KAAK,CAAC;YAClD,IAAI,MAAM,EAAE,CAAC;gBACT,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC5C,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC;oBAAE,OAAO,KAAK,CAAC;YACvC,CAAC;YACD,OAAO,IAAI,CAAC;QAChB,CAAC,CAAC,CAAC;IACP,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,iBAAiB,CAC3B,MAAuB,EACvB,GAA0B,EAC1B,mBAAwC,EACxC,aAA+C;QAE/C,4BAA4B;QAC5B,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;QAElE,qEAAqE;QACrE,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,qBAAqB,CAAC,MAAM,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QAChE,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACpC,SAAS,CAAC,kDAAkD,GAAG,CAAC,QAAQ,aAAa,CAAC,CAAC;YACvF,OAAO,IAAI,CAAC;QAChB,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC;QAC7D,MAAM,MAAM,GAAG,GAAG,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;QAE1C,yCAAyC;QACzC,MAAM,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAC3C,IAAI,QAAQ,IAAI,QAAQ,CAAC,QAAQ,KAAK,QAAQ,EAAE,CAAC;YAC7C,OAAO,IAAI,CAAC,CAAC,oBAAoB;QACrC,CAAC;QAED,MAAM,EAAE,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,WAAgC,CAAC;QAErC,IAAI,QAAQ,EAAE,CAAC;YACX,WAAW,GAAG,QAAQ,CAAC;QAC3B,CAAC;aAAM,CAAC;YACJ,WAAW,GAAG,MAAM,EAAE,CAAC,eAAe,CAAsB,mBAAmB,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;YACnG,WAAW,CAAC,SAAS,EAAE,CAAC;YACxB,WAAW,CAAC,eAAe,GAAG,mBAAmB,CAAC,eAAe,CAAC;YAClE,WAAW,CAAC,aAAa,GAAG,mBAAmB,CAAC,aAAa,CAAC;YAC9D,WAAW,CAAC,mBAAmB,GAAG,mBAAmB,CAAC,mBAAmB,CAAC;YAC1E,WAAW,CAAC,iBAAiB,GAAG,mBAAmB,CAAC,iBAAiB,CAAC;QAC1E,CAAC;QAED,WAAW,CAAC,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC;QAC5B,WAAW,CAAC,WAAW,GAAG,IAAI,CAAC,MAAM,CAAC,yBAAyB,CAAC,mBAAmB,CAAC,CAAC;QACrF,WAAW,CAAC,GAAG,GAAG,GAAG,CAAC,QAAQ,CAAC;QAC/B,WAAW,CAAC,IAAI,GAAG,IAAI,CAAC;QACxB,WAAW,CAAC,QAAQ,GAAG,QAAQ,CAAC;QAEhC,MAAM,KAAK,GAAG,MAAM,WAAW,CAAC,IAAI,EAAE,CAAC;QACvC,IAAI,CAAC,KAAK,EAAE,CAAC;YACT,MAAM,IAAI,KAAK,CAAC,mCAAmC,GAAG,CAAC,QAAQ,GAAG,CAAC,CAAC;QACxE,CAAC;QAED,aAAa,CAAC,GAAG,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;QACvC,OAAO,WAAW,CAAC;IACvB,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,wBAAwB,CAAC,eAAuB;QAC1D,MAAM,EAAE,GAAG,IAAI,OAAO,EAAE,CAAC;QACzB,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,OAAO,CAAsB;YACjD,UAAU,EAAE,mBAAmB;YAC/B,WAAW,EAAE,oBAAoB,eAAe,GAAG;YACnD,UAAU,EAAE,eAAe;SAC9B,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;QAErB,MAAM,GAAG,GAAG,IAAI,GAAG,EAA+B,CAAC;QACnD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YACjB,KAAK,MAAM,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;gBAC9B,IAAI,EAAE,CAAC,GAAG,EAAE,CAAC;oBACT,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,EAAE,CAAC,CAAC;gBACtC,CAAC;YACL,CAAC;QACL,CAAC;QACD,OAAO,GAAG,CAAC;IACf,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,qBAAqB,CAAC,MAAc,EAAE,QAAgB;QAChE,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;QAE5C,IAAI,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,OAAO,IAAI,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,OAAO,IAAI,GAAG,KAAK,OAAO,EAAE,CAAC;YAC5F,OAAO,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACxC,CAAC;QAED,IAAI,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,OAAO,IAAI,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,OAAO,EAAE,CAAC;YAC5G,OAAO,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;QACpC,CAAC;QAED,uFAAuF;QACvF,IAAI,CAAC;YACD,OAAO,MAAM,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QAC9C,CAAC;QAAC,MAAM,CAAC;YACL,SAAS,CAAC,iDAAiD,GAAG,UAAU,QAAQ,GAAG,CAAC,CAAC;YACrF,OAAO,EAAE,CAAC;QACd,CAAC;IACL,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,QAAgB;QACrC,MAAM,OAAO,GAAG,QAAQ,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QAC1C,IAAI,OAAO,GAAG,CAAC;YAAE,OAAO,EAAE,CAAC;QAC3B,OAAO,QAAQ,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;IACrD,CAAC;CACJ,CAAA;AAzQY,mBAAmB;IAD/B,aAAa,CAAC,WAAW,EAAE,qBAAqB,CAAC;GACrC,mBAAmB,CAyQ/B"}
|
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
import { UserInfo } from '@memberjunction/core';
|
|
2
2
|
import { MJContentSourceEntity, MJContentItemEntity } from '@memberjunction/core-entities';
|
|
3
|
+
/** Progress callback for per-item updates during autotagging */
|
|
4
|
+
export type AutotagProgressCallback = (processed: number, total: number, currentItem?: string) => void;
|
|
3
5
|
export declare abstract class AutotagBase {
|
|
4
6
|
abstract SetContentItemsToProcess(contentSources: MJContentSourceEntity[]): Promise<MJContentItemEntity[]>;
|
|
5
|
-
|
|
7
|
+
/**
|
|
8
|
+
* Run autotagging for this source type.
|
|
9
|
+
* @param contextUser - The user context for server-side operations
|
|
10
|
+
* @param onProgress - Optional progress callback
|
|
11
|
+
* @param contentSourceIDs - Optional filter: only process these specific source IDs. If omitted, processes all sources for this type.
|
|
12
|
+
*/
|
|
13
|
+
abstract Autotag(contextUser: UserInfo, onProgress?: AutotagProgressCallback, contentSourceIDs?: string[]): Promise<void>;
|
|
6
14
|
}
|
|
7
15
|
//# sourceMappingURL=AutotagBase.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AutotagBase.d.ts","sourceRoot":"","sources":["../../../src/Core/generic/AutotagBase.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAChD,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,+BAA+B,CAAC;AAE3F,8BAAsB,WAAW;aACb,wBAAwB,CAAC,cAAc,EAAE,qBAAqB,EAAE,GAAG,OAAO,CAAC,mBAAmB,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"AutotagBase.d.ts","sourceRoot":"","sources":["../../../src/Core/generic/AutotagBase.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAChD,OAAO,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,+BAA+B,CAAC;AAE3F,gEAAgE;AAChE,MAAM,MAAM,uBAAuB,GAAG,CAAC,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,WAAW,CAAC,EAAE,MAAM,KAAK,IAAI,CAAC;AAEvG,8BAAsB,WAAW;aACb,wBAAwB,CAAC,cAAc,EAAE,qBAAqB,EAAE,GAAG,OAAO,CAAC,mBAAmB,EAAE,CAAC;IACjH;;;;;OAKG;aACa,OAAO,CAAC,WAAW,EAAE,QAAQ,EAAE,UAAU,CAAC,EAAE,uBAAuB,EAAE,gBAAgB,CAAC,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;CACnI"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AutotagBase.js","sourceRoot":"","sources":["../../../src/Core/generic/AutotagBase.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"AutotagBase.js","sourceRoot":"","sources":["../../../src/Core/generic/AutotagBase.ts"],"names":[],"mappings":"AAMA,MAAM,OAAgB,WAAW;CAShC"}
|