@memberjunction/content-autotagging 5.37.0 → 5.39.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -6
- package/dist/Engine/generic/AutotagBaseEngine.d.ts +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.d.ts.map +1 -1
- package/dist/Engine/generic/AutotagBaseEngine.js +81 -22
- package/dist/Engine/generic/AutotagBaseEngine.js.map +1 -1
- package/dist/Engine/generic/RunBudget.d.ts +21 -2
- package/dist/Engine/generic/RunBudget.d.ts.map +1 -1
- package/dist/Engine/generic/RunBudget.js +23 -1
- package/dist/Engine/generic/RunBudget.js.map +1 -1
- package/dist/Entity/generic/AutotagEntity.d.ts.map +1 -1
- package/dist/Entity/generic/AutotagEntity.js +9 -4
- package/dist/Entity/generic/AutotagEntity.js.map +1 -1
- package/dist/Websites/generic/AutotagWebsite.d.ts +165 -26
- package/dist/Websites/generic/AutotagWebsite.d.ts.map +1 -1
- package/dist/Websites/generic/AutotagWebsite.js +489 -130
- package/dist/Websites/generic/AutotagWebsite.js.map +1 -1
- package/package.json +17 -16
|
@@ -9,8 +9,9 @@ var __metadata = (this && this.__metadata) || function (k, v) {
|
|
|
9
9
|
};
|
|
10
10
|
import { AutotagBase } from '../../Core/index.js';
|
|
11
11
|
import { AutotagBaseEngine } from '../../Engine/index.js';
|
|
12
|
-
import {
|
|
13
|
-
import {
|
|
12
|
+
import { RunBudget } from '../../Engine/generic/RunBudget.js';
|
|
13
|
+
import { RegisterClass, NormalizeUUID } from '@memberjunction/global';
|
|
14
|
+
import { RunView, LogStatus } from '@memberjunction/core';
|
|
14
15
|
import * as cheerio from 'cheerio';
|
|
15
16
|
import axios from 'axios';
|
|
16
17
|
import { URL } from 'url';
|
|
@@ -19,9 +20,187 @@ dotenv.config({ quiet: true });
|
|
|
19
20
|
let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
20
21
|
constructor() {
|
|
21
22
|
super();
|
|
23
|
+
// Sensible defaults — overridable per content source via ContentSourceParam rows.
|
|
24
|
+
// CrawlSitesInLowerLevelDomain=true + MaxDepth=2 means we crawl the start URL
|
|
25
|
+
// plus two levels of in-domain links by default (~root + sections + content pages).
|
|
26
|
+
// CrawlOtherSitesInTopLevelDomain stays false to avoid accidentally fanning out
|
|
27
|
+
// across sibling paths of the seed URL unless explicitly opted in.
|
|
28
|
+
this.CrawlOtherSitesInTopLevelDomain = false;
|
|
29
|
+
this.CrawlSitesInLowerLevelDomain = true;
|
|
30
|
+
this.MaxDepth = 2;
|
|
31
|
+
/**
|
|
32
|
+
* Per-source RunBudget tracker, keyed by normalized source ID. Items
|
|
33
|
+
* processed in each batch are tallied against the budget of the source
|
|
34
|
+
* they belong to; when any budget exhausts, the engine's OnAfterBatch
|
|
35
|
+
* gate returns `continue:false` and the run pauses gracefully. Next
|
|
36
|
+
* invocation will re-crawl, change-detection will skip the already-
|
|
37
|
+
* processed pages, and the remaining ones get processed.
|
|
38
|
+
*/
|
|
39
|
+
this.sourceBudgetMap = new Map();
|
|
22
40
|
this.engine = AutotagBaseEngine.Instance;
|
|
23
41
|
this.visitedURLs = new Set();
|
|
24
42
|
}
|
|
43
|
+
/**
|
|
44
|
+
* Reset crawl-related instance fields back to the class defaults. Called at the
|
|
45
|
+
* start of each content source so prior-source overrides don't leak forward.
|
|
46
|
+
* URLPattern and RootURL default to undefined — derived later if unset.
|
|
47
|
+
*/
|
|
48
|
+
applyDefaultCrawlSettings() {
|
|
49
|
+
this.CrawlOtherSitesInTopLevelDomain = false;
|
|
50
|
+
this.CrawlSitesInLowerLevelDomain = true;
|
|
51
|
+
this.MaxDepth = 2;
|
|
52
|
+
this.URLPattern = undefined;
|
|
53
|
+
this.RootURL = undefined;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Apply the typed `Configuration.Website` sub-object (if present) to this
|
|
57
|
+
* crawler instance. Each field is optional — unset values leave the existing
|
|
58
|
+
* (default) instance value intact.
|
|
59
|
+
*
|
|
60
|
+
* NOTE: pluggability — today this is hard-coded for AutotagWebsite. Once we
|
|
61
|
+
* have more source types that need typed per-instance settings (RSS, Cloud
|
|
62
|
+
* Storage, etc.), this pattern should be promoted to an
|
|
63
|
+
* `IConfigurableContentSource<TConfig>` interface where each subclass declares
|
|
64
|
+
* its typed config sub-object key and shape. For now this is the canonical
|
|
65
|
+
* shape; other autotaggers can copy it when they need typed knobs.
|
|
66
|
+
*/
|
|
67
|
+
/**
|
|
68
|
+
* Apply per-source `ContentSourceParam` rows to this crawler instance. Values
|
|
69
|
+
* stored in the DB are strings, so we coerce per-key to the right runtime type
|
|
70
|
+
* instead of bulk-assigning (which previously stuffed strings into number /
|
|
71
|
+
* boolean fields and relied on JS coercion at use sites).
|
|
72
|
+
*
|
|
73
|
+
* Unknown keys are silently ignored — same gate as the prior `if (key in this)`
|
|
74
|
+
* check, just made explicit.
|
|
75
|
+
*/
|
|
76
|
+
overlayCrawlParamsFromMap(params) {
|
|
77
|
+
const maxDepth = params.get('MaxDepth');
|
|
78
|
+
if (maxDepth != null) {
|
|
79
|
+
const n = this.coerceNumber(maxDepth);
|
|
80
|
+
if (n != null)
|
|
81
|
+
this.MaxDepth = n;
|
|
82
|
+
}
|
|
83
|
+
const lower = params.get('CrawlSitesInLowerLevelDomain');
|
|
84
|
+
if (lower != null)
|
|
85
|
+
this.CrawlSitesInLowerLevelDomain = this.coerceBoolean(lower);
|
|
86
|
+
const other = params.get('CrawlOtherSitesInTopLevelDomain');
|
|
87
|
+
if (other != null)
|
|
88
|
+
this.CrawlOtherSitesInTopLevelDomain = this.coerceBoolean(other);
|
|
89
|
+
const pattern = params.get('URLPattern');
|
|
90
|
+
// RegExp values come pre-compiled by the engine — store the source text
|
|
91
|
+
// so the crawler's `new RegExp(this.URLPattern)` call stays consistent.
|
|
92
|
+
if (pattern instanceof RegExp) {
|
|
93
|
+
this.URLPattern = pattern.source;
|
|
94
|
+
}
|
|
95
|
+
else if (typeof pattern === 'string' && pattern.length > 0) {
|
|
96
|
+
this.URLPattern = pattern;
|
|
97
|
+
}
|
|
98
|
+
const root = params.get('RootURL');
|
|
99
|
+
if (typeof root === 'string' && root.length > 0)
|
|
100
|
+
this.RootURL = root;
|
|
101
|
+
}
|
|
102
|
+
coerceBoolean(value) {
|
|
103
|
+
if (typeof value === 'boolean')
|
|
104
|
+
return value;
|
|
105
|
+
if (typeof value === 'string') {
|
|
106
|
+
const trimmed = value.trim().toLowerCase();
|
|
107
|
+
return trimmed === 'true' || trimmed === '1' || trimmed === 'yes';
|
|
108
|
+
}
|
|
109
|
+
return false;
|
|
110
|
+
}
|
|
111
|
+
applyWebsiteConfigFromSource(source) {
|
|
112
|
+
const w = source.ConfigurationObject?.Website;
|
|
113
|
+
if (!w)
|
|
114
|
+
return;
|
|
115
|
+
if (typeof w.MaxDepth === 'number' && Number.isFinite(w.MaxDepth))
|
|
116
|
+
this.MaxDepth = w.MaxDepth;
|
|
117
|
+
if (typeof w.CrawlSitesInLowerLevelDomain === 'boolean')
|
|
118
|
+
this.CrawlSitesInLowerLevelDomain = w.CrawlSitesInLowerLevelDomain;
|
|
119
|
+
if (typeof w.CrawlOtherSitesInTopLevelDomain === 'boolean')
|
|
120
|
+
this.CrawlOtherSitesInTopLevelDomain = w.CrawlOtherSitesInTopLevelDomain;
|
|
121
|
+
if (typeof w.URLPattern === 'string' && w.URLPattern.length > 0)
|
|
122
|
+
this.URLPattern = w.URLPattern;
|
|
123
|
+
if (typeof w.RootURL === 'string' && w.RootURL.length > 0)
|
|
124
|
+
this.RootURL = w.RootURL;
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Build a per-source RunBudget map from each source's ConfigurationObject.
|
|
128
|
+
* Sources with no budget knobs set still get a RunBudget entry (with all
|
|
129
|
+
* limits = null) so the OnAfterBatch hook can update item counts uniformly.
|
|
130
|
+
*
|
|
131
|
+
* Per-source overrides via ContentSourceParam rows (e.g., MaxItemsPerRun
|
|
132
|
+
* stored as a param) take precedence over the ConfigurationObject value.
|
|
133
|
+
*/
|
|
134
|
+
async setupRunBudgets(contentSources) {
|
|
135
|
+
this.sourceBudgetMap = new Map();
|
|
136
|
+
for (const source of contentSources) {
|
|
137
|
+
const id = NormalizeUUID(source.ID);
|
|
138
|
+
const cfg = source.ConfigurationObject;
|
|
139
|
+
const params = await this.engine.getContentSourceParams(source, this.contextUser);
|
|
140
|
+
// ContentSourceParam override beats ConfigurationObject — that
|
|
141
|
+
// lets the per-source-instance UI knob win over the global
|
|
142
|
+
// ContentSource defaults.
|
|
143
|
+
const paramMaxItems = params?.get('MaxItemsPerRun');
|
|
144
|
+
const paramMaxTokens = params?.get('MaxTokensPerRun');
|
|
145
|
+
const paramMaxCost = params?.get('MaxCostPerRun');
|
|
146
|
+
this.sourceBudgetMap.set(id, new RunBudget({
|
|
147
|
+
MaxItemsPerRun: this.coerceNumber(paramMaxItems) ?? this.readConfigNumber(cfg, 'MaxItemsPerRun'),
|
|
148
|
+
MaxNewTagsPerRun: this.readConfigNumber(cfg, 'MaxNewTagsPerRun'),
|
|
149
|
+
MaxNewTagsPerItem: this.readConfigNumber(cfg, 'MaxNewTagsPerItem'),
|
|
150
|
+
MaxTokensPerRun: this.coerceNumber(paramMaxTokens) ?? this.readConfigNumber(cfg, 'MaxTokensPerRun'),
|
|
151
|
+
MaxCostPerRun: this.coerceNumber(paramMaxCost) ?? this.readConfigNumber(cfg, 'MaxCostPerRun'),
|
|
152
|
+
}));
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
coerceNumber(value) {
|
|
156
|
+
if (value == null)
|
|
157
|
+
return null;
|
|
158
|
+
if (typeof value === 'number')
|
|
159
|
+
return Number.isFinite(value) ? value : null;
|
|
160
|
+
if (typeof value === 'string') {
|
|
161
|
+
const n = Number(value);
|
|
162
|
+
return Number.isFinite(n) ? n : null;
|
|
163
|
+
}
|
|
164
|
+
return null;
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Safely read a numeric budget knob from the typed configuration object.
|
|
168
|
+
* Returns null when the field is unset or not a finite number.
|
|
169
|
+
*/
|
|
170
|
+
readConfigNumber(cfg, key) {
|
|
171
|
+
if (!cfg)
|
|
172
|
+
return null;
|
|
173
|
+
return this.coerceNumber(cfg[key]);
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Install the engine's OnAfterBatch hook so each batch's items are
|
|
177
|
+
* counted against the budget of the source they belong to. Returns
|
|
178
|
+
* `continue:false` from the gate when any source's budget exhausts,
|
|
179
|
+
* which the engine then translates into a graceful pause.
|
|
180
|
+
*/
|
|
181
|
+
installBudgetGate() {
|
|
182
|
+
this.engine.OnAfterBatch = async (batch, _totalProcessed) => {
|
|
183
|
+
// Tally items per source within this batch.
|
|
184
|
+
const perSourceCounts = new Map();
|
|
185
|
+
for (const item of batch) {
|
|
186
|
+
if (!item.ContentSourceID)
|
|
187
|
+
continue;
|
|
188
|
+
const id = NormalizeUUID(item.ContentSourceID);
|
|
189
|
+
perSourceCounts.set(id, (perSourceCounts.get(id) ?? 0) + 1);
|
|
190
|
+
}
|
|
191
|
+
for (const [id, count] of perSourceCounts) {
|
|
192
|
+
const budget = this.sourceBudgetMap.get(id);
|
|
193
|
+
if (!budget)
|
|
194
|
+
continue;
|
|
195
|
+
budget.recordItemsProcessed(count);
|
|
196
|
+
const verdict = budget.checkBudgets();
|
|
197
|
+
if (!verdict.ok) {
|
|
198
|
+
return { continue: false, reason: `${verdict.reason}: ${verdict.details ?? ''}` };
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
return { continue: true };
|
|
202
|
+
};
|
|
203
|
+
}
|
|
25
204
|
getContextUser() {
|
|
26
205
|
return this.contextUser;
|
|
27
206
|
}
|
|
@@ -36,28 +215,69 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
36
215
|
this.contextUser = contextUser;
|
|
37
216
|
this.contentSourceTypeID = this.engine.SetSubclassContentSourceType('Website');
|
|
38
217
|
const contentSources = await this.engine.getAllContentSources(this.contextUser, this.contentSourceTypeID);
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
218
|
+
// Per-source budget setup — produces a RunBudget for each content
|
|
219
|
+
// source and installs the OnAfterBatch gate on the engine so the
|
|
220
|
+
// run pauses gracefully when any source exhausts its MaxItemsPerRun /
|
|
221
|
+
// tokens / cost / tag budget.
|
|
222
|
+
await this.setupRunBudgets(contentSources);
|
|
223
|
+
this.installBudgetGate();
|
|
224
|
+
// Stream content items source-by-source into the LLM batcher. The
|
|
225
|
+
// crawl phase produces items as soon as they pass change-detection,
|
|
226
|
+
// and the LLM phase consumes them in batches without waiting for the
|
|
227
|
+
// last source to finish crawling. Wall-clock time becomes
|
|
228
|
+
// max(crawl, classify) + a small buffer instead of crawl + classify.
|
|
229
|
+
let itemsYielded = 0;
|
|
230
|
+
const streamSource = this;
|
|
231
|
+
const itemStream = (async function* () {
|
|
232
|
+
for await (const item of streamSource.streamContentItemsToProcess(contentSources)) {
|
|
233
|
+
itemsYielded++;
|
|
234
|
+
yield item;
|
|
235
|
+
}
|
|
236
|
+
})();
|
|
237
|
+
try {
|
|
238
|
+
await this.engine.ExtractTextAndProcessWithLLM(itemStream, this.contextUser, undefined, undefined, onProgress);
|
|
239
|
+
}
|
|
240
|
+
finally {
|
|
241
|
+
// Clean up engine state — leaving stale hooks around would leak
|
|
242
|
+
// budget state into the next Autotag invocation on a shared
|
|
243
|
+
// engine singleton.
|
|
244
|
+
this.engine.OnAfterBatch = null;
|
|
245
|
+
}
|
|
246
|
+
// Surface per-source budget pause reasons in the log so operators can
|
|
247
|
+
// see why a run stopped short.
|
|
248
|
+
for (const [sourceID, budget] of this.sourceBudgetMap) {
|
|
249
|
+
const verdict = budget.checkBudgets();
|
|
250
|
+
if (!verdict.ok) {
|
|
251
|
+
LogStatus(`[autotag-website] Source ${sourceID} reached budget: ${verdict.reason} — ${verdict.details ?? ''}`);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
return itemsYielded;
|
|
42
255
|
}
|
|
43
256
|
/**
|
|
44
|
-
*
|
|
45
|
-
*
|
|
46
|
-
*
|
|
47
|
-
*
|
|
257
|
+
* Streaming variant: yields each new/changed content item as soon as it
|
|
258
|
+
* passes change detection. Lets the crawl and LLM phases overlap so total
|
|
259
|
+
* wall-clock time is roughly max(crawl, classify) instead of crawl + classify.
|
|
260
|
+
*
|
|
261
|
+
* The canonical implementation lives here; the array-returning
|
|
262
|
+
* `SetContentItemsToProcess` is a thin collector wrapper around this.
|
|
48
263
|
*/
|
|
49
|
-
async
|
|
50
|
-
const contentItemsToProcess = [];
|
|
51
|
-
// If content source parameters were provided, set them. Otherwise, use the default values.
|
|
264
|
+
async *streamContentItemsToProcess(contentSources) {
|
|
52
265
|
for (const contentSource of contentSources) {
|
|
266
|
+
// Reset instance state to defaults before applying per-source overrides.
|
|
267
|
+
// Without this, knobs set on the previous source would leak into the next.
|
|
268
|
+
this.applyDefaultCrawlSettings();
|
|
269
|
+
// First overlay: typed Configuration.Website sub-object (the structured editor
|
|
270
|
+
// in the form writes here). This is the canonical storage for new sources.
|
|
271
|
+
this.applyWebsiteConfigFromSource(contentSource);
|
|
272
|
+
// Second overlay: per-source ContentSourceParam rows. These win — legacy
|
|
273
|
+
// sources configured via the params grid (or anyone who wants a sharper
|
|
274
|
+
// per-instance override) keep working. We handle each known crawler key
|
|
275
|
+
// explicitly so the DB-stored string values get the right runtime type
|
|
276
|
+
// (the prior "bulk dynamic assign" path silently stuffed strings into
|
|
277
|
+
// number/boolean fields).
|
|
53
278
|
const contentSourceParamsMap = await this.engine.getContentSourceParams(contentSource, this.contextUser);
|
|
54
279
|
if (contentSourceParamsMap) {
|
|
55
|
-
|
|
56
|
-
contentSourceParamsMap.forEach((value, key) => {
|
|
57
|
-
if (key in this) {
|
|
58
|
-
this[key] = value;
|
|
59
|
-
}
|
|
60
|
-
});
|
|
280
|
+
this.overlayCrawlParamsFromMap(contentSourceParamsMap);
|
|
61
281
|
}
|
|
62
282
|
const contentSourceParams = {
|
|
63
283
|
contentSourceID: contentSource.ID,
|
|
@@ -68,19 +288,30 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
68
288
|
URL: contentSource.URL
|
|
69
289
|
};
|
|
70
290
|
try {
|
|
71
|
-
// All content items associated with the content source
|
|
72
291
|
const startURL = contentSourceParams.URL;
|
|
73
|
-
// root url should be set to this.RootURL if it exists, otherwise it should be set to the base path of the startURL.
|
|
74
292
|
const rootURL = this.RootURL ? this.RootURL : this.getBasePath(startURL);
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
293
|
+
const regex = (this.URLPattern && new RegExp(this.URLPattern)) || new RegExp('.*');
|
|
294
|
+
// Consume the URL stream lazily — each `link` arrives as the crawler
|
|
295
|
+
// discovers it, NOT after the full recursive crawl completes. The
|
|
296
|
+
// engine's LLM batcher accumulates items as they're yielded here, so
|
|
297
|
+
// tagging starts firing as soon as the first BatchSize items are ready
|
|
298
|
+
// (instead of waiting for the entire source to finish crawling).
|
|
299
|
+
let yieldedForSource = 0;
|
|
300
|
+
for await (const link of this.streamAllLinksFromContentSource(startURL, rootURL, regex)) {
|
|
301
|
+
try {
|
|
302
|
+
const item = await this.processSingleURL(link, contentSourceParams);
|
|
303
|
+
if (item) {
|
|
304
|
+
yieldedForSource++;
|
|
305
|
+
yield item;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
catch (e) {
|
|
309
|
+
// Per-URL failures are isolated — log and keep going so a single
|
|
310
|
+
// bad page doesn't poison the rest of the source.
|
|
311
|
+
console.error(`[autotag-website] Failed to process URL ${link}:`, e);
|
|
312
|
+
}
|
|
81
313
|
}
|
|
82
|
-
|
|
83
|
-
// No content items found to process
|
|
314
|
+
if (yieldedForSource === 0) {
|
|
84
315
|
console.log(`No content items found to process for content source: ${contentSource.Get('Name')}`);
|
|
85
316
|
}
|
|
86
317
|
}
|
|
@@ -88,6 +319,20 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
88
319
|
console.error(`Failed to process content source: ${contentSource.Get('Name')}`);
|
|
89
320
|
}
|
|
90
321
|
}
|
|
322
|
+
}
|
|
323
|
+
/**
|
|
324
|
+
* Given a content source, retrieve all content items associated with the content sources.
|
|
325
|
+
* The content items are then processed to determine if they have been modified since the
|
|
326
|
+
* last time they were processed or if they are new content items.
|
|
327
|
+
*
|
|
328
|
+
* Backwards-compatible array form. Internally drains the streaming variant
|
|
329
|
+
* so there is exactly one implementation of the change-detection logic.
|
|
330
|
+
*/
|
|
331
|
+
async SetContentItemsToProcess(contentSources) {
|
|
332
|
+
const contentItemsToProcess = [];
|
|
333
|
+
for await (const item of this.streamContentItemsToProcess(contentSources)) {
|
|
334
|
+
contentItemsToProcess.push(item);
|
|
335
|
+
}
|
|
91
336
|
return contentItemsToProcess;
|
|
92
337
|
}
|
|
93
338
|
/**
|
|
@@ -99,61 +344,18 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
99
344
|
* @param contextUser
|
|
100
345
|
* @returns
|
|
101
346
|
*/
|
|
347
|
+
/**
|
|
348
|
+
* Backwards-compatible batch form: process an explicit list of URLs and
|
|
349
|
+
* return all new/changed content items as an array. New code should prefer
|
|
350
|
+
* `streamContentItemsToProcess` which pipelines into the LLM batcher.
|
|
351
|
+
*/
|
|
102
352
|
async SetNewAndModifiedContentItems(contentItemLinks, contentSourceParams, contextUser) {
|
|
103
353
|
const addedContentItems = [];
|
|
104
|
-
for (const
|
|
354
|
+
for (const link of contentItemLinks) {
|
|
105
355
|
try {
|
|
106
|
-
const
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
{
|
|
110
|
-
EntityName: 'MJ: Content Items',
|
|
111
|
-
ExtraFilter: `Checksum = '${newHash}'`,
|
|
112
|
-
ResultType: 'entity_object'
|
|
113
|
-
},
|
|
114
|
-
{
|
|
115
|
-
EntityName: 'MJ: Content Items',
|
|
116
|
-
ExtraFilter: `ContentSourceID = '${contentSourceParams.contentSourceID}' AND URL = '${contentItemLink}'`,
|
|
117
|
-
ResultType: 'entity_object'
|
|
118
|
-
}
|
|
119
|
-
], this.contextUser);
|
|
120
|
-
const contentItemResultsWithChecksum = results[0];
|
|
121
|
-
const contentItemResultsWithURL = results[1];
|
|
122
|
-
if (contentItemResultsWithChecksum.Success && contentItemResultsWithChecksum.Results.length) {
|
|
123
|
-
// We found the checksum so this content item has not changed since we last accessed it, do nothing
|
|
124
|
-
continue;
|
|
125
|
-
}
|
|
126
|
-
else if (contentItemResultsWithURL.Success && contentItemResultsWithURL.Results.length) {
|
|
127
|
-
// This content item already exists, update the hash and last updated date
|
|
128
|
-
const contentItemResult = contentItemResultsWithURL.Results[0];
|
|
129
|
-
const lastStoredHash = contentItemResult.Checksum;
|
|
130
|
-
if (lastStoredHash !== newHash) {
|
|
131
|
-
// This content item has changed since we last access it, update the hash and last updated date
|
|
132
|
-
const md = this.ProviderToUse;
|
|
133
|
-
const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
|
|
134
|
-
contentItem.Load(contentItemResult.ID);
|
|
135
|
-
contentItem.Checksum = newHash;
|
|
136
|
-
contentItem.Text = await this.parseWebPage(contentItemLink);
|
|
137
|
-
await contentItem.Save();
|
|
138
|
-
addedContentItems.push(contentItem); // Content item was modified, add to list
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
else {
|
|
142
|
-
// This content item does not exist, add it
|
|
143
|
-
const md = this.ProviderToUse;
|
|
144
|
-
const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
|
|
145
|
-
contentItem.ContentSourceID = contentSourceParams.contentSourceID;
|
|
146
|
-
contentItem.Name = this.getPathName(contentItemLink); // Will get overwritten by title later if it exists
|
|
147
|
-
contentItem.Description = this.engine.GetContentItemDescription(contentSourceParams);
|
|
148
|
-
contentItem.ContentTypeID = contentSourceParams.ContentTypeID;
|
|
149
|
-
contentItem.ContentFileTypeID = contentSourceParams.ContentFileTypeID;
|
|
150
|
-
contentItem.ContentSourceTypeID = contentSourceParams.ContentSourceTypeID;
|
|
151
|
-
contentItem.Checksum = await this.engine.getChecksumFromURL(contentItemLink);
|
|
152
|
-
contentItem.URL = contentItemLink;
|
|
153
|
-
contentItem.Text = await this.parseWebPage(contentItemLink);
|
|
154
|
-
await contentItem.Save();
|
|
155
|
-
addedContentItems.push(contentItem); // Content item was added, add to list
|
|
156
|
-
}
|
|
356
|
+
const item = await this.processSingleURL(link, contentSourceParams);
|
|
357
|
+
if (item)
|
|
358
|
+
addedContentItems.push(item);
|
|
157
359
|
}
|
|
158
360
|
catch (e) {
|
|
159
361
|
console.log(e);
|
|
@@ -161,6 +363,67 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
161
363
|
}
|
|
162
364
|
return addedContentItems;
|
|
163
365
|
}
|
|
366
|
+
/**
|
|
367
|
+
* Process one URL through the change-detection pipeline. Returns the
|
|
368
|
+
* MJContentItem if the page is new or changed (caller should hand it off
|
|
369
|
+
* to the LLM stage), or `null` if the page is unchanged.
|
|
370
|
+
*
|
|
371
|
+
* One axios.get per URL: the same response body provides both the
|
|
372
|
+
* change-detection hash and the page text. Compare with `byChecksum`
|
|
373
|
+
* scoped to the current ContentSource so identical boilerplate (404 pages,
|
|
374
|
+
* shared error templates) from a *different* source can't silently mask
|
|
375
|
+
* legitimate pages here.
|
|
376
|
+
*/
|
|
377
|
+
async processSingleURL(url, contentSourceParams) {
|
|
378
|
+
const { text, checksum: newHash } = await this.fetchAndExtract(url);
|
|
379
|
+
const rv = new RunView();
|
|
380
|
+
const results = await rv.RunViews([
|
|
381
|
+
{
|
|
382
|
+
EntityName: 'MJ: Content Items',
|
|
383
|
+
ExtraFilter: `ContentSourceID = '${contentSourceParams.contentSourceID}' AND Checksum = '${newHash}'`,
|
|
384
|
+
ResultType: 'entity_object'
|
|
385
|
+
},
|
|
386
|
+
{
|
|
387
|
+
EntityName: 'MJ: Content Items',
|
|
388
|
+
ExtraFilter: `ContentSourceID = '${contentSourceParams.contentSourceID}' AND URL = '${url}'`,
|
|
389
|
+
ResultType: 'entity_object'
|
|
390
|
+
}
|
|
391
|
+
], this.contextUser);
|
|
392
|
+
const byChecksum = results[0];
|
|
393
|
+
const byURL = results[1];
|
|
394
|
+
// Same content already in DB for this source — unchanged, skip.
|
|
395
|
+
if (byChecksum.Success && byChecksum.Results.length) {
|
|
396
|
+
return null;
|
|
397
|
+
}
|
|
398
|
+
// URL exists for this source but content has drifted — update in place.
|
|
399
|
+
if (byURL.Success && byURL.Results.length) {
|
|
400
|
+
const existing = byURL.Results[0];
|
|
401
|
+
if (existing.Checksum === newHash) {
|
|
402
|
+
return null;
|
|
403
|
+
}
|
|
404
|
+
const md = this.ProviderToUse;
|
|
405
|
+
const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
|
|
406
|
+
await contentItem.Load(existing.ID);
|
|
407
|
+
contentItem.Checksum = newHash;
|
|
408
|
+
contentItem.Text = text;
|
|
409
|
+
await contentItem.Save();
|
|
410
|
+
return contentItem;
|
|
411
|
+
}
|
|
412
|
+
// New URL — create the content item, reusing the already-fetched body.
|
|
413
|
+
const md = this.ProviderToUse;
|
|
414
|
+
const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
|
|
415
|
+
contentItem.ContentSourceID = contentSourceParams.contentSourceID;
|
|
416
|
+
contentItem.Name = this.getPathName(url); // Will get overwritten by title later if it exists
|
|
417
|
+
contentItem.Description = this.engine.GetContentItemDescription(contentSourceParams);
|
|
418
|
+
contentItem.ContentTypeID = contentSourceParams.ContentTypeID;
|
|
419
|
+
contentItem.ContentFileTypeID = contentSourceParams.ContentFileTypeID;
|
|
420
|
+
contentItem.ContentSourceTypeID = contentSourceParams.ContentSourceTypeID;
|
|
421
|
+
contentItem.Checksum = newHash;
|
|
422
|
+
contentItem.URL = url;
|
|
423
|
+
contentItem.Text = text;
|
|
424
|
+
await contentItem.Save();
|
|
425
|
+
return contentItem;
|
|
426
|
+
}
|
|
164
427
|
async fetchPageContent(url) {
|
|
165
428
|
const { data } = await axios.get(url);
|
|
166
429
|
return data;
|
|
@@ -180,16 +443,44 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
180
443
|
return text;
|
|
181
444
|
}
|
|
182
445
|
/**
|
|
183
|
-
*
|
|
184
|
-
*
|
|
185
|
-
*
|
|
446
|
+
* Pure helper: extract clean body text from raw HTML. No IO. Exposed as
|
|
447
|
+
* a protected method so subclasses and unit tests can exercise it without
|
|
448
|
+
* monkey-patching axios.
|
|
449
|
+
*/
|
|
450
|
+
extractTextFromHTML(html) {
|
|
451
|
+
const $ = cheerio.load(html);
|
|
452
|
+
const body = $('body')[0];
|
|
453
|
+
if (!body)
|
|
454
|
+
return '';
|
|
455
|
+
return this.getTextWithLineBreaks(body, $);
|
|
456
|
+
}
|
|
457
|
+
/**
|
|
458
|
+
* Fetch a URL once, extract clean text, and compute a stable checksum
|
|
459
|
+
* over that text. Returns both so callers don't have to fetch twice for
|
|
460
|
+
* "is this changed?" + "what's the content?".
|
|
461
|
+
*
|
|
462
|
+
* The checksum is computed over the EXTRACTED body text, NOT the raw
|
|
463
|
+
* HTML, because raw HTML routinely contains incidental changes (server
|
|
464
|
+
* timestamps, CSRF tokens, build hashes, ad rotators) that would
|
|
465
|
+
* falsely report a page as "changed" on every crawl. Hashing the
|
|
466
|
+
* extracted text is what users actually mean by "did the content
|
|
467
|
+
* change?"
|
|
468
|
+
*/
|
|
469
|
+
async fetchAndExtract(url) {
|
|
470
|
+
const { data } = await axios.get(url);
|
|
471
|
+
const text = this.extractTextFromHTML(String(data));
|
|
472
|
+
const checksum = await this.engine.getChecksumFromText(text);
|
|
473
|
+
return { text, checksum };
|
|
474
|
+
}
|
|
475
|
+
/**
|
|
476
|
+
* Given a URL, extracts text from a webpage. Kept for external callers
|
|
477
|
+
* that just want the text — internal change-detection now uses
|
|
478
|
+
* `fetchAndExtract` to avoid redundant fetches.
|
|
186
479
|
*/
|
|
187
480
|
async parseWebPage(url) {
|
|
188
481
|
try {
|
|
189
482
|
const pageContent = await this.fetchPageContent(url);
|
|
190
|
-
|
|
191
|
-
const text = this.getTextWithLineBreaks($('body')[0], $);
|
|
192
|
-
return text;
|
|
483
|
+
return this.extractTextFromHTML(pageContent);
|
|
193
484
|
}
|
|
194
485
|
catch (error) {
|
|
195
486
|
console.error(`Error processing ${url}:`, error);
|
|
@@ -197,50 +488,67 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
197
488
|
}
|
|
198
489
|
}
|
|
199
490
|
/**
|
|
200
|
-
*
|
|
201
|
-
*
|
|
202
|
-
*
|
|
203
|
-
*
|
|
204
|
-
*
|
|
491
|
+
* Streaming variant: yields each newly-discovered URL as the crawler finds it,
|
|
492
|
+
* so downstream consumers (the content-item streamer that feeds the LLM
|
|
493
|
+
* batcher) can start working before discovery completes. This is the
|
|
494
|
+
* canonical implementation; `getAllLinksFromContentSource` below is a
|
|
495
|
+
* backwards-compatible array-collecting wrapper.
|
|
205
496
|
*/
|
|
206
|
-
async
|
|
497
|
+
async *streamAllLinksFromContentSource(url, rootURL, regex) {
|
|
498
|
+
// Start each content source with a clean visited set — otherwise URLs
|
|
499
|
+
// found for one source silently get deduped away when the next source
|
|
500
|
+
// is crawled.
|
|
501
|
+
this.visitedURLs = new Set();
|
|
502
|
+
// Normalize the seed URL once so all downstream comparisons share the same form.
|
|
503
|
+
const seedURL = this.normalizeURL(url);
|
|
207
504
|
try {
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
return Array.from(this.visitedURLs);
|
|
505
|
+
yield* this.streamLowerLevelLinks(seedURL, rootURL, this.MaxDepth, new Set(), regex);
|
|
506
|
+
yield* this.streamTopLevelLinks(seedURL, this.getBasePath(seedURL), regex);
|
|
211
507
|
}
|
|
212
508
|
catch (e) {
|
|
213
509
|
console.error(`Failed to get links from ${url}`);
|
|
214
|
-
return [];
|
|
215
510
|
}
|
|
216
511
|
}
|
|
217
512
|
/**
|
|
218
|
-
*
|
|
219
|
-
* @param url
|
|
220
|
-
* @param rootURL
|
|
221
|
-
* @param visitedURLs
|
|
222
|
-
* @returns
|
|
513
|
+
* Backwards-compatible array form. Drains the streaming variant.
|
|
223
514
|
*/
|
|
224
|
-
async
|
|
515
|
+
async getAllLinksFromContentSource(url, rootURL, regex) {
|
|
516
|
+
const collected = [];
|
|
517
|
+
for await (const link of this.streamAllLinksFromContentSource(url, rootURL, regex)) {
|
|
518
|
+
collected.push(link);
|
|
519
|
+
}
|
|
520
|
+
return collected;
|
|
521
|
+
}
|
|
522
|
+
/**
|
|
523
|
+
* Streaming variant of getTopLevelLinks — yields each URL it adds to the
|
|
524
|
+
* visited set so the LLM batcher gets fed in real time.
|
|
525
|
+
*/
|
|
526
|
+
async *streamTopLevelLinks(url, rootURL, regex) {
|
|
225
527
|
if (!this.CrawlOtherSitesInTopLevelDomain) {
|
|
226
|
-
|
|
528
|
+
// Seed URL still gets yielded so the processSingleURL pipeline runs on it.
|
|
529
|
+
if (!this.visitedURLs.has(url)) {
|
|
530
|
+
this.visitedURLs.add(url);
|
|
531
|
+
yield url;
|
|
532
|
+
}
|
|
227
533
|
return;
|
|
228
534
|
}
|
|
229
|
-
// If we have already visited this URL,
|
|
535
|
+
// If we have already visited this URL, nothing to do.
|
|
230
536
|
if (this.visitedURLs.has(url) || !await this.urlIsValid(url) || this.isHighestDomain(url)) {
|
|
231
537
|
return;
|
|
232
538
|
}
|
|
233
539
|
this.visitedURLs.add(url);
|
|
540
|
+
yield url;
|
|
541
|
+
const discovered = [];
|
|
234
542
|
try {
|
|
235
543
|
const { data } = await axios.get(url);
|
|
236
544
|
const $ = cheerio.load(data);
|
|
237
|
-
// Get all links on the page for the current URL
|
|
238
545
|
$('a').each((_, element) => {
|
|
239
546
|
const link = $(element).attr('href');
|
|
240
547
|
if (link) {
|
|
241
|
-
const newURL = new URL(link, url).href;
|
|
242
|
-
if (newURL.startsWith(rootURL) && !this.visitedURLs.has(newURL)) {
|
|
548
|
+
const newURL = this.normalizeURL(new URL(link, url).href);
|
|
549
|
+
if (newURL.startsWith(rootURL) && !this.visitedURLs.has(newURL) && regex.test(newURL)) {
|
|
243
550
|
this.visitedURLs.add(newURL);
|
|
551
|
+
discovered.push(newURL);
|
|
244
552
|
}
|
|
245
553
|
}
|
|
246
554
|
});
|
|
@@ -250,6 +558,19 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
250
558
|
console.error(`Failed to get links from ${url}`);
|
|
251
559
|
return;
|
|
252
560
|
}
|
|
561
|
+
// Yield the page's links AFTER the await/delay completes so they're emitted
|
|
562
|
+
// outside the cheerio sync callback (which can't yield).
|
|
563
|
+
for (const newURL of discovered) {
|
|
564
|
+
yield newURL;
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
/**
|
|
568
|
+
* Backwards-compatible void form. Drains the streaming variant (links go
|
|
569
|
+
* into `visitedURLs` as a side effect of streamTopLevelLinks).
|
|
570
|
+
*/
|
|
571
|
+
async getTopLevelLinks(url, rootURL, regex) {
|
|
572
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
573
|
+
for await (const _ of this.streamTopLevelLinks(url, rootURL, regex)) { /* drain */ }
|
|
253
574
|
}
|
|
254
575
|
/**
|
|
255
576
|
* Simple check to see if the URL is at the highest level domain.
|
|
@@ -288,6 +609,31 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
288
609
|
throw e;
|
|
289
610
|
}
|
|
290
611
|
}
|
|
612
|
+
/**
|
|
613
|
+
* Normalize a URL for use as a dedup key in `visitedURLs`. Conservative
|
|
614
|
+
* normalization that catches the common variations without risking the merge
|
|
615
|
+
* of two semantically distinct pages:
|
|
616
|
+
* - drops the fragment (always client-side per RFC 3986)
|
|
617
|
+
* - collapses trailing slash on the path (except the root "/")
|
|
618
|
+
* - sorts query parameters for stable equality
|
|
619
|
+
* - host is already lower-cased by URL parser
|
|
620
|
+
* Path case is intentionally preserved — RFC 3986 says paths are case-sensitive
|
|
621
|
+
* and some servers (wikis, certain Linux file fronts) actually treat them that way.
|
|
622
|
+
*/
|
|
623
|
+
normalizeURL(href) {
|
|
624
|
+
try {
|
|
625
|
+
const u = new URL(href);
|
|
626
|
+
u.hash = '';
|
|
627
|
+
if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
|
|
628
|
+
u.pathname = u.pathname.slice(0, -1);
|
|
629
|
+
}
|
|
630
|
+
u.searchParams.sort();
|
|
631
|
+
return u.href;
|
|
632
|
+
}
|
|
633
|
+
catch {
|
|
634
|
+
return href;
|
|
635
|
+
}
|
|
636
|
+
}
|
|
291
637
|
async urlIsValid(url) {
|
|
292
638
|
try {
|
|
293
639
|
const response = await axios.head(url);
|
|
@@ -299,53 +645,66 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
|
|
|
299
645
|
}
|
|
300
646
|
}
|
|
301
647
|
/**
|
|
302
|
-
*
|
|
303
|
-
*
|
|
304
|
-
*
|
|
305
|
-
*
|
|
306
|
-
*
|
|
307
|
-
*
|
|
648
|
+
* Streaming variant of getLowerLevelLinks. Yields each newly-discovered URL
|
|
649
|
+
* the moment it's added to the visited set, then recurses depth-first into
|
|
650
|
+
* children. This is the canonical implementation — the LLM batcher gets
|
|
651
|
+
* fed in real time during crawl instead of having to wait for the entire
|
|
652
|
+
* recursive discovery to complete.
|
|
653
|
+
*
|
|
654
|
+
* `getLowerLevelLinks` below is a thin backwards-compatible wrapper that
|
|
655
|
+
* drains the stream into a Set.
|
|
308
656
|
*/
|
|
309
|
-
async
|
|
657
|
+
async *streamLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex) {
|
|
310
658
|
try {
|
|
311
659
|
console.log(`Scraping ${url}`);
|
|
312
|
-
//
|
|
313
|
-
|
|
314
|
-
|
|
660
|
+
// The Number.isFinite guard protects against accidental NaN/undefined
|
|
661
|
+
// arriving as crawlDepth — without it, `undefined < 0` is false and the
|
|
662
|
+
// recursion runs without a depth ceiling.
|
|
663
|
+
if (scrapedURLs.has(url) || await this.urlIsValid(url) === false || !Number.isFinite(crawlDepth) || crawlDepth < 0 || !this.CrawlSitesInLowerLevelDomain) {
|
|
664
|
+
return;
|
|
315
665
|
}
|
|
316
|
-
|
|
317
|
-
const extractedLinks = new Set(); // Links extracted from the input URL
|
|
666
|
+
const extractedLinks = [];
|
|
318
667
|
const { data } = await axios.get(url);
|
|
319
668
|
const $ = cheerio.load(data);
|
|
320
|
-
// Get all links on the page for the current URL
|
|
321
669
|
$('a').each((_, element) => {
|
|
322
670
|
const link = $(element).attr('href');
|
|
323
671
|
if (link) {
|
|
324
|
-
const newURL = new URL(link, url).href;
|
|
672
|
+
const newURL = this.normalizeURL(new URL(link, url).href);
|
|
325
673
|
if (newURL.startsWith(rootURL) && newURL !== url && !this.visitedURLs.has(newURL) && regex.test(newURL)) {
|
|
326
|
-
extractedLinks.
|
|
674
|
+
extractedLinks.push(newURL);
|
|
327
675
|
this.visitedURLs.add(newURL);
|
|
328
676
|
}
|
|
329
677
|
}
|
|
330
678
|
});
|
|
331
679
|
await this.delay(1000); // Delay to prevent rate limiting
|
|
332
680
|
scrapedURLs.add(url);
|
|
333
|
-
//
|
|
334
|
-
|
|
335
|
-
|
|
681
|
+
// Yield each newly-discovered URL outside the (sync) cheerio callback.
|
|
682
|
+
// Consumers start processing these immediately while we recurse.
|
|
683
|
+
for (const newURL of extractedLinks) {
|
|
684
|
+
yield newURL;
|
|
336
685
|
}
|
|
686
|
+
// Depth limit — discover this page's links but don't recurse.
|
|
687
|
+
if (crawlDepth === 0)
|
|
688
|
+
return;
|
|
337
689
|
for (const subLink of extractedLinks) {
|
|
338
|
-
|
|
339
|
-
const lowerLevelLinks = await this.getLowerLevelLinks(subLink, rootURL, crawlDepth - 1, scrapedURLs, regex);
|
|
340
|
-
combinedLinks = new Set([...extractedLinks, ...lowerLevelLinks]);
|
|
690
|
+
yield* this.streamLowerLevelLinks(subLink, rootURL, crawlDepth - 1, scrapedURLs, regex);
|
|
341
691
|
}
|
|
342
|
-
return combinedLinks;
|
|
343
692
|
}
|
|
344
693
|
catch (e) {
|
|
345
694
|
console.error(`Failed to get links from ${url}`);
|
|
346
|
-
return new Set();
|
|
347
695
|
}
|
|
348
696
|
}
|
|
697
|
+
/**
|
|
698
|
+
* Backwards-compatible Set form. Drains the streaming variant; URLs end up
|
|
699
|
+
* in `this.visitedURLs` as a side effect of streamLowerLevelLinks.
|
|
700
|
+
*/
|
|
701
|
+
async getLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex) {
|
|
702
|
+
const out = new Set();
|
|
703
|
+
for await (const link of this.streamLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex)) {
|
|
704
|
+
out.add(link);
|
|
705
|
+
}
|
|
706
|
+
return out;
|
|
707
|
+
}
|
|
349
708
|
async delay(ms) {
|
|
350
709
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
351
710
|
}
|