@memberjunction/content-autotagging 5.37.0 → 5.39.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,8 +9,9 @@ var __metadata = (this && this.__metadata) || function (k, v) {
9
9
  };
10
10
  import { AutotagBase } from '../../Core/index.js';
11
11
  import { AutotagBaseEngine } from '../../Engine/index.js';
12
- import { RegisterClass } from '@memberjunction/global';
13
- import { RunView } from '@memberjunction/core';
12
+ import { RunBudget } from '../../Engine/generic/RunBudget.js';
13
+ import { RegisterClass, NormalizeUUID } from '@memberjunction/global';
14
+ import { RunView, LogStatus } from '@memberjunction/core';
14
15
  import * as cheerio from 'cheerio';
15
16
  import axios from 'axios';
16
17
  import { URL } from 'url';
@@ -19,9 +20,187 @@ dotenv.config({ quiet: true });
19
20
  let AutotagWebsite = class AutotagWebsite extends AutotagBase {
20
21
  constructor() {
21
22
  super();
23
+ // Sensible defaults — overridable per content source via ContentSourceParam rows.
24
+ // CrawlSitesInLowerLevelDomain=true + MaxDepth=2 means we crawl the start URL
25
+ // plus two levels of in-domain links by default (~root + sections + content pages).
26
+ // CrawlOtherSitesInTopLevelDomain stays false to avoid accidentally fanning out
27
+ // across sibling paths of the seed URL unless explicitly opted in.
28
+ this.CrawlOtherSitesInTopLevelDomain = false;
29
+ this.CrawlSitesInLowerLevelDomain = true;
30
+ this.MaxDepth = 2;
31
+ /**
32
+ * Per-source RunBudget tracker, keyed by normalized source ID. Items
33
+ * processed in each batch are tallied against the budget of the source
34
+ * they belong to; when any budget exhausts, the engine's OnAfterBatch
35
+ * gate returns `continue:false` and the run pauses gracefully. Next
36
+ * invocation will re-crawl, change-detection will skip the already-
37
+ * processed pages, and the remaining ones get processed.
38
+ */
39
+ this.sourceBudgetMap = new Map();
22
40
  this.engine = AutotagBaseEngine.Instance;
23
41
  this.visitedURLs = new Set();
24
42
  }
43
+ /**
44
+ * Reset crawl-related instance fields back to the class defaults. Called at the
45
+ * start of each content source so prior-source overrides don't leak forward.
46
+ * URLPattern and RootURL default to undefined — derived later if unset.
47
+ */
48
+ applyDefaultCrawlSettings() {
49
+ this.CrawlOtherSitesInTopLevelDomain = false;
50
+ this.CrawlSitesInLowerLevelDomain = true;
51
+ this.MaxDepth = 2;
52
+ this.URLPattern = undefined;
53
+ this.RootURL = undefined;
54
+ }
55
+ /**
56
+ * Apply the typed `Configuration.Website` sub-object (if present) to this
57
+ * crawler instance. Each field is optional — unset values leave the existing
58
+ * (default) instance value intact.
59
+ *
60
+ * NOTE: pluggability — today this is hard-coded for AutotagWebsite. Once we
61
+ * have more source types that need typed per-instance settings (RSS, Cloud
62
+ * Storage, etc.), this pattern should be promoted to an
63
+ * `IConfigurableContentSource<TConfig>` interface where each subclass declares
64
+ * its typed config sub-object key and shape. For now this is the canonical
65
+ * shape; other autotaggers can copy it when they need typed knobs.
66
+ */
67
+ /**
68
+ * Apply per-source `ContentSourceParam` rows to this crawler instance. Values
69
+ * stored in the DB are strings, so we coerce per-key to the right runtime type
70
+ * instead of bulk-assigning (which previously stuffed strings into number /
71
+ * boolean fields and relied on JS coercion at use sites).
72
+ *
73
+ * Unknown keys are silently ignored — same gate as the prior `if (key in this)`
74
+ * check, just made explicit.
75
+ */
76
+ overlayCrawlParamsFromMap(params) {
77
+ const maxDepth = params.get('MaxDepth');
78
+ if (maxDepth != null) {
79
+ const n = this.coerceNumber(maxDepth);
80
+ if (n != null)
81
+ this.MaxDepth = n;
82
+ }
83
+ const lower = params.get('CrawlSitesInLowerLevelDomain');
84
+ if (lower != null)
85
+ this.CrawlSitesInLowerLevelDomain = this.coerceBoolean(lower);
86
+ const other = params.get('CrawlOtherSitesInTopLevelDomain');
87
+ if (other != null)
88
+ this.CrawlOtherSitesInTopLevelDomain = this.coerceBoolean(other);
89
+ const pattern = params.get('URLPattern');
90
+ // RegExp values come pre-compiled by the engine — store the source text
91
+ // so the crawler's `new RegExp(this.URLPattern)` call stays consistent.
92
+ if (pattern instanceof RegExp) {
93
+ this.URLPattern = pattern.source;
94
+ }
95
+ else if (typeof pattern === 'string' && pattern.length > 0) {
96
+ this.URLPattern = pattern;
97
+ }
98
+ const root = params.get('RootURL');
99
+ if (typeof root === 'string' && root.length > 0)
100
+ this.RootURL = root;
101
+ }
102
+ coerceBoolean(value) {
103
+ if (typeof value === 'boolean')
104
+ return value;
105
+ if (typeof value === 'string') {
106
+ const trimmed = value.trim().toLowerCase();
107
+ return trimmed === 'true' || trimmed === '1' || trimmed === 'yes';
108
+ }
109
+ return false;
110
+ }
111
+ applyWebsiteConfigFromSource(source) {
112
+ const w = source.ConfigurationObject?.Website;
113
+ if (!w)
114
+ return;
115
+ if (typeof w.MaxDepth === 'number' && Number.isFinite(w.MaxDepth))
116
+ this.MaxDepth = w.MaxDepth;
117
+ if (typeof w.CrawlSitesInLowerLevelDomain === 'boolean')
118
+ this.CrawlSitesInLowerLevelDomain = w.CrawlSitesInLowerLevelDomain;
119
+ if (typeof w.CrawlOtherSitesInTopLevelDomain === 'boolean')
120
+ this.CrawlOtherSitesInTopLevelDomain = w.CrawlOtherSitesInTopLevelDomain;
121
+ if (typeof w.URLPattern === 'string' && w.URLPattern.length > 0)
122
+ this.URLPattern = w.URLPattern;
123
+ if (typeof w.RootURL === 'string' && w.RootURL.length > 0)
124
+ this.RootURL = w.RootURL;
125
+ }
126
+ /**
127
+ * Build a per-source RunBudget map from each source's ConfigurationObject.
128
+ * Sources with no budget knobs set still get a RunBudget entry (with all
129
+ * limits = null) so the OnAfterBatch hook can update item counts uniformly.
130
+ *
131
+ * Per-source overrides via ContentSourceParam rows (e.g., MaxItemsPerRun
132
+ * stored as a param) take precedence over the ConfigurationObject value.
133
+ */
134
+ async setupRunBudgets(contentSources) {
135
+ this.sourceBudgetMap = new Map();
136
+ for (const source of contentSources) {
137
+ const id = NormalizeUUID(source.ID);
138
+ const cfg = source.ConfigurationObject;
139
+ const params = await this.engine.getContentSourceParams(source, this.contextUser);
140
+ // ContentSourceParam override beats ConfigurationObject — that
141
+ // lets the per-source-instance UI knob win over the global
142
+ // ContentSource defaults.
143
+ const paramMaxItems = params?.get('MaxItemsPerRun');
144
+ const paramMaxTokens = params?.get('MaxTokensPerRun');
145
+ const paramMaxCost = params?.get('MaxCostPerRun');
146
+ this.sourceBudgetMap.set(id, new RunBudget({
147
+ MaxItemsPerRun: this.coerceNumber(paramMaxItems) ?? this.readConfigNumber(cfg, 'MaxItemsPerRun'),
148
+ MaxNewTagsPerRun: this.readConfigNumber(cfg, 'MaxNewTagsPerRun'),
149
+ MaxNewTagsPerItem: this.readConfigNumber(cfg, 'MaxNewTagsPerItem'),
150
+ MaxTokensPerRun: this.coerceNumber(paramMaxTokens) ?? this.readConfigNumber(cfg, 'MaxTokensPerRun'),
151
+ MaxCostPerRun: this.coerceNumber(paramMaxCost) ?? this.readConfigNumber(cfg, 'MaxCostPerRun'),
152
+ }));
153
+ }
154
+ }
155
+ coerceNumber(value) {
156
+ if (value == null)
157
+ return null;
158
+ if (typeof value === 'number')
159
+ return Number.isFinite(value) ? value : null;
160
+ if (typeof value === 'string') {
161
+ const n = Number(value);
162
+ return Number.isFinite(n) ? n : null;
163
+ }
164
+ return null;
165
+ }
166
+ /**
167
+ * Safely read a numeric budget knob from the typed configuration object.
168
+ * Returns null when the field is unset or not a finite number.
169
+ */
170
+ readConfigNumber(cfg, key) {
171
+ if (!cfg)
172
+ return null;
173
+ return this.coerceNumber(cfg[key]);
174
+ }
175
+ /**
176
+ * Install the engine's OnAfterBatch hook so each batch's items are
177
+ * counted against the budget of the source they belong to. Returns
178
+ * `continue:false` from the gate when any source's budget exhausts,
179
+ * which the engine then translates into a graceful pause.
180
+ */
181
+ installBudgetGate() {
182
+ this.engine.OnAfterBatch = async (batch, _totalProcessed) => {
183
+ // Tally items per source within this batch.
184
+ const perSourceCounts = new Map();
185
+ for (const item of batch) {
186
+ if (!item.ContentSourceID)
187
+ continue;
188
+ const id = NormalizeUUID(item.ContentSourceID);
189
+ perSourceCounts.set(id, (perSourceCounts.get(id) ?? 0) + 1);
190
+ }
191
+ for (const [id, count] of perSourceCounts) {
192
+ const budget = this.sourceBudgetMap.get(id);
193
+ if (!budget)
194
+ continue;
195
+ budget.recordItemsProcessed(count);
196
+ const verdict = budget.checkBudgets();
197
+ if (!verdict.ok) {
198
+ return { continue: false, reason: `${verdict.reason}: ${verdict.details ?? ''}` };
199
+ }
200
+ }
201
+ return { continue: true };
202
+ };
203
+ }
25
204
  getContextUser() {
26
205
  return this.contextUser;
27
206
  }
@@ -36,28 +215,69 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
36
215
  this.contextUser = contextUser;
37
216
  this.contentSourceTypeID = this.engine.SetSubclassContentSourceType('Website');
38
217
  const contentSources = await this.engine.getAllContentSources(this.contextUser, this.contentSourceTypeID);
39
- const contentItemsToProcess = await this.SetContentItemsToProcess(contentSources);
40
- await this.engine.ExtractTextAndProcessWithLLM(contentItemsToProcess, this.contextUser, undefined, undefined, onProgress);
41
- return contentItemsToProcess.length;
218
+ // Per-source budget setup — produces a RunBudget for each content
219
+ // source and installs the OnAfterBatch gate on the engine so the
220
+ // run pauses gracefully when any source exhausts its MaxItemsPerRun /
221
+ // tokens / cost / tag budget.
222
+ await this.setupRunBudgets(contentSources);
223
+ this.installBudgetGate();
224
+ // Stream content items source-by-source into the LLM batcher. The
225
+ // crawl phase produces items as soon as they pass change-detection,
226
+ // and the LLM phase consumes them in batches without waiting for the
227
+ // last source to finish crawling. Wall-clock time becomes
228
+ // max(crawl, classify) + a small buffer instead of crawl + classify.
229
+ let itemsYielded = 0;
230
+ const streamSource = this;
231
+ const itemStream = (async function* () {
232
+ for await (const item of streamSource.streamContentItemsToProcess(contentSources)) {
233
+ itemsYielded++;
234
+ yield item;
235
+ }
236
+ })();
237
+ try {
238
+ await this.engine.ExtractTextAndProcessWithLLM(itemStream, this.contextUser, undefined, undefined, onProgress);
239
+ }
240
+ finally {
241
+ // Clean up engine state — leaving stale hooks around would leak
242
+ // budget state into the next Autotag invocation on a shared
243
+ // engine singleton.
244
+ this.engine.OnAfterBatch = null;
245
+ }
246
+ // Surface per-source budget pause reasons in the log so operators can
247
+ // see why a run stopped short.
248
+ for (const [sourceID, budget] of this.sourceBudgetMap) {
249
+ const verdict = budget.checkBudgets();
250
+ if (!verdict.ok) {
251
+ LogStatus(`[autotag-website] Source ${sourceID} reached budget: ${verdict.reason} — ${verdict.details ?? ''}`);
252
+ }
253
+ }
254
+ return itemsYielded;
42
255
  }
43
256
  /**
44
- * Given a content source, retrieve all content items associated with the content sources.
45
- * The content items are then processed to determine if they have been modified since the last time they were processed or if they are new content items.
46
- * @param contentSource
47
- * @returns
257
+ * Streaming variant: yields each new/changed content item as soon as it
258
+ * passes change detection. Lets the crawl and LLM phases overlap so total
259
+ * wall-clock time is roughly max(crawl, classify) instead of crawl + classify.
260
+ *
261
+ * The canonical implementation lives here; the array-returning
262
+ * `SetContentItemsToProcess` is a thin collector wrapper around this.
48
263
  */
49
- async SetContentItemsToProcess(contentSources) {
50
- const contentItemsToProcess = [];
51
- // If content source parameters were provided, set them. Otherwise, use the default values.
264
+ async *streamContentItemsToProcess(contentSources) {
52
265
  for (const contentSource of contentSources) {
266
+ // Reset instance state to defaults before applying per-source overrides.
267
+ // Without this, knobs set on the previous source would leak into the next.
268
+ this.applyDefaultCrawlSettings();
269
+ // First overlay: typed Configuration.Website sub-object (the structured editor
270
+ // in the form writes here). This is the canonical storage for new sources.
271
+ this.applyWebsiteConfigFromSource(contentSource);
272
+ // Second overlay: per-source ContentSourceParam rows. These win — legacy
273
+ // sources configured via the params grid (or anyone who wants a sharper
274
+ // per-instance override) keep working. We handle each known crawler key
275
+ // explicitly so the DB-stored string values get the right runtime type
276
+ // (the prior "bulk dynamic assign" path silently stuffed strings into
277
+ // number/boolean fields).
53
278
  const contentSourceParamsMap = await this.engine.getContentSourceParams(contentSource, this.contextUser);
54
279
  if (contentSourceParamsMap) {
55
- // Override defaults with content source specific params
56
- contentSourceParamsMap.forEach((value, key) => {
57
- if (key in this) {
58
- this[key] = value;
59
- }
60
- });
280
+ this.overlayCrawlParamsFromMap(contentSourceParamsMap);
61
281
  }
62
282
  const contentSourceParams = {
63
283
  contentSourceID: contentSource.ID,
@@ -68,19 +288,30 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
68
288
  URL: contentSource.URL
69
289
  };
70
290
  try {
71
- // All content items associated with the content source
72
291
  const startURL = contentSourceParams.URL;
73
- // root url should be set to this.RootURL if it exists, otherwise it should be set to the base path of the startURL.
74
292
  const rootURL = this.RootURL ? this.RootURL : this.getBasePath(startURL);
75
- // regex should be set to this.URLPattern if it exists, otherwise it should be set to match any URL.
76
- const regex = this.URLPattern && new RegExp(this.URLPattern) || new RegExp('.*');
77
- const allContentItemLinks = await this.getAllLinksFromContentSource(startURL, rootURL, regex);
78
- const contentItems = await this.SetNewAndModifiedContentItems(allContentItemLinks, contentSourceParams, this.contextUser);
79
- if (contentItems && contentItems.length > 0) {
80
- contentItemsToProcess.push(...contentItems);
293
+ const regex = (this.URLPattern && new RegExp(this.URLPattern)) || new RegExp('.*');
294
+ // Consume the URL stream lazily each `link` arrives as the crawler
295
+ // discovers it, NOT after the full recursive crawl completes. The
296
+ // engine's LLM batcher accumulates items as they're yielded here, so
297
+ // tagging starts firing as soon as the first BatchSize items are ready
298
+ // (instead of waiting for the entire source to finish crawling).
299
+ let yieldedForSource = 0;
300
+ for await (const link of this.streamAllLinksFromContentSource(startURL, rootURL, regex)) {
301
+ try {
302
+ const item = await this.processSingleURL(link, contentSourceParams);
303
+ if (item) {
304
+ yieldedForSource++;
305
+ yield item;
306
+ }
307
+ }
308
+ catch (e) {
309
+ // Per-URL failures are isolated — log and keep going so a single
310
+ // bad page doesn't poison the rest of the source.
311
+ console.error(`[autotag-website] Failed to process URL ${link}:`, e);
312
+ }
81
313
  }
82
- else {
83
- // No content items found to process
314
+ if (yieldedForSource === 0) {
84
315
  console.log(`No content items found to process for content source: ${contentSource.Get('Name')}`);
85
316
  }
86
317
  }
@@ -88,6 +319,20 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
88
319
  console.error(`Failed to process content source: ${contentSource.Get('Name')}`);
89
320
  }
90
321
  }
322
+ }
323
+ /**
324
+ * Given a content source, retrieve all content items associated with the content sources.
325
+ * The content items are then processed to determine if they have been modified since the
326
+ * last time they were processed or if they are new content items.
327
+ *
328
+ * Backwards-compatible array form. Internally drains the streaming variant
329
+ * so there is exactly one implementation of the change-detection logic.
330
+ */
331
+ async SetContentItemsToProcess(contentSources) {
332
+ const contentItemsToProcess = [];
333
+ for await (const item of this.streamContentItemsToProcess(contentSources)) {
334
+ contentItemsToProcess.push(item);
335
+ }
91
336
  return contentItemsToProcess;
92
337
  }
93
338
  /**
@@ -99,61 +344,18 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
99
344
  * @param contextUser
100
345
  * @returns
101
346
  */
347
+ /**
348
+ * Backwards-compatible batch form: process an explicit list of URLs and
349
+ * return all new/changed content items as an array. New code should prefer
350
+ * `streamContentItemsToProcess` which pipelines into the LLM batcher.
351
+ */
102
352
  async SetNewAndModifiedContentItems(contentItemLinks, contentSourceParams, contextUser) {
103
353
  const addedContentItems = [];
104
- for (const contentItemLink of contentItemLinks) {
354
+ for (const link of contentItemLinks) {
105
355
  try {
106
- const newHash = await this.engine.getChecksumFromURL(contentItemLink);
107
- const rv = new RunView();
108
- const results = await rv.RunViews([
109
- {
110
- EntityName: 'MJ: Content Items',
111
- ExtraFilter: `Checksum = '${newHash}'`,
112
- ResultType: 'entity_object'
113
- },
114
- {
115
- EntityName: 'MJ: Content Items',
116
- ExtraFilter: `ContentSourceID = '${contentSourceParams.contentSourceID}' AND URL = '${contentItemLink}'`,
117
- ResultType: 'entity_object'
118
- }
119
- ], this.contextUser);
120
- const contentItemResultsWithChecksum = results[0];
121
- const contentItemResultsWithURL = results[1];
122
- if (contentItemResultsWithChecksum.Success && contentItemResultsWithChecksum.Results.length) {
123
- // We found the checksum so this content item has not changed since we last accessed it, do nothing
124
- continue;
125
- }
126
- else if (contentItemResultsWithURL.Success && contentItemResultsWithURL.Results.length) {
127
- // This content item already exists, update the hash and last updated date
128
- const contentItemResult = contentItemResultsWithURL.Results[0];
129
- const lastStoredHash = contentItemResult.Checksum;
130
- if (lastStoredHash !== newHash) {
131
- // This content item has changed since we last access it, update the hash and last updated date
132
- const md = this.ProviderToUse;
133
- const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
134
- contentItem.Load(contentItemResult.ID);
135
- contentItem.Checksum = newHash;
136
- contentItem.Text = await this.parseWebPage(contentItemLink);
137
- await contentItem.Save();
138
- addedContentItems.push(contentItem); // Content item was modified, add to list
139
- }
140
- }
141
- else {
142
- // This content item does not exist, add it
143
- const md = this.ProviderToUse;
144
- const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
145
- contentItem.ContentSourceID = contentSourceParams.contentSourceID;
146
- contentItem.Name = this.getPathName(contentItemLink); // Will get overwritten by title later if it exists
147
- contentItem.Description = this.engine.GetContentItemDescription(contentSourceParams);
148
- contentItem.ContentTypeID = contentSourceParams.ContentTypeID;
149
- contentItem.ContentFileTypeID = contentSourceParams.ContentFileTypeID;
150
- contentItem.ContentSourceTypeID = contentSourceParams.ContentSourceTypeID;
151
- contentItem.Checksum = await this.engine.getChecksumFromURL(contentItemLink);
152
- contentItem.URL = contentItemLink;
153
- contentItem.Text = await this.parseWebPage(contentItemLink);
154
- await contentItem.Save();
155
- addedContentItems.push(contentItem); // Content item was added, add to list
156
- }
356
+ const item = await this.processSingleURL(link, contentSourceParams);
357
+ if (item)
358
+ addedContentItems.push(item);
157
359
  }
158
360
  catch (e) {
159
361
  console.log(e);
@@ -161,6 +363,67 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
161
363
  }
162
364
  return addedContentItems;
163
365
  }
366
+ /**
367
+ * Process one URL through the change-detection pipeline. Returns the
368
+ * MJContentItem if the page is new or changed (caller should hand it off
369
+ * to the LLM stage), or `null` if the page is unchanged.
370
+ *
371
+ * One axios.get per URL: the same response body provides both the
372
+ * change-detection hash and the page text. Compare with `byChecksum`
373
+ * scoped to the current ContentSource so identical boilerplate (404 pages,
374
+ * shared error templates) from a *different* source can't silently mask
375
+ * legitimate pages here.
376
+ */
377
+ async processSingleURL(url, contentSourceParams) {
378
+ const { text, checksum: newHash } = await this.fetchAndExtract(url);
379
+ const rv = new RunView();
380
+ const results = await rv.RunViews([
381
+ {
382
+ EntityName: 'MJ: Content Items',
383
+ ExtraFilter: `ContentSourceID = '${contentSourceParams.contentSourceID}' AND Checksum = '${newHash}'`,
384
+ ResultType: 'entity_object'
385
+ },
386
+ {
387
+ EntityName: 'MJ: Content Items',
388
+ ExtraFilter: `ContentSourceID = '${contentSourceParams.contentSourceID}' AND URL = '${url}'`,
389
+ ResultType: 'entity_object'
390
+ }
391
+ ], this.contextUser);
392
+ const byChecksum = results[0];
393
+ const byURL = results[1];
394
+ // Same content already in DB for this source — unchanged, skip.
395
+ if (byChecksum.Success && byChecksum.Results.length) {
396
+ return null;
397
+ }
398
+ // URL exists for this source but content has drifted — update in place.
399
+ if (byURL.Success && byURL.Results.length) {
400
+ const existing = byURL.Results[0];
401
+ if (existing.Checksum === newHash) {
402
+ return null;
403
+ }
404
+ const md = this.ProviderToUse;
405
+ const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
406
+ await contentItem.Load(existing.ID);
407
+ contentItem.Checksum = newHash;
408
+ contentItem.Text = text;
409
+ await contentItem.Save();
410
+ return contentItem;
411
+ }
412
+ // New URL — create the content item, reusing the already-fetched body.
413
+ const md = this.ProviderToUse;
414
+ const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
415
+ contentItem.ContentSourceID = contentSourceParams.contentSourceID;
416
+ contentItem.Name = this.getPathName(url); // Will get overwritten by title later if it exists
417
+ contentItem.Description = this.engine.GetContentItemDescription(contentSourceParams);
418
+ contentItem.ContentTypeID = contentSourceParams.ContentTypeID;
419
+ contentItem.ContentFileTypeID = contentSourceParams.ContentFileTypeID;
420
+ contentItem.ContentSourceTypeID = contentSourceParams.ContentSourceTypeID;
421
+ contentItem.Checksum = newHash;
422
+ contentItem.URL = url;
423
+ contentItem.Text = text;
424
+ await contentItem.Save();
425
+ return contentItem;
426
+ }
164
427
  async fetchPageContent(url) {
165
428
  const { data } = await axios.get(url);
166
429
  return data;
@@ -180,16 +443,44 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
180
443
  return text;
181
444
  }
182
445
  /**
183
- * Given a URL, this function extracts text from a webpage.
184
- * @param url
185
- * @returns The text extracted from the webpage
446
+ * Pure helper: extract clean body text from raw HTML. No IO. Exposed as
447
+ * a protected method so subclasses and unit tests can exercise it without
448
+ * monkey-patching axios.
449
+ */
450
+ extractTextFromHTML(html) {
451
+ const $ = cheerio.load(html);
452
+ const body = $('body')[0];
453
+ if (!body)
454
+ return '';
455
+ return this.getTextWithLineBreaks(body, $);
456
+ }
457
+ /**
458
+ * Fetch a URL once, extract clean text, and compute a stable checksum
459
+ * over that text. Returns both so callers don't have to fetch twice for
460
+ * "is this changed?" + "what's the content?".
461
+ *
462
+ * The checksum is computed over the EXTRACTED body text, NOT the raw
463
+ * HTML, because raw HTML routinely contains incidental changes (server
464
+ * timestamps, CSRF tokens, build hashes, ad rotators) that would
465
+ * falsely report a page as "changed" on every crawl. Hashing the
466
+ * extracted text is what users actually mean by "did the content
467
+ * change?"
468
+ */
469
+ async fetchAndExtract(url) {
470
+ const { data } = await axios.get(url);
471
+ const text = this.extractTextFromHTML(String(data));
472
+ const checksum = await this.engine.getChecksumFromText(text);
473
+ return { text, checksum };
474
+ }
475
+ /**
476
+ * Given a URL, extracts text from a webpage. Kept for external callers
477
+ * that just want the text — internal change-detection now uses
478
+ * `fetchAndExtract` to avoid redundant fetches.
186
479
  */
187
480
  async parseWebPage(url) {
188
481
  try {
189
482
  const pageContent = await this.fetchPageContent(url);
190
- const $ = cheerio.load(pageContent);
191
- const text = this.getTextWithLineBreaks($('body')[0], $);
192
- return text;
483
+ return this.extractTextFromHTML(pageContent);
193
484
  }
194
485
  catch (error) {
195
486
  console.error(`Error processing ${url}:`, error);
@@ -197,50 +488,67 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
197
488
  }
198
489
  }
199
490
  /**
200
- * Given a root URL that corresponds to a content source, retrieve all the links in accordance to the crawl settings.
201
- * If the crawl settings are set to crawl other sites in the top level domain, then all links in the top level domain will be retrieved.
202
- * If the crawl settings are set to crawl sites in lower level domains, then function is recursively called to retrieve all links in the lower level domains.
203
- * @param url
204
- * @returns
491
+ * Streaming variant: yields each newly-discovered URL as the crawler finds it,
492
+ * so downstream consumers (the content-item streamer that feeds the LLM
493
+ * batcher) can start working before discovery completes. This is the
494
+ * canonical implementation; `getAllLinksFromContentSource` below is a
495
+ * backwards-compatible array-collecting wrapper.
205
496
  */
206
- async getAllLinksFromContentSource(url, rootURL, regex) {
497
+ async *streamAllLinksFromContentSource(url, rootURL, regex) {
498
+ // Start each content source with a clean visited set — otherwise URLs
499
+ // found for one source silently get deduped away when the next source
500
+ // is crawled.
501
+ this.visitedURLs = new Set();
502
+ // Normalize the seed URL once so all downstream comparisons share the same form.
503
+ const seedURL = this.normalizeURL(url);
207
504
  try {
208
- await this.getLowerLevelLinks(url, rootURL, this.MaxDepth, new Set(), regex);
209
- await this.getTopLevelLinks(url, this.getBasePath(url));
210
- return Array.from(this.visitedURLs);
505
+ yield* this.streamLowerLevelLinks(seedURL, rootURL, this.MaxDepth, new Set(), regex);
506
+ yield* this.streamTopLevelLinks(seedURL, this.getBasePath(seedURL), regex);
211
507
  }
212
508
  catch (e) {
213
509
  console.error(`Failed to get links from ${url}`);
214
- return [];
215
510
  }
216
511
  }
217
512
  /**
218
- * For a given URL, retrieves all other links at that top level domain.
219
- * @param url
220
- * @param rootURL
221
- * @param visitedURLs
222
- * @returns
513
+ * Backwards-compatible array form. Drains the streaming variant.
223
514
  */
224
- async getTopLevelLinks(url, rootURL) {
515
+ async getAllLinksFromContentSource(url, rootURL, regex) {
516
+ const collected = [];
517
+ for await (const link of this.streamAllLinksFromContentSource(url, rootURL, regex)) {
518
+ collected.push(link);
519
+ }
520
+ return collected;
521
+ }
522
+ /**
523
+ * Streaming variant of getTopLevelLinks — yields each URL it adds to the
524
+ * visited set so the LLM batcher gets fed in real time.
525
+ */
526
+ async *streamTopLevelLinks(url, rootURL, regex) {
225
527
  if (!this.CrawlOtherSitesInTopLevelDomain) {
226
- this.visitedURLs.add(url);
528
+ // Seed URL still gets yielded so the processSingleURL pipeline runs on it.
529
+ if (!this.visitedURLs.has(url)) {
530
+ this.visitedURLs.add(url);
531
+ yield url;
532
+ }
227
533
  return;
228
534
  }
229
- // If we have already visited this URL, return an empty array
535
+ // If we have already visited this URL, nothing to do.
230
536
  if (this.visitedURLs.has(url) || !await this.urlIsValid(url) || this.isHighestDomain(url)) {
231
537
  return;
232
538
  }
233
539
  this.visitedURLs.add(url);
540
+ yield url;
541
+ const discovered = [];
234
542
  try {
235
543
  const { data } = await axios.get(url);
236
544
  const $ = cheerio.load(data);
237
- // Get all links on the page for the current URL
238
545
  $('a').each((_, element) => {
239
546
  const link = $(element).attr('href');
240
547
  if (link) {
241
- const newURL = new URL(link, url).href;
242
- if (newURL.startsWith(rootURL) && !this.visitedURLs.has(newURL)) {
548
+ const newURL = this.normalizeURL(new URL(link, url).href);
549
+ if (newURL.startsWith(rootURL) && !this.visitedURLs.has(newURL) && regex.test(newURL)) {
243
550
  this.visitedURLs.add(newURL);
551
+ discovered.push(newURL);
244
552
  }
245
553
  }
246
554
  });
@@ -250,6 +558,19 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
250
558
  console.error(`Failed to get links from ${url}`);
251
559
  return;
252
560
  }
561
+ // Yield the page's links AFTER the await/delay completes so they're emitted
562
+ // outside the cheerio sync callback (which can't yield).
563
+ for (const newURL of discovered) {
564
+ yield newURL;
565
+ }
566
+ }
567
+ /**
568
+ * Backwards-compatible void form. Drains the streaming variant (links go
569
+ * into `visitedURLs` as a side effect of streamTopLevelLinks).
570
+ */
571
+ async getTopLevelLinks(url, rootURL, regex) {
572
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
573
+ for await (const _ of this.streamTopLevelLinks(url, rootURL, regex)) { /* drain */ }
253
574
  }
254
575
  /**
255
576
  * Simple check to see if the URL is at the highest level domain.
@@ -288,6 +609,31 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
288
609
  throw e;
289
610
  }
290
611
  }
612
+ /**
613
+ * Normalize a URL for use as a dedup key in `visitedURLs`. Conservative
614
+ * normalization that catches the common variations without risking the merge
615
+ * of two semantically distinct pages:
616
+ * - drops the fragment (always client-side per RFC 3986)
617
+ * - collapses trailing slash on the path (except the root "/")
618
+ * - sorts query parameters for stable equality
619
+ * - host is already lower-cased by URL parser
620
+ * Path case is intentionally preserved — RFC 3986 says paths are case-sensitive
621
+ * and some servers (wikis, certain Linux file fronts) actually treat them that way.
622
+ */
623
+ normalizeURL(href) {
624
+ try {
625
+ const u = new URL(href);
626
+ u.hash = '';
627
+ if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
628
+ u.pathname = u.pathname.slice(0, -1);
629
+ }
630
+ u.searchParams.sort();
631
+ return u.href;
632
+ }
633
+ catch {
634
+ return href;
635
+ }
636
+ }
291
637
  async urlIsValid(url) {
292
638
  try {
293
639
  const response = await axios.head(url);
@@ -299,53 +645,66 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
299
645
  }
300
646
  }
301
647
  /**
302
- * For a given URL, retrieves all links at lower level domains up to the specified crawl depth.
303
- * @param url
304
- * @param rootURL
305
- * @param crawlDepth
306
- * @param visitedURLs
307
- * @returns
648
+ * Streaming variant of getLowerLevelLinks. Yields each newly-discovered URL
649
+ * the moment it's added to the visited set, then recurses depth-first into
650
+ * children. This is the canonical implementation — the LLM batcher gets
651
+ * fed in real time during crawl instead of having to wait for the entire
652
+ * recursive discovery to complete.
653
+ *
654
+ * `getLowerLevelLinks` below is a thin backwards-compatible wrapper that
655
+ * drains the stream into a Set.
308
656
  */
309
- async getLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex) {
657
+ async *streamLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex) {
310
658
  try {
311
659
  console.log(`Scraping ${url}`);
312
- // If we have already visited this URL, return an empty array
313
- if (scrapedURLs.has(url) || await this.urlIsValid(url) === false || crawlDepth < 0 || !this.CrawlSitesInLowerLevelDomain) {
314
- return new Set();
660
+ // The Number.isFinite guard protects against accidental NaN/undefined
661
+ // arriving as crawlDepth without it, `undefined < 0` is false and the
662
+ // recursion runs without a depth ceiling.
663
+ if (scrapedURLs.has(url) || await this.urlIsValid(url) === false || !Number.isFinite(crawlDepth) || crawlDepth < 0 || !this.CrawlSitesInLowerLevelDomain) {
664
+ return;
315
665
  }
316
- let combinedLinks = new Set(); // Combined links from the current URL and all lower level URLs
317
- const extractedLinks = new Set(); // Links extracted from the input URL
666
+ const extractedLinks = [];
318
667
  const { data } = await axios.get(url);
319
668
  const $ = cheerio.load(data);
320
- // Get all links on the page for the current URL
321
669
  $('a').each((_, element) => {
322
670
  const link = $(element).attr('href');
323
671
  if (link) {
324
- const newURL = new URL(link, url).href;
672
+ const newURL = this.normalizeURL(new URL(link, url).href);
325
673
  if (newURL.startsWith(rootURL) && newURL !== url && !this.visitedURLs.has(newURL) && regex.test(newURL)) {
326
- extractedLinks.add(newURL);
674
+ extractedLinks.push(newURL);
327
675
  this.visitedURLs.add(newURL);
328
676
  }
329
677
  }
330
678
  });
331
679
  await this.delay(1000); // Delay to prevent rate limiting
332
680
  scrapedURLs.add(url);
333
- // If we are at the depth limit, return the current set of URLs and don't recurse
334
- if (crawlDepth === 0) {
335
- return extractedLinks;
681
+ // Yield each newly-discovered URL outside the (sync) cheerio callback.
682
+ // Consumers start processing these immediately while we recurse.
683
+ for (const newURL of extractedLinks) {
684
+ yield newURL;
336
685
  }
686
+ // Depth limit — discover this page's links but don't recurse.
687
+ if (crawlDepth === 0)
688
+ return;
337
689
  for (const subLink of extractedLinks) {
338
- //console.log(`Adding ${subLink}`);
339
- const lowerLevelLinks = await this.getLowerLevelLinks(subLink, rootURL, crawlDepth - 1, scrapedURLs, regex);
340
- combinedLinks = new Set([...extractedLinks, ...lowerLevelLinks]);
690
+ yield* this.streamLowerLevelLinks(subLink, rootURL, crawlDepth - 1, scrapedURLs, regex);
341
691
  }
342
- return combinedLinks;
343
692
  }
344
693
  catch (e) {
345
694
  console.error(`Failed to get links from ${url}`);
346
- return new Set();
347
695
  }
348
696
  }
697
+ /**
698
+ * Backwards-compatible Set form. Drains the streaming variant; URLs end up
699
+ * in `this.visitedURLs` as a side effect of streamLowerLevelLinks.
700
+ */
701
+ async getLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex) {
702
+ const out = new Set();
703
+ for await (const link of this.streamLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex)) {
704
+ out.add(link);
705
+ }
706
+ return out;
707
+ }
349
708
  async delay(ms) {
350
709
  return new Promise(resolve => setTimeout(resolve, ms));
351
710
  }