npm - @memberjunction/content-autotagging - Versions diffs - 5.37.0 → 5.39.0 - Mend

@memberjunction/content-autotagging 5.37.0 → 5.39.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +64 -6
package/dist/Engine/generic/AutotagBaseEngine.d.ts +1 -1
package/dist/Engine/generic/AutotagBaseEngine.d.ts.map +1 -1
package/dist/Engine/generic/AutotagBaseEngine.js +81 -22
package/dist/Engine/generic/AutotagBaseEngine.js.map +1 -1
package/dist/Engine/generic/RunBudget.d.ts +21 -2
package/dist/Engine/generic/RunBudget.d.ts.map +1 -1
package/dist/Engine/generic/RunBudget.js +23 -1
package/dist/Engine/generic/RunBudget.js.map +1 -1
package/dist/Entity/generic/AutotagEntity.d.ts.map +1 -1
package/dist/Entity/generic/AutotagEntity.js +9 -4
package/dist/Entity/generic/AutotagEntity.js.map +1 -1
package/dist/Websites/generic/AutotagWebsite.d.ts +165 -26
package/dist/Websites/generic/AutotagWebsite.d.ts.map +1 -1
package/dist/Websites/generic/AutotagWebsite.js +489 -130
package/dist/Websites/generic/AutotagWebsite.js.map +1 -1
package/package.json +17 -16

package/dist/Websites/generic/AutotagWebsite.js CHANGED Viewed

@@ -9,8 +9,9 @@ var __metadata = (this && this.__metadata) || function (k, v) {
 };
 import { AutotagBase } from '../../Core/index.js';
 import { AutotagBaseEngine } from '../../Engine/index.js';
-import { RegisterClass } from '@memberjunction/global';
-import { RunView } from '@memberjunction/core';
+import { RunBudget } from '../../Engine/generic/RunBudget.js';
+import { RegisterClass, NormalizeUUID } from '@memberjunction/global';
+import { RunView, LogStatus } from '@memberjunction/core';
 import * as cheerio from 'cheerio';
 import axios from 'axios';
 import { URL } from 'url';
@@ -19,9 +20,187 @@ dotenv.config({ quiet: true });
 let AutotagWebsite = class AutotagWebsite extends AutotagBase {
     constructor() {
         super();
+        // Sensible defaults — overridable per content source via ContentSourceParam rows.
+        // CrawlSitesInLowerLevelDomain=true + MaxDepth=2 means we crawl the start URL
+        // plus two levels of in-domain links by default (~root + sections + content pages).
+        // CrawlOtherSitesInTopLevelDomain stays false to avoid accidentally fanning out
+        // across sibling paths of the seed URL unless explicitly opted in.
+        this.CrawlOtherSitesInTopLevelDomain = false;
+        this.CrawlSitesInLowerLevelDomain = true;
+        this.MaxDepth = 2;
+        /**
+         * Per-source RunBudget tracker, keyed by normalized source ID. Items
+         * processed in each batch are tallied against the budget of the source
+         * they belong to; when any budget exhausts, the engine's OnAfterBatch
+         * gate returns `continue:false` and the run pauses gracefully. Next
+         * invocation will re-crawl, change-detection will skip the already-
+         * processed pages, and the remaining ones get processed.
+         */
+        this.sourceBudgetMap = new Map();
         this.engine = AutotagBaseEngine.Instance;
         this.visitedURLs = new Set();
     }
+    /**
+     * Reset crawl-related instance fields back to the class defaults. Called at the
+     * start of each content source so prior-source overrides don't leak forward.
+     * URLPattern and RootURL default to undefined — derived later if unset.
+     */
+    applyDefaultCrawlSettings() {
+        this.CrawlOtherSitesInTopLevelDomain = false;
+        this.CrawlSitesInLowerLevelDomain = true;
+        this.MaxDepth = 2;
+        this.URLPattern = undefined;
+        this.RootURL = undefined;
+    }
+    /**
+     * Apply the typed `Configuration.Website` sub-object (if present) to this
+     * crawler instance. Each field is optional — unset values leave the existing
+     * (default) instance value intact.
+     *
+     * NOTE: pluggability — today this is hard-coded for AutotagWebsite. Once we
+     * have more source types that need typed per-instance settings (RSS, Cloud
+     * Storage, etc.), this pattern should be promoted to an
+     * `IConfigurableContentSource<TConfig>` interface where each subclass declares
+     * its typed config sub-object key and shape. For now this is the canonical
+     * shape; other autotaggers can copy it when they need typed knobs.
+     */
+    /**
+     * Apply per-source `ContentSourceParam` rows to this crawler instance. Values
+     * stored in the DB are strings, so we coerce per-key to the right runtime type
+     * instead of bulk-assigning (which previously stuffed strings into number /
+     * boolean fields and relied on JS coercion at use sites).
+     *
+     * Unknown keys are silently ignored — same gate as the prior `if (key in this)`
+     * check, just made explicit.
+     */
+    overlayCrawlParamsFromMap(params) {
+        const maxDepth = params.get('MaxDepth');
+        if (maxDepth != null) {
+            const n = this.coerceNumber(maxDepth);
+            if (n != null)
+                this.MaxDepth = n;
+        }
+        const lower = params.get('CrawlSitesInLowerLevelDomain');
+        if (lower != null)
+            this.CrawlSitesInLowerLevelDomain = this.coerceBoolean(lower);
+        const other = params.get('CrawlOtherSitesInTopLevelDomain');
+        if (other != null)
+            this.CrawlOtherSitesInTopLevelDomain = this.coerceBoolean(other);
+        const pattern = params.get('URLPattern');
+        // RegExp values come pre-compiled by the engine — store the source text
+        // so the crawler's `new RegExp(this.URLPattern)` call stays consistent.
+        if (pattern instanceof RegExp) {
+            this.URLPattern = pattern.source;
+        }
+        else if (typeof pattern === 'string' && pattern.length > 0) {
+            this.URLPattern = pattern;
+        }
+        const root = params.get('RootURL');
+        if (typeof root === 'string' && root.length > 0)
+            this.RootURL = root;
+    }
+    coerceBoolean(value) {
+        if (typeof value === 'boolean')
+            return value;
+        if (typeof value === 'string') {
+            const trimmed = value.trim().toLowerCase();
+            return trimmed === 'true' || trimmed === '1' || trimmed === 'yes';
+        }
+        return false;
+    }
+    applyWebsiteConfigFromSource(source) {
+        const w = source.ConfigurationObject?.Website;
+        if (!w)
+            return;
+        if (typeof w.MaxDepth === 'number' && Number.isFinite(w.MaxDepth))
+            this.MaxDepth = w.MaxDepth;
+        if (typeof w.CrawlSitesInLowerLevelDomain === 'boolean')
+            this.CrawlSitesInLowerLevelDomain = w.CrawlSitesInLowerLevelDomain;
+        if (typeof w.CrawlOtherSitesInTopLevelDomain === 'boolean')
+            this.CrawlOtherSitesInTopLevelDomain = w.CrawlOtherSitesInTopLevelDomain;
+        if (typeof w.URLPattern === 'string' && w.URLPattern.length > 0)
+            this.URLPattern = w.URLPattern;
+        if (typeof w.RootURL === 'string' && w.RootURL.length > 0)
+            this.RootURL = w.RootURL;
+    }
+    /**
+     * Build a per-source RunBudget map from each source's ConfigurationObject.
+     * Sources with no budget knobs set still get a RunBudget entry (with all
+     * limits = null) so the OnAfterBatch hook can update item counts uniformly.
+     *
+     * Per-source overrides via ContentSourceParam rows (e.g., MaxItemsPerRun
+     * stored as a param) take precedence over the ConfigurationObject value.
+     */
+    async setupRunBudgets(contentSources) {
+        this.sourceBudgetMap = new Map();
+        for (const source of contentSources) {
+            const id = NormalizeUUID(source.ID);
+            const cfg = source.ConfigurationObject;
+            const params = await this.engine.getContentSourceParams(source, this.contextUser);
+            // ContentSourceParam override beats ConfigurationObject — that
+            // lets the per-source-instance UI knob win over the global
+            // ContentSource defaults.
+            const paramMaxItems = params?.get('MaxItemsPerRun');
+            const paramMaxTokens = params?.get('MaxTokensPerRun');
+            const paramMaxCost = params?.get('MaxCostPerRun');
+            this.sourceBudgetMap.set(id, new RunBudget({
+                MaxItemsPerRun: this.coerceNumber(paramMaxItems) ?? this.readConfigNumber(cfg, 'MaxItemsPerRun'),
+                MaxNewTagsPerRun: this.readConfigNumber(cfg, 'MaxNewTagsPerRun'),
+                MaxNewTagsPerItem: this.readConfigNumber(cfg, 'MaxNewTagsPerItem'),
+                MaxTokensPerRun: this.coerceNumber(paramMaxTokens) ?? this.readConfigNumber(cfg, 'MaxTokensPerRun'),
+                MaxCostPerRun: this.coerceNumber(paramMaxCost) ?? this.readConfigNumber(cfg, 'MaxCostPerRun'),
+            }));
+        }
+    }
+    coerceNumber(value) {
+        if (value == null)
+            return null;
+        if (typeof value === 'number')
+            return Number.isFinite(value) ? value : null;
+        if (typeof value === 'string') {
+            const n = Number(value);
+            return Number.isFinite(n) ? n : null;
+        }
+        return null;
+    }
+    /**
+     * Safely read a numeric budget knob from the typed configuration object.
+     * Returns null when the field is unset or not a finite number.
+     */
+    readConfigNumber(cfg, key) {
+        if (!cfg)
+            return null;
+        return this.coerceNumber(cfg[key]);
+    }
+    /**
+     * Install the engine's OnAfterBatch hook so each batch's items are
+     * counted against the budget of the source they belong to. Returns
+     * `continue:false` from the gate when any source's budget exhausts,
+     * which the engine then translates into a graceful pause.
+     */
+    installBudgetGate() {
+        this.engine.OnAfterBatch = async (batch, _totalProcessed) => {
+            // Tally items per source within this batch.
+            const perSourceCounts = new Map();
+            for (const item of batch) {
+                if (!item.ContentSourceID)
+                    continue;
+                const id = NormalizeUUID(item.ContentSourceID);
+                perSourceCounts.set(id, (perSourceCounts.get(id) ?? 0) + 1);
+            }
+            for (const [id, count] of perSourceCounts) {
+                const budget = this.sourceBudgetMap.get(id);
+                if (!budget)
+                    continue;
+                budget.recordItemsProcessed(count);
+                const verdict = budget.checkBudgets();
+                if (!verdict.ok) {
+                    return { continue: false, reason: `${verdict.reason}: ${verdict.details ?? ''}` };
+                }
+            }
+            return { continue: true };
+        };
+    }
     getContextUser() {
         return this.contextUser;
     }
@@ -36,28 +215,69 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
         this.contextUser = contextUser;
         this.contentSourceTypeID = this.engine.SetSubclassContentSourceType('Website');
         const contentSources = await this.engine.getAllContentSources(this.contextUser, this.contentSourceTypeID);
-        const contentItemsToProcess = await this.SetContentItemsToProcess(contentSources);
-        await this.engine.ExtractTextAndProcessWithLLM(contentItemsToProcess, this.contextUser, undefined, undefined, onProgress);
-        return contentItemsToProcess.length;
+        // Per-source budget setup — produces a RunBudget for each content
+        // source and installs the OnAfterBatch gate on the engine so the
+        // run pauses gracefully when any source exhausts its MaxItemsPerRun /
+        // tokens / cost / tag budget.
+        await this.setupRunBudgets(contentSources);
+        this.installBudgetGate();
+        // Stream content items source-by-source into the LLM batcher. The
+        // crawl phase produces items as soon as they pass change-detection,
+        // and the LLM phase consumes them in batches without waiting for the
+        // last source to finish crawling. Wall-clock time becomes
+        // max(crawl, classify) + a small buffer instead of crawl + classify.
+        let itemsYielded = 0;
+        const streamSource = this;
+        const itemStream = (async function* () {
+            for await (const item of streamSource.streamContentItemsToProcess(contentSources)) {
+                itemsYielded++;
+                yield item;
+            }
+        })();
+        try {
+            await this.engine.ExtractTextAndProcessWithLLM(itemStream, this.contextUser, undefined, undefined, onProgress);
+        }
+        finally {
+            // Clean up engine state — leaving stale hooks around would leak
+            // budget state into the next Autotag invocation on a shared
+            // engine singleton.
+            this.engine.OnAfterBatch = null;
+        }
+        // Surface per-source budget pause reasons in the log so operators can
+        // see why a run stopped short.
+        for (const [sourceID, budget] of this.sourceBudgetMap) {
+            const verdict = budget.checkBudgets();
+            if (!verdict.ok) {
+                LogStatus(`[autotag-website] Source ${sourceID} reached budget: ${verdict.reason} — ${verdict.details ?? ''}`);
+            }
+        }
+        return itemsYielded;
     }
     /**
-     * Given a content source, retrieve all content items associated with the content sources.
-     * The content items are then processed to determine if they have been modified since the last time they were processed or if they are new content items.
-     * @param contentSource
-     * @returns
+     * Streaming variant: yields each new/changed content item as soon as it
+     * passes change detection. Lets the crawl and LLM phases overlap so total
+     * wall-clock time is roughly max(crawl, classify) instead of crawl + classify.
+     *
+     * The canonical implementation lives here; the array-returning
+     * `SetContentItemsToProcess` is a thin collector wrapper around this.
      */
-    async SetContentItemsToProcess(contentSources) {
-        const contentItemsToProcess = [];
-        // If content source parameters were provided, set them. Otherwise, use the default values.
+    async *streamContentItemsToProcess(contentSources) {
         for (const contentSource of contentSources) {
+            // Reset instance state to defaults before applying per-source overrides.
+            // Without this, knobs set on the previous source would leak into the next.
+            this.applyDefaultCrawlSettings();
+            // First overlay: typed Configuration.Website sub-object (the structured editor
+            // in the form writes here). This is the canonical storage for new sources.
+            this.applyWebsiteConfigFromSource(contentSource);
+            // Second overlay: per-source ContentSourceParam rows. These win — legacy
+            // sources configured via the params grid (or anyone who wants a sharper
+            // per-instance override) keep working. We handle each known crawler key
+            // explicitly so the DB-stored string values get the right runtime type
+            // (the prior "bulk dynamic assign" path silently stuffed strings into
+            // number/boolean fields).
             const contentSourceParamsMap = await this.engine.getContentSourceParams(contentSource, this.contextUser);
             if (contentSourceParamsMap) {
-                // Override defaults with content source specific params
-                contentSourceParamsMap.forEach((value, key) => {
-                    if (key in this) {
-                        this[key] = value;
-                    }
-                });
+                this.overlayCrawlParamsFromMap(contentSourceParamsMap);
             }
             const contentSourceParams = {
                 contentSourceID: contentSource.ID,
@@ -68,19 +288,30 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
                 URL: contentSource.URL
             };
             try {
-                // All content items associated with the content source
                 const startURL = contentSourceParams.URL;
-                // root url should be set to this.RootURL if it exists, otherwise it should be set to the base path of the startURL.
                 const rootURL = this.RootURL ? this.RootURL : this.getBasePath(startURL);
-                // regex should be set to this.URLPattern if it exists, otherwise it should be set to match any URL.
-                const regex = this.URLPattern && new RegExp(this.URLPattern) || new RegExp('.*');
-                const allContentItemLinks = await this.getAllLinksFromContentSource(startURL, rootURL, regex);
-                const contentItems = await this.SetNewAndModifiedContentItems(allContentItemLinks, contentSourceParams, this.contextUser);
-                if (contentItems && contentItems.length > 0) {
-                    contentItemsToProcess.push(...contentItems);
+                const regex = (this.URLPattern && new RegExp(this.URLPattern)) || new RegExp('.*');
+                // Consume the URL stream lazily — each `link` arrives as the crawler
+                // discovers it, NOT after the full recursive crawl completes. The
+                // engine's LLM batcher accumulates items as they're yielded here, so
+                // tagging starts firing as soon as the first BatchSize items are ready
+                // (instead of waiting for the entire source to finish crawling).
+                let yieldedForSource = 0;
+                for await (const link of this.streamAllLinksFromContentSource(startURL, rootURL, regex)) {
+                    try {
+                        const item = await this.processSingleURL(link, contentSourceParams);
+                        if (item) {
+                            yieldedForSource++;
+                            yield item;
+                        }
+                    }
+                    catch (e) {
+                        // Per-URL failures are isolated — log and keep going so a single
+                        // bad page doesn't poison the rest of the source.
+                        console.error(`[autotag-website] Failed to process URL ${link}:`, e);
+                    }
                 }
-                else {
-                    // No content items found to process
+                if (yieldedForSource === 0) {
                     console.log(`No content items found to process for content source: ${contentSource.Get('Name')}`);
                 }
             }
@@ -88,6 +319,20 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
                 console.error(`Failed to process content source: ${contentSource.Get('Name')}`);
             }
         }
+    }
+    /**
+     * Given a content source, retrieve all content items associated with the content sources.
+     * The content items are then processed to determine if they have been modified since the
+     * last time they were processed or if they are new content items.
+     *
+     * Backwards-compatible array form. Internally drains the streaming variant
+     * so there is exactly one implementation of the change-detection logic.
+     */
+    async SetContentItemsToProcess(contentSources) {
+        const contentItemsToProcess = [];
+        for await (const item of this.streamContentItemsToProcess(contentSources)) {
+            contentItemsToProcess.push(item);
+        }
         return contentItemsToProcess;
     }
     /**
@@ -99,61 +344,18 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
      * @param contextUser
      * @returns
      */
+    /**
+     * Backwards-compatible batch form: process an explicit list of URLs and
+     * return all new/changed content items as an array. New code should prefer
+     * `streamContentItemsToProcess` which pipelines into the LLM batcher.
+     */
     async SetNewAndModifiedContentItems(contentItemLinks, contentSourceParams, contextUser) {
         const addedContentItems = [];
-        for (const contentItemLink of contentItemLinks) {
+        for (const link of contentItemLinks) {
             try {
-                const newHash = await this.engine.getChecksumFromURL(contentItemLink);
-                const rv = new RunView();
-                const results = await rv.RunViews([
-                    {
-                        EntityName: 'MJ: Content Items',
-                        ExtraFilter: `Checksum = '${newHash}'`,
-                        ResultType: 'entity_object'
-                    },
-                    {
-                        EntityName: 'MJ: Content Items',
-                        ExtraFilter: `ContentSourceID = '${contentSourceParams.contentSourceID}' AND URL = '${contentItemLink}'`,
-                        ResultType: 'entity_object'
-                    }
-                ], this.contextUser);
-                const contentItemResultsWithChecksum = results[0];
-                const contentItemResultsWithURL = results[1];
-                if (contentItemResultsWithChecksum.Success && contentItemResultsWithChecksum.Results.length) {
-                    // We found the checksum so this content item has not changed since we last accessed it, do nothing
-                    continue;
-                }
-                else if (contentItemResultsWithURL.Success && contentItemResultsWithURL.Results.length) {
-                    // This content item already exists, update the hash and last updated date
-                    const contentItemResult = contentItemResultsWithURL.Results[0];
-                    const lastStoredHash = contentItemResult.Checksum;
-                    if (lastStoredHash !== newHash) {
-                        // This content item has changed since we last access it, update the hash and last updated date
-                        const md = this.ProviderToUse;
-                        const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
-                        contentItem.Load(contentItemResult.ID);
-                        contentItem.Checksum = newHash;
-                        contentItem.Text = await this.parseWebPage(contentItemLink);
-                        await contentItem.Save();
-                        addedContentItems.push(contentItem); // Content item was modified, add to list
-                    }
-                }
-                else {
-                    // This content item does not exist, add it
-                    const md = this.ProviderToUse;
-                    const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
-                    contentItem.ContentSourceID = contentSourceParams.contentSourceID;
-                    contentItem.Name = this.getPathName(contentItemLink); // Will get overwritten by title later if it exists
-                    contentItem.Description = this.engine.GetContentItemDescription(contentSourceParams);
-                    contentItem.ContentTypeID = contentSourceParams.ContentTypeID;
-                    contentItem.ContentFileTypeID = contentSourceParams.ContentFileTypeID;
-                    contentItem.ContentSourceTypeID = contentSourceParams.ContentSourceTypeID;
-                    contentItem.Checksum = await this.engine.getChecksumFromURL(contentItemLink);
-                    contentItem.URL = contentItemLink;
-                    contentItem.Text = await this.parseWebPage(contentItemLink);
-                    await contentItem.Save();
-                    addedContentItems.push(contentItem); // Content item was added, add to list
-                }
+                const item = await this.processSingleURL(link, contentSourceParams);
+                if (item)
+                    addedContentItems.push(item);
             }
             catch (e) {
                 console.log(e);
@@ -161,6 +363,67 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
         }
         return addedContentItems;
     }
+    /**
+     * Process one URL through the change-detection pipeline. Returns the
+     * MJContentItem if the page is new or changed (caller should hand it off
+     * to the LLM stage), or `null` if the page is unchanged.
+     *
+     * One axios.get per URL: the same response body provides both the
+     * change-detection hash and the page text. Compare with `byChecksum`
+     * scoped to the current ContentSource so identical boilerplate (404 pages,
+     * shared error templates) from a *different* source can't silently mask
+     * legitimate pages here.
+     */
+    async processSingleURL(url, contentSourceParams) {
+        const { text, checksum: newHash } = await this.fetchAndExtract(url);
+        const rv = new RunView();
+        const results = await rv.RunViews([
+            {
+                EntityName: 'MJ: Content Items',
+                ExtraFilter: `ContentSourceID = '${contentSourceParams.contentSourceID}' AND Checksum = '${newHash}'`,
+                ResultType: 'entity_object'
+            },
+            {
+                EntityName: 'MJ: Content Items',
+                ExtraFilter: `ContentSourceID = '${contentSourceParams.contentSourceID}' AND URL = '${url}'`,
+                ResultType: 'entity_object'
+            }
+        ], this.contextUser);
+        const byChecksum = results[0];
+        const byURL = results[1];
+        // Same content already in DB for this source — unchanged, skip.
+        if (byChecksum.Success && byChecksum.Results.length) {
+            return null;
+        }
+        // URL exists for this source but content has drifted — update in place.
+        if (byURL.Success && byURL.Results.length) {
+            const existing = byURL.Results[0];
+            if (existing.Checksum === newHash) {
+                return null;
+            }
+            const md = this.ProviderToUse;
+            const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
+            await contentItem.Load(existing.ID);
+            contentItem.Checksum = newHash;
+            contentItem.Text = text;
+            await contentItem.Save();
+            return contentItem;
+        }
+        // New URL — create the content item, reusing the already-fetched body.
+        const md = this.ProviderToUse;
+        const contentItem = await md.GetEntityObject('MJ: Content Items', this.contextUser);
+        contentItem.ContentSourceID = contentSourceParams.contentSourceID;
+        contentItem.Name = this.getPathName(url); // Will get overwritten by title later if it exists
+        contentItem.Description = this.engine.GetContentItemDescription(contentSourceParams);
+        contentItem.ContentTypeID = contentSourceParams.ContentTypeID;
+        contentItem.ContentFileTypeID = contentSourceParams.ContentFileTypeID;
+        contentItem.ContentSourceTypeID = contentSourceParams.ContentSourceTypeID;
+        contentItem.Checksum = newHash;
+        contentItem.URL = url;
+        contentItem.Text = text;
+        await contentItem.Save();
+        return contentItem;
+    }
     async fetchPageContent(url) {
         const { data } = await axios.get(url);
         return data;
@@ -180,16 +443,44 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
         return text;
     }
     /**
-     * Given a URL, this function extracts text from a webpage.
-     * @param url
-     * @returns The text extracted from the webpage
+     * Pure helper: extract clean body text from raw HTML. No IO. Exposed as
+     * a protected method so subclasses and unit tests can exercise it without
+     * monkey-patching axios.
+     */
+    extractTextFromHTML(html) {
+        const $ = cheerio.load(html);
+        const body = $('body')[0];
+        if (!body)
+            return '';
+        return this.getTextWithLineBreaks(body, $);
+    }
+    /**
+     * Fetch a URL once, extract clean text, and compute a stable checksum
+     * over that text. Returns both so callers don't have to fetch twice for
+     * "is this changed?" + "what's the content?".
+     *
+     * The checksum is computed over the EXTRACTED body text, NOT the raw
+     * HTML, because raw HTML routinely contains incidental changes (server
+     * timestamps, CSRF tokens, build hashes, ad rotators) that would
+     * falsely report a page as "changed" on every crawl. Hashing the
+     * extracted text is what users actually mean by "did the content
+     * change?"
+     */
+    async fetchAndExtract(url) {
+        const { data } = await axios.get(url);
+        const text = this.extractTextFromHTML(String(data));
+        const checksum = await this.engine.getChecksumFromText(text);
+        return { text, checksum };
+    }
+    /**
+     * Given a URL, extracts text from a webpage. Kept for external callers
+     * that just want the text — internal change-detection now uses
+     * `fetchAndExtract` to avoid redundant fetches.
      */
     async parseWebPage(url) {
         try {
             const pageContent = await this.fetchPageContent(url);
-            const $ = cheerio.load(pageContent);
-            const text = this.getTextWithLineBreaks($('body')[0], $);
-            return text;
+            return this.extractTextFromHTML(pageContent);
         }
         catch (error) {
             console.error(`Error processing ${url}:`, error);
@@ -197,50 +488,67 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
         }
     }
     /**
-     * Given a root URL that corresponds to a content source, retrieve all the links in accordance to the crawl settings.
-     * If the crawl settings are set to crawl other sites in the top level domain, then all links in the top level domain will be retrieved.
-     * If the crawl settings are set to crawl sites in lower level domains, then function is recursively called to retrieve all links in the lower level domains.
-     * @param url
-     * @returns
+     * Streaming variant: yields each newly-discovered URL as the crawler finds it,
+     * so downstream consumers (the content-item streamer that feeds the LLM
+     * batcher) can start working before discovery completes. This is the
+     * canonical implementation; `getAllLinksFromContentSource` below is a
+     * backwards-compatible array-collecting wrapper.
      */
-    async getAllLinksFromContentSource(url, rootURL, regex) {
+    async *streamAllLinksFromContentSource(url, rootURL, regex) {
+        // Start each content source with a clean visited set — otherwise URLs
+        // found for one source silently get deduped away when the next source
+        // is crawled.
+        this.visitedURLs = new Set();
+        // Normalize the seed URL once so all downstream comparisons share the same form.
+        const seedURL = this.normalizeURL(url);
         try {
-            await this.getLowerLevelLinks(url, rootURL, this.MaxDepth, new Set(), regex);
-            await this.getTopLevelLinks(url, this.getBasePath(url));
-            return Array.from(this.visitedURLs);
+            yield* this.streamLowerLevelLinks(seedURL, rootURL, this.MaxDepth, new Set(), regex);
+            yield* this.streamTopLevelLinks(seedURL, this.getBasePath(seedURL), regex);
         }
         catch (e) {
             console.error(`Failed to get links from ${url}`);
-            return [];
         }
     }
     /**
-     * For a given URL, retrieves all other links at that top level domain.
-     * @param url
-     * @param rootURL
-     * @param visitedURLs
-     * @returns
+     * Backwards-compatible array form. Drains the streaming variant.
      */
-    async getTopLevelLinks(url, rootURL) {
+    async getAllLinksFromContentSource(url, rootURL, regex) {
+        const collected = [];
+        for await (const link of this.streamAllLinksFromContentSource(url, rootURL, regex)) {
+            collected.push(link);
+        }
+        return collected;
+    }
+    /**
+     * Streaming variant of getTopLevelLinks — yields each URL it adds to the
+     * visited set so the LLM batcher gets fed in real time.
+     */
+    async *streamTopLevelLinks(url, rootURL, regex) {
         if (!this.CrawlOtherSitesInTopLevelDomain) {
-            this.visitedURLs.add(url);
+            // Seed URL still gets yielded so the processSingleURL pipeline runs on it.
+            if (!this.visitedURLs.has(url)) {
+                this.visitedURLs.add(url);
+                yield url;
+            }
             return;
         }
-        // If we have already visited this URL, return an empty array
+        // If we have already visited this URL, nothing to do.
         if (this.visitedURLs.has(url) || !await this.urlIsValid(url) || this.isHighestDomain(url)) {
             return;
         }
         this.visitedURLs.add(url);
+        yield url;
+        const discovered = [];
         try {
             const { data } = await axios.get(url);
             const $ = cheerio.load(data);
-            // Get all links on the page for the current URL
             $('a').each((_, element) => {
                 const link = $(element).attr('href');
                 if (link) {
-                    const newURL = new URL(link, url).href;
-                    if (newURL.startsWith(rootURL) && !this.visitedURLs.has(newURL)) {
+                    const newURL = this.normalizeURL(new URL(link, url).href);
+                    if (newURL.startsWith(rootURL) && !this.visitedURLs.has(newURL) && regex.test(newURL)) {
                         this.visitedURLs.add(newURL);
+                        discovered.push(newURL);
                     }
                 }
             });
@@ -250,6 +558,19 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
             console.error(`Failed to get links from ${url}`);
             return;
         }
+        // Yield the page's links AFTER the await/delay completes so they're emitted
+        // outside the cheerio sync callback (which can't yield).
+        for (const newURL of discovered) {
+            yield newURL;
+        }
+    }
+    /**
+     * Backwards-compatible void form. Drains the streaming variant (links go
+     * into `visitedURLs` as a side effect of streamTopLevelLinks).
+     */
+    async getTopLevelLinks(url, rootURL, regex) {
+        // eslint-disable-next-line @typescript-eslint/no-unused-vars
+        for await (const _ of this.streamTopLevelLinks(url, rootURL, regex)) { /* drain */ }
     }
     /**
      * Simple check to see if the URL is at the highest level domain.
@@ -288,6 +609,31 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
             throw e;
         }
     }
+    /**
+     * Normalize a URL for use as a dedup key in `visitedURLs`. Conservative
+     * normalization that catches the common variations without risking the merge
+     * of two semantically distinct pages:
+     *   - drops the fragment (always client-side per RFC 3986)
+     *   - collapses trailing slash on the path (except the root "/")
+     *   - sorts query parameters for stable equality
+     *   - host is already lower-cased by URL parser
+     * Path case is intentionally preserved — RFC 3986 says paths are case-sensitive
+     * and some servers (wikis, certain Linux file fronts) actually treat them that way.
+     */
+    normalizeURL(href) {
+        try {
+            const u = new URL(href);
+            u.hash = '';
+            if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
+                u.pathname = u.pathname.slice(0, -1);
+            }
+            u.searchParams.sort();
+            return u.href;
+        }
+        catch {
+            return href;
+        }
+    }
     async urlIsValid(url) {
         try {
             const response = await axios.head(url);
@@ -299,53 +645,66 @@ let AutotagWebsite = class AutotagWebsite extends AutotagBase {
         }
     }
     /**
-     * For a given URL, retrieves all links at lower level domains up to the specified crawl depth.
-     * @param url
-     * @param rootURL
-     * @param crawlDepth
-     * @param visitedURLs
-     * @returns
+     * Streaming variant of getLowerLevelLinks. Yields each newly-discovered URL
+     * the moment it's added to the visited set, then recurses depth-first into
+     * children. This is the canonical implementation — the LLM batcher gets
+     * fed in real time during crawl instead of having to wait for the entire
+     * recursive discovery to complete.
+     *
+     * `getLowerLevelLinks` below is a thin backwards-compatible wrapper that
+     * drains the stream into a Set.
      */
-    async getLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex) {
+    async *streamLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex) {
         try {
             console.log(`Scraping ${url}`);
-            // If we have already visited this URL, return an empty array
-            if (scrapedURLs.has(url) || await this.urlIsValid(url) === false || crawlDepth < 0 || !this.CrawlSitesInLowerLevelDomain) {
-                return new Set();
+            // The Number.isFinite guard protects against accidental NaN/undefined
+            // arriving as crawlDepth — without it, `undefined < 0` is false and the
+            // recursion runs without a depth ceiling.
+            if (scrapedURLs.has(url) || await this.urlIsValid(url) === false || !Number.isFinite(crawlDepth) || crawlDepth < 0 || !this.CrawlSitesInLowerLevelDomain) {
+                return;
             }
-            let combinedLinks = new Set(); // Combined links from the current URL and all lower level URLs
-            const extractedLinks = new Set(); // Links extracted from the input URL
+            const extractedLinks = [];
             const { data } = await axios.get(url);
             const $ = cheerio.load(data);
-            // Get all links on the page for the current URL
             $('a').each((_, element) => {
                 const link = $(element).attr('href');
                 if (link) {
-                    const newURL = new URL(link, url).href;
+                    const newURL = this.normalizeURL(new URL(link, url).href);
                     if (newURL.startsWith(rootURL) && newURL !== url && !this.visitedURLs.has(newURL) && regex.test(newURL)) {
-                        extractedLinks.add(newURL);
+                        extractedLinks.push(newURL);
                         this.visitedURLs.add(newURL);
                     }
                 }
             });
             await this.delay(1000); // Delay to prevent rate limiting
             scrapedURLs.add(url);
-            // If we are at the depth limit, return the current set of URLs and don't recurse
-            if (crawlDepth === 0) {
-                return extractedLinks;
+            // Yield each newly-discovered URL outside the (sync) cheerio callback.
+            // Consumers start processing these immediately while we recurse.
+            for (const newURL of extractedLinks) {
+                yield newURL;
             }
+            // Depth limit — discover this page's links but don't recurse.
+            if (crawlDepth === 0)
+                return;
             for (const subLink of extractedLinks) {
-                //console.log(`Adding ${subLink}`);
-                const lowerLevelLinks = await this.getLowerLevelLinks(subLink, rootURL, crawlDepth - 1, scrapedURLs, regex);
-                combinedLinks = new Set([...extractedLinks, ...lowerLevelLinks]);
+                yield* this.streamLowerLevelLinks(subLink, rootURL, crawlDepth - 1, scrapedURLs, regex);
             }
-            return combinedLinks;
         }
         catch (e) {
             console.error(`Failed to get links from ${url}`);
-            return new Set();
         }
     }
+    /**
+     * Backwards-compatible Set form. Drains the streaming variant; URLs end up
+     * in `this.visitedURLs` as a side effect of streamLowerLevelLinks.
+     */
+    async getLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex) {
+        const out = new Set();
+        for await (const link of this.streamLowerLevelLinks(url, rootURL, crawlDepth, scrapedURLs, regex)) {
+            out.add(link);
+        }
+        return out;
+    }
     async delay(ms) {
         return new Promise(resolve => setTimeout(resolve, ms));
     }