@dpopsuev/web-spider 0.10.4 → 0.10.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/batch.js.map +1 -0
- package/dist/cache.js.map +1 -0
- package/dist/convert.js.map +1 -0
- package/dist/crawl.js.map +1 -0
- package/dist/disk-cache.js.map +1 -0
- package/dist/graph.js.map +1 -0
- package/dist/index.js.map +1 -0
- package/dist/parse.js.map +1 -0
- package/dist/playwright.js.map +1 -0
- package/dist/ports.js.map +1 -0
- package/dist/robots.js.map +1 -0
- package/dist/search.js.map +1 -0
- package/dist/sitemap.js.map +1 -0
- package/dist/spider.js.map +1 -0
- package/dist/throttle.js.map +1 -0
- package/dist/tree.js.map +1 -0
- package/dist/types.js.map +1 -0
- package/dist/views.js.map +1 -0
- package/dist/web-search.js.map +1 -0
- package/package.json +2 -1
- package/fixtures/article-with-images.html +0 -94
- package/fixtures/gh-shell.html +0 -32
- package/fixtures/guide-ai-agents-web-scraping.json +0 -552
- package/fixtures/images/large.jpg +0 -0
- package/fixtures/images/small.jpg +0 -0
- package/fixtures/images/tiny.png +0 -0
- package/fixtures/quotes-index.json +0 -40
- package/scripts/fetch-guide.mjs +0 -25
- package/src/cache.ts +0 -99
- package/src/convert.ts +0 -161
- package/src/crawl.ts +0 -186
- package/src/disk-cache.ts +0 -228
- package/src/graph.ts +0 -189
- package/src/index.ts +0 -74
- package/src/parse.ts +0 -154
- package/src/playwright.ts +0 -193
- package/src/ports.ts +0 -131
- package/src/robots.ts +0 -121
- package/src/search.ts +0 -173
- package/src/sitemap.ts +0 -67
- package/src/spider.ts +0 -475
- package/src/throttle.ts +0 -118
- package/src/tree.ts +0 -379
- package/src/types.ts +0 -225
- package/src/views.ts +0 -42
- package/src/web-search.ts +0 -548
- package/test/convert-images.test.ts +0 -69
- package/test/disk-cache-images.test.ts +0 -193
- package/test/engine-registry.test.ts +0 -114
- package/test/exports.test.ts +0 -124
- package/test/get-chunk.test.ts +0 -115
- package/test/images-integration.test.ts +0 -359
- package/test/improvements.test.ts +0 -279
- package/test/inbound-count.test.ts +0 -111
- package/test/lean.test.ts +0 -105
- package/test/playwright.test.ts +0 -128
- package/test/ports.test.ts +0 -161
- package/test/search.test.ts +0 -219
- package/test/spider-images.test.ts +0 -180
- package/test/spider-unit.test.ts +0 -610
- package/test/tree.test.ts +0 -272
- package/test/types.test.ts +0 -169
- package/test/web-search-integration.test.ts +0 -180
- package/test/web-search.test.ts +0 -305
- package/tsconfig.json +0 -9
- package/tsconfig.test.json +0 -7
- package/vitest.config.ts +0 -8
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"batch.js","sourceRoot":"","sources":["../src/batch.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAcrC;;;;;;;;GAQG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,IAAc,EAAE,OAAqB,EAAE;IACxE,sEAAsE;IACtE,sEAAsE;IACtE,8DAA8D;IAC9D,MAAM,EAAE,WAAW,GAAG,CAAC,EAAE,OAAO,GAAG,GAAG,EAAE,KAAK,EAAE,UAAU,EACxD,QAAQ,EAAE,SAAS,EAAE,WAAW,EAAE,YAAY,EAAE,+BAA+B;IAC/E,GAAG,UAAU,EAAE,GAAG,IAAI,CAAC;IAExB,MAAM,OAAO,GAAG,IAAI,GAAG,EAAgC,CAAC;IACxD,MAAM,MAAM,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC;IAClC,IAAI,IAAI,GAAG,CAAC,CAAC;IAEb,+DAA+D;IAC/D,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QAC1B,MAAM,MAAM,GAAG,KAAK,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;QAC/B,IAAI,MAAM,EAAE,CAAC;YACZ,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;YACzB,IAAI,EAAE,CAAC;YACP,UAAU,EAAE,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QACxC,CAAC;aAAM,CAAC;YACP,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACnB,CAAC;IACF,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,OAAO,CAAC;IAEzC,qDAAqD;IACrD,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;QACnC,MAAM,OAAO,GAAG,GAAS,EAAE;YAC1B,OAAO,QAAQ,GAAG,WAAW,IAAI,KAAK,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;gBACzD,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC7B,QAAQ,EAAE,CAAC;gBAEX,MAAM,KAAK,GACV,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,OAAO,CAAO,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,OAAO,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;gBAElG,KAAK;qBACH,IAAI,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;qBACnC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE;oBACd,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;oBACvB,KAAK,EAAE,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;gBACvB,CAAC,CAAC;qBACD,KAAK,CAAC,CAAC,GAAY,EAAE,EAAE;oBACvB,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;gBACvE,CAAC,CAAC;qBACD,OAAO,CAAC,GAAG,EAAE;oBACb,IAAI,EAAE,CAAC;oBACP,UAAU,EAAE,CACX,IAAI,EACJ,MAAM,CAAC,MAAM,EACb,GAAG,EACH,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAW,CAAC,CAAC,CAAC,SAAS,CAC3E,CAAC;oBACF,QAAQ,EAAE,CAAC;oBACX,IAAI,IAAI,KAAK,MAAM,CAAC,MAAM;wBAAE,OAAO,EAAE,CAAC;;wBACjC,OAAO,EAAE,CAAC;gBAChB,CAAC,CAAC,CAAC;YACL,CAAC;QACF,CAAC,CAAC;QACF,OAAO,EAAE,CAAC;IACX,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC;AAChB,CAAC","sourcesContent":["import type { SpiderCache } from \"./cache.js\";\nimport type { SpiderOptions } from \"./spider.js\";\nimport { spider } from \"./spider.js\";\nimport type { SpideredPage } from \"./types.js\";\n\nexport interface BatchOptions extends SpiderOptions {\n\t/** Max concurrent fetches (default 3 — be polite) */\n\tconcurrency?: number;\n\t/** Fixed delay in ms between each fetch start (default 300) */\n\tdelayMs?: number;\n\t/** Optional cache — already-cached URLs are skipped */\n\tcache?: SpiderCache;\n\t/** Called after each URL completes (success or failure) */\n\tonProgress?: (done: number, total: number, url: string, error?: Error) => void;\n}\n\n/**\n * Spider multiple URLs concurrently with a bounded semaphore.\n *\n * Returns a Map keyed by URL. Value is either a SpideredPage (success)\n * or an Error (failure). Errors do not poison the batch.\n *\n * Cache integration: if `opts.cache` is provided, cached pages are\n * returned immediately and do not count toward concurrency.\n */\nexport async function batchSpider(urls: string[], opts: BatchOptions = {}): Promise<Map<string, SpideredPage | Error>> {\n\t// Strip crawl-only options that batchSpider doesn't use so they don't\n\t// confuse callers and don't get forwarded to spider() where they'd be\n\t// applied per-call rather than shared (use crawl() for that).\n\tconst { concurrency = 3, delayMs = 300, cache, onProgress,\n\t\tthrottle: _throttle, robotsCache: _robotsCache, // consumed here, not forwarded\n\t\t...spiderOpts } = opts;\n\n\tconst results = new Map<string, SpideredPage | Error>();\n\tconst unique = [...new Set(urls)];\n\tlet done = 0;\n\n\t// Satisfy cache hits synchronously before touching the network\n\tconst toFetch: string[] = [];\n\tfor (const url of unique) {\n\t\tconst cached = cache?.get(url);\n\t\tif (cached) {\n\t\t\tresults.set(url, cached);\n\t\t\tdone++;\n\t\t\tonProgress?.(done, unique.length, url);\n\t\t} else {\n\t\t\ttoFetch.push(url);\n\t\t}\n\t}\n\n\tif (toFetch.length === 0) return results;\n\n\t// Semaphore: at most `concurrency` in-flight at once\n\tlet inFlight = 0;\n\tlet index = 0;\n\n\tawait new Promise<void>((resolve) => {\n\t\tconst tryNext = (): void => {\n\t\t\twhile (inFlight < concurrency && index < toFetch.length) {\n\t\t\t\tconst url = toFetch[index++];\n\t\t\t\tinFlight++;\n\n\t\t\t\tconst delay =\n\t\t\t\t\tdelayMs > 0 ? new Promise<void>((r) => setTimeout(r, delayMs * (index - 1))) : Promise.resolve();\n\n\t\t\t\tdelay\n\t\t\t\t\t.then(() => spider(url, spiderOpts))\n\t\t\t\t\t.then((page) => {\n\t\t\t\t\t\tresults.set(url, page);\n\t\t\t\t\t\tcache?.set(url, page);\n\t\t\t\t\t})\n\t\t\t\t\t.catch((err: unknown) => {\n\t\t\t\t\t\tresults.set(url, err instanceof Error ? err : new Error(String(err)));\n\t\t\t\t\t})\n\t\t\t\t\t.finally(() => {\n\t\t\t\t\t\tdone++;\n\t\t\t\t\t\tonProgress?.(\n\t\t\t\t\t\t\tdone,\n\t\t\t\t\t\t\tunique.length,\n\t\t\t\t\t\t\turl,\n\t\t\t\t\t\t\tresults.get(url) instanceof Error ? (results.get(url) as Error) : undefined,\n\t\t\t\t\t\t);\n\t\t\t\t\t\tinFlight--;\n\t\t\t\t\t\tif (done === unique.length) resolve();\n\t\t\t\t\t\telse tryNext();\n\t\t\t\t\t});\n\t\t\t}\n\t\t};\n\t\ttryNext();\n\t});\n\n\treturn results;\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache.js","sourceRoot":"","sources":["../src/cache.ts"],"names":[],"mappings":"AAeA;;;;;;;;;;;;;;GAcG;AACH,MAAM,OAAO,WAAW;IAKvB,YAAY,OAA2B,EAAE;QAJxB,UAAK,GAA2C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAKpF,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC;QACnC,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC;IAC3C,CAAC;IAED,6EAA6E;IACrE,GAAG,CAAC,GAAW;QACtB,IAAI,CAAC;YACJ,MAAM,CAAC,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YACvB,CAAC,CAAC,IAAI,GAAG,EAAE,CAAC;YACZ,OAAO,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACxC,CAAC;QAAC,MAAM,CAAC;YACR,OAAO,GAAG,CAAC;QACZ,CAAC;IACF,CAAC;IAED,GAAG,CAAC,GAAW;QACd,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACxB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC5B,IAAI,CAAC,KAAK;YAAE,OAAO,SAAS,CAAC;QAC7B,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC;YAClC,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YACrB,OAAO,SAAS,CAAC;QAClB,CAAC;QACD,6DAA6D;QAC7D,kEAAkE;QAClE,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACrB,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC;QACtB,OAAO,KAAK,CAAC,IAAI,CAAC;IACnB,CAAC;IAED,GAAG,CAAC,GAAW,EAAE,IAAkB;QAClC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACxB,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,IAAI,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1E,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1C,IAAI,MAAM,KAAK,SAAS;gBAAE,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACrD,CAAC;QACD,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC;IAC9D,CAAC;IAED,GAAG,CAAC,GAAW;QACd,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,SAAS,CAAC;IACpC,CAAC;IAED,MAAM,CAAC,GAAW;QACjB,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;IAClC,CAAC;IAED,KAAK;QACJ,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC;YAAE,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAC/D,CAAC;IAED,IAAI,IAAI;QACP,OAAO,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;IACvC,CAAC;IAED,6DAA6D;IAC7D,MAAM;QACL,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,OAAO,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC;aAC9B,MAAM,CAAC,CAAC,CAAC,EAAmB,EAAE,CAAC,CAAC,KAAK,SAAS,IAAI,CAAC,CAAC,SAAS,GAAG,GAAG,CAAC;aACpE,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IACtB,CAAC;CACD","sourcesContent":["import type { ICache } from \"./ports.js\";\nimport type { SpideredPage } from \"./types.js\";\n\ninterface CacheEntry {\n\tpage: SpideredPage;\n\texpiresAt: number;\n}\n\nexport interface SpiderCacheOptions {\n\t/** Maximum number of pages to hold (default 500) */\n\tmaxSize?: number;\n\t/** Time-to-live in milliseconds (default 30 min) */\n\tttlMs?: number;\n}\n\n/**\n * LRU cache for spidered pages.\n *\n * Implements the Identity Map pattern from Local Materialized View:\n * exactly one entry per normalised URL — duplicate fetches never happen.\n *\n * Uses a plain object (Object.create(null)) for storage rather than a Map.\n * Plain objects carry no realm-specific internal slots, so they are safe\n * across V8 context (realm) boundaries — e.g. when the cache is constructed\n * in an ESM module realm but called from a jiti VM-sandbox realm.\n *\n * JavaScript objects maintain insertion order for string keys (ES2015+),\n * so delete-then-reinsert gives the same LRU-tail promotion semantics as a\n * Map without any cross-realm risk.\n */\nexport class SpiderCache implements ICache<string, SpideredPage> {\n\tprivate readonly store: Record<string, CacheEntry | undefined> = Object.create(null);\n\tprivate readonly maxSize: number;\n\tprivate readonly ttlMs: number;\n\n\tconstructor(opts: SpiderCacheOptions = {}) {\n\t\tthis.maxSize = opts.maxSize ?? 500;\n\t\tthis.ttlMs = opts.ttlMs ?? 30 * 60 * 1000;\n\t}\n\n\t/** Normalise a URL so http/https and trailing slashes don't cause misses. */\n\tprivate key(url: string): string {\n\t\ttry {\n\t\t\tconst u = new URL(url);\n\t\t\tu.hash = \"\";\n\t\t\treturn u.toString().replace(/\\/$/, \"\");\n\t\t} catch {\n\t\t\treturn url;\n\t\t}\n\t}\n\n\tget(url: string): SpideredPage | undefined {\n\t\tconst k = this.key(url);\n\t\tconst entry = this.store[k];\n\t\tif (!entry) return undefined;\n\t\tif (Date.now() > entry.expiresAt) {\n\t\t\tdelete this.store[k];\n\t\t\treturn undefined;\n\t\t}\n\t\t// Promote to tail (most-recently-used) by delete + reinsert.\n\t\t// Object insertion order is preserved for string keys in ES2015+.\n\t\tdelete this.store[k];\n\t\tthis.store[k] = entry;\n\t\treturn entry.page;\n\t}\n\n\tset(url: string, page: SpideredPage): void {\n\t\tconst k = this.key(url);\n\t\tif (Object.keys(this.store).length >= this.maxSize && !(k in this.store)) {\n\t\t\tconst lruKey = Object.keys(this.store)[0];\n\t\t\tif (lruKey !== undefined) delete this.store[lruKey];\n\t\t}\n\t\tthis.store[k] = { page, expiresAt: Date.now() + this.ttlMs };\n\t}\n\n\thas(url: string): boolean {\n\t\treturn this.get(url) !== undefined;\n\t}\n\n\tdelete(url: string): void {\n\t\tdelete this.store[this.key(url)];\n\t}\n\n\tclear(): void {\n\t\tfor (const k of Object.keys(this.store)) delete this.store[k];\n\t}\n\n\tget size(): number {\n\t\treturn Object.keys(this.store).length;\n\t}\n\n\t/** All currently valid pages (does not update LRU order). */\n\tvalues(): SpideredPage[] {\n\t\tconst now = Date.now();\n\t\treturn Object.values(this.store)\n\t\t\t.filter((e): e is CacheEntry => e !== undefined && e.expiresAt > now)\n\t\t\t.map((e) => e.page);\n\t}\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"convert.js","sourceRoot":"","sources":["../src/convert.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,eAAe,MAAM,UAAU,CAAC;AAUvC,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AAExF,uEAAuE;AACvE,qEAAqE;AACpE,QAAyC,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;AAE7D,0EAA0E;AAC1E,4DAA4D;AAC5D,QAAQ,CAAC,OAAO,CAAC,cAAc,EAAE;IAChC,MAAM,EAAE,KAAK;IACb,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;CACrB,CAAC,CAAC;AAEH,MAAM,kBAAkB,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AACjG,kBAAmD,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;AAgBvE,0DAA0D;AAC1D,MAAM,UAAU,UAAU,CAAC,IAAY,EAAE,IAAwB;IAChE,IAAI,IAAI,EAAE,UAAU;QAAE,OAAO,kBAAkB,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC/D,OAAO,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;AAChC,CAAC;AAED,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E,MAAM,kBAAkB,GAAG,GAAG,CAAC;AAE/B,+DAA+D;AAC/D,MAAM,UAAU,iBAAiB,CAAC,KAAe;IAChD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC1B,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QACtB,IAAI,CAAC,CAAC;YAAE,SAAS;QACjB,IAAI,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC;YAAE,OAAO,MAAM,CAAC;QACvC,IAAI,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,OAAO,OAAO,CAAC;QACtC,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;YAAE,OAAO,MAAM,CAAC;QAC1D,IAAI,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,OAAO,YAAY,CAAC;QAC3C,OAAO,MAAM,CAAC;IACf,CAAC;IACD,OAAO,MAAM,CAAC;AACf,CAAC;AAED,8EAA8E;AAC9E,WAAW;AACX,8EAA8E;AAE9E;;;;;;;GAOG;AACH,MAAM,UAAU,KAAK,CAAC,QAAgB,EAAE,OAAe;IACtD,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEnC,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,IAAI,MAAM,GAAa,EAAE,CAAC;IAC1B,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,OAAO,GAAG,KAAK,CAAC;IAEpB,MAAM,KAAK,GAAG,GAAS,EAAE;QACxB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,CAAC,IAAI;YAAE,OAAO;QAClB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAC3D,IAAI,SAAS,GAAG,EAAE;YAAE,OAAO;QAC3B,MAAM,WAAW,GAAG,iBAAiB,CAAC,MAAM,CAAC,CAAC;QAC9C,MAAM,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,GAAG,OAAO,UAAU,KAAK,EAAE,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC,CAAC;QAC/F,KAAK,EAAE,CAAC;QACR,MAAM,GAAG,EAAE,CAAC;IACb,CAAC,CAAC;IAEF,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC1B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAE5B,yEAAyE;QACzE,IAAI,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/B,MAAM,GAAG,CAAC,MAAM,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClB,SAAS;QACV,CAAC;QACD,IAAI,MAAM,EAAE,CAAC;YACZ,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClB,SAAS;QACV,CAAC;QAED,yEAAyE;QACzE,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC;QAE3C,IAAI,UAAU,EAAE,CAAC;YAChB,IAAI,CAAC,OAAO,EAAE,CAAC;gBACd,6DAA6D;gBAC7D,+BAA+B;gBAC/B,KAAK,EAAE,CAAC;gBACR,OAAO,GAAG,IAAI,CAAC;YAChB,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClB,SAAS;QACV,CAAC;QAED,IAAI,OAAO,EAAE,CAAC;YACb,+DAA+D;YAC/D,KAAK,EAAE,CAAC;YACR,OAAO,GAAG,KAAK,CAAC;QACjB,CAAC;QAED,yEAAyE;QACzE,IAAI,CAAC,OAAO,EAAE,CAAC;YACd,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClB,SAAS;QACV,CAAC;QAED,MAAM,YAAY,GAAG,cAAc,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAClD,IAAI,YAAY,EAAE,CAAC;YAClB,MAAM,YAAY,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;YAC1E,IAAI,YAAY,IAAI,kBAAkB;gBAAE,KAAK,EAAE,CAAC;YAChD,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnB,CAAC;aAAM,CAAC;YACP,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClB,MAAM,YAAY,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;YAC1E,IAAI,YAAY,IAAI,kBAAkB;gBAAE,KAAK,EAAE,CAAC;QACjD,CAAC;IACF,CAAC;IACD,KAAK,EAAE,CAAC;IACR,OAAO,MAAM,CAAC;AACf,CAAC","sourcesContent":["/**\n * Markdown conversion and chunk splitting.\n *\n * Owns the Turndown dependency. spider.ts calls toMarkdown() and chunk();\n * it never imports Turndown directly.\n */\n\nimport TurndownService from \"turndown\";\nimport type { Chunk, ChunkType } from \"./types.js\";\n\n// ---------------------------------------------------------------------------\n// Turndown setup\n// ---------------------------------------------------------------------------\n\n// TurndownService exposes .escape as a mutable internal — not in @types/turndown.\ninterface PatchableTurndown { escape: (s: string) => string }\n\nconst turndown = new TurndownService({ headingStyle: \"atx\", codeBlockStyle: \"fenced\" });\n\n// Disable escape — Turndown escapes markdown-special chars by default,\n// producing backslash noise that is unnatural for agent consumption.\n(turndown as unknown as PatchableTurndown).escape = (s) => s;\n\n// Strip images by default — agents cannot see them and alt-text is noise.\n// Disabled when keepImages: true is passed to toMarkdown().\nturndown.addRule(\"strip-images\", {\n\tfilter: \"img\",\n\treplacement: () => \"\",\n});\n\nconst turndownWithImages = new TurndownService({ headingStyle: \"atx\", codeBlockStyle: \"fenced\" });\n(turndownWithImages as unknown as PatchableTurndown).escape = (s) => s;\n// Default Turndown behaviour already renders <img> as  — no extra rule needed.\n\n// ---------------------------------------------------------------------------\n// Markdown conversion\n// ---------------------------------------------------------------------------\n\nexport interface ToMarkdownOptions {\n\t/**\n\t * When true, <img> tags are rendered as  instead of being stripped.\n\t * Use when captureImages is enabled so image references appear in the markdown.\n\t * Default: false.\n\t */\n\tkeepImages?: boolean;\n}\n\n/** Convert Readability article HTML to clean markdown. */\nexport function toMarkdown(html: string, opts?: ToMarkdownOptions): string {\n\tif (opts?.keepImages) return turndownWithImages.turndown(html);\n\treturn turndown.turndown(html);\n}\n\n// ---------------------------------------------------------------------------\n// Content type detection\n// ---------------------------------------------------------------------------\n\nconst CHUNK_TARGET_WORDS = 150;\n\n/** Detect the dominant content type from a markdown buffer. */\nexport function detectContentType(lines: string[]): ChunkType {\n\tfor (const line of lines) {\n\t\tconst t = line.trim();\n\t\tif (!t) continue;\n\t\tif (t.startsWith(\"```\")) return \"code\";\n\t\tif (t.startsWith(\"|\")) return \"table\";\n\t\tif (/^[-*+] /.test(t) || /^\\d+\\. /.test(t)) return \"list\";\n\t\tif (t.startsWith(\">\")) return \"blockquote\";\n\t\treturn \"text\";\n\t}\n\treturn \"text\";\n}\n\n// ---------------------------------------------------------------------------\n// Chunking\n// ---------------------------------------------------------------------------\n\n/**\n * Split markdown into RAG-ready chunks at heading boundaries.\n *\n * Atomicity guarantees:\n * - Fenced code blocks (``` ... ```) are never split.\n * - Markdown tables (lines starting with |) are always flushed as a single\n * chunk. Prose before the table is flushed first so the table is isolated.\n */\nexport function chunk(markdown: string, baseUrl: string): Chunk[] {\n\tconst chunks: Chunk[] = [];\n\tconst lines = markdown.split(\"\\n\");\n\n\tlet heading = \"\";\n\tlet buffer: string[] = [];\n\tlet index = 0;\n\tlet inCode = false;\n\tlet inTable = false;\n\n\tconst flush = (): void => {\n\t\tconst text = buffer.join(\"\\n\").trim();\n\t\tif (!text) return;\n\t\tconst wordCount = text.split(/\\s+/).filter(Boolean).length;\n\t\tif (wordCount < 10) return;\n\t\tconst contentType = detectContentType(buffer);\n\t\tchunks.push({ id: `${baseUrl}#chunk-${index}`, index, heading, text, wordCount, contentType });\n\t\tindex++;\n\t\tbuffer = [];\n\t};\n\n\tfor (const line of lines) {\n\t\tconst trimmed = line.trim();\n\n\t\t// ── Fenced code block toggle ──────────────────────────────────────────\n\t\tif (trimmed.startsWith(\"```\")) {\n\t\t\tinCode = !inCode;\n\t\t\tbuffer.push(line);\n\t\t\tcontinue;\n\t\t}\n\t\tif (inCode) {\n\t\t\tbuffer.push(line);\n\t\t\tcontinue;\n\t\t}\n\n\t\t// ── Table rows ────────────────────────────────────────────────────────\n\t\tconst isTableRow = trimmed.startsWith(\"|\");\n\n\t\tif (isTableRow) {\n\t\t\tif (!inTable) {\n\t\t\t\t// Table is starting — flush any preceding prose so the table\n\t\t\t\t// gets its own isolated chunk.\n\t\t\t\tflush();\n\t\t\t\tinTable = true;\n\t\t\t}\n\t\t\tbuffer.push(line);\n\t\t\tcontinue;\n\t\t}\n\n\t\tif (inTable) {\n\t\t\t// Table just ended — flush it before processing the next line.\n\t\t\tflush();\n\t\t\tinTable = false;\n\t\t}\n\n\t\t// ── Normal prose / headings ───────────────────────────────────────────\n\t\tif (!trimmed) {\n\t\t\tbuffer.push(line);\n\t\t\tcontinue;\n\t\t}\n\n\t\tconst headingMatch = /^#{1,3} (.+)/.exec(trimmed);\n\t\tif (headingMatch) {\n\t\t\tconst currentWords = buffer.join(\" \").split(/\\s+/).filter(Boolean).length;\n\t\t\tif (currentWords >= CHUNK_TARGET_WORDS) flush();\n\t\t\theading = headingMatch[1];\n\t\t\tbuffer.push(line);\n\t\t} else {\n\t\t\tbuffer.push(line);\n\t\t\tconst currentWords = buffer.join(\" \").split(/\\s+/).filter(Boolean).length;\n\t\t\tif (currentWords >= CHUNK_TARGET_WORDS) flush();\n\t\t}\n\t}\n\tflush();\n\treturn chunks;\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawl.js","sourceRoot":"","sources":["../src/crawl.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AACzC,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAEvC,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAEhD,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AA4C/C;;;;;;;;;;GAUG;AACH,MAAM,CAAC,KAAK,UAAU,KAAK,CAAC,QAAgB,EAAE,OAAqB,EAAE;IACpE,MAAM,EACL,QAAQ,GAAG,CAAC,EACZ,QAAQ,GAAG,EAAE,EACb,cAAc,GAAG,IAAI,EACrB,WAAW,GAAG,CAAC,EACf,OAAO,GAAG,GAAG,EACb,KAAK,GAAG,IAAI,WAAW,EAAkC,EACzD,KAAK,GAAG,IAAI,SAAS,EAAE,EACvB,MAAM,EACN,SAAS,EACT,aAAa,GAAG,IAAI,EACpB,UAAU,GAAG,IAAI,EACjB,GAAG,UAAU,EACb,GAAG,IAAI,CAAC;IAET,MAAM,QAAQ,GAAG,UAAU,CAAC,QAAQ,IAAI,IAAI,cAAc,CAAC,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,CAAC;IACpF,MAAM,WAAW,GAAG,UAAU,CAAC,WAAW,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;IAClH,MAAM,UAAU,GAAG,UAAU,CAAC,UAAU,CAAC;IAEzC,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,QAAQ,CAAC;IAC/C,MAAM,KAAK,GAAG,IAAI,GAAG,EAAwB,CAAC;IAC9C,MAAM,MAAM,GAAG,IAAI,GAAG,EAAiB,CAAC;IACxC,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAE/B,MAAM,WAAW,GAAG,CAAC,GAAW,EAAW,EAAE;QAC5C,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,KAAK,CAAC;QAChC,IAAI,KAAK,CAAC,IAAI,GAAG,MAAM,CAAC,IAAI,IAAI,QAAQ;YAAE,OAAO,KAAK,CAAC;QACvD,IAAI,CAAC;YACJ,MAAM,CAAC,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YACvB,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC;gBAAE,OAAO,KAAK,CAAC;YAC5D,IAAI,cAAc,IAAI,CAAC,CAAC,QAAQ,KAAK,WAAW;gBAAE,OAAO,KAAK,CAAC;QAChE,CAAC;QAAC,MAAM,CAAC;YACR,OAAO,KAAK,CAAC;QACd,CAAC;QACD,IAAI,SAAS,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC;YAAE,OAAO,KAAK,CAAC;QAC/C,OAAO,IAAI,CAAC;IACb,CAAC,CAAC;IAEF,4EAA4E;IAC5E,MAAM,UAAU,GAAG,KAAK,EAAE,IAAc,EAAE,KAAa,EAAiB,EAAE;QACzE,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;YACnC,MAAM,OAAO,GAAG,GAAS,EAAE;gBAC1B,OAAO,QAAQ,GAAG,WAAW,IAAI,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;oBACtD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;oBAC1B,QAAQ,EAAE,CAAC;oBAEX,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC;wBAC5B,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAE,CAAC;wBAClC,CAAC,CAAC,MAAM,CAAC,GAAG,EAAE,EAAE,GAAG,UAAU,EAAE,QAAQ,EAAE,WAAW,EAAE,CAAC,CAAC;oBAEzD,MAAM;yBACJ,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE;wBACd,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;wBACrB,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;wBACrB,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;wBACpB,MAAM,EAAE,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;oBACvB,CAAC,CAAC;yBACD,KAAK,CAAC,CAAC,GAAY,EAAE,EAAE;wBACvB,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;oBACtE,CAAC,CAAC;yBACD,OAAO,CAAC,GAAG,EAAE;wBACb,SAAS,EAAE,CAAC;wBACZ,QAAQ,EAAE,CAAC;wBACX,IAAI,SAAS,KAAK,IAAI,CAAC,MAAM;4BAAE,OAAO,EAAE,CAAC;;4BACpC,OAAO,EAAE,CAAC;oBAChB,CAAC,CAAC,CAAC;gBACL,CAAC;YACF,CAAC,CAAC;YACF,OAAO,EAAE,CAAC;QACX,CAAC,CAAC,CAAC;IACJ,CAAC,CAAC;IAEF,IAAI,QAAQ,GAAG,CAAC,QAAQ,CAAC,CAAC;IAC1B,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IAEnB,IAAI,UAAU,EAAE,CAAC;QAChB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC;QACxC,wDAAwD;QACxD,MAAM,MAAM,GAAG,UAAU,IAAI;YAC5B,KAAK,CAAC,KAAK,CAAC,GAAsD;gBACjE,OAAO,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;YAC5D,CAAC;SACD,CAAC;QACF,MAAM,WAAW,GAAG,MAAM,gBAAgB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3D,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;YAC7B,IAAI,WAAW,CAAC,CAAC,CAAC,EAAE,CAAC;gBACpB,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;gBACZ,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;QACF,CAAC;IACF,CAAC;IAED,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,IAAI,QAAQ,EAAE,KAAK,EAAE,EAAE,CAAC;QAChD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,MAAM;QACjC,IAAI,KAAK,CAAC,IAAI,GAAG,MAAM,CAAC,IAAI,IAAI,QAAQ;YAAE,MAAM;QAEhD,MAAM,SAAS,GAAG,QAAQ,GAAG,KAAK,CAAC,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC;QACtD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAE3C,MAAM,UAAU,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAE/B,IAAI,KAAK,KAAK,QAAQ;YAAE,MAAM;QAE9B,MAAM,YAAY,GAAa,EAAE,CAAC;QAClC,KAAK,MAAM,GAAG,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC5B,IAAI,CAAC,IAAI;gBAAE,SAAS;YACpB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC/B,IAAI,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC5B,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACpB,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBAC9B,CAAC;YACF,CAAC;QACF,CAAC;QACD,QAAQ,GAAG,YAAY,CAAC;IACzB,CAAC;IAED,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;AACjC,CAAC","sourcesContent":["import { SpiderCache } from \"./cache.js\";\nimport { PageGraph } from \"./graph.js\";\nimport type { ICache } from \"./ports.js\";\nimport { RobotsCache } from \"./robots.js\";\nimport { fetchSitemapUrls } from \"./sitemap.js\";\nimport type { SpiderOptions } from \"./spider.js\";\nimport { spider } from \"./spider.js\";\nimport { DomainThrottle } from \"./throttle.js\";\nimport type { SpideredPage } from \"./types.js\";\n\nexport interface CrawlOptions extends SpiderOptions {\n\t/** How many link hops from the start URL (default 2) */\n\tmaxDepth?: number;\n\t/** Hard cap on total pages spidered (default 50) */\n\tmaxPages?: number;\n\t/** Only follow links on the same domain as the start URL (default true) */\n\tsameDomainOnly?: boolean;\n\t/** Max concurrent fetches (default 3) */\n\tconcurrency?: number;\n\t/**\n\t * Minimum delay between requests to the same domain (ms).\n\t * When a throttle is provided this sets its minDelayMs.\n\t * Default 500.\n\t */\n\tdelayMs?: number;\n\t/** Bring your own cache — already-spidered URLs are skipped */\n\tcache?: ICache<string, SpideredPage>;\n\t/** Bring your own graph — nodes/edges added as pages are spidered */\n\tgraph?: PageGraph;\n\t/** Called with each successfully spidered page */\n\tonPage?: (page: SpideredPage, depth: number) => void;\n\t/** Return false to skip a URL before fetching it */\n\turlFilter?: (url: string) => boolean;\n\t/**\n\t * Whether to check and respect robots.txt for each domain (default true).\n\t * Automatically creates a RobotsCache if not provided via SpiderOptions.\n\t */\n\trespectRobots?: boolean;\n\t/**\n\t * Attempt to fetch /sitemap.xml before BFS to seed the frontier with\n\t * all known URLs. Falls back to normal BFS on any error (default true).\n\t */\n\tuseSitemap?: boolean;\n}\n\nexport interface CrawlResult {\n\tpages: Map<string, SpideredPage>;\n\tgraph: PageGraph;\n\terrors: Map<string, Error>;\n}\n\n/**\n * Recursive BFS crawler.\n *\n * Starts at `startUrl`, spiders it, extracts links, filters them, then\n * recurses up to `maxDepth` hops. Respects `maxPages`, `sameDomainOnly`,\n * and `urlFilter`. Populates the provided (or freshly created) cache and\n * graph as it goes.\n *\n * Concurrency is bounded per depth level — we fully finish each level\n * before proceeding, giving BFS ordering and predictable memory use.\n */\nexport async function crawl(startUrl: string, opts: CrawlOptions = {}): Promise<CrawlResult> {\n\tconst {\n\t\tmaxDepth = 2,\n\t\tmaxPages = 50,\n\t\tsameDomainOnly = true,\n\t\tconcurrency = 3,\n\t\tdelayMs = 500,\n\t\tcache = new SpiderCache() as ICache<string, SpideredPage>,\n\t\tgraph = new PageGraph(),\n\t\tonPage,\n\t\turlFilter,\n\t\trespectRobots = true,\n\t\tuseSitemap = true,\n\t\t...spiderOpts\n\t} = opts;\n\n\tconst throttle = spiderOpts.throttle ?? new DomainThrottle({ minDelayMs: delayMs });\n\tconst robotsCache = spiderOpts.robotsCache ?? (respectRobots ? new RobotsCache(spiderOpts.userAgent) : undefined);\n\tconst httpClient = spiderOpts.httpClient;\n\n\tconst startDomain = new URL(startUrl).hostname;\n\tconst pages = new Map<string, SpideredPage>();\n\tconst errors = new Map<string, Error>();\n\tconst seen = new Set<string>();\n\n\tconst shouldVisit = (url: string): boolean => {\n\t\tif (seen.has(url)) return false;\n\t\tif (pages.size + errors.size >= maxPages) return false;\n\t\ttry {\n\t\t\tconst u = new URL(url);\n\t\t\tif (![\"http:\", \"https:\"].includes(u.protocol)) return false;\n\t\t\tif (sameDomainOnly && u.hostname !== startDomain) return false;\n\t\t} catch {\n\t\t\treturn false;\n\t\t}\n\t\tif (urlFilter && !urlFilter(url)) return false;\n\t\treturn true;\n\t};\n\n\t// Throttle and robots.txt are handled inside spider() via shared instances.\n\tconst fetchBatch = async (urls: string[], depth: number): Promise<void> => {\n\t\tlet index = 0;\n\t\tlet inFlight = 0;\n\t\tlet completed = 0;\n\n\t\tawait new Promise<void>((resolve) => {\n\t\t\tconst tryNext = (): void => {\n\t\t\t\twhile (inFlight < concurrency && index < urls.length) {\n\t\t\t\t\tconst url = urls[index++];\n\t\t\t\t\tinFlight++;\n\n\t\t\t\t\tconst fetch_ = cache.has(url)\n\t\t\t\t\t\t? Promise.resolve(cache.get(url)!)\n\t\t\t\t\t\t: spider(url, { ...spiderOpts, throttle, robotsCache });\n\n\t\t\t\t\tfetch_\n\t\t\t\t\t\t.then((page) => {\n\t\t\t\t\t\t\tpages.set(url, page);\n\t\t\t\t\t\t\tcache.set(url, page);\n\t\t\t\t\t\t\tgraph.addPage(page);\n\t\t\t\t\t\t\tonPage?.(page, depth);\n\t\t\t\t\t\t})\n\t\t\t\t\t\t.catch((err: unknown) => {\n\t\t\t\t\t\t\terrors.set(url, err instanceof Error ? err : new Error(String(err)));\n\t\t\t\t\t\t})\n\t\t\t\t\t\t.finally(() => {\n\t\t\t\t\t\t\tcompleted++;\n\t\t\t\t\t\t\tinFlight--;\n\t\t\t\t\t\t\tif (completed === urls.length) resolve();\n\t\t\t\t\t\t\telse tryNext();\n\t\t\t\t\t\t});\n\t\t\t\t}\n\t\t\t};\n\t\t\ttryNext();\n\t\t});\n\t};\n\n\tlet frontier = [startUrl];\n\tseen.add(startUrl);\n\n\tif (useSitemap) {\n\t\tconst origin = new URL(startUrl).origin;\n\t\t// Use a minimal default httpClient if none was injected\n\t\tconst client = httpClient ?? {\n\t\t\tasync fetch(req: { url: string; headers?: Record<string, string> }) {\n\t\t\t\treturn globalThis.fetch(req.url, { headers: req.headers });\n\t\t\t},\n\t\t};\n\t\tconst sitemapUrls = await fetchSitemapUrls(origin, client);\n\t\tfor (const u of sitemapUrls) {\n\t\t\tif (shouldVisit(u)) {\n\t\t\t\tseen.add(u);\n\t\t\t\tfrontier.push(u);\n\t\t\t}\n\t\t}\n\t}\n\n\tfor (let depth = 0; depth <= maxDepth; depth++) {\n\t\tif (frontier.length === 0) break;\n\t\tif (pages.size + errors.size >= maxPages) break;\n\n\t\tconst remaining = maxPages - pages.size - errors.size;\n\t\tconst batch = frontier.slice(0, remaining);\n\n\t\tawait fetchBatch(batch, depth);\n\n\t\tif (depth === maxDepth) break;\n\n\t\tconst nextFrontier: string[] = [];\n\t\tfor (const url of batch) {\n\t\t\tconst page = pages.get(url);\n\t\t\tif (!page) continue;\n\t\t\tfor (const link of page.links) {\n\t\t\t\tif (shouldVisit(link.href)) {\n\t\t\t\t\tseen.add(link.href);\n\t\t\t\t\tnextFrontier.push(link.href);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tfrontier = nextFrontier;\n\t}\n\n\treturn { pages, graph, errors };\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"disk-cache.js","sourceRoot":"","sources":["../src/disk-cache.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAInD,8DAA8D;AAC9D,MAAM,cAAc,GAAG,CAAC,CAAC;AA6BzB,MAAM,OAAO,SAAS;IAUrB,YAAY,IAAY,EAAE,OAAyB,EAAE;QATpC,UAAK,GAAsC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAU/E,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;QACjB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC;QAC1C,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC;QACnC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAC;QACxC,IAAI,CAAC,oBAAoB,GAAG,IAAI,CAAC,oBAAoB,IAAI,EAAE,GAAG,IAAI,CAAC;QACnE,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,QAAQ,CAAC,CAAC;QAC/C,IAAI,CAAC,IAAI,EAAE,CAAC;IACb,CAAC;IAEO,GAAG,CAAC,GAAW;QACtB,IAAI,CAAC;YACJ,MAAM,CAAC,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YACvB,CAAC,CAAC,IAAI,GAAG,EAAE,CAAC;YACZ,OAAO,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACxC,CAAC;QAAC,MAAM,CAAC;YACR,OAAO,GAAG,CAAC;QACZ,CAAC;IACF,CAAC;IAED,GAAG,CAAC,GAAW,EAAE,IAAkB;QAClC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACxB,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,IAAI,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1E,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1C,IAAI,MAAM,KAAK,SAAS;gBAAE,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACrD,CAAC;QACD,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC;QAC7D,IAAI,IAAI,CAAC,SAAS;YAAE,IAAI,CAAC,KAAK,EAAE,CAAC;IAClC,CAAC;IAED,GAAG,CAAC,GAAW;QACd,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,SAAS,CAAC;IACpC,CAAC;IAED,MAAM,CAAC,GAAW;QACjB,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;QACjC,IAAI,IAAI,CAAC,SAAS;YAAE,IAAI,CAAC,KAAK,EAAE,CAAC;IAClC,CAAC;IAED,8EAA8E;IAC9E,gBAAgB;IAChB,8EAA8E;IAE9E,qEAAqE;IAC7D,aAAa,CAAC,GAAW;QAChC,MAAM,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAC1D,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC;QACjD,OAAO,GAAG,IAAI,GAAG,GAAG,EAAE,CAAC;IACxB,CAAC;IAED;;;;;OAKG;IACK,KAAK,CAAC,MAAkB;QAC/B,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;YACjC,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAChD,CAAC;QACD,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;YACzB,IAAI,CAAC,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,MAAM,IAAI,IAAI,CAAC,oBAAoB,EAAE,CAAC;gBACnE,OAAO,GAAG,CAAC;YACZ,CAAC;YACD,MAAM,QAAQ,GAAG,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC7C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;YAChD,aAAa,CAAC,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC;YAC3D,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,GAAG,IAAI,EAAE,GAAG,GAAG,CAAC;YACvC,OAAO,EAAE,GAAG,IAAI,EAAE,QAAQ,EAAE,CAAC;QAC9B,CAAC,CAAC,CAAC;IACJ,CAAC;IAED;;;OAGG;IACK,OAAO,CAAC,MAAkB;QACjC,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;YACzB,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ;gBAAE,OAAO,GAAG,CAAC;YAC5C,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC;gBAAE,OAAO,GAAG,CAAC;YAC1C,IAAI,CAAC;gBACJ,MAAM,MAAM,GAAG,YAAY,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;gBAC7D,OAAO,EAAE,GAAG,GAAG,EAAE,MAAM,EAAE,CAAC;YAC3B,CAAC;YAAC,MAAM,CAAC;gBACR,OAAO,GAAG,CAAC;YACZ,CAAC;QACF,CAAC,CAAC,CAAC;IACJ,CAAC;IAED,8EAA8E;IAC9E,cAAc;IACd,8EAA8E;IAE9E,6EAA6E;IAC7E,KAAK;QACJ,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,MAAM,OAAO,GAA0B,EAAE,CAAC;QAC1C,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YACjD,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,SAAS,IAAI,GAAG;gBAAE,SAAS;YACvC,MAAM,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;YACpB,MAAM,UAAU,GAAiB,IAAI,CAAC,MAAM;gBAC3C,CAAC,CAAC,EAAE,GAAG,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE;gBAC9C,CAAC,CAAC,IAAI,CAAC;YACR,OAAO,CAAC,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,UAAU,EAAE,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC;QAC3D,CAAC;QACD,MAAM,OAAO,GAAgB,EAAE,CAAC,EAAE,cAAc,EAAE,OAAO,EAAE,CAAC;QAC5D,aAAa,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC,CAAC;IAC3D,CAAC;IAEO,IAAI;QACX,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,OAAO;QACnC,IAAI,CAAC;YACJ,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,EAAE,MAAM,CAAC,CAAY,CAAC;YAEnE,gEAAgE;YAChE,uDAAuD;YACvD,IACC,OAAO,GAAG,KAAK,QAAQ;gBACvB,GAAG,KAAK,IAAI;gBACX,GAAuB,CAAC,CAAC,KAAK,cAAc,EAC5C,CAAC;gBACF,OAAO,CAAC,2CAA2C;YACpD,CAAC;YAED,MAAM,OAAO,GAAG,GAAkB,CAAC;YACnC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACvB,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;gBACtD,IAAI,CAAC,CAAC,SAAS,GAAG,GAAG;oBAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;YAC1C,CAAC;QACF,CAAC;QAAC,MAAM,CAAC;YACR,4CAA4C;QAC7C,CAAC;IACF,CAAC;IAED,oEAAoE;IACpE,MAAM;QACL,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,OAAO,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC;aAC9B,MAAM,CAAC,CAAC,CAAC,EAAc,EAAE,CAAC,CAAC,KAAK,SAAS,IAAI,CAAC,CAAC,SAAS,GAAG,GAAG,CAAC;aAC/D,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC;aACzC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;YACV,MAAM,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;YACpB,OAAO,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,GAAG,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAC5E,CAAC,CAAC,CAAC;IACL,CAAC;IAED,mEAAmE;IACnE,GAAG,CAAC,GAAW;QACd,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACxB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC5B,IAAI,CAAC,KAAK;YAAE,OAAO,SAAS,CAAC;QAC7B,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC;YAClC,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YACrB,OAAO,SAAS,CAAC;QAClB,CAAC;QACD,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC;QACxB,IAAI,IAAI,CAAC,MAAM;YAAE,OAAO,EAAE,GAAG,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;QACvE,OAAO,IAAI,CAAC;IACb,CAAC;CACD","sourcesContent":["/**\n * Disk-backed cache implementing ICache<string, SpideredPage>.\n *\n * Persists to a JSON file so the cache survives extension reloads and\n * pi restarts. Call flush() to write — set() auto-flushes by default.\n *\n * The images directory is derived automatically from `dirname(path)/images`.\n * Callers do not need to create it — DiskCache creates it on first large-image\n * flush. Pre-creating it at startup (e.g. in the extension boot path) is\n * harmless and avoids a first-write delay.\n *\n * Internal storage uses a plain object (Object.create(null)) rather than a\n * Map. Plain objects carry no realm-specific internal slots, making them safe\n * across V8 context (realm) boundaries — e.g. when DiskCache is constructed\n * in an ESM module realm but called from a jiti VM-sandbox realm (Bun binary\n * mode). The Map-backed version threw \"Map operation called on non-Map object\"\n * in that scenario.\n *\n * A schema version field in the persisted JSON guards against stale cache\n * files from previous major versions being silently loaded with wrong shapes.\n */\n\nimport { createHash } from \"node:crypto\";\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from \"node:fs\";\nimport { dirname, extname, join } from \"node:path\";\nimport type { ICache } from \"./ports.js\";\nimport type { ImageRef, SpideredPage } from \"./types.js\";\n\n/** Bump when the on-disk entry shape changes incompatibly. */\nconst SCHEMA_VERSION = 2;\n\nexport interface DiskCacheOptions {\n\t/** Time-to-live in ms. Default 30 min. */\n\tttlMs?: number;\n\t/** Max entries. Default 500. */\n\tmaxSize?: number;\n\t/** Auto-flush to disk on every set(). Default true. */\n\tautoFlush?: boolean;\n\t/**\n\t * Base64 byte threshold for inline vs. file storage of images.\n\t * Images whose base64 string length exceeds this are written as binary\n\t * files to <cache-dir>/images/ instead of being stored inline in the JSON.\n\t * Default: 32 * 1024 (32 KB of base64 ≈ 24 KB binary).\n\t */\n\tinlineImageThreshold?: number;\n}\n\ninterface Entry {\n\tpage: SpideredPage;\n\texpiresAt: number;\n}\n\n/** Versioned wrapper written to disk. */\ninterface DiskPayload {\n\tv: number;\n\tentries: Record<string, Entry>;\n}\n\nexport class DiskCache implements ICache<string, SpideredPage> {\n\tprivate readonly store: Record<string, Entry | undefined> = Object.create(null);\n\tprivate readonly path: string;\n\tprivate readonly ttlMs: number;\n\tprivate readonly maxSize: number;\n\tprivate readonly autoFlush: boolean;\n\tprivate readonly inlineImageThreshold: number;\n\t/** Directory where large image binaries are stored. */\n\tprivate readonly imagesDir: string;\n\n\tconstructor(path: string, opts: DiskCacheOptions = {}) {\n\t\tthis.path = path;\n\t\tthis.ttlMs = opts.ttlMs ?? 30 * 60 * 1000;\n\t\tthis.maxSize = opts.maxSize ?? 500;\n\t\tthis.autoFlush = opts.autoFlush ?? true;\n\t\tthis.inlineImageThreshold = opts.inlineImageThreshold ?? 32 * 1024;\n\t\tthis.imagesDir = join(dirname(path), \"images\");\n\t\tthis.load();\n\t}\n\n\tprivate key(url: string): string {\n\t\ttry {\n\t\t\tconst u = new URL(url);\n\t\t\tu.hash = \"\";\n\t\t\treturn u.toString().replace(/\\/$/, \"\");\n\t\t} catch {\n\t\t\treturn url;\n\t\t}\n\t}\n\n\tset(url: string, page: SpideredPage): void {\n\t\tconst k = this.key(url);\n\t\tif (Object.keys(this.store).length >= this.maxSize && !(k in this.store)) {\n\t\t\tconst oldest = Object.keys(this.store)[0];\n\t\t\tif (oldest !== undefined) delete this.store[oldest];\n\t\t}\n\t\tthis.store[k] = { page, expiresAt: Date.now() + this.ttlMs };\n\t\tif (this.autoFlush) this.flush();\n\t}\n\n\thas(url: string): boolean {\n\t\treturn this.get(url) !== undefined;\n\t}\n\n\tdelete(url: string): void {\n\t\tdelete this.store[this.key(url)];\n\t\tif (this.autoFlush) this.flush();\n\t}\n\n\t// ---------------------------------------------------------------------------\n\t// Image helpers\n\t// ---------------------------------------------------------------------------\n\n\t/** Derive a stable filename for an image binary from its src URL. */\n\tprivate imageFilename(src: string): string {\n\t\tconst hash = createHash(\"sha1\").update(src).digest(\"hex\");\n\t\tconst ext = extname(src.split(\"?\")[0]) || \".bin\";\n\t\treturn `${hash}${ext}`;\n\t}\n\n\t/**\n\t * Prepare images for serialisation:\n\t * - Images whose base64 length ≤ threshold are kept inline.\n\t * - Larger images are written to imagesDir as binary files; base64 is\n\t * replaced by filePath in the serialised entry.\n\t */\n\tprivate spill(images: ImageRef[]): ImageRef[] {\n\t\tif (!existsSync(this.imagesDir)) {\n\t\t\tmkdirSync(this.imagesDir, { recursive: true });\n\t\t}\n\t\treturn images.map((img) => {\n\t\t\tif (!img.base64 || img.base64.length <= this.inlineImageThreshold) {\n\t\t\t\treturn img;\n\t\t\t}\n\t\t\tconst filename = this.imageFilename(img.src);\n\t\t\tconst filePath = join(this.imagesDir, filename);\n\t\t\twriteFileSync(filePath, Buffer.from(img.base64, \"base64\"));\n\t\t\tconst { base64: _omit, ...rest } = img;\n\t\t\treturn { ...rest, filePath };\n\t\t});\n\t}\n\n\t/**\n\t * Hydrate images on read: if an image has filePath but no base64,\n\t * load the binary from disk and re-encode.\n\t */\n\tprivate hydrate(images: ImageRef[]): ImageRef[] {\n\t\treturn images.map((img) => {\n\t\t\tif (img.base64 || !img.filePath) return img;\n\t\t\tif (!existsSync(img.filePath)) return img;\n\t\t\ttry {\n\t\t\t\tconst base64 = readFileSync(img.filePath).toString(\"base64\");\n\t\t\t\treturn { ...img, base64 };\n\t\t\t} catch {\n\t\t\t\treturn img;\n\t\t\t}\n\t\t});\n\t}\n\n\t// ---------------------------------------------------------------------------\n\t// Persistence\n\t// ---------------------------------------------------------------------------\n\n\t/** Write current contents to disk. Large images are spilled to imagesDir. */\n\tflush(): void {\n\t\tconst now = Date.now();\n\t\tconst entries: Record<string, Entry> = {};\n\t\tfor (const [k, v] of Object.entries(this.store)) {\n\t\t\tif (!v || v.expiresAt <= now) continue;\n\t\t\tconst page = v.page;\n\t\t\tconst serialised: SpideredPage = page.images\n\t\t\t\t? { ...page, images: this.spill(page.images) }\n\t\t\t\t: page;\n\t\t\tentries[k] = { page: serialised, expiresAt: v.expiresAt };\n\t\t}\n\t\tconst payload: DiskPayload = { v: SCHEMA_VERSION, entries };\n\t\twriteFileSync(this.path, JSON.stringify(payload), \"utf8\");\n\t}\n\n\tprivate load(): void {\n\t\tif (!existsSync(this.path)) return;\n\t\ttry {\n\t\t\tconst raw = JSON.parse(readFileSync(this.path, \"utf8\")) as unknown;\n\n\t\t\t// Reject files from incompatible schema versions (including old\n\t\t\t// unversioned files that lack the \"v\" field entirely).\n\t\t\tif (\n\t\t\t\ttypeof raw !== \"object\" ||\n\t\t\t\traw === null ||\n\t\t\t\t(raw as { v?: unknown }).v !== SCHEMA_VERSION\n\t\t\t) {\n\t\t\t\treturn; // stale schema — start fresh, do not throw\n\t\t\t}\n\n\t\t\tconst payload = raw as DiskPayload;\n\t\t\tconst now = Date.now();\n\t\t\tfor (const [k, v] of Object.entries(payload.entries)) {\n\t\t\t\tif (v.expiresAt > now) this.store[k] = v;\n\t\t\t}\n\t\t} catch {\n\t\t\t// Corrupt or unreadable file — start fresh.\n\t\t}\n\t}\n\n\t/** All currently valid (non-expired) pages, sorted newest-first. */\n\tvalues(): SpideredPage[] {\n\t\tconst now = Date.now();\n\t\treturn Object.values(this.store)\n\t\t\t.filter((e): e is Entry => e !== undefined && e.expiresAt > now)\n\t\t\t.sort((a, b) => b.expiresAt - a.expiresAt)\n\t\t\t.map((e) => {\n\t\t\t\tconst page = e.page;\n\t\t\t\treturn page.images ? { ...page, images: this.hydrate(page.images) } : page;\n\t\t\t});\n\t}\n\n\t/** Retrieve a page, hydrating any file-backed images from disk. */\n\tget(url: string): SpideredPage | undefined {\n\t\tconst k = this.key(url);\n\t\tconst entry = this.store[k];\n\t\tif (!entry) return undefined;\n\t\tif (Date.now() > entry.expiresAt) {\n\t\t\tdelete this.store[k];\n\t\t\treturn undefined;\n\t\t}\n\t\tconst page = entry.page;\n\t\tif (page.images) return { ...page, images: this.hydrate(page.images) };\n\t\treturn page;\n\t}\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"graph.js","sourceRoot":"","sources":["../src/graph.ts"],"names":[],"mappings":"AA4BA;;;;;;;;;;;;;GAaG;AACH,MAAM,OAAO,SAAS;IAAtB;QACkB,UAAK,GAAyC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QACnF,2BAA2B;QACV,QAAG,GAA2C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QACnF,gCAAgC;QACf,QAAG,GAAyC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IA6IlF,CAAC;IA3IA,iDAAiD;IACjD,OAAO,CAAC,IAAkB;QACzB,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG;YACtB,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,WAAW,EAAE,IAAI,CAAC,WAAW;YAC7B,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM;SAC9B,CAAC;QAEF,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YAC/B,IAAI,CAAC,IAAI,CAAC,IAAI;gBAAE,SAAS;YACzB,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAC/D,CAAC;IACF,CAAC;IAED,2EAA2E;IAC3E,OAAO,CAAC,IAAY,EAAE,EAAU,EAAE,IAAY,EAAE,UAAmB;QAClE,MAAM,IAAI,GAAa,EAAE,IAAI,EAAE,EAAE,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;QACtD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;YACxC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,QAAQ,EAAE,IAAI,CAAC,CAAC;QACtC,CAAC;QACD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,CAAC;QACnC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7B,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,OAAO,EAAE,IAAI,CAAC,CAAC;QACnC,CAAC;IACF,CAAC;IAED,IAAI,CAAC,GAAW;QACf,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IACxB,CAAC;IAED,kCAAkC;IAClC,QAAQ,CAAC,GAAW;QACnB,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;IAC5B,CAAC;IAED,mCAAmC;IACnC,OAAO,CAAC,GAAW;QAClB,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;IAC5B,CAAC;IAED,+DAA+D;IAC/D,KAAK;QACJ,OAAO,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC;aAC9B,MAAM,CAAC,CAAC,CAAC,EAAiB,EAAE,CAAC,CAAC,KAAK,SAAS,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC;IACzF,CAAC;IAED,4DAA4D;IAC5D,KAAK;QACJ,OAAO,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC;aAC9B,MAAM,CAAC,CAAC,CAAC,EAAiB,EAAE;YAC5B,IAAI,CAAC,CAAC;gBAAE,OAAO,KAAK,CAAC;YACrB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;YACpC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;IACL,CAAC;IAED,4EAA4E;IAC5E,QAAQ,CAAC,IAAY,EAAE,EAAU;QAChC,IAAI,IAAI,KAAK,EAAE;YAAE,OAAO,CAAC,IAAI,CAAC,CAAC;QAC/B,MAAM,OAAO,GAAG,IAAI,GAAG,CAAS,CAAC,IAAI,CAAC,CAAC,CAAC;QACxC,MAAM,KAAK,GAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QAExC,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,EAAG,CAAC;YAC5B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YACtC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;gBAC5C,IAAI,IAAI,CAAC,EAAE,KAAK,EAAE;oBAAE,OAAO,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,CAAC;gBACzC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC,EAAE,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;oBACpD,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACrB,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;gBAChC,CAAC;YACF,CAAC;QACF,CAAC;QACD,OAAO,IAAI,CAAC;IACb,CAAC;IAED;;;OAGG;IACH,aAAa,CAAC,QAAgB;QAC7B,MAAM,OAAO,GAAG,IAAI,GAAG,CAAS,CAAC,QAAQ,CAAC,CAAC,CAAC;QAC5C,MAAM,KAAK,GAAG,CAAC,QAAQ,CAAC,CAAC;QACzB,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,MAAM,GAAG,GAAG,KAAK,CAAC,KAAK,EAAG,CAAC;YAC3B,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC;gBACxC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC,EAAE,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;oBACpD,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACrB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACrB,CAAC;YACF,CAAC;QACF,CAAC;QACD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QACzB,OAAO,CAAC,GAAG,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAiB,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC;IAC7F,CAAC;IAED,0DAA0D;IAC1D,UAAU;QACT,OAAO,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC;aAC9B,MAAM,CAAC,CAAC,CAAC,EAAiB,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC;aAC7C,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;aACvE,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY,CAAC,CAAC;IACnD,CAAC;IAED,IAAI,SAAS;QACZ,OAAO,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;IACvC,CAAC;IAED,IAAI,SAAS;QACZ,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YAC7C,IAAI,KAAK;gBAAE,KAAK,IAAI,KAAK,CAAC,MAAM,CAAC;QAClC,CAAC;QACD,OAAO,KAAK,CAAC;IACd,CAAC;IAED,wDAAwD;IACxD,MAAM;QACL,MAAM,KAAK,GAAe,EAAE,CAAC;QAC7B,KAAK,MAAM,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YAChD,IAAI,QAAQ;gBAAE,KAAK,CAAC,IAAI,CAAC,GAAG,QAAQ,CAAC,CAAC;QACvC,CAAC;QACD,OAAO;YACN,KAAK,EAAE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAiB,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC;YAC9E,KAAK;SACL,CAAC;IACH,CAAC;IAED,MAAM,CAAC,QAAQ,CAAC,IAAuB;QACtC,MAAM,CAAC,GAAG,IAAI,SAAS,EAAE,CAAC;QAC1B,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,KAAK;YAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAC/C,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,KAAK;YAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,UAAU,CAAC,CAAC;QAC1E,OAAO,CAAC,CAAC;IACV,CAAC;CACD","sourcesContent":["import type { SpideredPage } from \"./types.js\";\n\n/** A node in the knowledge graph — lightweight reference, not the full page. */\nexport interface PageNode {\n\turl: string;\n\tdomain: string;\n\ttitle: string;\n\tdescription: string;\n\twordCount: number;\n\tfetchedAt: string;\n\tchunkCount: number;\n}\n\n/** A directed edge between two pages. */\nexport interface PageEdge {\n\tfrom: string;\n\tto: string;\n\t/** Anchor text of the link */\n\ttext: string;\n\tisExternal: boolean;\n}\n\n/** Serialisable snapshot for storage or embedding. */\nexport interface PageGraphSnapshot {\n\tnodes: PageNode[];\n\tedges: PageEdge[];\n}\n\n/**\n * Directed knowledge graph of spidered pages.\n *\n * Nodes are pages. Edges are outbound links.\n * Maintains a reverse index (inbound links) for O(1) lookup.\n *\n * All graph queries return plain data — no PageNode references —\n * so the graph is trivially serialisable.\n *\n * Internal storage uses plain objects (Object.create(null)) rather than\n * Maps. Plain objects carry no realm-specific internal slots, making them\n * safe across V8 context (realm) boundaries — e.g. when the graph is\n * constructed in an ESM module realm but called from a jiti VM-sandbox.\n */\nexport class PageGraph {\n\tprivate readonly nodes: Record<string, PageNode | undefined> = Object.create(null);\n\t/** url → outbound edges */\n\tprivate readonly out: Record<string, PageEdge[] | undefined> = Object.create(null);\n\t/** url → inbound source urls */\n\tprivate readonly in_: Record<string, string[] | undefined> = Object.create(null);\n\n\t/** Add or update a node from a spidered page. */\n\taddPage(page: SpideredPage): void {\n\t\tthis.nodes[page.url] = {\n\t\t\turl: page.url,\n\t\t\tdomain: page.domain,\n\t\t\ttitle: page.title,\n\t\t\tdescription: page.description,\n\t\t\twordCount: page.wordCount,\n\t\t\tfetchedAt: page.fetchedAt,\n\t\t\tchunkCount: page.chunks.length,\n\t\t};\n\n\t\tfor (const link of page.links) {\n\t\t\tif (!link.href) continue;\n\t\t\tthis.addEdge(page.url, link.href, link.text, link.isExternal);\n\t\t}\n\t}\n\n\t/** Add a directed edge without requiring the target to be spidered yet. */\n\taddEdge(from: string, to: string, text: string, isExternal: boolean): void {\n\t\tconst edge: PageEdge = { from, to, text, isExternal };\n\t\tconst existing = this.out[from] ?? [];\n\t\tif (!existing.some((e) => e.to === to)) {\n\t\t\tthis.out[from] = [...existing, edge];\n\t\t}\n\t\tconst inbound = this.in_[to] ?? [];\n\t\tif (!inbound.includes(from)) {\n\t\t\tthis.in_[to] = [...inbound, from];\n\t\t}\n\t}\n\n\tnode(url: string): PageNode | undefined {\n\t\treturn this.nodes[url];\n\t}\n\n\t/** Outbound edges from a node. */\n\toutbound(url: string): PageEdge[] {\n\t\treturn this.out[url] ?? [];\n\t}\n\n\t/** URLs that link TO this page. */\n\tinbound(url: string): string[] {\n\t\treturn this.in_[url] ?? [];\n\t}\n\n\t/** Pages with no inbound links — entry points to the graph. */\n\troots(): PageNode[] {\n\t\treturn Object.values(this.nodes)\n\t\t\t.filter((n): n is PageNode => n !== undefined && (this.in_[n.url] ?? []).length === 0);\n\t}\n\n\t/** Pages with no outbound links to other spidered nodes. */\n\tsinks(): PageNode[] {\n\t\treturn Object.values(this.nodes)\n\t\t\t.filter((n): n is PageNode => {\n\t\t\t\tif (!n) return false;\n\t\t\t\tconst edges = this.out[n.url] ?? [];\n\t\t\t\treturn !edges.some((e) => e.to in this.nodes);\n\t\t\t});\n\t}\n\n\t/** BFS shortest path between two page URLs. Returns null if unreachable. */\n\tfindPath(from: string, to: string): string[] | null {\n\t\tif (from === to) return [from];\n\t\tconst visited = new Set<string>([from]);\n\t\tconst queue: Array<string[]> = [[from]];\n\n\t\twhile (queue.length > 0) {\n\t\t\tconst path = queue.shift()!;\n\t\t\tconst current = path[path.length - 1];\n\t\t\tfor (const edge of this.out[current] ?? []) {\n\t\t\t\tif (edge.to === to) return [...path, to];\n\t\t\t\tif (!visited.has(edge.to) && edge.to in this.nodes) {\n\t\t\t\t\tvisited.add(edge.to);\n\t\t\t\t\tqueue.push([...path, edge.to]);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\treturn null;\n\t}\n\n\t/**\n\t * All pages reachable from `startUrl` via spidered links.\n\t * BFS, bounded by the nodes present in the graph.\n\t */\n\treachableFrom(startUrl: string): PageNode[] {\n\t\tconst visited = new Set<string>([startUrl]);\n\t\tconst queue = [startUrl];\n\t\twhile (queue.length > 0) {\n\t\t\tconst url = queue.shift()!;\n\t\t\tfor (const edge of this.out[url] ?? []) {\n\t\t\t\tif (!visited.has(edge.to) && edge.to in this.nodes) {\n\t\t\t\t\tvisited.add(edge.to);\n\t\t\t\t\tqueue.push(edge.to);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tvisited.delete(startUrl);\n\t\treturn [...visited].map((u) => this.nodes[u]).filter((n): n is PageNode => n !== undefined);\n\t}\n\n\t/** Nodes ranked by inbound link count (highest first). */\n\tbyPageRank(): Array<{ node: PageNode; inboundCount: number }> {\n\t\treturn Object.values(this.nodes)\n\t\t\t.filter((n): n is PageNode => n !== undefined)\n\t\t\t.map((n) => ({ node: n, inboundCount: (this.in_[n.url] ?? []).length }))\n\t\t\t.sort((a, b) => b.inboundCount - a.inboundCount);\n\t}\n\n\tget nodeCount(): number {\n\t\treturn Object.keys(this.nodes).length;\n\t}\n\n\tget edgeCount(): number {\n\t\tlet total = 0;\n\t\tfor (const edges of Object.values(this.out)) {\n\t\t\tif (edges) total += edges.length;\n\t\t}\n\t\treturn total;\n\t}\n\n\t/** Plain snapshot — safe to JSON.stringify or embed. */\n\ttoJSON(): PageGraphSnapshot {\n\t\tconst edges: PageEdge[] = [];\n\t\tfor (const edgeList of Object.values(this.out)) {\n\t\t\tif (edgeList) edges.push(...edgeList);\n\t\t}\n\t\treturn {\n\t\t\tnodes: Object.values(this.nodes).filter((n): n is PageNode => n !== undefined),\n\t\t\tedges,\n\t\t};\n\t}\n\n\tstatic fromJSON(snap: PageGraphSnapshot): PageGraph {\n\t\tconst g = new PageGraph();\n\t\tfor (const n of snap.nodes) g.nodes[n.url] = n;\n\t\tfor (const e of snap.edges) g.addEdge(e.from, e.to, e.text, e.isExternal);\n\t\treturn g;\n\t}\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wCAAwC;AACxC,8EAA8E;AAG9E,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAEzC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAEnC,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAEvC,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC1C,yGAAyG;AACzG,OAAO,EAAE,WAAW,IAAI,WAAW,EAAE,MAAM,aAAa,CAAC;AAEzD,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAErC,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAE/D,OAAO,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AAEpC,OAAO,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AASxI;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,QAAQ,CACvB,KAAmC,EACnC,GAAW,EACX,KAAa;IAEb,IAAI,KAAK,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAChC,OAAO,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;AACtC,CAAC;AASD,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAE5C,OAAO,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,MAAM,iBAAiB,CAAC;AAC/E,OAAO,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAC7D,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAEhD,OAAO,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAC/D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,eAAe,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AAErJ,8EAA8E;AAC9E,sEAAsE;AACtE,oEAAoE;AACpE,qEAAqE;AACrE,qEAAqE;AACrE,8EAA8E","sourcesContent":["// ---------------------------------------------------------------------------\n// Public API — what most consumers need\n// ---------------------------------------------------------------------------\n\nexport type { SpiderCacheOptions } from \"./cache.js\";\nexport { SpiderCache } from \"./cache.js\";\nexport type { CrawlOptions, CrawlResult } from \"./crawl.js\";\nexport { crawl } from \"./crawl.js\";\nexport type { PageEdge, PageGraphSnapshot, PageNode } from \"./graph.js\";\nexport { PageGraph } from \"./graph.js\";\nexport type { FuzzySearchOptions, SearchHit } from \"./search.js\";\nexport { searchPages } from \"./search.js\";\n/** @deprecated Use {@link searchPages} — renamed in v0.4.0 to reflect BM25F ranking (not fuzzy-only). */\nexport { searchPages as fuzzySearch } from \"./search.js\";\nexport type { SpiderOptions, TreePage } from \"./spider.js\";\nexport { spider } from \"./spider.js\";\nexport type { QueryTreeOptions } from \"./tree.js\";\nexport { buildTree, navigateTree, queryTree } from \"./tree.js\";\nexport type { Chunk, ChunkType, DOMNode, ImageRef, LeanLink, LeanPage, Link, PageView, SpideredPage, TreeHit } from \"./types.js\";\nexport { toLean } from \"./views.js\";\nexport type { BraveSearchOptions, DdgSearchOptions, ExaSearchOptions, FallbackSearchEngineOptions, SearchEngine, TavilySearchOptions, WebSearchResult } from \"./web-search.js\";\nexport { braveSearch, ddgSearch, exaSearch, registerSearchEngine, resolveSearchEngine, tavilySearch, webSearch } from \"./web-search.js\";\n\n// ---------------------------------------------------------------------------\n// Utilities\n// ---------------------------------------------------------------------------\n\nimport type { ICache } from \"./ports.js\";\nimport type { Chunk, SpideredPage } from \"./types.js\";\n\n/**\n * Retrieve a single chunk from a cached page by URL and chunk index.\n *\n * Avoids loading the full page markdown when an agent only needs one\n * specific chunk — e.g. to re-read a section after a highlights hit.\n *\n * Returns undefined when the URL is not cached, the index is out of range,\n * or the index is negative.\n *\n * @example\n * const chunk = getChunk(cache, \"https://example.com/article\", 3)\n * if (chunk) console.log(chunk.text)\n */\nexport function getChunk(\n\tcache: ICache<string, SpideredPage>,\n\turl: string,\n\tindex: number,\n): Chunk | undefined {\n\tif (index < 0) return undefined;\n\treturn cache.get(url)?.chunks[index];\n}\n\n// ---------------------------------------------------------------------------\n// Extension / DI — port interfaces and their concrete adapters.\n// Import these when you need to inject custom implementations.\n// ---------------------------------------------------------------------------\n\nexport type { HttpRequest, HttpResponse, ICache, IHttpClient, IRobotsChecker, ISearchEngine, IThrottle, RobotsResult, SearchQuery } from \"./ports.js\";\nexport type { DiskCacheOptions } from \"./disk-cache.js\";\nexport { DiskCache } from \"./disk-cache.js\";\nexport type { PlaywrightClientOptions } from \"./playwright.js\";\nexport { PlaywrightHttpClient, createPlaywrightClient } from \"./playwright.js\";\nexport { RobotsCache, createRobotsCache } from \"./robots.js\";\nexport { fetchSitemapUrls } from \"./sitemap.js\";\nexport type { ThrottleOptions } from \"./throttle.js\";\nexport { DomainThrottle, createThrottle } from \"./throttle.js\";\nexport { BraveSearchEngine, DdgSearchEngine, ExaSearchEngine, FallbackSearchEngine, TavilySearchEngine, defaultSearchEngine } from \"./web-search.js\";\n\n// ---------------------------------------------------------------------------\n// parse.ts, convert.ts, views.ts are internal implementation modules.\n// They are NOT exported here — they are consumed only by spider.ts.\n// If you need lower-level DOM or markdown utilities, import from the\n// sub-modules directly (not covered by semver stability guarantees).\n// ---------------------------------------------------------------------------\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parse.js","sourceRoot":"","sources":["../src/parse.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC,8EAA8E;AAC9E,eAAe;AACf,8EAA8E;AAE9E;;;;;GAKG;AACH,MAAM,UAAU,QAAQ,CAAC,IAAY,EAAE,GAAW;IACjD,OAAO,SAAS,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,QAA+B,CAAC;AACjE,CAAC;AAED,8EAA8E;AAC9E,qBAAqB;AACrB,8EAA8E;AAE9E,MAAM,YAAY,GACjB,kIAAkI,CAAC;AAEpI,8EAA8E;AAC9E,MAAM,UAAU,YAAY,CAAC,EAAW;IACvC,IAAI,EAAE,CAAC,OAAO,CAAC,4BAA4B,CAAC;QAAE,OAAO,IAAI,CAAC;IAC1D,IACC,EAAE,CAAC,OAAO,CACT,iFAAiF,CACjF;QAED,OAAO,IAAI,CAAC;IAEb,IAAI,IAAI,GAAmB,EAAE,CAAC;IAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,IAAI,CAAC,IAAI;YAAE,MAAM;QACjB,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YAClC,IAAI,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC;gBAAE,OAAO,IAAI,CAAC;QACzC,CAAC;QACD,IAAI,GAAG,IAAI,CAAC,aAAa,CAAC;IAC3B,CAAC;IACD,OAAO,KAAK,CAAC;AACd,CAAC;AAED,8EAA8E;AAC9E,uBAAuB;AACvB,8EAA8E;AAE9E,kEAAkE;AAClE,MAAM,UAAU,UAAU,CAAC,CAAU;IACpC,IAAI,CAAC,CAAC,CAAC,aAAa,CAAC,KAAK,CAAC,EAAE,CAAC;QAC7B,OAAO,CAAC,CAAC,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1D,CAAC;IACD,MAAM,KAAK,GAAG,CAAC,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;IAC3C,KAAK,MAAM,GAAG,IAAI,CAAC,GAAG,KAAK,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC;QAAE,GAAG,CAAC,MAAM,EAAE,CAAC;IACnE,OAAO,CAAC,KAAK,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AAC9D,CAAC;AAED,8EAA8E;AAC9E,kBAAkB;AAClB,8EAA8E;AAE9E,sEAAsE;AACtE,MAAM,UAAU,YAAY,CAAC,GAAa,EAAE,OAAe;IAC1D,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;IACvC,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;SAChD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QACV,MAAM,IAAI,GAAI,CAAuB,CAAC,IAAI,CAAC;QAC3C,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC;aACxB,OAAO,CACP,mIAAmI,EACnI,EAAE,CACF;aACA,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;aACpB,IAAI,EAAE,CAAC;QACT,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;YAAE,OAAO,IAAI,CAAC;QAElE,OAAO;YACN,IAAI;YACJ,IAAI;YACJ,UAAU,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;YACpC,GAAG,EAAE,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAE,KAAe,CAAC,CAAC,CAAE,MAAgB;SAC5C,CAAC;IAClB,CAAC,CAAC;SACD,MAAM,CAAC,CAAC,CAAC,EAAa,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC;SACpC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AACjB,CAAC;AAED,8EAA8E;AAC9E,qBAAqB;AACrB,8EAA8E;AAE9E,+DAA+D;AAC/D,MAAM,UAAU,eAAe,CAAC,IAAY;IAC3C,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,eAAe,IAAI,gBAAgB,CAAC,CAAC;IACpE,MAAM,QAAQ,GAA6B,EAAE,CAAC;IAC9C,QAAQ,CAAC,gBAAgB,CAAC,YAAY,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;QACtD,MAAM,KAAK,GAAG,QAAQ,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAc,CAAC;QACvD,MAAM,IAAI,GAAG,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QAC3C,IAAI,IAAI;YAAE,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC;AACjB,CAAC;AAED,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E,6DAA6D;AAC7D,MAAM,UAAU,WAAW,CAAC,GAAa;IACxC,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAE/B,MAAM,QAAQ,GAAG,GAAG,CAAC,aAAa,CAAC,uBAAuB,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;IAC3F,KAAK,MAAM,CAAC,IAAI,QAAQ;SACtB,KAAK,CAAC,MAAM,CAAC;SACb,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;SAClC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;QACnB,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;IACb,CAAC;IAED,GAAG,CAAC,gBAAgB,CAAC,wDAAwD,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE;QAC7F,MAAM,CAAC,GAAG,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC3D,IAAI,CAAC;YAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC,CAAC,CAAC;IAEH,MAAM,OAAO,GACZ,GAAG,CAAC,aAAa,CAAC,kCAAkC,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC;QAC9E,GAAG,CAAC,aAAa,CAAC,qCAAqC,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,CAAC;IACnF,IAAI,OAAO;QAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;IAEpD,OAAO,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AAC/B,CAAC;AAED,8EAA8E;AAC9E,2BAA2B;AAC3B,8EAA8E;AAE9E,gEAAgE;AAChE,MAAM,UAAU,mBAAmB,CAAC,GAAa,EAAE,UAAkB;IACpE,MAAM,SAAS,GACd,GAAG,CAAC,aAAa,CAAC,uBAAuB,CAAC,EAAE,YAAY,CAAC,MAAM,CAAC;QAChE,GAAG,CAAC,aAAa,CAAC,yBAAyB,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,CAAC;IACvE,IAAI,CAAC,SAAS;QAAE,OAAO,SAAS,CAAC;IACjC,MAAM,IAAI,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACjD,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC;AACrE,CAAC","sourcesContent":["/**\n * DOM parsing helpers.\n *\n * Owns the DOM parsing dependency. spider.ts calls these after fetching HTML;\n * it never touches the DOM library directly.\n */\n\nimport { parseHTML } from \"linkedom\";\nimport type { Link, SpideredPage } from \"./types.js\";\n\n// ---------------------------------------------------------------------------\n// DOM creation\n// ---------------------------------------------------------------------------\n\n/**\n * Parse raw HTML into a DOM Document.\n * Uses linkedom — a lightweight server-side DOM that has no CSS engine,\n * no module-level Maps, and a flat CJS dependency tree. Safe to load\n * through jiti's transform pipeline without nativeModules workarounds.\n */\nexport function parseDom(html: string, url: string): Document {\n\treturn parseHTML(html, { url }).document as unknown as Document;\n}\n\n// ---------------------------------------------------------------------------\n// Nav classification\n// ---------------------------------------------------------------------------\n\nconst NAV_CLASS_RE =\n\t/^(nav|navbar|navigation|menu|menubar|header|footer|sidebar|breadcrumb|topbar|toolbar|site-nav|main-nav|primary-nav|global-nav)$/i;\n\n/** True if el or any ancestor up to 5 levels looks like navigation chrome. */\nexport function isNavElement(el: Element): boolean {\n\tif (el.closest(\"nav, header, footer, aside\")) return true;\n\tif (\n\t\tel.closest(\n\t\t\t\"[role='navigation'],[role='banner'],[role='contentinfo'],[role='complementary']\",\n\t\t)\n\t)\n\t\treturn true;\n\n\tlet node: Element | null = el;\n\tfor (let i = 0; i < 5; i++) {\n\t\tif (!node) break;\n\t\tfor (const cls of node.classList) {\n\t\t\tif (NAV_CLASS_RE.test(cls)) return true;\n\t\t}\n\t\tnode = node.parentElement;\n\t}\n\treturn false;\n}\n\n// ---------------------------------------------------------------------------\n// Link text extraction\n// ---------------------------------------------------------------------------\n\n/** Extract visible text from an anchor, skipping SVG subtrees. */\nexport function anchorText(a: Element): string {\n\tif (!a.querySelector(\"svg\")) {\n\t\treturn (a.textContent ?? \"\").replace(/\\s+/g, \" \").trim();\n\t}\n\tconst clone = a.cloneNode(true) as Element;\n\tfor (const svg of [...clone.querySelectorAll(\"svg\")]) svg.remove();\n\treturn (clone.textContent ?? \"\").replace(/\\s+/g, \" \").trim();\n}\n\n// ---------------------------------------------------------------------------\n// Link extraction\n// ---------------------------------------------------------------------------\n\n/** Extract outbound links from the DOM, classified as body or nav. */\nexport function extractLinks(doc: Document, baseUrl: string): Link[] {\n\tconst origin = new URL(baseUrl).origin;\n\treturn Array.from(doc.querySelectorAll(\"a[href]\"))\n\t\t.map((a) => {\n\t\t\tconst href = (a as HTMLAnchorElement).href;\n\t\t\tconst text = anchorText(a)\n\t\t\t\t.replace(\n\t\t\t\t\t/\\b(open_in_new|navigate_next|navigate_before|arrow_drop_down|arrow_drop_up|chevron_right|chevron_left|expand_more|expand_less)\\b/g,\n\t\t\t\t\t\"\",\n\t\t\t\t)\n\t\t\t\t.replace(/\\s+/g, \" \")\n\t\t\t\t.trim();\n\t\t\tif (!href || !text || href.startsWith(\"javascript:\")) return null;\n\n\t\t\treturn {\n\t\t\t\thref,\n\t\t\t\ttext,\n\t\t\t\tisExternal: !href.startsWith(origin),\n\t\t\t\trel: isNavElement(a) ? (\"nav\" as const) : (\"body\" as const),\n\t\t\t} satisfies Link;\n\t\t})\n\t\t.filter((l): l is Link => l !== null)\n\t\t.slice(0, 200);\n}\n\n// ---------------------------------------------------------------------------\n// Heading extraction\n// ---------------------------------------------------------------------------\n\n/** Extract h1/h2/h3 headings from Readability article HTML. */\nexport function extractHeadings(html: string): SpideredPage[\"headings\"] {\n\tconst { document } = parseHTML(`<html><body>${html}</body></html>`);\n\tconst headings: SpideredPage[\"headings\"] = [];\n\tdocument.querySelectorAll(\"h1, h2, h3\").forEach((el) => {\n\t\tconst level = parseInt(el.tagName[1], 10) as 1 | 2 | 3;\n\t\tconst text = (el.textContent ?? \"\").trim();\n\t\tif (text) headings.push({ level, text });\n\t});\n\treturn headings;\n}\n\n// ---------------------------------------------------------------------------\n// Tag extraction\n// ---------------------------------------------------------------------------\n\n/** Extract topic tags from meta keywords and article:tag. */\nexport function extractTags(doc: Document): string[] {\n\tconst tags = new Set<string>();\n\n\tconst keywords = doc.querySelector('meta[name=\"keywords\"]')?.getAttribute(\"content\") ?? \"\";\n\tfor (const k of keywords\n\t\t.split(/[,;]/)\n\t\t.map((k) => k.trim().toLowerCase())\n\t\t.filter(Boolean)) {\n\t\ttags.add(k);\n\t}\n\n\tdoc.querySelectorAll('meta[property=\"article:tag\"], meta[name=\"article:tag\"]').forEach((el) => {\n\t\tconst t = el.getAttribute(\"content\")?.trim().toLowerCase();\n\t\tif (t) tags.add(t);\n\t});\n\n\tconst section =\n\t\tdoc.querySelector('meta[property=\"article:section\"]')?.getAttribute(\"content\") ??\n\t\tdoc.querySelector('meta[property=\"og:article:section\"]')?.getAttribute(\"content\");\n\tif (section) tags.add(section.trim().toLowerCase());\n\n\treturn [...tags].slice(0, 20);\n}\n\n// ---------------------------------------------------------------------------\n// Canonical URL extraction\n// ---------------------------------------------------------------------------\n\n/** Extract canonical URL from link[rel=canonical] or og:url. */\nexport function extractCanonicalUrl(doc: Document, fetchedUrl: string): string | undefined {\n\tconst canonical =\n\t\tdoc.querySelector('link[rel=\"canonical\"]')?.getAttribute(\"href\") ??\n\t\tdoc.querySelector('meta[property=\"og:url\"]')?.getAttribute(\"content\");\n\tif (!canonical) return undefined;\n\tconst norm = (u: string) => u.replace(/\\/$/, \"\");\n\treturn norm(canonical) !== norm(fetchedUrl) ? canonical : undefined;\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"playwright.js","sourceRoot":"","sources":["../src/playwright.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAsCH,uEAAuE;AACvE,mEAAmE;AACnE,IAAI,cAAc,GAAG,KAAK,CAAC;AAE3B,MAAM,OAAO,oBAAoB;IAShC,YAAY,OAAgC,EAAE;QAR9C,8DAA8D;QACtD,YAAO,GAAe,IAAI,CAAC;QAQlC,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,QAAQ,CAAC;QACxC,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC,cAAc,IAAI,EAAE,CAAC;QAChD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,MAAM,CAAC;QAC1C,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,aAAa,CAAC;QACjD,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,KAAK,CAAC;IAClD,CAAC;IAEO,KAAK,CAAC,WAAW;QACxB,qEAAqE;QACrE,2EAA2E;QAC3E,IAAI,CAAC;YACJ,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,kBAAkB,CAAC,CAAC;YACtD,IAAI,CAAC,cAAc,EAAE,CAAC;gBACrB,MAAM,EAAE,OAAO,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,gCAAgC,CAAC,CAAC;gBAClF,QAAQ,CAAC,GAAG,CAAC,aAAa,EAAE,CAAC,CAAC;gBAC9B,cAAc,GAAG,IAAI,CAAC;YACvB,CAAC;YACD,OAAO,QAAQ,CAAC;QACjB,CAAC;QAAC,MAAM,CAAC;YACR,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACrD,OAAO,QAAQ,CAAC;QACjB,CAAC;IACF,CAAC;IAEO,KAAK,CAAC,UAAU;QACvB,IAAI,IAAI,CAAC,OAAO,EAAE,WAAW,EAAE;YAAE,OAAO,IAAI,CAAC,OAAO,CAAC;QACrD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,WAAW,EAAE,CAAC;QAC1C,MAAM,UAAU,GAAG,IAAI,CAAC,cAAc;YACrC,CAAC,CAAC,EAAE,cAAc,EAAE,IAAI,CAAC,cAAc,EAAE,QAAQ,EAAE,IAAI,EAAE;YACzD,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,CAAC,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC;QAC7C,IAAI,CAAC,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;QACjD,OAAO,IAAI,CAAC,OAAO,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,GAAgB;QAC3B,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QACxC,8DAA8D;QAC9D,MAAM,IAAI,GAAQ,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QAE1C,oEAAoE;QACpE,4DAA4D;QAC5D,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAC7B,IAAI,CAAC,EAAE,CAAC,WAAW,EAAE,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAE/B,IAAI,CAAC;YACJ,yDAAyD;YACzD,mEAAmE;YACnE,gEAAgE;YAChE,mEAAmE;YACnE,gCAAgC;YAChC,8DAA8D;YAC9D,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAU,EAAE,EAAE;gBACvC,MAAM,IAAI,GAAW,KAAK,CAAC,OAAO,EAAE,CAAC,YAAY,EAAE,CAAC;gBACpD,MAAM,MAAM,GAAW,KAAK,CAAC,OAAO,EAAE,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACjE,MAAM,YAAY,GAAG,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;gBAEjD,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;oBACrB,KAAK,CAAC,KAAK,EAAE,CAAC;gBACf,CAAC;qBAAM,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,aAAa,IAAI,YAAY,CAAC,EAAE,CAAC;oBACvF,KAAK,CAAC,KAAK,EAAE,CAAC;gBACf,CAAC;qBAAM,CAAC;oBACP,KAAK,CAAC,QAAQ,EAAE,CAAC;gBAClB,CAAC;YACF,CAAC,CAAC,CAAC;YAEH,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE;gBACzC,OAAO,EAAE,IAAI,CAAC,SAAS;gBACvB,SAAS,EAAE,IAAI,CAAC,SAAS;aACzB,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACf,MAAM,IAAI,KAAK,CAAC,uCAAuC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;YACnE,CAAC;YAED,MAAM,MAAM,GAAW,QAAQ,CAAC,MAAM,EAAE,CAAC;YACzC,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;gBACnB,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,MAAM,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;YACzE,CAAC;YAED,qEAAqE;YACrE,MAAM,IAAI,GAAW,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAC1C,MAAM,OAAO,GAA2B,MAAM,QAAQ,CAAC,UAAU,EAAE,CAAC;YAEpE,OAAO;gBACN,EAAE,EAAE,IAAI;gBACR,MAAM;gBACN,UAAU,EAAE,QAAQ,CAAC,UAAU,EAAE;gBACjC,OAAO,EAAE,EAAE,GAAG,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,IAAI,IAAI,EAAE;gBACvE,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC,IAAI;gBACtB,WAAW,EAAE,KAAK,IAAI,EAAE;oBACvB,MAAM,GAAG,GAAW,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;oBAC1C,OAAO,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC,UAAU,CAAgB,CAAC;gBACzF,CAAC;aACD,CAAC;QACH,CAAC;gBAAS,CAAC;YACV,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;QACpB,CAAC;IACF,CAAC;IAED,kFAAkF;IAClF,KAAK,CAAC,KAAK;QACV,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACrB,CAAC;IACF,CAAC;CACD;AAED;;;GAGG;AACH,MAAM,UAAU,sBAAsB,CACrC,IAA8B;IAE9B,IAAI,CAAC;QACJ,OAAO,IAAI,oBAAoB,CAAC,IAAI,CAAC,CAAC;IACvC,CAAC;IAAC,MAAM,CAAC;QACR,OAAO,IAAI,CAAC;IACb,CAAC;AACF,CAAC","sourcesContent":["/**\n * Playwright adapter — implements IHttpClient using a headless browser.\n *\n * Uses playwright-extra with the stealth plugin, which patches ~15 headless\n * fingerprint signals (navigator.webdriver, User-Agent, plugins, WebGL, etc.)\n * so the browser is indistinguishable from a real Chrome session.\n *\n * Requires system-installed Chrome (channel:\"chrome\") — no browser binary\n * is downloaded. Falls back gracefully to plain playwright-core if\n * playwright-extra or the stealth plugin are not installed.\n *\n * Browser lifecycle:\n * - Launched lazily on the first fetch() call.\n * - Reused across all subsequent requests (one browser, one tab per request).\n * - Call close() when done to release the browser process.\n *\n * Usage:\n * const client = new PlaywrightHttpClient()\n * const page = await spider(url, { httpClient: client })\n * await client.close()\n */\n\nimport type { HttpRequest, HttpResponse, IHttpClient } from \"./ports.js\";\n\nexport interface PlaywrightClientOptions {\n\t/**\n\t * Browser channel — finds a system-installed browser automatically.\n\t * \"chrome\" — Google Chrome (default)\n\t * \"msedge\" — Microsoft Edge\n\t * \"chromium\" — Playwright's own Chromium (must be installed separately)\n\t */\n\tchannel?: \"chrome\" | \"msedge\" | \"chromium\";\n\t/**\n\t * Explicit path to a browser executable.\n\t * Overrides `channel`. Use when Chrome is not in the standard location.\n\t */\n\texecutablePath?: string;\n\t/**\n\t * Navigation timeout in ms. Default: 30 000.\n\t */\n\ttimeoutMs?: number;\n\t/**\n\t * When to consider navigation complete.\n\t * \"networkidle\" — no network activity for 500ms (best for SPAs, default).\n\t * \"domcontentloaded\" — HTML parsed; faster but may miss lazy-loaded content.\n\t * \"load\" — window load event fired.\n\t */\n\twaitUntil?: \"load\" | \"domcontentloaded\" | \"networkidle\" | \"commit\";\n\t/**\n\t * When true, image and media resource types are allowed through instead of\n\t * being aborted. Required when spider() is called with captureImages: true\n\t * so that individual image fetches via this client succeed.\n\t * Fonts are always blocked regardless of this flag.\n\t * Default: false.\n\t */\n\tcaptureImages?: boolean;\n}\n\n// Module-level flag: stealth is wired to the playwright-extra chromium\n// singleton once and stays active for the lifetime of the process.\nlet stealthApplied = false;\n\nexport class PlaywrightHttpClient implements IHttpClient {\n\t// eslint-disable-next-line @typescript-eslint/no-explicit-any\n\tprivate browser: any | null = null;\n\tprivate readonly channel: string;\n\tprivate readonly executablePath: string;\n\tprivate readonly timeoutMs: number;\n\tprivate readonly waitUntil: string;\n\tprivate readonly captureImages: boolean;\n\n\tconstructor(opts: PlaywrightClientOptions = {}) {\n\t\tthis.channel = opts.channel ?? \"chrome\";\n\t\tthis.executablePath = opts.executablePath ?? \"\";\n\t\tthis.timeoutMs = opts.timeoutMs ?? 30_000;\n\t\tthis.waitUntil = opts.waitUntil ?? \"networkidle\";\n\t\tthis.captureImages = opts.captureImages ?? false;\n\t}\n\n\tprivate async getChromium() {\n\t\t// Prefer playwright-extra + stealth — patches headless fingerprints.\n\t\t// Falls back to plain playwright-core if playwright-extra isn't installed.\n\t\ttry {\n\t\t\tconst { chromium } = await import(\"playwright-extra\");\n\t\t\tif (!stealthApplied) {\n\t\t\t\tconst { default: StealthPlugin } = await import(\"puppeteer-extra-plugin-stealth\");\n\t\t\t\tchromium.use(StealthPlugin());\n\t\t\t\tstealthApplied = true;\n\t\t\t}\n\t\t\treturn chromium;\n\t\t} catch {\n\t\t\tconst { chromium } = await import(\"playwright-core\");\n\t\t\treturn chromium;\n\t\t}\n\t}\n\n\tprivate async getBrowser() {\n\t\tif (this.browser?.isConnected()) return this.browser;\n\t\tconst chromium = await this.getChromium();\n\t\tconst launchOpts = this.executablePath\n\t\t\t? { executablePath: this.executablePath, headless: true }\n\t\t\t: { channel: this.channel, headless: true };\n\t\tthis.browser = await chromium.launch(launchOpts);\n\t\treturn this.browser;\n\t}\n\n\tasync fetch(req: HttpRequest): Promise<HttpResponse> {\n\t\tconst browser = await this.getBrowser();\n\t\t// eslint-disable-next-line @typescript-eslint/no-explicit-any\n\t\tconst page: any = await browser.newPage();\n\n\t\t// Suppress browser-side console output and JS errors — they are not\n\t\t// useful to the caller and would leak into Pi's TUI stream.\n\t\tpage.on(\"console\", () => {});\n\t\tpage.on(\"pageerror\", () => {});\n\n\t\ttry {\n\t\t\t// Block fonts always (never needed for HTML extraction).\n\t\t\t// Block images and media during page navigation for speed — unless\n\t\t\t// this is a direct image fetch (Accept: image/*), in which case\n\t\t\t// captureImages:true lets it through so fetchImages() can retrieve\n\t\t\t// the binary via arrayBuffer().\n\t\t\t// eslint-disable-next-line @typescript-eslint/no-explicit-any\n\t\t\tawait page.route(\"**/*\", (route: any) => {\n\t\t\t\tconst type: string = route.request().resourceType();\n\t\t\t\tconst accept: string = route.request().headers()[\"accept\"] ?? \"\";\n\t\t\t\tconst isImageFetch = accept.startsWith(\"image/\");\n\n\t\t\t\tif (type === \"font\") {\n\t\t\t\t\troute.abort();\n\t\t\t\t} else if ([\"image\", \"media\"].includes(type) && !(this.captureImages && isImageFetch)) {\n\t\t\t\t\troute.abort();\n\t\t\t\t} else {\n\t\t\t\t\troute.continue();\n\t\t\t\t}\n\t\t\t});\n\n\t\t\tconst response = await page.goto(req.url, {\n\t\t\t\ttimeout: this.timeoutMs,\n\t\t\t\twaitUntil: this.waitUntil,\n\t\t\t});\n\n\t\t\tif (!response) {\n\t\t\t\tthrow new Error(`Navigation failed — no response for ${req.url}`);\n\t\t\t}\n\n\t\t\tconst status: number = response.status();\n\t\t\tif (status >= 400) {\n\t\t\t\tthrow new Error(`HTTP ${status} ${response.statusText()} — ${req.url}`);\n\t\t\t}\n\n\t\t\t// page.content() returns the full serialised DOM after JS execution.\n\t\t\tconst html: string = await page.content();\n\t\t\tconst headers: Record<string, string> = await response.allHeaders();\n\n\t\t\treturn {\n\t\t\t\tok: true,\n\t\t\t\tstatus,\n\t\t\t\tstatusText: response.statusText(),\n\t\t\t\theaders: { get: (name: string) => headers[name.toLowerCase()] ?? null },\n\t\t\t\ttext: async () => html,\n\t\t\t\tarrayBuffer: async () => {\n\t\t\t\t\tconst buf: Buffer = await response.body();\n\t\t\t\t\treturn buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) as ArrayBuffer;\n\t\t\t\t},\n\t\t\t};\n\t\t} finally {\n\t\t\tawait page.close();\n\t\t}\n\t}\n\n\t/** Close the shared browser process. Call when the client is no longer needed. */\n\tasync close(): Promise<void> {\n\t\tif (this.browser) {\n\t\t\tawait this.browser.close();\n\t\t\tthis.browser = null;\n\t\t}\n\t}\n}\n\n/**\n * Create a PlaywrightHttpClient, returning null if playwright-core is not\n * installed. Useful for graceful degradation in environments without a browser.\n */\nexport function createPlaywrightClient(\n\topts?: PlaywrightClientOptions,\n): PlaywrightHttpClient | null {\n\ttry {\n\t\treturn new PlaywrightHttpClient(opts);\n\t} catch {\n\t\treturn null;\n\t}\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ports.js","sourceRoot":"","sources":["../src/ports.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG","sourcesContent":["/**\n * Port interfaces — the contracts the core depends on.\n *\n * No concrete imports. Adapters implement these; the core orchestrates them.\n * All ports are optional in SpiderOptions — concrete defaults are wired in\n * spider.ts and crawl.ts so callers need not supply them unless they want\n * to substitute (e.g. inject a mock HTTP client for testing).\n */\n\n// ---------------------------------------------------------------------------\n// IHttpClient\n// ---------------------------------------------------------------------------\n\nexport interface HttpRequest {\n\turl: string;\n\theaders?: Record<string, string>;\n\tsignal?: AbortSignal;\n}\n\nexport interface HttpResponse {\n\tok: boolean;\n\tstatus: number;\n\tstatusText: string;\n\theaders: { get(name: string): string | null };\n\ttext(): Promise<string>;\n\tarrayBuffer(): Promise<ArrayBuffer>;\n}\n\n/**\n * Minimal HTTP client port.\n * Default adapter wraps global fetch().\n * Swap for tests: return fixed HTML without touching the network.\n */\nexport interface IHttpClient {\n\tfetch(req: HttpRequest): Promise<HttpResponse>;\n}\n\n// ---------------------------------------------------------------------------\n// ICache<K, V>\n// ---------------------------------------------------------------------------\n\n/**\n * Generic cache port.\n * Default adapter: SpiderCache (LRU, TTL).\n * Swap for tests or production: in-memory Map, Redis, SQLite, etc.\n */\nexport interface ICache<K, V> {\n\tget(key: K): V | undefined;\n\tset(key: K, value: V): void;\n\thas(key: K): boolean;\n\tdelete(key: K): void;\n\t/** All currently valid (non-expired) values. */\n\tvalues(): V[];\n}\n\n// ---------------------------------------------------------------------------\n// IThrottle\n// ---------------------------------------------------------------------------\n\n/**\n * Per-domain request throttle port.\n * Default adapter: DomainThrottle (token bucket + exponential backoff).\n * Swap for tests: no-op implementation that always resolves immediately.\n */\nexport interface IThrottle {\n\twait(url: string): Promise<void>;\n\tsuccess(url: string): void;\n\trateLimit(url: string, retryAfterHeader: string | null): number;\n\tsetDomainDelay(host: string, ms: number): void;\n\treadonly maxRetries: number;\n}\n\n// ---------------------------------------------------------------------------\n// IRobotsChecker\n// ---------------------------------------------------------------------------\n\nexport interface RobotsResult {\n\tallowed: boolean;\n\tcrawlDelayMs?: number;\n}\n\n/**\n * robots.txt compliance port.\n * Default adapter: RobotsCache (fetches + parses per origin, 1h TTL).\n * Swap for tests: permissive stub that always returns { allowed: true }.\n */\nexport interface IRobotsChecker {\n\tcheck(url: string): Promise<RobotsResult>;\n}\n\n// ---------------------------------------------------------------------------\n// ISearchEngine\n// ---------------------------------------------------------------------------\n\nexport interface SearchQuery {\n\tquery: string;\n\tnumResults?: number;\n\t/**\n\t * Restrict results to content published within this window.\n\t * Supported by Tavily (\"day\"|\"week\"|\"month\"|\"year\") and Brave (\"pd\"|\"pw\"|\"pm\"|\"py\").\n\t * Adapters map this to their engine-specific parameter name.\n\t */\n\ttimeRange?: \"day\" | \"week\" | \"month\" | \"year\";\n\t/**\n\t * Search topic mode. \"news\" prioritises freshly indexed news articles.\n\t * Supported by Tavily. Ignored by engines that don't support it.\n\t */\n\ttopic?: \"news\" | \"general\";\n}\n\n/**\n * A single result from a web search engine.\n * Defined here so port interfaces have no dependency on adapter modules.\n */\nexport interface WebSearchResult {\n\turl: string;\n\ttitle: string;\n\t/** Short description or snippet from the search engine. */\n\tsnippet: string;\n\t/** ISO-8601 or human-readable date, if the engine returned one. */\n\tpublishedAt?: string;\n}\n\n/**\n * Web search engine port.\n * Adapters: BraveSearchEngine, TavilySearchEngine (in web-search.ts).\n * Swap for tests: stub returning fixed results.\n */\nexport interface ISearchEngine {\n\tsearch(req: SearchQuery): Promise<WebSearchResult[]>;\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots.js","sourceRoot":"","sources":["../src/robots.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAaH,SAAS,KAAK,CAAC,IAAY;IAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAClC,MAAM,UAAU,GAAsB,EAAE,CAAC;IACzC,IAAI,YAAgC,CAAC;IACrC,IAAI,OAAO,GAAG,KAAK,CAAC;IAEpB,KAAK,MAAM,GAAG,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,CAAC,IAAI;YAAE,SAAS;QAEpB,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAChC,IAAI,KAAK,KAAK,CAAC,CAAC;YAAE,SAAS;QAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACtD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAE3C,IAAI,GAAG,KAAK,YAAY,EAAE,CAAC;YAC1B,OAAO,GAAG,KAAK,KAAK,GAAG,CAAC;QACzB,CAAC;aAAM,IAAI,OAAO,EAAE,CAAC;YACpB,IAAI,GAAG,KAAK,UAAU,IAAI,KAAK,EAAE,CAAC;gBACjC,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;YAChD,CAAC;iBAAM,IAAI,GAAG,KAAK,OAAO,IAAI,KAAK,EAAE,CAAC;gBACrC,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;YAC/C,CAAC;iBAAM,IAAI,GAAG,KAAK,aAAa,EAAE,CAAC;gBAClC,MAAM,CAAC,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;gBAC5B,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;oBAAE,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,KAAK,EAAE,MAAM,CAAC,CAAC;YACpE,CAAC;QACF,CAAC;IACF,CAAC;IAED,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,CAAC;AACrC,CAAC;AAED,SAAS,SAAS,CAAC,MAAoB,EAAE,IAAY;IACpD,qCAAqC;IACrC,IAAI,IAAiC,CAAC;IACtC,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACnC,IAAI,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7B,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM;gBAAE,IAAI,GAAG,CAAC,CAAC;QACzD,CAAC;IACF,CAAC;IACD,OAAO,IAAI,EAAE,KAAK,IAAI,IAAI,CAAC,CAAC,iBAAiB;AAC9C,CAAC;AAID,MAAM,MAAM,GAAG,EAAE,GAAG,EAAE,GAAG,KAAK,CAAC,CAAC,SAAS;AAEzC,MAAM,OAAO,WAAW;IAIvB,YAAY,SAAS,GAAG,gBAAgB;QAHvB,UAAK,GAAG,IAAI,GAAG,EAAuD,CAAC;QAIvF,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC5B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,KAAK,CAAC,GAAW;QACtB,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC1C,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAEnC,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC;YAC5C,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,GAAG,MAAM,aAAa,CAAC,CAAC;YAC9D,KAAK,GAAG,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,MAAM,EAAE,CAAC;YACnD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;QAC/B,CAAC;QAED,OAAO;YACN,OAAO,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,QAAQ,CAAC;YAC1C,YAAY,EAAE,KAAK,CAAC,MAAM,CAAC,YAAY;SACvC,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,WAAW,CAAC,SAAiB;QAC1C,IAAI,CAAC;YACJ,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;YACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,KAAK,CAAC,CAAC;YAC1D,IAAI,GAAa,CAAC;YAClB,IAAI,CAAC;gBACJ,GAAG,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC,SAAS,EAAE;oBACvC,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,OAAO,EAAE,EAAE,YAAY,EAAE,IAAI,CAAC,SAAS,EAAE;iBACzC,CAAC,CAAC;YACJ,CAAC;oBAAS,CAAC;gBACV,YAAY,CAAC,KAAK,CAAC,CAAC;YACrB,CAAC;YACD,IAAI,CAAC,GAAG,CAAC,EAAE;gBAAE,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC,CAAC,kBAAkB;YAC1D,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QAChC,CAAC;QAAC,MAAM,CAAC;YACR,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC,CAAC,4BAA4B;QACxD,CAAC;IACF,CAAC;CACD;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,SAAkB;IACnD,OAAO,IAAI,WAAW,CAAC,SAAS,CAAC,CAAC;AACnC,CAAC","sourcesContent":["/**\n * Minimal robots.txt fetcher and per-domain cache.\n * Respects User-agent: * directives (Allow, Disallow, Crawl-delay).\n * Fails open — any fetch/parse error allows all URLs.\n */\n\ninterface RobotsDirective {\n\tallow: boolean;\n\tpath: string;\n}\n\ninterface ParsedRobots {\n\tdirectives: RobotsDirective[];\n\t/** Crawl-delay in ms, if the robots.txt specified one (capped at 60s). */\n\tcrawlDelayMs?: number;\n}\n\nfunction parse(text: string): ParsedRobots {\n\tconst lines = text.split(/\\r?\\n/);\n\tconst directives: RobotsDirective[] = [];\n\tlet crawlDelayMs: number | undefined;\n\tlet inBlock = false;\n\n\tfor (const raw of lines) {\n\t\tconst line = raw.split(\"#\")[0].trim();\n\t\tif (!line) continue;\n\n\t\tconst colon = line.indexOf(\":\");\n\t\tif (colon === -1) continue;\n\t\tconst key = line.slice(0, colon).trim().toLowerCase();\n\t\tconst value = line.slice(colon + 1).trim();\n\n\t\tif (key === \"user-agent\") {\n\t\t\tinBlock = value === \"*\";\n\t\t} else if (inBlock) {\n\t\t\tif (key === \"disallow\" && value) {\n\t\t\t\tdirectives.push({ allow: false, path: value });\n\t\t\t} else if (key === \"allow\" && value) {\n\t\t\t\tdirectives.push({ allow: true, path: value });\n\t\t\t} else if (key === \"crawl-delay\") {\n\t\t\t\tconst s = parseFloat(value);\n\t\t\t\tif (!isNaN(s) && s > 0) crawlDelayMs = Math.min(s * 1_000, 60_000);\n\t\t\t}\n\t\t}\n\t}\n\n\treturn { directives, crawlDelayMs };\n}\n\nfunction isAllowed(robots: ParsedRobots, path: string): boolean {\n\t// Longest matching path prefix wins.\n\tlet best: RobotsDirective | undefined;\n\tfor (const d of robots.directives) {\n\t\tif (path.startsWith(d.path)) {\n\t\t\tif (!best || d.path.length > best.path.length) best = d;\n\t\t}\n\t}\n\treturn best?.allow ?? true; // default: allow\n}\n\nimport type { IRobotsChecker, RobotsResult } from \"./ports.js\";\n\nconst TTL_MS = 60 * 60 * 1_000; // 1 hour\n\nexport class RobotsCache implements IRobotsChecker {\n\tprivate readonly cache = new Map<string, { robots: ParsedRobots; expiresAt: number }>();\n\tprivate readonly userAgent: string;\n\n\tconstructor(userAgent = \"web-spider/0.1\") {\n\t\tthis.userAgent = userAgent;\n\t}\n\n\t/**\n\t * Returns whether the URL is allowed and the crawl-delay if specified.\n\t * Caches per origin for 1 hour. Fails open on any error.\n\t */\n\tasync check(url: string): Promise<RobotsResult> {\n\t\tconst { origin, pathname } = new URL(url);\n\t\tlet entry = this.cache.get(origin);\n\n\t\tif (!entry || Date.now() > entry.expiresAt) {\n\t\t\tconst robots = await this.fetchRobots(`${origin}/robots.txt`);\n\t\t\tentry = { robots, expiresAt: Date.now() + TTL_MS };\n\t\t\tthis.cache.set(origin, entry);\n\t\t}\n\n\t\treturn {\n\t\t\tallowed: isAllowed(entry.robots, pathname),\n\t\t\tcrawlDelayMs: entry.robots.crawlDelayMs,\n\t\t};\n\t}\n\n\tprivate async fetchRobots(robotsUrl: string): Promise<ParsedRobots> {\n\t\ttry {\n\t\t\tconst controller = new AbortController();\n\t\t\tconst timer = setTimeout(() => controller.abort(), 5_000);\n\t\t\tlet res: Response;\n\t\t\ttry {\n\t\t\t\tres = await globalThis.fetch(robotsUrl, {\n\t\t\t\t\tsignal: controller.signal,\n\t\t\t\t\theaders: { \"User-Agent\": this.userAgent },\n\t\t\t\t});\n\t\t\t} finally {\n\t\t\t\tclearTimeout(timer);\n\t\t\t}\n\t\t\tif (!res.ok) return { directives: [] }; // 404 → allow all\n\t\t\treturn parse(await res.text());\n\t\t} catch {\n\t\t\treturn { directives: [] }; // network error → fail open\n\t\t}\n\t}\n}\n\n/**\n * Factory — avoids jiti/Bun CJS re-export interop where class constructors\n * accessed through a re-export chain can appear undefined at call site.\n * Use this in extension code instead of `new RobotsCache()`.\n */\nexport function createRobotsCache(userAgent?: string): RobotsCache {\n\treturn new RobotsCache(userAgent);\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search.js","sourceRoot":"","sources":["../src/search.ts"],"names":[],"mappings":"AAAA,OAAO,UAAU,MAAM,YAAY,CAAC;AAiDpC,8EAA8E;AAC9E,wEAAwE;AACxE,8EAA8E;AAE9E;;;GAGG;AACH,SAAS,YAAY,CAAC,IAAY,EAAE,SAAiB,EAAE,WAAqB,EAAE,MAAc;IAC3F,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IAEjC,IAAI,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IACnC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;QAChB,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;YAC9B,MAAM,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAC5B,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC;gBACd,GAAG,GAAG,CAAC,CAAC;gBACR,MAAM;YACP,CAAC;QACF,CAAC;IACF,CAAC;IACD,IAAI,GAAG,KAAK,CAAC,CAAC;QAAE,GAAG,GAAG,CAAC,CAAC;IAExB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC;IACxC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;IAC1G,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/D,OAAO,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;AACtE,CAAC;AAED,2EAA2E;AAC3E,SAAS,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC;SACN,WAAW,EAAE;SACb,KAAK,CAAC,6BAA6B,CAAC;SACpC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AAC/B,CAAC;AAED,8EAA8E;AAC9E,aAAa;AACb,8EAA8E;AAE9E;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,WAAW,CAAC,KAAqB,EAAE,KAAa,EAAE,OAA2B,EAAE;IAC9F,MAAM,EAAE,IAAI,GAAG,EAAE,EAAE,aAAa,GAAG,GAAG,EAAE,GAAG,IAAI,CAAC;IAEhD,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAE7B,4EAA4E;IAC5E,MAAM,IAAI,GAAgB,EAAE,CAAC;IAE7B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC1B,qBAAqB;QACrB,MAAM,QAAQ,GAAyD;YACtE,EAAE,EAAE,EAAE,GAAG,IAAI,CAAC,GAAG,aAAa,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,KAAK,EAAE;YACpE,GAAG,CAAC,IAAI,CAAC,WAAW;gBACnB,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,GAAG,IAAI,CAAC,GAAG,mBAAmB,EAAE,OAAO,EAAE,aAAa,EAAE,IAAI,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC;gBAC1F,CAAC,CAAC,EAAE,CAAC;YACN,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;gBAC/B,EAAE,EAAE,GAAG,IAAI,CAAC,GAAG,UAAU,CAAC,EAAE;gBAC5B,OAAO,EAAE,IAAI,CAAC,CAAC,KAAK,EAAE;gBACtB,IAAI,EAAE,CAAC,CAAC,IAAI;aACZ,CAAC,CAAC;SACH,CAAC;QACF,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YAC1B,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC,CAAC;QACvF,CAAC;QAED,kBAAkB;QAClB,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAC7B,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACzF,CAAC;IACF,CAAC;IAED,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,MAAM,EAAE,GAAG,IAAI,UAAU,CAAY;QACpC,MAAM,EAAE,CAAC,MAAM,EAAE,SAAS,CAAC;QAC3B,WAAW,EAAE,CAAC,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,CAAC;QAClD,aAAa,EAAE;YACd,wDAAwD;YACxD,KAAK,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE;YACrB,+EAA+E;YAC/E,KAAK,EAAE,GAAG;YACV,2DAA2D;YAC3D,MAAM,EAAE,IAAI;SACZ;KACD,CAAC,CAAC;IAEH,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAEhB,MAAM,OAAO,GAAG,EAAE,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IACjC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEpC,iEAAiE;IACjE,uEAAuE;IACvE,MAAM,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;IAEhC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC7C,MAAM,WAAW,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC;IAEpC,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACzC,GAAG,EAAE,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;QACrB,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QAC7B,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QAC7B,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,GAAG,MAAM,EAAE,CAAC,CAAC,GAAG,GAAG,CAAC,GAAG,GAAG;QAC5D,OAAO,EAAE,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,EAAE,SAAS,EAAE,WAAW,EAAE,aAAa,CAAC;KAC/E,CAAC,CAAC,CAAC;AACL,CAAC;AAED,wFAAwF;AACxF,MAAM,CAAC,MAAM,WAAW,GAAG,WAAW,CAAA","sourcesContent":["import MiniSearch from \"minisearch\";\nimport type { SpideredPage } from \"./types.js\";\n\n/** A single ranked match from fuzzySearch. */\nexport interface SearchHit {\n\t/** URL of the page the match came from. */\n\turl: string;\n\t/**\n\t * Stable chunk ID (\"url#chunk-N\") when the match is in body text.\n\t * Empty string when the match is in page metadata (title, description,\n\t * headings).\n\t */\n\tchunkId: string;\n\t/** Nearest heading for the matched chunk, or the matched field name for\n\t * metadata hits (e.g. \"title\", \"description\"). */\n\theading: string;\n\t/** Normalised score 0–1. Higher is a better match. */\n\tscore: number;\n\t/** Short context window around the best match, ≤ 2×snippetRadius chars.\n\t * Prefixed/suffixed with \"…\" when truncated. */\n\tsnippet: string;\n}\n\nexport interface FuzzySearchOptions {\n\t/** Maximum hits to return (default 10). */\n\ttopN?: number;\n\t/**\n\t * Characters of context on each side of the match in the snippet\n\t * (default 100). Keep low to save tokens; raise when you need more context.\n\t */\n\tsnippetRadius?: number;\n}\n\n// ---------------------------------------------------------------------------\n// Internal types\n// ---------------------------------------------------------------------------\n\ninterface SearchDoc {\n\t/** Unique stable ID used by MiniSearch — chunk id or synthetic meta id. */\n\tid: string;\n\turl: string;\n\t/** Nearest heading or metadata field name (\"title\", \"description\", \"h2\", …). */\n\theading: string;\n\t/** The text that was indexed and will be searched. */\n\ttext: string;\n\t/** Same as id for chunks; empty string for metadata docs. */\n\tchunkId: string;\n}\n\n// ---------------------------------------------------------------------------\n// Snippet builder — kept from v1, MiniSearch doesn't generate snippets.\n// ---------------------------------------------------------------------------\n\n/**\n * Build a short snippet around the best match position.\n * Falls back to the start of the text when no match is found.\n */\nfunction buildSnippet(text: string, fullQuery: string, queryTokens: string[], radius: number): string {\n\tconst lower = text.toLowerCase();\n\n\tlet pos = lower.indexOf(fullQuery);\n\tif (pos === -1) {\n\t\tfor (const qt of queryTokens) {\n\t\t\tconst p = lower.indexOf(qt);\n\t\t\tif (p !== -1) {\n\t\t\t\tpos = p;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n\tif (pos === -1) pos = 0;\n\n\tconst start = Math.max(0, pos - radius);\n\tconst end = Math.min(text.length, pos + Math.max(fullQuery.length, queryTokens[0]?.length ?? 1) + radius);\n\tconst raw = text.slice(start, end).replace(/\\s+/g, \" \").trim();\n\treturn (start > 0 ? \"…\" : \"\") + raw + (end < text.length ? \"…\" : \"\");\n}\n\n/** Tokenise and lower-case a string — used only for snippet generation. */\nfunction tokenise(s: string): string[] {\n\treturn s\n\t\t.toLowerCase()\n\t\t.split(/[\\s\\-_.,;:!?()[\\]{}\"'`/\\\\]+/)\n\t\t.filter((t) => t.length > 1);\n}\n\n// ---------------------------------------------------------------------------\n// Public API\n// ---------------------------------------------------------------------------\n\n/**\n * Full-text search across a set of already-spidered pages using MiniSearch\n * (BM25F ranking, fuzzy edit-distance, prefix search, heading field boost ×2).\n *\n * Searches both body chunks and page metadata (title, description, headings).\n * Returns results ranked by score descending, normalised to 0–1.\n *\n * Designed for agent use: call after fetching pages to locate a specific\n * fact, term, or section without dumping all content into context.\n *\n * @example\n * const hits = searchPages(pages, \"cost optimization selectors\", { topN: 5 })\n * // hits[0].snippet → \"…LLM extraction vs Selectors…\"\n */\nexport function searchPages(pages: SpideredPage[], query: string, opts: FuzzySearchOptions = {}): SearchHit[] {\n\tconst { topN = 10, snippetRadius = 100 } = opts;\n\n\tif (!query.trim()) return [];\n\n\t// Build a flat document list — one entry per chunk, one per metadata field.\n\tconst docs: SearchDoc[] = [];\n\n\tfor (const page of pages) {\n\t\t// Metadata documents\n\t\tconst metaDocs: Array<{ id: string; heading: string; text: string }> = [\n\t\t\t{ id: `${page.url}#meta-title`, heading: \"title\", text: page.title },\n\t\t\t...(page.description\n\t\t\t\t? [{ id: `${page.url}#meta-description`, heading: \"description\", text: page.description }]\n\t\t\t\t: []),\n\t\t\t...page.headings.map((h, i) => ({\n\t\t\t\tid: `${page.url}#meta-h${i}`,\n\t\t\t\theading: `h${h.level}`,\n\t\t\t\ttext: h.text,\n\t\t\t})),\n\t\t];\n\t\tfor (const m of metaDocs) {\n\t\t\tdocs.push({ id: m.id, url: page.url, heading: m.heading, text: m.text, chunkId: \"\" });\n\t\t}\n\n\t\t// Chunk documents\n\t\tfor (const c of page.chunks) {\n\t\t\tdocs.push({ id: c.id, url: page.url, heading: c.heading, text: c.text, chunkId: c.id });\n\t\t}\n\t}\n\n\tif (docs.length === 0) return [];\n\n\tconst ms = new MiniSearch<SearchDoc>({\n\t\tfields: [\"text\", \"heading\"],\n\t\tstoreFields: [\"url\", \"heading\", \"chunkId\", \"text\"],\n\t\tsearchOptions: {\n\t\t\t// BM25F: headings are 2× more important than body text.\n\t\t\tboost: { heading: 2 },\n\t\t\t// Edit-distance fuzzy — 0.2 × term length, rounded (e.g. ≤1 for 5-char terms).\n\t\t\tfuzzy: 0.2,\n\t\t\t// Prefix match: \"automat\" finds \"automation\", \"automated\".\n\t\t\tprefix: true,\n\t\t},\n\t});\n\n\tms.addAll(docs);\n\n\tconst results = ms.search(query);\n\tif (results.length === 0) return [];\n\n\t// Normalise raw BM25 scores to 0–1 by dividing by the top score.\n\t// This preserves relative ranking while keeping values agent-friendly.\n\tconst maxRaw = results[0].score;\n\n\tconst fullQuery = query.trim().toLowerCase();\n\tconst queryTokens = tokenise(query);\n\n\treturn results.slice(0, topN).map((r) => ({\n\t\turl: String(r[\"url\"]),\n\t\tchunkId: String(r[\"chunkId\"]),\n\t\theading: String(r[\"heading\"]),\n\t\tscore: Math.round(Math.min(r.score / maxRaw, 1) * 100) / 100,\n\t\tsnippet: buildSnippet(String(r[\"text\"]), fullQuery, queryTokens, snippetRadius),\n\t}));\n}\n\n/** @deprecated Use {@link searchPages} — renamed in v0.4.0 to reflect BM25F ranking. */\nexport const fuzzySearch = searchPages\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap.js","sourceRoot":"","sources":["../src/sitemap.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAIH;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACrC,MAAc,EACd,UAAuB;IAEvB,MAAM,UAAU,GAAG,CAAC,GAAG,MAAM,cAAc,EAAE,GAAG,MAAM,oBAAoB,CAAC,CAAC;IAC5E,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAE/B,KAAK,MAAM,UAAU,IAAI,UAAU,EAAE,CAAC;QACrC,IAAI,CAAC;YACJ,MAAM,GAAG,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC;gBAClC,GAAG,EAAE,UAAU;gBACf,OAAO,EAAE,EAAE,MAAM,EAAE,gCAAgC,EAAE;aACrD,CAAC,CAAC;YACH,IAAI,CAAC,GAAG,CAAC,EAAE;gBAAE,SAAS;YACtB,MAAM,GAAG,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;YAC7B,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC;gBACpC,kEAAkE;gBAClE,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;oBAC1B,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;oBACtD,KAAK,MAAM,CAAC,IAAI,MAAM;wBAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;gBACrC,CAAC;qBAAM,CAAC;oBACP,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;gBACf,CAAC;YACF,CAAC;YACD,IAAI,IAAI,CAAC,IAAI,GAAG,CAAC;gBAAE,MAAM,CAAC,0BAA0B;QACrD,CAAC;QAAC,MAAM,CAAC;YACR,SAAS;QACV,CAAC;IACF,CAAC;IAED,OAAO,CAAC,GAAG,IAAI,CAAC,CAAC;AAClB,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,GAAW,EAAE,UAAuB;IAClE,IAAI,CAAC;QACJ,MAAM,GAAG,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC;QAC5C,IAAI,CAAC,GAAG,CAAC,EAAE;YAAE,OAAO,EAAE,CAAC;QACvB,OAAO,WAAW,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;IACtC,CAAC;IAAC,MAAM,CAAC;QACR,OAAO,EAAE,CAAC;IACX,CAAC;AACF,CAAC;AAED,SAAS,WAAW,CAAC,GAAW;IAC/B,MAAM,IAAI,GAAa,EAAE,CAAC;IAC1B,MAAM,EAAE,GAAG,0CAA0C,CAAC;IACtD,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACxC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IAC5B,CAAC;IACD,OAAO,IAAI,CAAC;AACb,CAAC","sourcesContent":["/**\n * Sitemap fetcher and parser.\n *\n * Attempts /sitemap.xml and /sitemap_index.xml. Extracts <loc> URLs.\n * Fails open — any error returns an empty array so callers fall back\n * to normal BFS without noise.\n */\n\nimport type { IHttpClient } from \"./ports.js\";\n\n/**\n * Fetch and parse sitemap URLs for the given origin.\n * Supports both standard sitemaps and sitemap index files.\n * Returns deduplicated absolute URLs, empty array on any failure.\n */\nexport async function fetchSitemapUrls(\n\torigin: string,\n\thttpClient: IHttpClient,\n): Promise<string[]> {\n\tconst candidates = [`${origin}/sitemap.xml`, `${origin}/sitemap_index.xml`];\n\tconst urls = new Set<string>();\n\n\tfor (const sitemapUrl of candidates) {\n\t\ttry {\n\t\t\tconst res = await httpClient.fetch({\n\t\t\t\turl: sitemapUrl,\n\t\t\t\theaders: { Accept: \"application/xml, text/xml, */*\" },\n\t\t\t});\n\t\t\tif (!res.ok) continue;\n\t\t\tconst xml = await res.text();\n\t\t\tfor (const loc of extractLocs(xml)) {\n\t\t\t\t// Sitemap index entries point to other sitemaps — fetch those too\n\t\t\t\tif (loc.endsWith(\".xml\")) {\n\t\t\t\t\tconst nested = await fetchSitemapXml(loc, httpClient);\n\t\t\t\t\tfor (const u of nested) urls.add(u);\n\t\t\t\t} else {\n\t\t\t\t\turls.add(loc);\n\t\t\t\t}\n\t\t\t}\n\t\t\tif (urls.size > 0) break; // found a working sitemap\n\t\t} catch {\n\t\t\tcontinue;\n\t\t}\n\t}\n\n\treturn [...urls];\n}\n\nasync function fetchSitemapXml(url: string, httpClient: IHttpClient): Promise<string[]> {\n\ttry {\n\t\tconst res = await httpClient.fetch({ url });\n\t\tif (!res.ok) return [];\n\t\treturn extractLocs(await res.text());\n\t} catch {\n\t\treturn [];\n\t}\n}\n\nfunction extractLocs(xml: string): string[] {\n\tconst urls: string[] = [];\n\tconst re = /<loc>\\s*(https?:\\/\\/[^<\\s]+)\\s*<\\/loc>/gi;\n\tlet match: RegExpExecArray | null;\n\twhile ((match = re.exec(xml)) !== null) {\n\t\turls.push(match[1].trim());\n\t}\n\treturn urls;\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"spider.js","sourceRoot":"","sources":["../src/spider.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAEjD,OAAO,EAAE,mBAAmB,EAAE,eAAe,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAEvG,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAEtC,OAAO,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AAEpC,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E,MAAM,gBAAgB,GAAG,GAAG,CAAC;AAE7B,8EAA8E;AAC9E,8BAA8B;AAC9B,8EAA8E;AAE9E,MAAM,iBAAiB,GAAgB;IACtC,KAAK,CAAC,KAAK,CAAC,GAAG;QACd,MAAM,GAAG,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE;YAC3C,MAAM,EAAE,GAAG,CAAC,MAAM;YAClB,OAAO,EAAE,GAAG,CAAC,OAAO;SACpB,CAAC,CAAC;QACH,OAAO;YACN,EAAE,EAAE,GAAG,CAAC,EAAE;YACV,MAAM,EAAE,GAAG,CAAC,MAAM;YAClB,UAAU,EAAE,GAAG,CAAC,UAAU;YAC1B,OAAO,EAAE,EAAE,GAAG,EAAE,CAAC,IAAY,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE;YACzD,IAAI,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE;YACtB,WAAW,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,WAAW,EAAE;SACpC,CAAC;IACH,CAAC;CACD,CAAC;AAkEF;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E,4EAA4E;AAC5E,SAAS,WAAW,CAAC,GAAW;IAC/B,MAAM,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,WAAW,EAAE,CAAC;IAC9D,MAAM,GAAG,GAA2B;QACnC,GAAG,EAAE,YAAY;QACjB,IAAI,EAAE,YAAY;QAClB,GAAG,EAAE,WAAW;QAChB,IAAI,EAAE,YAAY;QAClB,GAAG,EAAE,WAAW;QAChB,GAAG,EAAE,eAAe;QACpB,IAAI,EAAE,YAAY;KAClB,CAAC;IACF,OAAO,GAAG,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,YAAY,CAAC;AACvC,CAAC;AAED;;;;GAIG;AACH,KAAK,UAAU,WAAW,CACzB,WAAmB,EACnB,OAAe,EACf,UAAuB,EACvB,SAAiB,EACjB,QAAoB;IAEpB,kDAAkD;IAClD,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,GAAG,GAAG,QAAQ,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IAC3C,MAAM,MAAM,GAAG,CAAC,GAAG,GAAG,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IAEpE,MAAM,OAAO,GAAe,EAAE,CAAC;IAE/B,KAAK,MAAM,EAAE,IAAI,MAAM,EAAE,CAAC;QACzB,MAAM,MAAM,GAAG,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;QAC5C,IAAI,CAAC,MAAM;YAAE,SAAS;QAEtB,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;QAEzC,yCAAyC;QACzC,IAAI,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YAChC,MAAM,KAAK,GAAG,4BAA4B,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACxD,IAAI,KAAK,EAAE,CAAC;gBACX,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAC1E,CAAC;YACD,SAAS;QACV,CAAC;QAED,yBAAyB;QACzB,IAAI,WAAmB,CAAC;QACxB,IAAI,CAAC;YACJ,WAAW,GAAG,IAAI,GAAG,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,QAAQ,EAAE,CAAC;QACnD,CAAC;QAAC,MAAM,CAAC;YACR,SAAS;QACV,CAAC;QAED,IAAI,CAAC;YACJ,IAAI,QAAQ;gBAAE,MAAM,QAAQ,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YAC/C,MAAM,GAAG,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC;gBAClC,GAAG,EAAE,WAAW;gBAChB,OAAO,EAAE,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,EAAE,SAAS,EAAE;aAC9D,CAAC,CAAC;YACH,IAAI,CAAC,GAAG,CAAC,EAAE;gBAAE,SAAS;YACtB,QAAQ,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;YAE/B,MAAM,GAAG,GAAG,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC;YACpC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;YACnD,MAAM,WAAW,GAAG,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC;YACpD,MAAM,QAAQ,GAAG,WAAW,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,WAAW,CAAC,WAAW,CAAC,CAAC;YAE/E,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,WAAW,EAAE,QAAQ,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,CAAC;QAC3D,CAAC;QAAC,MAAM,CAAC;YACR,oEAAoE;YACpE,uCAAuC;QACxC,CAAC;IACF,CAAC;IAED,OAAO,OAAO,CAAC;AAChB,CAAC;AAWD,MAAM,CAAC,KAAK,UAAU,MAAM,CAC3B,GAAW,EACX,IAA0D;IAE1D,MAAM,EACL,SAAS,GAAG,MAAM,EAClB,SAAS,GAAG,uEAAuE,EACnF,IAAI,GAAG,MAAM,EACb,YAAY,EACZ,gBAAgB,EAChB,WAAW,EACX,QAAQ,EACR,WAAW,EACX,UAAU,GAAG,iBAAiB,EAC9B,aAAa,GAAG,KAAK,EACrB,SAAS,GAAG,EAAE,GACd,GAAG,IAAI,IAAI,EAAE,CAAC;IAEf,oEAAoE;IACpE,IAAI,SAAc,CAAC;IACnB,IAAI,CAAC;QACJ,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC1B,CAAC;IAAC,MAAM,CAAC;QACR,MAAM,IAAI,KAAK,CAAC,iBAAiB,GAAG,8CAA8C,CAAC,CAAC;IACrF,CAAC;IACD,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,QAAQ,CAAC,EAAE,CAAC;QACvD,MAAM,IAAI,KAAK,CAAC,yBAAyB,SAAS,CAAC,QAAQ,uCAAuC,CAAC,CAAC;IACrG,CAAC;IAED,oCAAoC;IACpC,IAAI,WAAW,EAAE,CAAC;QACjB,MAAM,EAAE,OAAO,EAAE,YAAY,EAAE,GAAG,MAAM,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC/D,IAAI,CAAC,OAAO;YAAE,MAAM,IAAI,KAAK,CAAC,0BAA0B,GAAG,EAAE,CAAC,CAAC;QAC/D,IAAI,YAAY,IAAI,QAAQ,EAAE,CAAC;YAC9B,QAAQ,CAAC,cAAc,CAAC,SAAS,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;QAC3D,CAAC;IACF,CAAC;IAED,mDAAmD;IACnD,MAAM,UAAU,GAAG,QAAQ,EAAE,UAAU,IAAI,CAAC,CAAC;IAC7C,IAAI,IAAI,GAAG,EAAE,CAAC;IACd,IAAI,UAAU,GAAiB,IAAI,CAAC;IAEpC,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,UAAU,EAAE,OAAO,EAAE,EAAE,CAAC;QACxD,IAAI,QAAQ;YAAE,MAAM,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEvC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;QAC9D,IAAI,GAA8C,CAAC;QACnD,IAAI,CAAC;YACJ,GAAG,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC;gBAC5B,GAAG;gBACH,MAAM,EAAE,UAAU,CAAC,MAAM;gBACzB,OAAO,EAAE,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,EAAE,WAAW,EAAE;aACzD,CAAC,CAAC;QACJ,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACd,YAAY,CAAC,KAAK,CAAC,CAAC;YACpB,IAAI,GAAG,YAAY,KAAK,IAAI,GAAG,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBACvD,MAAM,IAAI,KAAK,CAAC,iBAAiB,SAAS,QAAQ,GAAG,EAAE,CAAC,CAAC;YAC1D,CAAC;YACD,MAAM,GAAG,CAAC;QACX,CAAC;QACD,YAAY,CAAC,KAAK,CAAC,CAAC;QAEpB,IAAI,GAAG,CAAC,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YAC9C,IAAI,QAAQ,IAAI,OAAO,GAAG,UAAU,EAAE,CAAC;gBACtC,QAAQ,CAAC,SAAS,CAAC,GAAG,EAAE,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,CAAC;gBACxD,UAAU,GAAG,IAAI,KAAK,CAAC,QAAQ,GAAG,CAAC,MAAM,wBAAwB,OAAO,GAAG,CAAC,IAAI,UAAU,GAAG,CAAC,CAAC;gBAC/F,SAAS;YACV,CAAC;YACD,MAAM,IAAI,KAAK,CAAC,QAAQ,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,MAAM,GAAG,EAAE,CAAC,CAAC;QAClE,CAAC;QAED,IAAI,CAAC,GAAG,CAAC,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,QAAQ,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,MAAM,GAAG,EAAE,CAAC,CAAC;QAE9E,QAAQ,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC;QACvB,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QACxB,UAAU,GAAG,IAAI,CAAC;QAClB,MAAM;IACP,CAAC;IAED,IAAI,UAAU;QAAE,MAAM,UAAU,CAAC;IAEjC,qEAAqE;IACrE,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAEhC,4DAA4D;IAC5D,IAAI,gBAAgB,EAAE,CAAC;QACtB,KAAK,MAAM,GAAG,IAAI,gBAAgB;aAChC,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aACpB,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;YACnB,KAAK,MAAM,EAAE,IAAI,CAAC,GAAG,GAAG,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC;gBAAE,EAAE,CAAC,MAAM,EAAE,CAAC;QAC9D,CAAC;IACF,CAAC;IAED,wEAAwE;IACxE,IAAI,YAAY,EAAE,CAAC;QAClB,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,CAAC,YAAY,CAAC,CAAC;QAC7C,IAAI,IAAI,EAAE,CAAC;YACV,GAAG,CAAC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC;QACrC,CAAC;IACF,CAAC;IAED,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IACrC,MAAM,YAAY,GAAG,mBAAmB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAEnD,+DAA+D;IAC/D,MAAM,iBAAiB,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC;IACvD,MAAM,UAAU,GAAG,CAAC,iBAAiB,CAAC;IACtC,4EAA4E;IAC5E,8EAA8E;IAC9E,MAAM,OAAO,GAAG,iBAAiB,IAAI;QACpC,KAAK,EAAE,CAAC,GAAG,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QAC7D,OAAO,EAAE,EAAE;QACX,WAAW,EAAE,EAAE;QACf,MAAM,EAAE,CAAC;QACT,OAAO,EAAE,EAAE;QACX,MAAM,EAAE,EAAE;QACV,GAAG,EAAE,EAAE;QACP,SAAS,EAAE,EAAE;QACb,IAAI,EAAE,EAAE;QACR,aAAa,EAAE,IAAI;QACnB,kBAAkB,EAAE,CAAC;KACrB,CAAC;IAEF,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAC3D,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAE3C,MAAM,IAAI,GAAG,CAAC,IAAY,EAAU,EAAE;QACrC,MAAM,EAAE,GACP,GAAG,CAAC,aAAa,CAAC,cAAc,IAAI,IAAI,CAAC;YACzC,GAAG,CAAC,aAAa,CAAC,qBAAqB,IAAI,IAAI,CAAC;YAChD,GAAG,CAAC,aAAa,CAAC,kBAAkB,IAAI,IAAI,CAAC,CAAC;QAC/C,OAAO,CAAC,EAAE,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IACnD,CAAC,CAAC;IAEF,uEAAuE;IACvE,MAAM,QAAQ,GAAG,eAAe,CAAC,OAAO,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC;IACxD,MAAM,IAAI,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;IAE9B,8EAA8E;IAC9E,qDAAqD;IACrD,8EAA8E;IAC9E,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;QACrB,MAAM,WAAW,GAAG,CAAC,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACvD,MAAM,SAAS,GAAG,WAAW,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAClE,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,CAAC;QAE5D,MAAM,IAAI,GAAG;YACZ,GAAG;YACH,MAAM;YACN,SAAS;YACT,GAAG,CAAC,YAAY,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACvD,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC;YACrC,WAAW,EAAE,IAAI,CAAC,aAAa,CAAC;YAChC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,CAAC;YACxC,WAAW,EAAE,IAAI,CAAC,wBAAwB,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC;YAC3D,IAAI,EAAE,GAAG,CAAC,eAAe,CAAC,IAAI,IAAI,IAAI;YACtC,IAAI;YACJ,SAAS;YACT,kBAAkB,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,gBAAgB,CAAC;YAC3D,MAAM,EAAE,EAAE,EAAE,2CAA2C;YACvD,QAAQ;YACR,KAAK;YACL,QAAQ,EAAE,EAAE;SACW,CAAC;QACzB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;QAC1B,OAAO,EAAE,GAAG,IAAI,EAAE,UAAU,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC;IAC7E,CAAC;IAED,8EAA8E;IAC9E,uEAAuE;IACvE,8EAA8E;IAC9E,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;QACrB,MAAM,IAAI,GAAG,SAAS,CAAC,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,GAAG,CAAC,CAAC;QACnD,MAAM,QAAQ,GAAG,UAAU,CAAC,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,EAAE,UAAU,EAAE,aAAa,EAAE,CAAC,CAAC;QAClF,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAC/D,MAAM,MAAM,GAAG,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QACpC,MAAM,MAAM,GAAG,aAAa;YAC3B,CAAC,CAAC,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,GAAG,EAAE,UAAU,EAAE,SAAS,EAAE,QAAQ,CAAC;YAChF,CAAC,CAAC,SAAS,CAAC;QACb,OAAO;YACN,IAAI,EAAE,MAAM;YACZ,GAAG;YACH,MAAM;YACN,SAAS;YACT,GAAG,CAAC,YAAY,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACvD,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC;YACrC,WAAW,EAAE,IAAI,CAAC,aAAa,CAAC;YAChC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,CAAC;YACxC,WAAW,EAAE,IAAI,CAAC,wBAAwB,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC;YAC3D,IAAI,EAAE,GAAG,CAAC,eAAe,CAAC,IAAI,IAAI,IAAI;YACtC,IAAI;YACJ,SAAS;YACT,kBAAkB,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,gBAAgB,CAAC;YAC3D,QAAQ;YACR,MAAM;YACN,KAAK;YACL,QAAQ;YACR,IAAI;YACJ,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SAC7B,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,+BAA+B;IAC/B,8EAA8E;IAC9E,MAAM,QAAQ,GAAG,UAAU,CAAC,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,EAAE,UAAU,EAAE,aAAa,EAAE,CAAC,CAAC;IAClF,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;IAE/D,uEAAuE;IACvE,qEAAqE;IACrE,kDAAkD;IAClD,IAAI,SAAS,GAAG,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;IACrC,IAAI,WAAW,KAAK,SAAS,EAAE,CAAC;QAC/B,MAAM,UAAU,GAAG,WAAW,GAAG,CAAC,CAAC;QACnC,IAAI,SAAS,GAAG,UAAU,CAAC;QAC3B,IAAI,KAAK,GAAG,IAAI,CAAC;QACjB,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YAClC,kEAAkE;YAClE,iCAAiC;YACjC,IAAI,CAAC,KAAK,IAAI,SAAS,IAAI,CAAC;gBAAE,OAAO,KAAK,CAAC;YAC3C,KAAK,GAAG,KAAK,CAAC;YACd,SAAS,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC;YAC3B,OAAO,IAAI,CAAC;QACb,CAAC,CAAC,CAAC;IACJ,CAAC;IAED,qEAAqE;IACrE,MAAM,aAAa,GAAG,WAAW,KAAK,SAAS;QAC9C,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC;QAC3C,CAAC,CAAC,QAAQ,CAAC;IAEZ,MAAM,MAAM,GAAG,aAAa;QAC3B,CAAC,CAAC,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,GAAG,EAAE,UAAU,EAAE,SAAS,EAAE,QAAQ,CAAC;QAChF,CAAC,CAAC,SAAS,CAAC;IAEb,OAAO;QACN,GAAG;QACH,MAAM;QACN,SAAS;QACT,GAAG,CAAC,YAAY,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACvD,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC;QACrC,WAAW,EAAE,IAAI,CAAC,aAAa,CAAC;QAChC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,CAAC;QACxC,WAAW,EAAE,IAAI,CAAC,wBAAwB,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC;QAC3D,IAAI,EAAE,GAAG,CAAC,eAAe,CAAC,IAAI,IAAI,IAAI;QACtC,IAAI;QACJ,SAAS;QACT,kBAAkB,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,gBAAgB,CAAC;QAC3D,QAAQ;QACR,MAAM,EAAE,SAAS;QACjB,KAAK;QACL,QAAQ,EAAE,aAAa;QACvB,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7B,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KAC3C,CAAC;AACH,CAAC","sourcesContent":["import { Readability } from \"@mozilla/readability\";\nimport { chunk, toMarkdown } from \"./convert.js\";\nimport type { ImageRef } from \"./types.js\";\nimport { extractCanonicalUrl, extractHeadings, extractLinks, extractTags, parseDom } from \"./parse.js\";\nimport type { IHttpClient, IRobotsChecker, IThrottle } from \"./ports.js\";\nimport { buildTree } from \"./tree.js\";\nimport type { DOMNode, LeanPage, SpideredPage } from \"./types.js\";\nimport { toLean } from \"./views.js\";\n\n// ---------------------------------------------------------------------------\n// Constants\n// ---------------------------------------------------------------------------\n\nconst WORDS_PER_MINUTE = 200;\n\n// ---------------------------------------------------------------------------\n// Default HTTP client adapter\n// ---------------------------------------------------------------------------\n\nconst defaultHttpClient: IHttpClient = {\n\tasync fetch(req) {\n\t\tconst res = await globalThis.fetch(req.url, {\n\t\t\tsignal: req.signal,\n\t\t\theaders: req.headers,\n\t\t});\n\t\treturn {\n\t\t\tok: res.ok,\n\t\t\tstatus: res.status,\n\t\t\tstatusText: res.statusText,\n\t\t\theaders: { get: (name: string) => res.headers.get(name) },\n\t\t\ttext: () => res.text(),\n\t\t\tarrayBuffer: () => res.arrayBuffer(),\n\t\t};\n\t},\n};\n\n\n\n// ---------------------------------------------------------------------------\n// Public API\n// ---------------------------------------------------------------------------\n\nexport interface SpiderOptions {\n\t/**\n\t * ms before aborting the fetch (default 10 000).\n\t */\n\ttimeoutMs?: number;\n\t/**\n\t * Value sent as User-Agent.\n\t * Default identifies the tool; override for sites that block generic crawlers.\n\t */\n\tuserAgent?: string;\n\t/**\n\t * CSS selector that scopes content extraction to a specific element.\n\t * Everything outside the matched element is discarded before Readability runs.\n\t * Example: \"article\", \".main-content\", \"#post-body\"\n\t */\n\trootSelector?: string;\n\t/**\n\t * Comma-separated CSS selectors whose matched elements are removed before\n\t * extraction. Applied before Readability, so excluded content never reaches\n\t * the chunks or markdown.\n\t * Example: \"nav, footer, .sidebar, #ads\"\n\t */\n\texcludeSelectors?: string;\n\t/**\n\t * Approximate maximum token budget for the returned content.\n\t * Markdown is truncated to fit. Rough estimate: 1 token ≈ 4 characters.\n\t * Does not affect lean view (headings/links are always small).\n\t * Default: unlimited.\n\t */\n\ttokenBudget?: number;\n\t/**\n\t * Per-domain throttle — shared across spider() calls to enforce rate limits\n\t * and exponential backoff on 429/503 responses.\n\t */\n\tthrottle?: IThrottle;\n\t/**\n\t * robots.txt checker — when provided, spider() checks robots.txt before\n\t * fetching and respects Crawl-delay directives.\n\t */\n\trobotsCache?: IRobotsChecker;\n\t/**\n\t * HTTP client — defaults to a global fetch() adapter.\n\t * Inject a stub for testing without real network access.\n\t */\n\thttpClient?: IHttpClient;\n\t/**\n\t * When true, fetch <img> src URLs found in the article content and attach\n\t * them as base64-encoded ImageRef objects to SpideredPage.images.\n\t * Default: false — preserves current behaviour exactly.\n\t */\n\tcaptureImages?: boolean;\n\t/**\n\t * Maximum number of images to fetch per page.\n\t * Default: 10.\n\t */\n\tmaxImages?: number;\n}\n\n/**\n * Spider a single URL and return a fully structured SpideredPage.\n *\n * Pass `view: \"lean\"` to skip chunking and markdown conversion — returns a\n * LeanPage with only identity, metadata, and the heading/link outline.\n * Significantly faster (~3×) and uses far fewer tokens in agent context.\n *\n * Errors are returned as thrown exceptions with a descriptive message rather\n * than crashing silently. Common cases:\n * - Non-HTTP URLs throw immediately with a clear message.\n * - HTTP errors include the status code.\n * - JS-rendered pages (wordCount === 0) include a hint.\n * - Timeouts include the configured limit.\n *\n * @example\n * // Full page — chunks, markdown, all metadata\n * const page = await spider(\"https://example.com\")\n *\n * @example\n * // Lean overview — no body text, ideal for navigation decisions\n * const lean = await spider(\"https://example.com\", { view: \"lean\" })\n */\n// ---------------------------------------------------------------------------\n// Image fetching\n// ---------------------------------------------------------------------------\n\n/** Detect MIME type from a URL path extension, defaulting to image/jpeg. */\nfunction mimeFromUrl(src: string): string {\n\tconst ext = src.split(\"?\")[0].split(\".\").pop()?.toLowerCase();\n\tconst map: Record<string, string> = {\n\t\tjpg: \"image/jpeg\",\n\t\tjpeg: \"image/jpeg\",\n\t\tpng: \"image/png\",\n\t\twebp: \"image/webp\",\n\t\tgif: \"image/gif\",\n\t\tsvg: \"image/svg+xml\",\n\t\tavif: \"image/avif\",\n\t};\n\treturn map[ext ?? \"\"] ?? \"image/jpeg\";\n}\n\n/**\n * Extract <img> elements from article HTML, resolve src URLs, and fetch\n * each as a base64-encoded ImageRef. data: URLs are included without fetching.\n * Failed fetches are silently skipped.\n */\nasync function fetchImages(\n\tarticleHtml: string,\n\tpageUrl: string,\n\thttpClient: IHttpClient,\n\tmaxImages: number,\n\tthrottle?: IThrottle,\n): Promise<ImageRef[]> {\n\t// Parse the article HTML to extract img elements.\n\tconst { parseDom } = await import(\"./parse.js\");\n\tconst doc = parseDom(articleHtml, pageUrl);\n\tconst imgEls = [...doc.querySelectorAll(\"img\")].slice(0, maxImages);\n\n\tconst results: ImageRef[] = [];\n\n\tfor (const el of imgEls) {\n\t\tconst rawSrc = el.getAttribute(\"src\") ?? \"\";\n\t\tif (!rawSrc) continue;\n\n\t\tconst alt = el.getAttribute(\"alt\") ?? \"\";\n\n\t\t// data: URLs — include without fetching.\n\t\tif (rawSrc.startsWith(\"data:\")) {\n\t\t\tconst match = /^data:([^;]+);base64,(.+)$/.exec(rawSrc);\n\t\t\tif (match) {\n\t\t\t\tresults.push({ src: rawSrc, mimeType: match[1], alt, base64: match[2] });\n\t\t\t}\n\t\t\tcontinue;\n\t\t}\n\n\t\t// Resolve relative URLs.\n\t\tlet absoluteSrc: string;\n\t\ttry {\n\t\t\tabsoluteSrc = new URL(rawSrc, pageUrl).toString();\n\t\t} catch {\n\t\t\tcontinue;\n\t\t}\n\n\t\ttry {\n\t\t\tif (throttle) await throttle.wait(absoluteSrc);\n\t\t\tconst res = await httpClient.fetch({\n\t\t\t\turl: absoluteSrc,\n\t\t\t\theaders: { \"User-Agent\": \"web-spider/0.1\", Accept: \"image/*\" },\n\t\t\t});\n\t\t\tif (!res.ok) continue;\n\t\t\tthrottle?.success(absoluteSrc);\n\n\t\t\tconst buf = await res.arrayBuffer();\n\t\t\tconst base64 = Buffer.from(buf).toString(\"base64\");\n\t\t\tconst contentType = res.headers.get(\"content-type\");\n\t\t\tconst mimeType = contentType?.split(\";\")[0].trim() || mimeFromUrl(absoluteSrc);\n\n\t\t\tresults.push({ src: absoluteSrc, mimeType, alt, base64 });\n\t\t} catch {\n\t\t\t// Skip failed image fetches silently — a missing image should never\n\t\t\t// cause the whole page scrape to fail.\n\t\t}\n\t}\n\n\treturn results;\n}\n\n/** A page with its full DOM tree attached. */\nexport interface TreePage extends SpideredPage {\n\treadonly view: \"tree\";\n\ttree: DOMNode;\n}\n\nexport async function spider(url: string, opts: SpiderOptions & { view: \"lean\" }): Promise<LeanPage>;\nexport async function spider(url: string, opts: SpiderOptions & { view: \"tree\" }): Promise<TreePage>;\nexport async function spider(url: string, opts?: SpiderOptions & { view?: \"full\" }): Promise<SpideredPage>;\nexport async function spider(\n\turl: string,\n\topts?: SpiderOptions & { view?: \"lean\" | \"full\" | \"tree\" },\n): Promise<SpideredPage | LeanPage | TreePage> {\n\tconst {\n\t\ttimeoutMs = 30_000,\n\t\tuserAgent = \"web-spider/0.1 (AI agent research tool; +https://github.com/dpopsuev)\",\n\t\tview = \"full\",\n\t\trootSelector,\n\t\texcludeSelectors,\n\t\ttokenBudget,\n\t\tthrottle,\n\t\trobotsCache,\n\t\thttpClient = defaultHttpClient,\n\t\tcaptureImages = false,\n\t\tmaxImages = 10,\n\t} = opts ?? {};\n\n\t// Poka-yoke: reject non-HTTP URLs immediately with a clear message.\n\tlet parsedUrl: URL;\n\ttry {\n\t\tparsedUrl = new URL(url);\n\t} catch {\n\t\tthrow new Error(`Invalid URL: \"${url}\" — must be a fully-qualified http/https URL`);\n\t}\n\tif (![\"http:\", \"https:\"].includes(parsedUrl.protocol)) {\n\t\tthrow new Error(`Unsupported protocol \"${parsedUrl.protocol}\" — only http and https are supported`);\n\t}\n\n\t// Check robots.txt before fetching.\n\tif (robotsCache) {\n\t\tconst { allowed, crawlDelayMs } = await robotsCache.check(url);\n\t\tif (!allowed) throw new Error(`Blocked by robots.txt: ${url}`);\n\t\tif (crawlDelayMs && throttle) {\n\t\t\tthrottle.setDomainDelay(parsedUrl.hostname, crawlDelayMs);\n\t\t}\n\t}\n\n\t// Fetch with optional throttle + retry on 429/503.\n\tconst maxRetries = throttle?.maxRetries ?? 0;\n\tlet html = \"\";\n\tlet fetchError: Error | null = null;\n\n\tfor (let attempt = 0; attempt <= maxRetries; attempt++) {\n\t\tif (throttle) await throttle.wait(url);\n\n\t\tconst controller = new AbortController();\n\t\tconst timer = setTimeout(() => controller.abort(), timeoutMs);\n\t\tlet res: Awaited<ReturnType<IHttpClient[\"fetch\"]>>;\n\t\ttry {\n\t\t\tres = await httpClient.fetch({\n\t\t\t\turl,\n\t\t\t\tsignal: controller.signal,\n\t\t\t\theaders: { \"User-Agent\": userAgent, Accept: \"text/html\" },\n\t\t\t});\n\t\t} catch (err) {\n\t\t\tclearTimeout(timer);\n\t\t\tif (err instanceof Error && err.name === \"AbortError\") {\n\t\t\t\tthrow new Error(`Timeout after ${timeoutMs}ms — ${url}`);\n\t\t\t}\n\t\t\tthrow err;\n\t\t}\n\t\tclearTimeout(timer);\n\n\t\tif (res.status === 429 || res.status === 503) {\n\t\t\tif (throttle && attempt < maxRetries) {\n\t\t\t\tthrottle.rateLimit(url, res.headers.get(\"Retry-After\"));\n\t\t\t\tfetchError = new Error(`HTTP ${res.status} — retrying (attempt ${attempt + 1}/${maxRetries})`);\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tthrow new Error(`HTTP ${res.status} ${res.statusText} — ${url}`);\n\t\t}\n\n\t\tif (!res.ok) throw new Error(`HTTP ${res.status} ${res.statusText} — ${url}`);\n\n\t\tthrottle?.success(url);\n\t\thtml = await res.text();\n\t\tfetchError = null;\n\t\tbreak;\n\t}\n\n\tif (fetchError) throw fetchError;\n\n\t// Parse DOM via parse.ts — keeps the JSDOM dependency in one module.\n\tconst doc = parseDom(html, url);\n\n\t// Apply excludeSelectors before Readability strips the DOM.\n\tif (excludeSelectors) {\n\t\tfor (const sel of excludeSelectors\n\t\t\t.split(\",\")\n\t\t\t.map((s) => s.trim())\n\t\t\t.filter(Boolean)) {\n\t\t\tfor (const el of [...doc.querySelectorAll(sel)]) el.remove();\n\t\t}\n\t}\n\n\t// Scope to rootSelector: replace body content with the matched element.\n\tif (rootSelector) {\n\t\tconst root = doc.querySelector(rootSelector);\n\t\tif (root) {\n\t\t\tdoc.body.innerHTML = root.outerHTML;\n\t\t}\n\t}\n\n\tconst links = extractLinks(doc, url);\n\tconst canonicalUrl = extractCanonicalUrl(doc, url);\n\n\t// Readability content extraction (Firefox Reader View engine).\n\tconst readabilityResult = new Readability(doc).parse();\n\tconst jsRendered = !readabilityResult;\n\t// Graceful degradation: if Readability finds nothing, return a partial page\n\t// with jsRendered:true rather than throwing. The agent can decide what to do.\n\tconst article = readabilityResult ?? {\n\t\ttitle: (doc.querySelector(\"title\")?.textContent ?? \"\").trim(),\n\t\tcontent: \"\",\n\t\ttextContent: \"\",\n\t\tlength: 0,\n\t\texcerpt: \"\",\n\t\tbyline: \"\",\n\t\tdir: \"\",\n\t\tsite_name: \"\",\n\t\tlang: \"\",\n\t\tpublishedTime: null,\n\t\treadingTimeMinutes: 0,\n\t};\n\n\tconst domain = new URL(url).hostname.replace(/^www\\./, \"\");\n\tconst fetchedAt = new Date().toISOString();\n\n\tconst meta = (name: string): string => {\n\t\tconst el =\n\t\t\tdoc.querySelector(`meta[name=\"${name}\"]`) ??\n\t\t\tdoc.querySelector(`meta[property=\"og:${name}\"]`) ??\n\t\t\tdoc.querySelector(`meta[property=\"${name}\"]`);\n\t\treturn (el?.getAttribute(\"content\") ?? \"\").trim();\n\t};\n\n\t// headings must come before tags so the heading fallback is available.\n\tconst headings = extractHeadings(article.content ?? \"\");\n\tconst tags = extractTags(doc);\n\n\t// ---------------------------------------------------------------------------\n\t// Lean fast-path — skip turndown + chunking entirely\n\t// ---------------------------------------------------------------------------\n\tif (view === \"lean\") {\n\t\tconst textContent = (article.textContent ?? \"\").trim();\n\t\tconst wordCount = textContent.split(/\\s+/).filter(Boolean).length;\n\t\tconst chunkCount = Math.max(0, Math.floor(wordCount / 150));\n\n\t\tconst full = {\n\t\t\turl,\n\t\t\tdomain,\n\t\t\tfetchedAt,\n\t\t\t...(canonicalUrl !== undefined ? { canonicalUrl } : {}),\n\t\t\ttitle: article.title ?? meta(\"title\"),\n\t\t\tdescription: meta(\"description\"),\n\t\t\tauthor: article.byline ?? meta(\"author\"),\n\t\t\tpublishedAt: meta(\"article:published_time\") ?? meta(\"date\"),\n\t\t\tlang: doc.documentElement.lang ?? \"en\",\n\t\t\ttags,\n\t\t\twordCount,\n\t\t\treadingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),\n\t\t\tchunks: [], // placeholder — toLean reads chunks.length\n\t\t\theadings,\n\t\t\tlinks,\n\t\t\tmarkdown: \"\",\n\t\t} satisfies SpideredPage;\n\t\tconst lean = toLean(full);\n\t\treturn { ...lean, chunkCount, ...(jsRendered ? { jsRendered: true } : {}) };\n\t}\n\n\t// ---------------------------------------------------------------------------\n\t// Tree path — build semantic DOM tree, then also produce full markdown\n\t// ---------------------------------------------------------------------------\n\tif (view === \"tree\") {\n\t\tconst tree = buildTree(article.content ?? \"\", url);\n\t\tconst markdown = toMarkdown(article.content ?? \"\", { keepImages: captureImages });\n\t\tconst wordCount = markdown.split(/\\s+/).filter(Boolean).length;\n\t\tconst chunks = chunk(markdown, url);\n\t\tconst images = captureImages\n\t\t\t? await fetchImages(article.content ?? \"\", url, httpClient, maxImages, throttle)\n\t\t\t: undefined;\n\t\treturn {\n\t\t\tview: \"tree\",\n\t\t\turl,\n\t\t\tdomain,\n\t\t\tfetchedAt,\n\t\t\t...(canonicalUrl !== undefined ? { canonicalUrl } : {}),\n\t\t\ttitle: article.title ?? meta(\"title\"),\n\t\t\tdescription: meta(\"description\"),\n\t\t\tauthor: article.byline ?? meta(\"author\"),\n\t\t\tpublishedAt: meta(\"article:published_time\") ?? meta(\"date\"),\n\t\t\tlang: doc.documentElement.lang ?? \"en\",\n\t\t\ttags,\n\t\t\twordCount,\n\t\t\treadingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),\n\t\t\theadings,\n\t\t\tchunks,\n\t\t\tlinks,\n\t\t\tmarkdown,\n\t\t\ttree,\n\t\t\t...(images ? { images } : {}),\n\t\t};\n\t}\n\n\t// ---------------------------------------------------------------------------\n\t// Full path — turndown + chunk\n\t// ---------------------------------------------------------------------------\n\tconst markdown = toMarkdown(article.content ?? \"\", { keepImages: captureImages });\n\tconst wordCount = markdown.split(/\\s+/).filter(Boolean).length;\n\n\t// Chunk-aware tokenBudget: select whole chunks up to the budget rather\n\t// than slicing markdown mid-sentence. Preserves chunk boundaries and\n\t// returns the richest complete content that fits.\n\tlet allChunks = chunk(markdown, url);\n\tif (tokenBudget !== undefined) {\n\t\tconst charBudget = tokenBudget * 4;\n\t\tlet remaining = charBudget;\n\t\tlet first = true;\n\t\tallChunks = allChunks.filter((c) => {\n\t\t\t// Always include at least the first chunk — agents need something\n\t\t\t// even if it exceeds the budget.\n\t\t\tif (!first && remaining <= 0) return false;\n\t\t\tfirst = false;\n\t\t\tremaining -= c.text.length;\n\t\t\treturn true;\n\t\t});\n\t}\n\n\t// Reconstruct markdown from selected chunks for full-page consumers.\n\tconst finalMarkdown = tokenBudget !== undefined\n\t\t? allChunks.map((c) => c.text).join(\"\\n\\n\")\n\t\t: markdown;\n\n\tconst images = captureImages\n\t\t? await fetchImages(article.content ?? \"\", url, httpClient, maxImages, throttle)\n\t\t: undefined;\n\n\treturn {\n\t\turl,\n\t\tdomain,\n\t\tfetchedAt,\n\t\t...(canonicalUrl !== undefined ? { canonicalUrl } : {}),\n\t\ttitle: article.title ?? meta(\"title\"),\n\t\tdescription: meta(\"description\"),\n\t\tauthor: article.byline ?? meta(\"author\"),\n\t\tpublishedAt: meta(\"article:published_time\") ?? meta(\"date\"),\n\t\tlang: doc.documentElement.lang ?? \"en\",\n\t\ttags,\n\t\twordCount,\n\t\treadingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),\n\t\theadings,\n\t\tchunks: allChunks,\n\t\tlinks,\n\t\tmarkdown: finalMarkdown,\n\t\t...(images ? { images } : {}),\n\t\t...(jsRendered ? { jsRendered: true } : {}),\n\t};\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"throttle.js","sourceRoot":"","sources":["../src/throttle.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAuBH,SAAS,KAAK,CAAC,EAAU;IACxB,OAAO,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,eAAe,CAAC,MAAqB;IAC7C,IAAI,CAAC,MAAM;QAAE,OAAO,CAAC,CAAC;IACtB,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;IACrC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC;QAAE,OAAO,OAAO,GAAG,KAAK,CAAC;IAC5C,MAAM,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,CAAC,OAAO,EAAE,CAAC;IACxC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;IACxD,OAAO,CAAC,CAAC;AACV,CAAC;AAED,MAAM,OAAO,cAAc;IAO1B,YAAY,OAAwB,EAAE;QANrB,WAAM,GAAG,IAAI,GAAG,EAAuB,CAAC;QAOxD,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,GAAG,CAAC;QACzC,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,KAAK,CAAC;QACjD,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,MAAM,CAAC;QAChD,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,CAAC,CAAC;IACxC,CAAC;IAEO,KAAK,CAAC,IAAY;QACzB,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAC9B,IAAI,CAAC,CAAC,EAAE,CAAC;YACR,CAAC,GAAG,EAAE,MAAM,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;YAC9C,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAC1B,CAAC;QACD,OAAO,CAAC,CAAC;IACV,CAAC;IAED,mEAAmE;IACnE,KAAK,CAAC,IAAI,CAAC,GAAW;QACrB,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;QAC5C,MAAM,QAAQ,GAAG,CAAC,CAAC,UAAU,IAAI,IAAI,CAAC,UAAU,CAAC;QACjD,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CACrB,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,YAAY,GAAG,GAAG,CAAC,EACjC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,GAAG,QAAQ,GAAG,GAAG,CAAC,CACtC,CAAC;QACF,IAAI,KAAK,GAAG,CAAC;YAAE,MAAM,KAAK,CAAC,KAAK,CAAC,CAAC;QAClC,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACvB,CAAC;IAED,mEAAmE;IACnE,OAAO,CAAC,GAAW;QAClB,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;QAC5C,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QACb,CAAC,CAAC,YAAY,GAAG,CAAC,CAAC;IACpB,CAAC;IAED;;;OAGG;IACH,SAAS,CAAC,GAAW,EAAE,gBAA+B;QACrD,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;QAC5C,CAAC,CAAC,MAAM,EAAE,CAAC;QACX,MAAM,YAAY,GAAG,eAAe,CAAC,gBAAgB,CAAC,CAAC;QACvD,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC;QAClD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,aAAa,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;QACjG,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QACjD,CAAC,CAAC,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,MAAM,CAAC;QACrC,OAAO,MAAM,CAAC;IACf,CAAC;IAED;;;OAGG;IACH,cAAc,CAAC,IAAY,EAAE,EAAU;QACtC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,UAAU,GAAG,EAAE,CAAC;IAClC,CAAC;CACD;AAED;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,IAAsB;IACpD,OAAO,IAAI,cAAc,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC","sourcesContent":["/**\n * Per-domain request throttle with exponential backoff and jitter.\n *\n * Enforces a minimum gap between requests to the same hostname.\n * On 429/503, backs off exponentially and respects Retry-After headers.\n * Shared instances should be passed into spider() and crawl() so that\n * all requests to a domain coordinate through one rate limiter.\n */\n\nimport type { IThrottle } from \"./ports.js\";\n\nexport interface ThrottleOptions {\n\t/** Minimum gap between requests to the same domain (ms). Default 500. */\n\tminDelayMs?: number;\n\t/** Base for exponential backoff (ms). Default 1000. */\n\tbackoffBaseMs?: number;\n\t/** Maximum backoff delay (ms). Default 30 000. */\n\tbackoffCapMs?: number;\n\t/** Maximum retry attempts on 429/503 before giving up. Default 3. */\n\tmaxRetries?: number;\n}\n\ninterface DomainState {\n\tlastAt: number;\n\tbackoffUntil: number;\n\terrors: number;\n\t/** Per-domain minimum delay override (e.g. from robots.txt Crawl-delay). */\n\tminDelayMs?: number;\n}\n\nfunction sleep(ms: number): Promise<void> {\n\treturn new Promise((r) => setTimeout(r, ms));\n}\n\nfunction parseRetryAfter(header: string | null): number {\n\tif (!header) return 0;\n\tconst seconds = parseInt(header, 10);\n\tif (!isNaN(seconds)) return seconds * 1_000;\n\tconst date = new Date(header).getTime();\n\tif (!isNaN(date)) return Math.max(0, date - Date.now());\n\treturn 0;\n}\n\nexport class DomainThrottle implements IThrottle {\n\tprivate readonly states = new Map<string, DomainState>();\n\treadonly minDelayMs: number;\n\treadonly backoffBaseMs: number;\n\treadonly backoffCapMs: number;\n\treadonly maxRetries: number;\n\n\tconstructor(opts: ThrottleOptions = {}) {\n\t\tthis.minDelayMs = opts.minDelayMs ?? 500;\n\t\tthis.backoffBaseMs = opts.backoffBaseMs ?? 1_000;\n\t\tthis.backoffCapMs = opts.backoffCapMs ?? 30_000;\n\t\tthis.maxRetries = opts.maxRetries ?? 3;\n\t}\n\n\tprivate state(host: string): DomainState {\n\t\tlet s = this.states.get(host);\n\t\tif (!s) {\n\t\t\ts = { lastAt: 0, backoffUntil: 0, errors: 0 };\n\t\t\tthis.states.set(host, s);\n\t\t}\n\t\treturn s;\n\t}\n\n\t/** Wait until the domain's rate limit and backoff have cleared. */\n\tasync wait(url: string): Promise<void> {\n\t\tconst s = this.state(new URL(url).hostname);\n\t\tconst minDelay = s.minDelayMs ?? this.minDelayMs;\n\t\tconst now = Date.now();\n\t\tconst delay = Math.max(\n\t\t\tMath.max(0, s.backoffUntil - now),\n\t\t\tMath.max(0, s.lastAt + minDelay - now),\n\t\t);\n\t\tif (delay > 0) await sleep(delay);\n\t\ts.lastAt = Date.now();\n\t}\n\n\t/** Record a successful request — resets backoff for the domain. */\n\tsuccess(url: string): void {\n\t\tconst s = this.state(new URL(url).hostname);\n\t\ts.errors = 0;\n\t\ts.backoffUntil = 0;\n\t}\n\n\t/**\n\t * Record a rate-limit hit. Applies exponential backoff with jitter,\n\t * using Retry-After header when present. Returns the wait duration in ms.\n\t */\n\trateLimit(url: string, retryAfterHeader: string | null): number {\n\t\tconst s = this.state(new URL(url).hostname);\n\t\ts.errors++;\n\t\tconst retryAfterMs = parseRetryAfter(retryAfterHeader);\n\t\tconst jitter = Math.random() * this.backoffBaseMs;\n\t\tconst backoffMs = Math.min(this.backoffCapMs, this.backoffBaseMs * 2 ** (s.errors - 1) + jitter);\n\t\tconst waitMs = Math.max(retryAfterMs, backoffMs);\n\t\ts.backoffUntil = Date.now() + waitMs;\n\t\treturn waitMs;\n\t}\n\n\t/**\n\t * Override the minimum delay for a specific domain.\n\t * Used to honour robots.txt Crawl-delay directives.\n\t */\n\tsetDomainDelay(host: string, ms: number): void {\n\t\tthis.state(host).minDelayMs = ms;\n\t}\n}\n\n/**\n * Factory — avoids jiti/Bun CJS re-export interop where class constructors\n * accessed through a re-export chain can appear undefined at call site.\n * Use this in extension code instead of `new DomainThrottle()`.\n */\nexport function createThrottle(opts?: ThrottleOptions): DomainThrottle {\n\treturn new DomainThrottle(opts);\n}\n"]}
|
package/dist/tree.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tree.js","sourceRoot":"","sources":["../src/tree.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC,8EAA8E;AAC9E,oBAAoB;AACpB,8EAA8E;AAE9E;;;GAGG;AACH,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC;IAC1B,SAAS;IACT,SAAS;IACT,MAAM;IACN,OAAO;IACP,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,GAAG;IACH,YAAY;IACZ,KAAK;IACL,QAAQ;IACR,YAAY;IACZ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,SAAS;IACT,SAAS;CACT,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,MAAM,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC;AAEtG,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,UAAU,EAAE,GAAG,WAAW,CAAC,CAAC,CAAC;AAE/D,qEAAqE;AACrE,MAAM,eAAe,GAAG,IAAI,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,EAAE,SAAS,CAAC,CAAC,CAAC;AAEtH,2DAA2D;AAC3D,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC;IACjC,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,IAAI;IACJ,SAAS;IACT,SAAS;IACT,OAAO;IACP,YAAY;IACZ,SAAS;IACT,QAAQ;CACR,CAAC,CAAC;AAEH,8EAA8E;AAC9E,gBAAgB;AAChB,8EAA8E;AAE9E;;;GAGG;AACH,SAAS,WAAW,CAAC,EAAW;IAC/B,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;IAC3C,MAAM,CAAC,GAAG,4BAA4B,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACjD,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;AAC7B,CAAC;AAED,mEAAmE;AACnE,SAAS,WAAW,CAAC,EAAW;IAC/B,OAAO,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AAC3D,CAAC;AAED;;;GAGG;AACH,SAAS,SAAS,CAAC,EAAW,EAAE,UAAkB,EAAE,YAAiC;IACpF,MAAM,GAAG,GAAG,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;IAErC,uBAAuB;IACvB,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IACzC,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;IACjC,MAAM,OAAO,GAAG,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,IAAI,KAAK,GAAG,CAAC;IACvD,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,GAAG,UAAU,IAAI,OAAO,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;IAE/D,QAAQ;IACR,MAAM,KAAK,GAA2B,EAAE,CAAC;IACzC,IAAI,GAAG,KAAK,GAAG,EAAE,CAAC;QACjB,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QACrC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;YAAE,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC;IAChE,CAAC;IACD,IAAI,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,KAAK,EAAE,CAAC;QACrC,MAAM,IAAI,GAAG,WAAW,CAAC,EAAE,CAAC,IAAI,WAAW,CAAC,EAAE,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;QAC5E,IAAI,IAAI;YAAE,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC;IAC7B,CAAC;IACD,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;QACpB,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;QACvC,IAAI,KAAK;YAAE,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC;IAChC,CAAC;IACD,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;QACpB,MAAM,EAAE,GAAG,EAAE,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;QACvC,IAAI,EAAE;YAAE,KAAK,CAAC,QAAQ,GAAG,EAAE,CAAC;IAC7B,CAAC;IAED,oCAAoC;IACpC,IAAI,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,GAAG,WAAW,CAAC,EAAE,CAAC,CAAC;QAC7B,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QACvB,OAAO;YACN,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACnD,CAAC;IACH,CAAC;IAED,qEAAqE;IACrE,IAAI,GAAG,KAAK,KAAK,EAAE,CAAC;QACnB,MAAM,IAAI,GAAG,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC;QAC9C,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;YAAE,OAAO,IAAI,CAAC;QAC9B,OAAO;YACN,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACnD,CAAC;IACH,CAAC;IAED,4CAA4C;IAC5C,MAAM,QAAQ,GAAc,EAAE,CAAC;IAC/B,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAE7C,KAAK,MAAM,KAAK,IAAI,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC7C,MAAM,QAAQ,GAAG,KAAK,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;QAE7C,IAAI,aAAa,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACjC,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,EAAE,IAAI,EAAE,UAAU,CAAC,CAAC;YAChD,IAAI,IAAI;gBAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;aAAM,CAAC;YACP,uEAAuE;YACvE,MAAM,SAAS,GAAG,eAAe,CAAC,KAAK,EAAE,IAAI,EAAE,UAAU,CAAC,CAAC;YAC3D,QAAQ,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;QAC7B,CAAC;IACF,CAAC;IAED,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,sDAAsD;QACtD,MAAM,IAAI,GAAG,WAAW,CAAC,EAAE,CAAC,CAAC;QAC7B,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QACvB,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC;IACjF,CAAC;IAED,0DAA0D;IAC1D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QAC1D,iDAAiD;QACjD,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED,OAAO;QACN,GAAG;QACH,IAAI;QACJ,QAAQ;QACR,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KACnD,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,EAAW,EAAE,UAAkB,EAAE,YAAiC;IAC1F,MAAM,OAAO,GAAc,EAAE,CAAC;IAC9B,KAAK,MAAM,KAAK,IAAI,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC7C,MAAM,QAAQ,GAAG,KAAK,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;QAC7C,IAAI,aAAa,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACjC,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,EAAE,UAAU,EAAE,YAAY,CAAC,CAAC;YACxD,IAAI,IAAI;gBAAE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9B,CAAC;aAAM,CAAC;YACP,OAAO,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,KAAK,EAAE,UAAU,EAAE,YAAY,CAAC,CAAC,CAAC;QACnE,CAAC;IACF,CAAC;IACD,mEAAmE;IACnE,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,GAAG,WAAW,CAAC,EAAE,CAAC,CAAC;QAC7B,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YACtB,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACzC,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;YACjC,MAAM,OAAO,GAAG,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,KAAK,GAAG,CAAC;YAClD,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,UAAU,IAAI,OAAO,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;QACpE,CAAC;IACF,CAAC;IACD,OAAO,OAAO,CAAC;AAChB,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,SAAS,CAAC,WAAmB,EAAE,OAAe;IAC7D,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,eAAe,WAAW,gBAAgB,EAAE,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC;IAC7F,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;IAE3B,MAAM,QAAQ,GAAc,EAAE,CAAC;IAC/B,MAAM,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC;IAE/C,KAAK,MAAM,KAAK,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC/C,MAAM,QAAQ,GAAG,KAAK,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;QAC7C,IAAI,aAAa,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACjC,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;YACvD,IAAI,IAAI;gBAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;aAAM,CAAC;YACP,MAAM,SAAS,GAAG,eAAe,CAAC,KAAK,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;YAClE,QAAQ,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;QAC7B,CAAC;IACF,CAAC;IAED,OAAO,EAAE,GAAG,EAAE,SAAS,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC;AACtD,CAAC;AAED,8EAA8E;AAC9E,kBAAkB;AAClB,8EAA8E;AAE9E,kEAAkE;AAClE,SAAS,QAAQ,CAAC,IAAa;IAC9B,MAAM,MAAM,GAAc,CAAC,IAAI,CAAC,CAAC;IACjC,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QACnB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ;YAAE,MAAM,CAAC,IAAI,CAAC,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,MAAM,CAAC;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,IAAa,EAAE,IAAY;IACvD,MAAM,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC7B,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,IAAI,IAAI,CAAC;AACnD,CAAC;AAED,8EAA8E;AAC9E,oBAAoB;AACpB,8EAA8E;AAE9E,wDAAwD;AACxD,SAAS,QAAQ,CAAC,IAAa;IAC9B,IAAI,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAC,IAAI,CAAC;IAChC,IAAI,CAAC,IAAI,CAAC,QAAQ;QAAE,OAAO,EAAE,CAAC;IAC9B,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC9C,CAAC;AAED,2DAA2D;AAC3D,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC;IAC9B,SAAS;IACT,SAAS;IACT,OAAO;IACP,YAAY;IACZ,SAAS;IACT,IAAI;IACJ,KAAK;IACL,GAAG;IACH,QAAQ;IACR,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;CACJ,CAAC,CAAC;AAEH;;;GAGG;AACH,SAAS,SAAS,CAAC,IAAY,EAAE,WAAqB,EAAE,SAAiB;IACxE,IAAI,CAAC,IAAI;QAAE,OAAO,CAAC,CAAC;IACpB,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IACjC,IAAI,KAAK,GAAG,KAAK,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACvD,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;QAC9B,IAAI,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC;YAAE,KAAK,IAAI,QAAQ,CAAC;IAC3C,CAAC;IACD,OAAO,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAC3B,CAAC;AAED,4DAA4D;AAC5D,SAAS,YAAY,CAAC,IAAY,EAAE,SAAiB,EAAE,WAAqB,EAAE,MAAM,GAAG,GAAG;IACzF,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IACjC,IAAI,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IACnC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;QAChB,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;YAC9B,MAAM,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAC5B,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC;gBACd,GAAG,GAAG,CAAC,CAAC;gBACR,MAAM;YACP,CAAC;QACF,CAAC;IACF,CAAC;IACD,IAAI,GAAG,KAAK,CAAC,CAAC;QAAE,GAAG,GAAG,CAAC,CAAC;IACxB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC;IACxC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;IAChF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/D,OAAO,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;AACtE,CAAC;AASD;;;;;;;;;;GAUG;AACH,MAAM,UAAU,SAAS,CAAC,IAAa,EAAE,KAAa,EAAE,OAAyB,EAAE;IAClF,MAAM,EAAE,IAAI,GAAG,EAAE,EAAE,aAAa,GAAG,GAAG,EAAE,GAAG,IAAI,CAAC;IAChD,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAE7B,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC7C,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAEvE,MAAM,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,IAAI,GAAc,EAAE,CAAC;IAE3B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC1B,wEAAwE;QACxE,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC;YAAE,SAAS;QAC5C,IAAI,IAAI,CAAC,IAAI,KAAK,SAAS;YAAE,SAAS,CAAC,2CAA2C;QAElF,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;QAC5B,MAAM,KAAK,GAAG,SAAS,CAAC,IAAI,EAAE,WAAW,EAAE,SAAS,CAAC,CAAC;QACtD,IAAI,KAAK,KAAK,CAAC;YAAE,SAAS;QAE1B,IAAI,CAAC,IAAI,CAAC;YACT,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,KAAK;YACL,IAAI;YACJ,OAAO,EAAE,YAAY,CAAC,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,aAAa,CAAC;SAClE,CAAC,CAAC;IACJ,CAAC;IAED,6EAA6E;IAC7E,iFAAiF;IACjF,MAAM,OAAO,GAAG,IAAI;SAClB,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC;SACjC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,EAAE;QACvB,sEAAsE;QACtE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,KAAK,CAAC,KAAK,IAAI,GAAG,CAAC,KAAK,CAAC,CAAC;IAC5G,CAAC,CAAC,CAAC;IAEJ,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;AAC/B,CAAC","sourcesContent":["import { parseHTML } from \"linkedom\";\nimport type { DOMNode, TreeHit } from \"./types.js\";\n\n// ---------------------------------------------------------------------------\n// Semantic tag sets\n// ---------------------------------------------------------------------------\n\n/**\n * Tags that are kept as-is in the simplified tree.\n * Everything else is either collapsed (single-child wrappers) or stripped.\n */\nconst BLOCK_TAGS = new Set([\n\t\"article\",\n\t\"section\",\n\t\"main\",\n\t\"aside\",\n\t\"h1\",\n\t\"h2\",\n\t\"h3\",\n\t\"h4\",\n\t\"h5\",\n\t\"h6\",\n\t\"p\",\n\t\"blockquote\",\n\t\"pre\",\n\t\"figure\",\n\t\"figcaption\",\n\t\"ul\",\n\t\"ol\",\n\t\"li\",\n\t\"table\",\n\t\"thead\",\n\t\"tbody\",\n\t\"tfoot\",\n\t\"tr\",\n\t\"th\",\n\t\"td\",\n\t\"details\",\n\t\"summary\",\n]);\n\nconst INLINE_TAGS = new Set([\"a\", \"code\", \"strong\", \"em\", \"abbr\", \"time\", \"mark\", \"s\", \"del\", \"ins\"]);\n\nconst SEMANTIC_TAGS = new Set([...BLOCK_TAGS, ...INLINE_TAGS]);\n\n/** Tags whose subtrees should be flattened to a single text node. */\nconst LEAF_CONTAINERS = new Set([\"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\", \"p\", \"li\", \"td\", \"th\", \"figcaption\", \"summary\"]);\n\n/** Tags where we want to preserve full child structure. */\nconst BRANCH_CONTAINERS = new Set([\n\t\"pre\",\n\t\"ul\",\n\t\"ol\",\n\t\"table\",\n\t\"thead\",\n\t\"tbody\",\n\t\"tfoot\",\n\t\"tr\",\n\t\"section\",\n\t\"article\",\n\t\"aside\",\n\t\"blockquote\",\n\t\"details\",\n\t\"figure\",\n]);\n\n// ---------------------------------------------------------------------------\n// Tree building\n// ---------------------------------------------------------------------------\n\n/**\n * Extract the language from a <code> or <pre> element's class attribute.\n * Turndown and most syntax highlighters use class=\"language-typescript\" etc.\n */\nfunction extractLang(el: Element): string | undefined {\n\tconst cls = el.getAttribute(\"class\") ?? \"\";\n\tconst m = /language-([a-zA-Z0-9_+-]+)/.exec(cls);\n\treturn m ? m[1] : undefined;\n}\n\n/** Flatten all descendant text content into one trimmed string. */\nfunction flattenText(el: Element): string {\n\treturn (el.textContent ?? \"\").replace(/\\s+/g, \" \").trim();\n}\n\n/**\n * Recursively build a DOMNode from an Element.\n * Returns null if the element has no meaningful content.\n */\nfunction buildNode(el: Element, pathPrefix: string, siblingIndex: Map<string, number>): DOMNode | null {\n\tconst tag = el.tagName.toLowerCase();\n\n\t// Compute path segment\n\tconst count = siblingIndex.get(tag) ?? 0;\n\tsiblingIndex.set(tag, count + 1);\n\tconst segment = count === 0 ? tag : `${tag}[${count}]`;\n\tconst path = pathPrefix ? `${pathPrefix}.${segment}` : segment;\n\n\t// Attrs\n\tconst attrs: Record<string, string> = {};\n\tif (tag === \"a\") {\n\t\tconst href = el.getAttribute(\"href\");\n\t\tif (href && !href.startsWith(\"javascript:\")) attrs.href = href;\n\t}\n\tif (tag === \"code\" || tag === \"pre\") {\n\t\tconst lang = extractLang(el) ?? extractLang(el.querySelector(\"code\") ?? el);\n\t\tif (lang) attrs.lang = lang;\n\t}\n\tif (tag === \"abbr\") {\n\t\tconst title = el.getAttribute(\"title\");\n\t\tif (title) attrs.title = title;\n\t}\n\tif (tag === \"time\") {\n\t\tconst dt = el.getAttribute(\"datetime\");\n\t\tif (dt) attrs.datetime = dt;\n\t}\n\n\t// Leaf containers — flatten to text\n\tif (LEAF_CONTAINERS.has(tag)) {\n\t\tconst text = flattenText(el);\n\t\tif (!text) return null;\n\t\treturn {\n\t\t\ttag,\n\t\t\tpath,\n\t\t\ttext,\n\t\t\t...(Object.keys(attrs).length > 0 ? { attrs } : {}),\n\t\t};\n\t}\n\n\t// pre — treat the entire block (including nested <code>) as one leaf\n\tif (tag === \"pre\") {\n\t\tconst text = (el.textContent ?? \"\").trimEnd();\n\t\tif (!text.trim()) return null;\n\t\treturn {\n\t\t\ttag,\n\t\t\tpath,\n\t\t\ttext,\n\t\t\t...(Object.keys(attrs).length > 0 ? { attrs } : {}),\n\t\t};\n\t}\n\n\t// Branch containers — recurse into children\n\tconst children: DOMNode[] = [];\n\tconst childIndex = new Map<string, number>();\n\n\tfor (const child of Array.from(el.children)) {\n\t\tconst childTag = child.tagName.toLowerCase();\n\n\t\tif (SEMANTIC_TAGS.has(childTag)) {\n\t\t\tconst node = buildNode(child, path, childIndex);\n\t\t\tif (node) children.push(node);\n\t\t} else {\n\t\t\t// Non-semantic wrapper: collapse by recursing with the same path/index\n\t\t\tconst collapsed = collapseWrapper(child, path, childIndex);\n\t\t\tchildren.push(...collapsed);\n\t\t}\n\t}\n\n\tif (children.length === 0) {\n\t\t// Branch with no semantic children — try as text leaf\n\t\tconst text = flattenText(el);\n\t\tif (!text) return null;\n\t\treturn { tag, path, text, ...(Object.keys(attrs).length > 0 ? { attrs } : {}) };\n\t}\n\n\t// Collapse single-child branches with the same tag family\n\tif (children.length === 1 && !BRANCH_CONTAINERS.has(tag)) {\n\t\t// Promote the child up, but keep the parent path\n\t\treturn children[0];\n\t}\n\n\treturn {\n\t\ttag,\n\t\tpath,\n\t\tchildren,\n\t\t...(Object.keys(attrs).length > 0 ? { attrs } : {}),\n\t};\n}\n\n/**\n * Collapse a non-semantic wrapper element, returning its semantic descendants.\n */\nfunction collapseWrapper(el: Element, pathPrefix: string, siblingIndex: Map<string, number>): DOMNode[] {\n\tconst results: DOMNode[] = [];\n\tfor (const child of Array.from(el.children)) {\n\t\tconst childTag = child.tagName.toLowerCase();\n\t\tif (SEMANTIC_TAGS.has(childTag)) {\n\t\t\tconst node = buildNode(child, pathPrefix, siblingIndex);\n\t\t\tif (node) results.push(node);\n\t\t} else {\n\t\t\tresults.push(...collapseWrapper(child, pathPrefix, siblingIndex));\n\t\t}\n\t}\n\t// If no semantic children found, treat wrapper text as a paragraph\n\tif (results.length === 0) {\n\t\tconst text = flattenText(el);\n\t\tif (text.length > 20) {\n\t\t\tconst count = siblingIndex.get(\"p\") ?? 0;\n\t\t\tsiblingIndex.set(\"p\", count + 1);\n\t\t\tconst segment = count === 0 ? \"p\" : `p[${count}]`;\n\t\t\tresults.push({ tag: \"p\", path: `${pathPrefix}.${segment}`, text });\n\t\t}\n\t}\n\treturn results;\n}\n\n/**\n * Build a simplified semantic DOM tree from Readability article HTML.\n *\n * The root is always an \"article\" node. Presentational wrappers are collapsed,\n * single-child chains are simplified, and only semantic tags survive.\n */\nexport function buildTree(articleHtml: string, baseUrl: string): DOMNode {\n\tconst { document } = parseHTML(`<html><body>${articleHtml}</body></html>`, { url: baseUrl });\n\tconst body = document.body;\n\n\tconst children: DOMNode[] = [];\n\tconst siblingIndex = new Map<string, number>();\n\n\tfor (const child of Array.from(body.children)) {\n\t\tconst childTag = child.tagName.toLowerCase();\n\t\tif (SEMANTIC_TAGS.has(childTag)) {\n\t\t\tconst node = buildNode(child, \"article\", siblingIndex);\n\t\t\tif (node) children.push(node);\n\t\t} else {\n\t\t\tconst collapsed = collapseWrapper(child, \"article\", siblingIndex);\n\t\t\tchildren.push(...collapsed);\n\t\t}\n\t}\n\n\treturn { tag: \"article\", path: \"article\", children };\n}\n\n// ---------------------------------------------------------------------------\n// Tree navigation\n// ---------------------------------------------------------------------------\n\n/** Collect all nodes in the tree as a flat list (depth-first). */\nfunction allNodes(node: DOMNode): DOMNode[] {\n\tconst result: DOMNode[] = [node];\n\tif (node.children) {\n\t\tfor (const child of node.children) result.push(...allNodes(child));\n\t}\n\treturn result;\n}\n\n/**\n * Navigate to a specific node by its dot-bracket path.\n * Returns null if the path does not exist in the tree.\n *\n * @example navigateTree(tree, \"article.section[1].pre[0]\")\n */\nexport function navigateTree(root: DOMNode, path: string): DOMNode | null {\n\tconst nodes = allNodes(root);\n\treturn nodes.find((n) => n.path === path) ?? null;\n}\n\n// ---------------------------------------------------------------------------\n// Tree fuzzy search\n// ---------------------------------------------------------------------------\n\n/** Extract all text content from a node recursively. */\nfunction nodeText(node: DOMNode): string {\n\tif (node.text) return node.text;\n\tif (!node.children) return \"\";\n\treturn node.children.map(nodeText).join(\" \");\n}\n\n/** Semantic \"block\" tags that make good hit containers. */\nconst HIT_CONTAINERS = new Set([\n\t\"section\",\n\t\"article\",\n\t\"aside\",\n\t\"blockquote\",\n\t\"details\",\n\t\"li\",\n\t\"pre\",\n\t\"p\",\n\t\"figure\",\n\t\"h1\",\n\t\"h2\",\n\t\"h3\",\n\t\"h4\",\n\t\"h5\",\n\t\"h6\",\n\t\"tr\",\n]);\n\n/**\n * Score text against a query using token overlap + exact phrase bonus.\n * Returns 0–1.\n */\nfunction scoreText(text: string, queryTokens: string[], fullQuery: string): number {\n\tif (!text) return 0;\n\tconst lower = text.toLowerCase();\n\tlet score = lower.includes(fullQuery) ? 0.6 : 0;\n\tconst perToken = 0.4 / Math.max(queryTokens.length, 1);\n\tfor (const qt of queryTokens) {\n\t\tif (lower.includes(qt)) score += perToken;\n\t}\n\treturn Math.min(score, 1);\n}\n\n/** Build a short snippet around the best match position. */\nfunction buildSnippet(text: string, fullQuery: string, queryTokens: string[], radius = 100): string {\n\tconst lower = text.toLowerCase();\n\tlet pos = lower.indexOf(fullQuery);\n\tif (pos === -1) {\n\t\tfor (const qt of queryTokens) {\n\t\t\tconst p = lower.indexOf(qt);\n\t\t\tif (p !== -1) {\n\t\t\t\tpos = p;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n\tif (pos === -1) pos = 0;\n\tconst start = Math.max(0, pos - radius);\n\tconst end = Math.min(text.length, pos + Math.max(fullQuery.length, 1) + radius);\n\tconst raw = text.slice(start, end).replace(/\\s+/g, \" \").trim();\n\treturn (start > 0 ? \"…\" : \"\") + raw + (end < text.length ? \"…\" : \"\");\n}\n\nexport interface QueryTreeOptions {\n\t/** Max hits to return (default 10). */\n\ttopN?: number;\n\t/** Context chars around match in snippet (default 100). */\n\tsnippetRadius?: number;\n}\n\n/**\n * Fuzzy-search a DOM tree for a query string.\n *\n * Returns hits ranked by score. Each hit is the nearest semantic ancestor\n * that contains the match (a section, li, pre, p — not a raw div). This\n * means code blocks and table rows are always returned whole.\n *\n * @example\n * const hits = queryTree(tree, \"declaration merge\", { topN: 3 })\n * // hits[0].node is the full <section> containing that heading\n */\nexport function queryTree(root: DOMNode, query: string, opts: QueryTreeOptions = {}): TreeHit[] {\n\tconst { topN = 10, snippetRadius = 100 } = opts;\n\tif (!query.trim()) return [];\n\n\tconst fullQuery = query.trim().toLowerCase();\n\tconst queryTokens = fullQuery.split(/\\s+/).filter((t) => t.length > 1);\n\n\tconst nodes = allNodes(root);\n\tconst hits: TreeHit[] = [];\n\n\tfor (const node of nodes) {\n\t\t// Only return hit containers — not intermediate wrappers, not the root.\n\t\tif (!HIT_CONTAINERS.has(node.tag)) continue;\n\t\tif (node.path === \"article\") continue; // root always matches everything — skip it\n\n\t\tconst text = nodeText(node);\n\t\tconst score = scoreText(text, queryTokens, fullQuery);\n\t\tif (score === 0) continue;\n\n\t\thits.push({\n\t\t\tpath: node.path,\n\t\t\tscore,\n\t\t\tnode,\n\t\t\tsnippet: buildSnippet(text, fullQuery, queryTokens, snippetRadius),\n\t\t});\n\t}\n\n\t// Deduplicate: if a parent and child both match, keep only the more specific\n\t// (higher-scoring) one. If scores are equal, prefer the ancestor (more context).\n\tconst deduped = hits\n\t\t.sort((a, b) => b.score - a.score)\n\t\t.filter((hit, i, arr) => {\n\t\t\t// Remove this hit if a better-scoring ancestor is already in the list\n\t\t\treturn !arr.slice(0, i).some((other) => hit.path.startsWith(`${other.path}.`) && other.score >= hit.score);\n\t\t});\n\n\treturn deduped.slice(0, topN);\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"","sourcesContent":["/** Selects how much content spider() returns. */\nexport type PageView = \"lean\" | \"full\" | \"tree\";\n\n// ---------------------------------------------------------------------------\n// DOM tree types\n// ---------------------------------------------------------------------------\n\n/**\n * A single node in the simplified DOM tree.\n *\n * The tree is built from the Readability article HTML with all presentational\n * wrapper elements collapsed. Only semantically meaningful tags survive.\n * Single-child chains (div > div > p) are reduced to the leaf (p).\n *\n * Paths use bracket notation for siblings of the same tag:\n * \"article.section[1].pre[0].code\"\n *\n * Agents can:\n * - Read the tree to understand page structure without fetching full markdown.\n * - Call navigateTree(tree, path) to extract one exact node.\n * - Call queryTree(tree, query) to fuzzy-search and get matching subtrees.\n */\nexport interface DOMNode {\n\t/** HTML tag name, lower-cased. */\n\ttag: string;\n\t/** Stable dot-bracket path from the tree root, e.g. \"article.section[1].pre[0].code\". */\n\tpath: string;\n\t/**\n\t * Text content of this node.\n\t * For leaf nodes: the raw text. For branch nodes: concatenated descendant text.\n\t * Omitted when the node has children to avoid duplication.\n\t */\n\ttext?: string;\n\t/**\n\t * Semantically useful attributes only.\n\t * a → href, code → lang (from class=\"language-*\"), abbr → title.\n\t */\n\tattrs?: Record<string, string>;\n\t/** Child nodes. Present on branch nodes, absent on leaves. */\n\tchildren?: DOMNode[];\n}\n\n/** A hit returned by queryTree — a matching subtree with score and context. */\nexport interface TreeHit {\n\t/** Dot-bracket path of the matching node. */\n\tpath: string;\n\t/** Score 0–1. Higher is a better match. */\n\tscore: number;\n\t/** The matching node (may be a branch — e.g. a whole section). */\n\tnode: DOMNode;\n\t/** Short context around the best match, ≤ 200 chars. */\n\tsnippet: string;\n}\n\n/** Dominant content type of a chunk — detected from the markdown buffer. */\nexport type ChunkType = \"text\" | \"code\" | \"table\" | \"list\" | \"blockquote\";\n\n/** One embeddable, self-contained segment of a page. The unit of RAG. */\nexport interface Chunk {\n\t/** Stable reference: \"<url>#chunk-<index>\" */\n\tid: string;\n\tindex: number;\n\t/** Nearest ancestor heading, empty string if none */\n\theading: string;\n\t/** Clean Markdown text */\n\ttext: string;\n\twordCount: number;\n\t/** Dominant content type — lets agents skip code/table chunks when summarising. */\n\tcontentType: ChunkType;\n}\n\n/**\n * A single image scraped from a page.\n *\n * Storage contract:\n * - base64 is populated when the image is small enough to store inline.\n * - filePath is populated when the image has been spilled to disk.\n * - At least one of base64 or filePath is present on a hydrated ImageRef.\n *\n * LLM wire format (works with OpenAI, Anthropic, Together, Gemini):\n * `data:${mimeType};base64,${base64}`\n */\nexport interface ImageRef {\n\t/** Original absolute src URL of the image. */\n\tsrc: string;\n\t/** Base64-encoded image bytes. Omitted when the image is stored on disk. */\n\tbase64?: string;\n\t/** MIME type detected from Content-Type or src extension, e.g. \"image/jpeg\". */\n\tmimeType: string;\n\t/** Alt text from the <img> tag, empty string when absent. */\n\talt: string;\n\t/** Path to the binary file when the image has been persisted to disk. */\n\tfilePath?: string;\n}\n\n/** An outbound link — one edge in the knowledge graph. */\nexport interface Link {\n\thref: string;\n\ttext: string;\n\tisExternal: boolean;\n\t/**\n\t * Where in the page the link was found.\n\t * \"body\" — inside the article content (strongest signal).\n\t * \"nav\" — inside nav, header, footer, or aside (navigation chrome).\n\t */\n\trel: \"body\" | \"nav\";\n}\n\n/**\n * Minimal link for lean views — isExternal omitted (inferable from the URL).\n * Saves tokens when pages carry hundreds of links.\n */\nexport interface LeanLink {\n\thref: string;\n\ttext: string;\n}\n\n/**\n * Compact page view — identity, metadata, and structural outline only.\n * No chunk text, no markdown body. Use when deciding whether/where to dig\n * deeper. Roughly 5–20× fewer tokens than a full SpideredPage.\n *\n * Headings are flat markdown strings (\"## Section\") rather than objects —\n * same information, ~half the tokens.\n */\nexport interface LeanPage {\n\treadonly view: \"lean\";\n\n\t// --- identity ---\n\turl: string;\n\tdomain: string;\n\t/** Canonical URL when it differs from the fetched URL (og:url / link[rel=canonical]). */\n\tcanonicalUrl?: string;\n\n\t// --- metadata ---\n\ttitle: string;\n\tdescription?: string;\n\tauthor?: string;\n\tpublishedAt?: string;\n\tlang: string;\n\t/** Extracted topic tags — from meta keywords and article:tag. Compact vocabulary for grouping. */\n\ttags: string[];\n\n\t// --- content signals ---\n\twordCount: number;\n\treadingTimeMinutes: number;\n\t/** How many RAG chunks a full view would produce. */\n\tchunkCount: number;\n\n\t// --- structural outline ---\n\t/** Heading outline as flat markdown strings, e.g. \"## Section Name\". */\n\theadings: string[];\n\n\t// --- graph edges ---\n\t/** Outbound links — href + anchor text only. */\n\tlinks: LeanLink[];\n\n\t/** True when the page appears JS-rendered — metadata may be partial. */\n\tjsRendered?: boolean;\n\t/**\n\t * Number of other spidered pages that link to this page.\n\t * Populated when a PageGraph is passed to toLean(). Omitted otherwise.\n\t * Higher = more authoritative within the crawled corpus.\n\t */\n\tinboundCount?: number;\n}\n\n// toLean() moved to views.ts. Import from there or from the package root.\n\n/**\n * A fully spidered page.\n *\n * Follows the Local Materialized View rule: every field is a named,\n * independently readable value — never a serialized blob. Agents read\n * individual fields; RAG embeds individual chunks; graph walkers follow\n * individual links.\n */\nexport interface SpideredPage {\n\t// --- identity ---\n\turl: string;\n\tdomain: string;\n\tfetchedAt: string; // ISO-8601\n\t/** Canonical URL when it differs from the fetched URL (og:url / link[rel=canonical]). */\n\tcanonicalUrl?: string;\n\n\t// --- metadata (readable at a glance) ---\n\ttitle: string;\n\tdescription: string;\n\tauthor: string;\n\tpublishedAt: string;\n\tlang: string;\n\t/** Extracted topic tags — from meta keywords and article:tag. */\n\ttags: string[];\n\n\t// --- content signals ---\n\twordCount: number;\n\treadingTimeMinutes: number;\n\n\t// --- structured content ---\n\t/** Heading outline — h1/h2/h3 only */\n\theadings: Array<{ level: 1 | 2 | 3; text: string }>;\n\t/** RAG-ready chunks */\n\tchunks: Chunk[];\n\n\t// --- graph edges ---\n\t/** Outbound links from this page */\n\tlinks: Link[];\n\n\t// --- images (opt-in, requires captureImages: true) ---\n\t/**\n\t * Images scraped from the article content.\n\t * Only populated when spider() is called with captureImages: true.\n\t */\n\timages?: ImageRef[];\n\n\t// --- full body (fallback / debug) ---\n\tmarkdown: string;\n\n\t/**\n\t * True when the page appears to be JavaScript-rendered (Readability\n\t * found no content). metadata and links are still populated where\n\t * possible; chunks and markdown are empty.\n\t */\n\tjsRendered?: boolean;\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"views.js","sourceRoot":"","sources":["../src/views.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH;;;;;;;GAOG;AACH,MAAM,UAAU,MAAM,CAAC,IAAkB,EAAE,KAAiB;IAC3D,OAAO;QACN,IAAI,EAAE,MAAM;QACZ,GAAG,EAAE,IAAI,CAAC,GAAG;QACb,MAAM,EAAE,IAAI,CAAC,MAAM;QACnB,GAAG,CAAC,IAAI,CAAC,YAAY,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,IAAI,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC/E,KAAK,EAAE,IAAI,CAAC,KAAK;QACjB,GAAG,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC9D,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC/C,GAAG,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC9D,IAAI,EAAE,IAAI,CAAC,IAAI;QACf,IAAI,EAAE,IAAI,CAAC,IAAI;QACf,SAAS,EAAE,IAAI,CAAC,SAAS;QACzB,kBAAkB,EAAE,IAAI,CAAC,kBAAkB;QAC3C,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM;QAC9B,QAAQ,EAAE,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QACtE,KAAK,EAAE,IAAI,CAAC,KAAK;aACf,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,KAAK,MAAM,CAAC;aAC/B,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;aACZ,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC9C,GAAG,CAAC,KAAK,KAAK,SAAS;YACtB,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE;YAClD,CAAC,CAAC,EAAE,CAAC;KACN,CAAC;AACH,CAAC","sourcesContent":["/**\n * View transformations — business logic that converts a SpideredPage into\n * one of the available view shapes. Separated from types.ts which is pure\n * data-shape definitions.\n */\n\nimport type { PageGraph } from \"./graph.js\";\nimport type { LeanPage, SpideredPage } from \"./types.js\";\n\n/**\n * Downgrade a full SpideredPage to a LeanPage.\n *\n * Pass a PageGraph as the second argument to populate `inboundCount` —\n * the number of other spidered pages that link to this one. Agents can\n * use this as a lightweight authority signal when ranking results from\n * a crawl without running a full PageRank pass.\n */\nexport function toLean(page: SpideredPage, graph?: PageGraph): LeanPage {\n\treturn {\n\t\tview: \"lean\",\n\t\turl: page.url,\n\t\tdomain: page.domain,\n\t\t...(page.canonicalUrl !== undefined ? { canonicalUrl: page.canonicalUrl } : {}),\n\t\ttitle: page.title,\n\t\t...(page.description ? { description: page.description } : {}),\n\t\t...(page.author ? { author: page.author } : {}),\n\t\t...(page.publishedAt ? { publishedAt: page.publishedAt } : {}),\n\t\tlang: page.lang,\n\t\ttags: page.tags,\n\t\twordCount: page.wordCount,\n\t\treadingTimeMinutes: page.readingTimeMinutes,\n\t\tchunkCount: page.chunks.length,\n\t\theadings: page.headings.map((h) => `${\"#\".repeat(h.level)} ${h.text}`),\n\t\tlinks: page.links\n\t\t\t.filter((l) => l.rel === \"body\")\n\t\t\t.slice(0, 10)\n\t\t\t.map((l) => ({ href: l.href, text: l.text })),\n\t\t...(graph !== undefined\n\t\t\t? { inboundCount: graph.inbound(page.url).length }\n\t\t\t: {}),\n\t};\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"web-search.js","sourceRoot":"","sources":["../src/web-search.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAmDH;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,KAAa,EAAE,OAAyB,EAAE;IACzE,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;IACzD,IAAI,CAAC,MAAM;QAAE,MAAM,IAAI,KAAK,CAAC,4DAA4D,CAAC,CAAC;IAE3F,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,MAAM,CAAC,CAAC;IAC3D,IAAI,GAAa,CAAC;IAClB,IAAI,CAAC;QACJ,GAAG,GAAG,MAAM,KAAK,CAAC,2BAA2B,EAAE;YAC9C,MAAM,EAAE,MAAM;YACd,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,OAAO,EAAE;gBACR,cAAc,EAAE,kBAAkB;gBAClC,WAAW,EAAE,MAAM;aACnB;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACpB,KAAK;gBACL,UAAU,EAAE,IAAI,CAAC,UAAU,IAAI,EAAE;gBACjC,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,MAAM;gBACzB,QAAQ,EAAE;oBACT,UAAU,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,gBAAgB,EAAE,CAAC,EAAE;iBACpD;aACD,CAAC;SACF,CAAC,CAAC;IACJ,CAAC;YAAS,CAAC;QACV,YAAY,CAAC,KAAK,CAAC,CAAC;IACrB,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,EAAE;QAAE,MAAM,IAAI,KAAK,CAAC,kBAAkB,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;IAE/E,MAAM,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAO7B,CAAC;IAEF,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACvC,GAAG,EAAE,CAAC,CAAC,GAAG;QACV,KAAK,EAAE,CAAC,CAAC,KAAK;QACd,OAAO,EAAE,CAAC,CAAC,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE;QACxC,GAAG,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KAC5D,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,KAAa,EAAE,OAA2B,EAAE;IAC7E,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,sBAAsB,CAAC,CAAC;IAClE,IAAI,CAAC,MAAM;QAAE,MAAM,IAAI,KAAK,CAAC,8EAA8E,CAAC,CAAC;IAE7G,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC;QAClC,CAAC,EAAE,KAAK;QACR,KAAK,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,UAAU,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;KAClD,CAAC,CAAC;IACH,IAAI,IAAI,CAAC,OAAO;QAAE,MAAM,CAAC,GAAG,CAAC,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;IACtD,IAAI,IAAI,CAAC,SAAS;QAAE,MAAM,CAAC,GAAG,CAAC,WAAW,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;IAE5D,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,MAAM,CAAC,CAAC;IAC3D,IAAI,GAAa,CAAC;IAClB,IAAI,CAAC;QACJ,GAAG,GAAG,MAAM,KAAK,CAAC,kDAAkD,MAAM,EAAE,EAAE;YAC7E,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,OAAO,EAAE;gBACR,MAAM,EAAE,kBAAkB;gBAC1B,iBAAiB,EAAE,MAAM;gBACzB,sBAAsB,EAAE,MAAM;aAC9B;SACD,CAAC,CAAC;IACJ,CAAC;YAAS,CAAC;QACV,YAAY,CAAC,KAAK,CAAC,CAAC;IACrB,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,EAAE;QAAE,MAAM,IAAI,KAAK,CAAC,2BAA2B,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;IAExF,MAAM,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAS7B,CAAC;IAEF,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,OAAO,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5C,GAAG,EAAE,CAAC,CAAC,GAAG;QACV,KAAK,EAAE,CAAC,CAAC,KAAK;QACd,OAAO,EAAE,CAAC,CAAC,WAAW,IAAI,EAAE;QAC5B,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KACxC,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,KAAa,EAAE,OAA4B,EAAE;IAC/E,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;IAC5D,IAAI,CAAC,MAAM;QAAE,MAAM,IAAI,KAAK,CAAC,kEAAkE,CAAC,CAAC;IAEjG,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,MAAM,CAAC,CAAC;IAC3D,IAAI,GAAa,CAAC;IAClB,IAAI,CAAC;QACJ,GAAG,GAAG,MAAM,KAAK,CAAC,+BAA+B,EAAE;YAClD,MAAM,EAAE,MAAM;YACd,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;YAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACpB,KAAK;gBACL,OAAO,EAAE,MAAM;gBACf,WAAW,EAAE,IAAI,CAAC,UAAU,IAAI,CAAC;gBACjC,YAAY,EAAE,IAAI,CAAC,KAAK,IAAI,OAAO;gBACnC,mBAAmB,EAAE,KAAK;gBAC1B,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBACzD,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aAC5C,CAAC;SACF,CAAC,CAAC;IACJ,CAAC;YAAS,CAAC;QACV,YAAY,CAAC,KAAK,CAAC,CAAC;IACrB,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,EAAE;QAAE,MAAM,IAAI,KAAK,CAAC,qBAAqB,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;IAElF,MAAM,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAO7B,CAAC;IAEF,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACvC,GAAG,EAAE,CAAC,CAAC,GAAG;QACV,KAAK,EAAE,CAAC,CAAC,KAAK;QACd,OAAO,EAAE,CAAC,CAAC,OAAO,IAAI,EAAE;QACxB,GAAG,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KAC9D,CAAC,CAAC,CAAC;AACL,CAAC;AAcD;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,KAAa,EAAE,OAAyB,EAAE;IACzE,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC;QAClC,CAAC,EAAE,KAAK;QACR,MAAM,EAAE,MAAM;QACd,WAAW,EAAE,GAAG;QAChB,OAAO,EAAE,GAAG;QACZ,aAAa,EAAE,GAAG;KAClB,CAAC,CAAC;IAEH,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,MAAM,CAAC,CAAC;IAC3D,IAAI,GAAa,CAAC;IAClB,IAAI,CAAC;QACJ,GAAG,GAAG,MAAM,KAAK,CAAC,+BAA+B,MAAM,EAAE,EAAE;YAC1D,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,OAAO,EAAE;gBACR,MAAM,EAAE,kBAAkB;gBAC1B,6DAA6D;gBAC7D,4DAA4D;gBAC5D,YAAY,EAAE,gBAAgB;aAC9B;SACD,CAAC,CAAC;IACJ,CAAC;YAAS,CAAC;QACV,YAAY,CAAC,KAAK,CAAC,CAAC;IACrB,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,EAAE;QAAE,MAAM,IAAI,KAAK,CAAC,kBAAkB,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;IAE/E,MAAM,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAW7B,CAAC;IAEF,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,IAAI,EAAE,CAAC;IAEpC,+DAA+D;IAC/D,IAAI,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC;YACZ,GAAG,EAAE,IAAI,CAAC,WAAW;YACrB,KAAK,EAAE,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,cAAc,IAAI,YAAY;YAC1D,OAAO,EAAE,IAAI,CAAC,QAAQ;SACtB,CAAC,CAAC;IACJ,CAAC;IAED,iDAAiD;IACjD,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;QACpC,IAAI,OAAO,CAAC,MAAM,IAAI,KAAK;YAAE,MAAM;QACnC,IAAI,CAAC,CAAC,QAAQ;YAAE,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IACnF,CAAC;IAED,mDAAmD;IACnD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,aAAa,IAAI,EAAE,EAAE,CAAC;QAC9C,IAAI,OAAO,CAAC,MAAM,IAAI,KAAK;YAAE,MAAM;QACnC,IAAI,KAAK,CAAC,QAAQ,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;YAClC,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,KAAK,CAAC,QAAQ,EAAE,KAAK,EAAE,KAAK,CAAC,IAAI,EAAE,OAAO,EAAE,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;QAC/E,CAAC;QACD,KAAK,MAAM,GAAG,IAAI,KAAK,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;YACtC,IAAI,OAAO,CAAC,MAAM,IAAI,KAAK;gBAAE,MAAM;YACnC,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,GAAG,CAAC,QAAQ,EAAE,KAAK,EAAE,GAAG,CAAC,IAAI,EAAE,OAAO,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACzE,CAAC;IACF,CAAC;IAED,OAAO,OAAO,CAAC;AAChB,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC9B,KAAa,EACb,OAKI,EAAE;IAEN,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM;QACzB,CAAC,CAAC,mBAAmB,CAAC,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;QAC7E,CAAC,CAAC,mBAAmB,EAAE,CAAC;IACzB,OAAO,MAAM,CAAC,MAAM,CAAC;QACpB,KAAK;QACL,UAAU,EAAE,IAAI,CAAC,UAAU;QAC3B,SAAS,EAAE,IAAI,CAAC,SAAS;QACzB,KAAK,EAAE,IAAI,CAAC,KAAK;KACjB,CAAC,CAAC;AACJ,CAAC;AAYD,sEAAsE;AACtE,MAAM,eAAe,GAAG,IAAI,GAAG,EAAyB,CAAC;AAEzD;;;;;;GAMG;AACH,MAAM,UAAU,oBAAoB,CAAC,IAAY,EAAE,OAAsB;IACxE,eAAe,CAAC,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;AACpC,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,mBAAmB,CAAC,IAAY,EAAE,GAAwB;IACzE,MAAM,OAAO,GAAG,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC1C,IAAI,CAAC,OAAO;QAAE,MAAM,IAAI,KAAK,CAAC,2BAA2B,IAAI,6CAA6C,CAAC,CAAC;IAC5G,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC;AACrB,CAAC;AAED,qFAAqF;AACrF,SAAS,eAAe,CAAC,IAAY;IACpC,MAAM,OAAO,GAA2B;QACvC,KAAK,EAAE,sBAAsB;QAC7B,MAAM,EAAE,gBAAgB;QACxB,GAAG,EAAE,aAAa;KAClB,CAAC;IACF,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;AAC5B,CAAC;AAED,2CAA2C;AAC3C,6EAA6E;AAC7E,oBAAoB,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE;IACrC,IAAI,CAAC,GAAG;QAAE,MAAM,IAAI,KAAK,CAAC,8BAA8B,CAAC,CAAC;IAC1D,OAAO,IAAI,iBAAiB,CAAC,GAAG,CAAC,CAAC;AACnC,CAAC,CAAC,CAAC;AACH,oBAAoB,CAAC,QAAQ,EAAE,CAAC,GAAG,EAAE,EAAE;IACtC,IAAI,CAAC,GAAG;QAAE,MAAM,IAAI,KAAK,CAAC,wBAAwB,CAAC,CAAC;IACpD,OAAO,IAAI,kBAAkB,CAAC,GAAG,CAAC,CAAC;AACpC,CAAC,CAAC,CAAC;AACH,oBAAoB,CAAC,KAAK,EAAE,CAAC,GAAG,EAAE,EAAE;IACnC,IAAI,CAAC,GAAG;QAAE,MAAM,IAAI,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACjD,OAAO,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;AACjC,CAAC,CAAC,CAAC;AACH,oBAAoB,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,IAAI,eAAe,EAAE,CAAC,CAAC;AAEzD,8EAA8E;AAC9E,gEAAgE;AAChE,8EAA8E;AAE9E,0EAA0E;AAC1E,MAAM,eAAe,GAA8C;IAClE,GAAG,EAAE,IAAI;IACT,IAAI,EAAE,IAAI;IACV,KAAK,EAAE,IAAI;IACX,IAAI,EAAE,IAAI;CACV,CAAC;AAEF,uDAAuD;AACvD,MAAM,OAAO,iBAAiB;IAC7B,YAA6B,MAAc,EAAmB,OAAgB;QAAjD,WAAM,GAAN,MAAM,CAAQ;QAAmB,YAAO,GAAP,OAAO,CAAS;IAAG,CAAC;IAElF,MAAM,CAAC,GAAgB;QACtB,MAAM,SAAS,GAAG,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAC7E,OAAO,WAAW,CAAC,GAAG,CAAC,KAAK,EAAE;YAC7B,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,UAAU,EAAE,GAAG,CAAC,UAAU;YAC1B,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,SAAS;SACT,CAAC,CAAC;IACJ,CAAC;CACD;AAED,iDAAiD;AACjD,MAAM,OAAO,kBAAkB;IAC9B,YAA6B,MAAc;QAAd,WAAM,GAAN,MAAM,CAAQ;IAAG,CAAC;IAE/C,MAAM,CAAC,GAAgB;QACtB,OAAO,YAAY,CAAC,GAAG,CAAC,KAAK,EAAE;YAC9B,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,UAAU,EAAE,GAAG,CAAC,UAAU;YAC1B,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,KAAK,EAAE,GAAG,CAAC,KAAK;SAChB,CAAC,CAAC;IACJ,CAAC;CACD;AAED,8CAA8C;AAC9C,MAAM,OAAO,eAAe;IAC3B,YAA6B,MAAc;QAAd,WAAM,GAAN,MAAM,CAAQ;IAAG,CAAC;IAE/C,MAAM,CAAC,GAAgB;QACtB,OAAO,SAAS,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,UAAU,EAAE,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;IAClF,CAAC;CACD;AAED,+DAA+D;AAC/D,MAAM,OAAO,eAAe;IAC3B,MAAM,CAAC,GAAgB;QACtB,OAAO,SAAS,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,UAAU,EAAE,GAAG,CAAC,UAAU,EAAE,CAAC,CAAC;IAC7D,CAAC;CACD;AAmBD;;;;;;;;;;;;;GAaG;AACH,MAAM,OAAO,oBAAoB;IAIhC,YACkB,OAAwB,EACzC,OAAoC,EAAE;QADrB,YAAO,GAAP,OAAO,CAAiB;QAGzC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,MAAM,IAAI,KAAK,CAAC,mDAAmD,CAAC,CAAC;QAC/F,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,eAAe,IAAI,IAAI,CAAC;QACpD,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,eAAe,IAAI,IAAI,CAAC;IACrD,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,GAAgB;QAC5B,IAAI,SAAkB,CAAC;QAEvB,KAAK,MAAM,MAAM,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACnC,IAAI,CAAC;gBACJ,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;gBACzC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,eAAe;oBAAE,OAAO,OAAO,CAAC;gBAChE,4CAA4C;YAC7C,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACd,IAAI,CAAC,IAAI,CAAC,eAAe;oBAAE,MAAM,GAAG,CAAC;gBACrC,SAAS,GAAG,GAAG,CAAC;gBAChB,4CAA4C;YAC7C,CAAC;QACF,CAAC;QAED,iEAAiE;QACjE,IAAI,SAAS;YAAE,MAAM,SAAS,CAAC;QAC/B,OAAO,EAAE,CAAC;IACX,CAAC;CACD;AAED,8EAA8E;AAC9E,sDAAsD;AACtD,8EAA8E;AAE9E;;;;;;;;GAQG;AACH,MAAM,UAAU,mBAAmB;IAClC,MAAM,OAAO,GAAoB,EAAE,CAAC;IAEpC,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,sBAAsB,CAAC,CAAC;IAClD,IAAI,KAAK;QAAE,OAAO,CAAC,IAAI,CAAC,IAAI,iBAAiB,CAAC,KAAK,CAAC,CAAC,CAAC;IAEtD,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;IAC7C,IAAI,MAAM;QAAE,OAAO,CAAC,IAAI,CAAC,IAAI,kBAAkB,CAAC,MAAM,CAAC,CAAC,CAAC;IAEzD,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;IACvC,IAAI,GAAG;QAAE,OAAO,CAAC,IAAI,CAAC,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC;IAEhD,mEAAmE;IACnE,OAAO,CAAC,IAAI,CAAC,IAAI,eAAe,EAAE,CAAC,CAAC;IAEpC,OAAO,IAAI,oBAAoB,CAAC,OAAO,CAAC,CAAC;AAC1C,CAAC","sourcesContent":["/**\n * Web search API integration — Brave Search and Tavily.\n *\n * Both return a normalised WebSearchResult[].\n * API keys are read from environment variables by default:\n * BRAVE_SEARCH_API_KEY\n * TAVILY_API_KEY\n */\n\n// WebSearchResult is defined in ports.ts (the abstraction layer).\n// web-search.ts is an adapter — it imports from the port, not the other way.\nexport type { WebSearchResult } from \"./ports.js\";\nimport type { ISearchEngine, SearchQuery, WebSearchResult } from \"./ports.js\";\n\nexport interface BraveSearchOptions {\n\t/** API key. Defaults to process.env.BRAVE_SEARCH_API_KEY. */\n\tapiKey?: string;\n\t/** Number of results (1–20). Default 10. */\n\tnumResults?: number;\n\t/** ISO 3166-1 alpha-2 country code for localised results, e.g. \"US\". */\n\tcountry?: string;\n\t/**\n\t * Freshness filter. Maps SearchQuery.timeRange to Brave's parameter:\n\t * \"pd\" = past day, \"pw\" = past week, \"pm\" = past month, \"py\" = past year.\n\t * Pass directly when bypassing the adapter, or set timeRange on SearchQuery.\n\t */\n\tfreshness?: \"pd\" | \"pw\" | \"pm\" | \"py\";\n}\n\nexport interface TavilySearchOptions {\n\t/** API key. Defaults to process.env.TAVILY_API_KEY. */\n\tapiKey?: string;\n\t/** Number of results. Default 5. */\n\tnumResults?: number;\n\t/** \"basic\" (1 credit) or \"advanced\" (2 credits). Default \"basic\". */\n\tdepth?: \"basic\" | \"advanced\";\n\t/** Restrict results to content published within this window. */\n\ttimeRange?: \"day\" | \"week\" | \"month\" | \"year\";\n\t/** Topic mode: \"news\" prioritises fresh news articles. */\n\ttopic?: \"news\" | \"general\";\n}\n\nexport type SearchEngine = \"brave\" | \"tavily\" | \"exa\" | \"ddg\";\n\nexport interface ExaSearchOptions {\n\t/** API key. Defaults to process.env.EXA_API_KEY. */\n\tapiKey?: string;\n\t/** Number of results. Default 10. */\n\tnumResults?: number;\n\t/**\n\t * Search type.\n\t * \"auto\" — Exa decides keyword vs neural (default).\n\t * \"neural\" — embedding-based semantic search.\n\t * \"keyword\" — traditional keyword search.\n\t */\n\ttype?: \"auto\" | \"neural\" | \"keyword\";\n}\n\n/**\n * Search the web via the Exa Search API (neural/semantic retrieval).\n * https://exa.ai/docs/reference/search\n *\n * Returns highlights inline per result — richer snippets without extra round-trips.\n */\nexport async function exaSearch(query: string, opts: ExaSearchOptions = {}): Promise<WebSearchResult[]> {\n\tconst apiKey = opts.apiKey ?? process.env[\"EXA_API_KEY\"];\n\tif (!apiKey) throw new Error(\"Exa API key required — set EXA_API_KEY or pass opts.apiKey\");\n\n\tconst controller = new AbortController();\n\tconst timer = setTimeout(() => controller.abort(), 15_000);\n\tlet res: Response;\n\ttry {\n\t\tres = await fetch(\"https://api.exa.ai/search\", {\n\t\t\tmethod: \"POST\",\n\t\t\tsignal: controller.signal,\n\t\t\theaders: {\n\t\t\t\t\"Content-Type\": \"application/json\",\n\t\t\t\t\"x-api-key\": apiKey,\n\t\t\t},\n\t\t\tbody: JSON.stringify({\n\t\t\t\tquery,\n\t\t\t\tnumResults: opts.numResults ?? 10,\n\t\t\t\ttype: opts.type ?? \"auto\",\n\t\t\t\tcontents: {\n\t\t\t\t\thighlights: { numSentences: 2, highlightsPerUrl: 3 },\n\t\t\t\t},\n\t\t\t}),\n\t\t});\n\t} finally {\n\t\tclearTimeout(timer);\n\t}\n\n\tif (!res.ok) throw new Error(`Exa API error: ${res.status} ${res.statusText}`);\n\n\tconst data = (await res.json()) as {\n\t\tresults?: Array<{\n\t\t\turl: string;\n\t\t\ttitle: string;\n\t\t\tpublishedDate?: string;\n\t\t\thighlights?: string[];\n\t\t}>;\n\t};\n\n\treturn (data.results ?? []).map((r) => ({\n\t\turl: r.url,\n\t\ttitle: r.title,\n\t\tsnippet: r.highlights?.join(\" … \") ?? \"\",\n\t\t...(r.publishedDate ? { publishedAt: r.publishedDate } : {}),\n\t}));\n}\n\n/**\n * Search the web via the Brave Search API.\n * https://api.search.brave.com/app/documentation/web-search\n */\nexport async function braveSearch(query: string, opts: BraveSearchOptions = {}): Promise<WebSearchResult[]> {\n\tconst apiKey = opts.apiKey ?? process.env[\"BRAVE_SEARCH_API_KEY\"];\n\tif (!apiKey) throw new Error(\"Brave Search API key required — set BRAVE_SEARCH_API_KEY or pass opts.apiKey\");\n\n\tconst params = new URLSearchParams({\n\t\tq: query,\n\t\tcount: String(Math.min(opts.numResults ?? 10, 20)),\n\t});\n\tif (opts.country) params.set(\"country\", opts.country);\n\tif (opts.freshness) params.set(\"freshness\", opts.freshness);\n\n\tconst controller = new AbortController();\n\tconst timer = setTimeout(() => controller.abort(), 10_000);\n\tlet res: Response;\n\ttry {\n\t\tres = await fetch(`https://api.search.brave.com/res/v1/web/search?${params}`, {\n\t\t\tsignal: controller.signal,\n\t\t\theaders: {\n\t\t\t\tAccept: \"application/json\",\n\t\t\t\t\"Accept-Encoding\": \"gzip\",\n\t\t\t\t\"X-Subscription-Token\": apiKey,\n\t\t\t},\n\t\t});\n\t} finally {\n\t\tclearTimeout(timer);\n\t}\n\n\tif (!res.ok) throw new Error(`Brave Search API error: ${res.status} ${res.statusText}`);\n\n\tconst data = (await res.json()) as {\n\t\tweb?: {\n\t\t\tresults?: Array<{\n\t\t\t\turl: string;\n\t\t\t\ttitle: string;\n\t\t\t\tdescription?: string;\n\t\t\t\tage?: string;\n\t\t\t}>;\n\t\t};\n\t};\n\n\treturn (data.web?.results ?? []).map((r) => ({\n\t\turl: r.url,\n\t\ttitle: r.title,\n\t\tsnippet: r.description ?? \"\",\n\t\t...(r.age ? { publishedAt: r.age } : {}),\n\t}));\n}\n\n/**\n * Search the web via the Tavily API.\n * https://docs.tavily.com/docs/rest-api/api-reference\n */\nexport async function tavilySearch(query: string, opts: TavilySearchOptions = {}): Promise<WebSearchResult[]> {\n\tconst apiKey = opts.apiKey ?? process.env[\"TAVILY_API_KEY\"];\n\tif (!apiKey) throw new Error(\"Tavily API key required — set TAVILY_API_KEY or pass opts.apiKey\");\n\n\tconst controller = new AbortController();\n\tconst timer = setTimeout(() => controller.abort(), 15_000);\n\tlet res: Response;\n\ttry {\n\t\tres = await fetch(\"https://api.tavily.com/search\", {\n\t\t\tmethod: \"POST\",\n\t\t\tsignal: controller.signal,\n\t\t\theaders: { \"Content-Type\": \"application/json\" },\n\t\t\tbody: JSON.stringify({\n\t\t\t\tquery,\n\t\t\t\tapi_key: apiKey,\n\t\t\t\tmax_results: opts.numResults ?? 5,\n\t\t\t\tsearch_depth: opts.depth ?? \"basic\",\n\t\t\t\tinclude_raw_content: false,\n\t\t\t\t...(opts.timeRange ? { time_range: opts.timeRange } : {}),\n\t\t\t\t...(opts.topic ? { topic: opts.topic } : {}),\n\t\t\t}),\n\t\t});\n\t} finally {\n\t\tclearTimeout(timer);\n\t}\n\n\tif (!res.ok) throw new Error(`Tavily API error: ${res.status} ${res.statusText}`);\n\n\tconst data = (await res.json()) as {\n\t\tresults?: Array<{\n\t\t\turl: string;\n\t\t\ttitle: string;\n\t\t\tcontent?: string;\n\t\t\tpublished_date?: string;\n\t\t}>;\n\t};\n\n\treturn (data.results ?? []).map((r) => ({\n\t\turl: r.url,\n\t\ttitle: r.title,\n\t\tsnippet: r.content ?? \"\",\n\t\t...(r.published_date ? { publishedAt: r.published_date } : {}),\n\t}));\n}\n\n// ---------------------------------------------------------------------------\n// DuckDuckGo Instant Answer API — no key required, zero-cost fallback\n// ---------------------------------------------------------------------------\n\nexport interface DdgSearchOptions {\n\t/**\n\t * Maximum results to return. DDG doesn't support a server-side count param;\n\t * this slices the client-side result list. Default: 10.\n\t */\n\tnumResults?: number;\n}\n\n/**\n * Search via the DuckDuckGo Instant Answer API.\n * https://duckduckgo.com/api\n *\n * No API key required. Returns structured instant answers (Abstract,\n * Results, RelatedTopics) mapped to WebSearchResult[].\n *\n * Limitation: not a full web index — best for well-known entities and\n * unambiguous queries. Returns empty when DDG has no instant answer.\n */\nexport async function ddgSearch(query: string, opts: DdgSearchOptions = {}): Promise<WebSearchResult[]> {\n\tconst params = new URLSearchParams({\n\t\tq: query,\n\t\tformat: \"json\",\n\t\tno_redirect: \"1\",\n\t\tno_html: \"1\",\n\t\tskip_disambig: \"1\",\n\t});\n\n\tconst controller = new AbortController();\n\tconst timer = setTimeout(() => controller.abort(), 10_000);\n\tlet res: Response;\n\ttry {\n\t\tres = await fetch(`https://api.duckduckgo.com/?${params}`, {\n\t\t\tsignal: controller.signal,\n\t\t\theaders: {\n\t\t\t\tAccept: \"application/json\",\n\t\t\t\t// DDG silently returns an empty 200 body for browser-like or\n\t\t\t\t// missing User-Agents. A curl/bot-style UA gets a real 202.\n\t\t\t\t\"User-Agent\": \"web-spider/0.8\",\n\t\t\t},\n\t\t});\n\t} finally {\n\t\tclearTimeout(timer);\n\t}\n\n\tif (!res.ok) throw new Error(`DDG API error: ${res.status} ${res.statusText}`);\n\n\tconst data = (await res.json()) as {\n\t\tAbstract?: string;\n\t\tAbstractURL?: string;\n\t\tAbstractSource?: string;\n\t\tHeading?: string;\n\t\tResults?: Array<{ FirstURL: string; Text: string }>;\n\t\tRelatedTopics?: Array<{\n\t\t\tFirstURL?: string;\n\t\t\tText?: string;\n\t\t\tTopics?: Array<{ FirstURL: string; Text: string }>;\n\t\t}>;\n\t};\n\n\tconst results: WebSearchResult[] = [];\n\tconst limit = opts.numResults ?? 10;\n\n\t// 1. Instant answer abstract (Wikipedia-style knowledge panel)\n\tif (data.Abstract && data.AbstractURL) {\n\t\tresults.push({\n\t\t\turl: data.AbstractURL,\n\t\t\ttitle: data.Heading ?? data.AbstractSource ?? \"DuckDuckGo\",\n\t\t\tsnippet: data.Abstract,\n\t\t});\n\t}\n\n\t// 2. Official results (e.g. official site links)\n\tfor (const r of data.Results ?? []) {\n\t\tif (results.length >= limit) break;\n\t\tif (r.FirstURL) results.push({ url: r.FirstURL, title: r.Text, snippet: r.Text });\n\t}\n\n\t// 3. Related topics — flatten one level of nesting\n\tfor (const topic of data.RelatedTopics ?? []) {\n\t\tif (results.length >= limit) break;\n\t\tif (topic.FirstURL && topic.Text) {\n\t\t\tresults.push({ url: topic.FirstURL, title: topic.Text, snippet: topic.Text });\n\t\t}\n\t\tfor (const sub of topic.Topics ?? []) {\n\t\t\tif (results.length >= limit) break;\n\t\t\tresults.push({ url: sub.FirstURL, title: sub.Text, snippet: sub.Text });\n\t\t}\n\t}\n\n\treturn results;\n}\n\n/**\n * Search using whichever engine is explicitly requested or has an API key\n * available. Falls through to the DDG Instant Answer API as a zero-cost\n * last resort — no key required.\n *\n * Prefer {@link defaultSearchEngine} + {@link FallbackSearchEngine} when\n * you need composable retry / fallback behaviour.\n */\nexport async function webSearch(\n\tquery: string,\n\topts: {\n\t\tengine?: SearchEngine;\n\t\tnumResults?: number;\n\t\ttimeRange?: \"day\" | \"week\" | \"month\" | \"year\";\n\t\ttopic?: \"news\" | \"general\";\n\t} = {},\n): Promise<WebSearchResult[]> {\n\tconst engine = opts.engine\n\t\t? resolveSearchEngine(opts.engine, process.env[envKeyForEngine(opts.engine)])\n\t\t: defaultSearchEngine();\n\treturn engine.search({\n\t\tquery,\n\t\tnumResults: opts.numResults,\n\t\ttimeRange: opts.timeRange,\n\t\ttopic: opts.topic,\n\t});\n}\n\n// ---------------------------------------------------------------------------\n// Engine registry — OCP: adding a new engine = one registerSearchEngine() call\n// ---------------------------------------------------------------------------\n\n/**\n * A factory that creates an ISearchEngine from an optional API key.\n * key is undefined for keyless engines (e.g. DDG).\n */\ntype EngineFactory = (key: string | undefined) => ISearchEngine;\n\n/** The global engine registry. Seeded with built-in engines below. */\nconst ENGINE_REGISTRY = new Map<string, EngineFactory>();\n\n/**\n * Register a search engine under a name.\n *\n * Call this to add a new engine without touching any existing code:\n * @example\n * registerSearchEngine(\"my-engine\", (key) => new MyEngine(key!))\n */\nexport function registerSearchEngine(name: string, factory: EngineFactory): void {\n\tENGINE_REGISTRY.set(name, factory);\n}\n\n/**\n * Resolve a registered engine by name, passing the provided API key.\n * Throws a descriptive error for unknown names or missing required keys.\n */\nexport function resolveSearchEngine(name: string, key?: string | undefined): ISearchEngine {\n\tconst factory = ENGINE_REGISTRY.get(name);\n\tif (!factory) throw new Error(`Unknown search engine: \"${name}\". Register it with registerSearchEngine().`);\n\treturn factory(key);\n}\n\n/** @internal Map engine name to its env var key name (for webSearch auto-detect). */\nfunction envKeyForEngine(name: string): string {\n\tconst envKeys: Record<string, string> = {\n\t\tbrave: \"BRAVE_SEARCH_API_KEY\",\n\t\ttavily: \"TAVILY_API_KEY\",\n\t\texa: \"EXA_API_KEY\",\n\t};\n\treturn envKeys[name] ?? \"\";\n}\n\n// Seed the registry with built-in engines.\n// Adding a new engine: call registerSearchEngine() — do NOT edit this block.\nregisterSearchEngine(\"brave\", (key) => {\n\tif (!key) throw new Error(\"BRAVE_SEARCH_API_KEY not set\");\n\treturn new BraveSearchEngine(key);\n});\nregisterSearchEngine(\"tavily\", (key) => {\n\tif (!key) throw new Error(\"TAVILY_API_KEY not set\");\n\treturn new TavilySearchEngine(key);\n});\nregisterSearchEngine(\"exa\", (key) => {\n\tif (!key) throw new Error(\"EXA_API_KEY not set\");\n\treturn new ExaSearchEngine(key);\n});\nregisterSearchEngine(\"ddg\", () => new DdgSearchEngine());\n\n// ---------------------------------------------------------------------------\n// ISearchEngine adapters — concrete implementations of the port\n// ---------------------------------------------------------------------------\n\n/** Maps the canonical timeRange string to Brave's freshness parameter. */\nconst BRAVE_FRESHNESS: Record<string, \"pd\" | \"pw\" | \"pm\" | \"py\"> = {\n\tday: \"pd\",\n\tweek: \"pw\",\n\tmonth: \"pm\",\n\tyear: \"py\",\n};\n\n/** Brave Search adapter implementing ISearchEngine. */\nexport class BraveSearchEngine implements ISearchEngine {\n\tconstructor(private readonly apiKey: string, private readonly country?: string) {}\n\n\tsearch(req: SearchQuery): Promise<WebSearchResult[]> {\n\t\tconst freshness = req.timeRange ? BRAVE_FRESHNESS[req.timeRange] : undefined;\n\t\treturn braveSearch(req.query, {\n\t\t\tapiKey: this.apiKey,\n\t\t\tnumResults: req.numResults,\n\t\t\tcountry: this.country,\n\t\t\tfreshness,\n\t\t});\n\t}\n}\n\n/** Tavily adapter implementing ISearchEngine. */\nexport class TavilySearchEngine implements ISearchEngine {\n\tconstructor(private readonly apiKey: string) {}\n\n\tsearch(req: SearchQuery): Promise<WebSearchResult[]> {\n\t\treturn tavilySearch(req.query, {\n\t\t\tapiKey: this.apiKey,\n\t\t\tnumResults: req.numResults,\n\t\t\ttimeRange: req.timeRange,\n\t\t\ttopic: req.topic,\n\t\t});\n\t}\n}\n\n/** Exa adapter implementing ISearchEngine. */\nexport class ExaSearchEngine implements ISearchEngine {\n\tconstructor(private readonly apiKey: string) {}\n\n\tsearch(req: SearchQuery): Promise<WebSearchResult[]> {\n\t\treturn exaSearch(req.query, { apiKey: this.apiKey, numResults: req.numResults });\n\t}\n}\n\n/** DuckDuckGo Instant Answer adapter — no API key required. */\nexport class DdgSearchEngine implements ISearchEngine {\n\tsearch(req: SearchQuery): Promise<WebSearchResult[]> {\n\t\treturn ddgSearch(req.query, { numResults: req.numResults });\n\t}\n}\n\n// ---------------------------------------------------------------------------\n// FallbackSearchEngine — strategy composite\n// ---------------------------------------------------------------------------\n\nexport interface FallbackSearchEngineOptions {\n\t/**\n\t * Treat an empty result set as a failure and try the next engine.\n\t * Default: true.\n\t */\n\tfallbackOnEmpty?: boolean;\n\t/**\n\t * Swallow a thrown error and try the next engine instead of propagating.\n\t * Default: true.\n\t */\n\tfallbackOnError?: boolean;\n}\n\n/**\n * A composite ISearchEngine that tries each engine in order, falling back\n * to the next when the current one returns empty results or throws.\n *\n * Because it implements ISearchEngine itself it is fully composable —\n * nest FallbackSearchEngines, wrap them in caches, inject stubs in tests.\n *\n * @example\n * // Tavily with DDG as zero-cost fallback\n * const engine = new FallbackSearchEngine([\n * new TavilySearchEngine(process.env.TAVILY_API_KEY),\n * new DdgSearchEngine(),\n * ]);\n */\nexport class FallbackSearchEngine implements ISearchEngine {\n\tprivate readonly fallbackOnEmpty: boolean;\n\tprivate readonly fallbackOnError: boolean;\n\n\tconstructor(\n\t\tprivate readonly engines: ISearchEngine[],\n\t\topts: FallbackSearchEngineOptions = {},\n\t) {\n\t\tif (engines.length === 0) throw new Error(\"FallbackSearchEngine requires at least one engine\");\n\t\tthis.fallbackOnEmpty = opts.fallbackOnEmpty ?? true;\n\t\tthis.fallbackOnError = opts.fallbackOnError ?? true;\n\t}\n\n\tasync search(req: SearchQuery): Promise<WebSearchResult[]> {\n\t\tlet lastError: unknown;\n\n\t\tfor (const engine of this.engines) {\n\t\t\ttry {\n\t\t\t\tconst results = await engine.search(req);\n\t\t\t\tif (results.length > 0 || !this.fallbackOnEmpty) return results;\n\t\t\t\t// Empty + fallbackOnEmpty → try next engine\n\t\t\t} catch (err) {\n\t\t\t\tif (!this.fallbackOnError) throw err;\n\t\t\t\tlastError = err;\n\t\t\t\t// Error + fallbackOnError → try next engine\n\t\t\t}\n\t\t}\n\n\t\t// All engines exhausted — surface the last error or return empty\n\t\tif (lastError) throw lastError;\n\t\treturn [];\n\t}\n}\n\n// ---------------------------------------------------------------------------\n// Wiring — compose engines from environment variables\n// ---------------------------------------------------------------------------\n\n/**\n * Build a FallbackSearchEngine chain from environment variables.\n *\n * Priority order for keyed engines: Brave → Tavily → Exa.\n * DuckDuckGo is always appended as the zero-cost last resort.\n *\n * The returned engine implements ISearchEngine — swap it for any stub\n * in tests without touching call sites.\n */\nexport function defaultSearchEngine(): ISearchEngine {\n\tconst engines: ISearchEngine[] = [];\n\n\tconst brave = process.env[\"BRAVE_SEARCH_API_KEY\"];\n\tif (brave) engines.push(new BraveSearchEngine(brave));\n\n\tconst tavily = process.env[\"TAVILY_API_KEY\"];\n\tif (tavily) engines.push(new TavilySearchEngine(tavily));\n\n\tconst exa = process.env[\"EXA_API_KEY\"];\n\tif (exa) engines.push(new ExaSearchEngine(exa));\n\n\t// DDG always last — no key needed, never throws the \"no key\" error\n\tengines.push(new DdgSearchEngine());\n\n\treturn new FallbackSearchEngine(engines);\n}\n"]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dpopsuev/web-spider",
|
|
3
|
-
"version": "0.10.
|
|
3
|
+
"version": "0.10.5",
|
|
4
|
+
"files": ["dist", "package.json"],
|
|
4
5
|
"description": "AI-agent-friendly web spider: structured output, RAG-ready chunks, graph-traversable links",
|
|
5
6
|
"type": "module",
|
|
6
7
|
"main": "./dist/index.js",
|