@dpopsuev/web-spider 0.10.4 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/batch.js.map +1 -0
  2. package/dist/cache.js.map +1 -0
  3. package/dist/convert.js.map +1 -0
  4. package/dist/crawl.js.map +1 -0
  5. package/dist/disk-cache.js.map +1 -0
  6. package/dist/graph.js.map +1 -0
  7. package/dist/index.js.map +1 -0
  8. package/dist/parse.js.map +1 -0
  9. package/dist/playwright.js.map +1 -0
  10. package/dist/ports.js.map +1 -0
  11. package/dist/robots.js.map +1 -0
  12. package/dist/search.js.map +1 -0
  13. package/dist/sitemap.js.map +1 -0
  14. package/dist/spider.js.map +1 -0
  15. package/dist/throttle.js.map +1 -0
  16. package/dist/tree.js.map +1 -0
  17. package/dist/types.js.map +1 -0
  18. package/dist/views.js.map +1 -0
  19. package/dist/web-search.js.map +1 -0
  20. package/package.json +2 -1
  21. package/fixtures/article-with-images.html +0 -94
  22. package/fixtures/gh-shell.html +0 -32
  23. package/fixtures/guide-ai-agents-web-scraping.json +0 -552
  24. package/fixtures/images/large.jpg +0 -0
  25. package/fixtures/images/small.jpg +0 -0
  26. package/fixtures/images/tiny.png +0 -0
  27. package/fixtures/quotes-index.json +0 -40
  28. package/scripts/fetch-guide.mjs +0 -25
  29. package/src/cache.ts +0 -99
  30. package/src/convert.ts +0 -161
  31. package/src/crawl.ts +0 -186
  32. package/src/disk-cache.ts +0 -228
  33. package/src/graph.ts +0 -189
  34. package/src/index.ts +0 -74
  35. package/src/parse.ts +0 -154
  36. package/src/playwright.ts +0 -193
  37. package/src/ports.ts +0 -131
  38. package/src/robots.ts +0 -121
  39. package/src/search.ts +0 -173
  40. package/src/sitemap.ts +0 -67
  41. package/src/spider.ts +0 -475
  42. package/src/throttle.ts +0 -118
  43. package/src/tree.ts +0 -379
  44. package/src/types.ts +0 -225
  45. package/src/views.ts +0 -42
  46. package/src/web-search.ts +0 -548
  47. package/test/convert-images.test.ts +0 -69
  48. package/test/disk-cache-images.test.ts +0 -193
  49. package/test/engine-registry.test.ts +0 -114
  50. package/test/exports.test.ts +0 -124
  51. package/test/get-chunk.test.ts +0 -115
  52. package/test/images-integration.test.ts +0 -359
  53. package/test/improvements.test.ts +0 -279
  54. package/test/inbound-count.test.ts +0 -111
  55. package/test/lean.test.ts +0 -105
  56. package/test/playwright.test.ts +0 -128
  57. package/test/ports.test.ts +0 -161
  58. package/test/search.test.ts +0 -219
  59. package/test/spider-images.test.ts +0 -180
  60. package/test/spider-unit.test.ts +0 -610
  61. package/test/tree.test.ts +0 -272
  62. package/test/types.test.ts +0 -169
  63. package/test/web-search-integration.test.ts +0 -180
  64. package/test/web-search.test.ts +0 -305
  65. package/tsconfig.json +0 -9
  66. package/tsconfig.test.json +0 -7
  67. package/vitest.config.ts +0 -8
@@ -1,219 +0,0 @@
1
- import { readFileSync } from "fs";
2
- import { dirname, join } from "path";
3
- import { fileURLToPath } from "url";
4
- import { describe, expect, it } from "vitest";
5
- import { fuzzySearch } from "../src/search.js";
6
- import type { SpideredPage } from "../src/types.js";
7
-
8
- const __dirname = dirname(fileURLToPath(import.meta.url));
9
-
10
- function loadFixture(name: string): SpideredPage {
11
- const raw = readFileSync(join(__dirname, "../fixtures", name), "utf8");
12
- return JSON.parse(raw) as SpideredPage;
13
- }
14
-
15
- const guide = loadFixture("guide-ai-agents-web-scraping.json");
16
-
17
- // ---------------------------------------------------------------------------
18
- // Basic contract
19
- // ---------------------------------------------------------------------------
20
-
21
- describe("fuzzySearch — contract", () => {
22
- it("returns an empty array for a blank query", () => {
23
- expect(fuzzySearch([guide], "")).toEqual([]);
24
- expect(fuzzySearch([guide], " ")).toEqual([]);
25
- });
26
-
27
- it("returns an empty array when no pages are given", () => {
28
- expect(fuzzySearch([], "openai")).toEqual([]);
29
- });
30
-
31
- it("returns at most topN results", () => {
32
- const hits = fuzzySearch([guide], "the", { topN: 3 });
33
- expect(hits.length).toBeLessThanOrEqual(3);
34
- });
35
-
36
- it("every hit has required fields", () => {
37
- const hits = fuzzySearch([guide], "openai");
38
- expect(hits.length).toBeGreaterThan(0);
39
- for (const h of hits) {
40
- expect(typeof h.url).toBe("string");
41
- expect(typeof h.chunkId).toBe("string");
42
- expect(typeof h.heading).toBe("string");
43
- expect(typeof h.score).toBe("number");
44
- expect(typeof h.snippet).toBe("string");
45
- }
46
- });
47
-
48
- it("scores are in 0–1 range", () => {
49
- const hits = fuzzySearch([guide], "agent scraping pipeline");
50
- for (const h of hits) {
51
- expect(h.score).toBeGreaterThan(0);
52
- expect(h.score).toBeLessThanOrEqual(1);
53
- }
54
- });
55
-
56
- it("results are sorted by score descending", () => {
57
- const hits = fuzzySearch([guide], "LLM extraction cost");
58
- for (let i = 1; i < hits.length; i++) {
59
- expect(hits[i].score).toBeLessThanOrEqual(hits[i - 1].score);
60
- }
61
- });
62
- });
63
-
64
- // ---------------------------------------------------------------------------
65
- // Exact match quality
66
- // ---------------------------------------------------------------------------
67
-
68
- describe("fuzzySearch — exact match", () => {
69
- it("finds an exact phrase from the fixture title", () => {
70
- const hits = fuzzySearch([guide], "AI Agents & Web Scraping");
71
- expect(hits.length).toBeGreaterThan(0);
72
- const titleHit = hits.find((h) => h.heading === "title");
73
- // Title hit must be found and have a positive score.
74
- // The absolute threshold is not asserted — it depends on the scorer's
75
- // normalisation strategy and corpus size.
76
- expect(titleHit).toBeDefined();
77
- expect(titleHit!.score).toBeGreaterThan(0);
78
- });
79
-
80
- it("exact match scores higher than partial match for the same chunk", () => {
81
- // "cost optimization" appears verbatim in a heading
82
- const exact = fuzzySearch([guide], "Cost Optimization");
83
- const partial = fuzzySearch([guide], "cost");
84
- // The heading hit for exact phrase should outrank a generic token hit
85
- const exactTop = exact[0];
86
- expect(exactTop.score).toBeGreaterThanOrEqual(partial[0].score);
87
- });
88
-
89
- it("includes a non-empty snippet for every hit", () => {
90
- const hits = fuzzySearch([guide], "OpenAI API");
91
- for (const h of hits) {
92
- expect(h.snippet.trim().length).toBeGreaterThan(0);
93
- }
94
- });
95
-
96
- it("snippet contains the matched term", () => {
97
- const hits = fuzzySearch([guide], "OpenAI API", { topN: 5 });
98
- // At least one snippet should contain the matched term (case-insensitive)
99
- const found = hits.some((h) => h.snippet.toLowerCase().includes("openai"));
100
- expect(found).toBe(true);
101
- });
102
- });
103
-
104
- // ---------------------------------------------------------------------------
105
- // Fuzzy / partial matching
106
- // ---------------------------------------------------------------------------
107
-
108
- describe("fuzzySearch — fuzzy matching", () => {
109
- it("matches partial tokens (prefix)", () => {
110
- // "automat" should match "automation", "automated", "automatically"
111
- const hits = fuzzySearch([guide], "automat");
112
- expect(hits.length).toBeGreaterThan(0);
113
- });
114
-
115
- it("is case-insensitive", () => {
116
- const lower = fuzzySearch([guide], "amazon");
117
- const upper = fuzzySearch([guide], "AMAZON");
118
- expect(lower.length).toBe(upper.length);
119
- expect(lower.map((h) => h.chunkId)).toEqual(upper.map((h) => h.chunkId));
120
- });
121
-
122
- it("handles multi-word queries across chunk boundaries", () => {
123
- // Words that appear spread across the document, not necessarily adjacent
124
- const hits = fuzzySearch([guide], "proxy captcha reliable");
125
- expect(hits.length).toBeGreaterThan(0);
126
- });
127
-
128
- it("returns no hits for a query that is clearly absent", () => {
129
- const hits = fuzzySearch([guide], "xyzzy quux frumious bandersnatch");
130
- expect(hits.length).toBe(0);
131
- });
132
- });
133
-
134
- // ---------------------------------------------------------------------------
135
- // Metadata vs chunk hits
136
- // ---------------------------------------------------------------------------
137
-
138
- describe("fuzzySearch — metadata hits", () => {
139
- it("matches headings and returns chunkId as empty string", () => {
140
- const hits = fuzzySearch([guide], "Frequently Asked Questions");
141
- const metaHit = hits.find((h) => h.chunkId === "");
142
- expect(metaHit).toBeDefined();
143
- });
144
-
145
- it("chunk hits carry a valid chunk ID", () => {
146
- const hits = fuzzySearch([guide], "intelligent data pipeline");
147
- const chunkHit = hits.find((h) => h.chunkId !== "");
148
- expect(chunkHit).toBeDefined();
149
- expect(chunkHit!.chunkId).toMatch(/^https?:\/\/.+#chunk-\d+$/);
150
- });
151
-
152
- it("matches the page description field", () => {
153
- // Guide description: "Combine AI agents with web scraping APIs..."
154
- const hits = fuzzySearch([guide], "automated reports");
155
- expect(hits.some((h) => h.heading === "description")).toBe(true);
156
- });
157
- });
158
-
159
- // ---------------------------------------------------------------------------
160
- // Multi-page corpus
161
- // ---------------------------------------------------------------------------
162
-
163
- describe("fuzzySearch — multi-page corpus", () => {
164
- // Build a second synthetic page from a subset of the guide's chunks
165
- const page2: SpideredPage = {
166
- ...guide,
167
- url: "https://example.com/other",
168
- domain: "example.com",
169
- title: "A Different Article About Proxies",
170
- description: "Proxy rotation and CAPTCHA handling for scrapers.",
171
- chunks: guide.chunks.slice(0, 2).map((c, i) => ({
172
- ...c,
173
- id: `https://example.com/other#chunk-${i}`,
174
- })),
175
- };
176
-
177
- it("returns hits from multiple pages when both match", () => {
178
- // topN must exceed the number of matching guide chunks to let page2 surface
179
- const hits = fuzzySearch([guide, page2], "scraping", { topN: 100 });
180
- const urls = new Set(hits.map((h) => h.url));
181
- expect(urls.size).toBeGreaterThan(1);
182
- });
183
-
184
- it("respects topN across the whole corpus", () => {
185
- const hits = fuzzySearch([guide, page2], "agent", { topN: 4 });
186
- expect(hits.length).toBeLessThanOrEqual(4);
187
- });
188
-
189
- it("higher-scoring page ranks first regardless of input order", () => {
190
- // page2 title is explicitly about proxies; guide is not
191
- const hitsProxies = fuzzySearch([guide, page2], "proxy rotation CAPTCHA", { topN: 1 });
192
- expect(hitsProxies[0].url).toBe(page2.url);
193
- });
194
- });
195
-
196
- // ---------------------------------------------------------------------------
197
- // Snippet shape
198
- // ---------------------------------------------------------------------------
199
-
200
- describe("fuzzySearch — snippet", () => {
201
- it("snippet is bounded by snippetRadius", () => {
202
- const radius = 30;
203
- const hits = fuzzySearch([guide], "OpenAI", { snippetRadius: radius });
204
- for (const h of hits) {
205
- // Strip leading/trailing ellipsis markers before measuring
206
- const bare = h.snippet.replace(/^…|…$/g, "");
207
- // The bare snippet should be at most 2×radius + matched term length
208
- // Give a generous upper bound to account for word boundaries
209
- expect(bare.length).toBeLessThan(radius * 2 + 60);
210
- }
211
- });
212
-
213
- it("snippet adds leading ellipsis when match is not at start", () => {
214
- // Search for something known to appear mid-text
215
- const hits = fuzzySearch([guide], "cost optimization");
216
- const mid = hits.find((h) => h.snippet.startsWith("…"));
217
- expect(mid).toBeDefined();
218
- });
219
- });
@@ -1,180 +0,0 @@
1
- /**
2
- * TDD tests for spider() captureImages option.
3
- * No real network — uses stub IHttpClient.
4
- */
5
-
6
- import { readFileSync } from "node:fs";
7
- import { join } from "node:path";
8
- import { describe, expect, it } from "vitest";
9
- import type { IHttpClient } from "../src/ports.js";
10
- import { spider } from "../src/spider.js";
11
-
12
- // ---------------------------------------------------------------------------
13
- // Fixture HTML (loaded from disk)
14
- // ---------------------------------------------------------------------------
15
-
16
- const FIXTURE_HTML = readFileSync(
17
- join(import.meta.dirname, "../fixtures/article-with-images.html"),
18
- "utf8",
19
- );
20
-
21
- const TINY_PNG = readFileSync(join(import.meta.dirname, "../fixtures/images/tiny.png"));
22
- const SMALL_JPG = readFileSync(join(import.meta.dirname, "../fixtures/images/small.jpg"));
23
-
24
- // ---------------------------------------------------------------------------
25
- // Stub HTTP client factory
26
- // ---------------------------------------------------------------------------
27
-
28
- /**
29
- * Returns a stub IHttpClient that serves the fixture HTML for page requests
30
- * and fixture image bytes for image requests.
31
- * `failOnSecond`: if true, throws on the second image fetch.
32
- */
33
- function makeStubClient(opts: { failOnSecond?: boolean } = {}): IHttpClient {
34
- let imageFetchCount = 0;
35
- return {
36
- async fetch(req) {
37
- // Page request
38
- if (req.url.startsWith("https://example.com") && !req.url.match(/\.(jpg|jpeg|png|webp|gif)(\?|$)/i)) {
39
- return {
40
- ok: true,
41
- status: 200,
42
- statusText: "OK",
43
- headers: { get: (name) => (name === "content-type" ? "text/html" : null) },
44
- text: async () => FIXTURE_HTML,
45
- arrayBuffer: async () => new ArrayBuffer(0),
46
- };
47
- }
48
-
49
- // Image requests
50
- imageFetchCount++;
51
- if (opts.failOnSecond && imageFetchCount === 2) {
52
- throw new Error("Simulated network failure on second image");
53
- }
54
-
55
- // Serve fixture bytes based on extension
56
- const isJpeg = req.url.match(/\.(jpg|jpeg|webp)(\?|$)/i);
57
- const bytes = isJpeg ? SMALL_JPG : TINY_PNG;
58
- const mimeType = isJpeg ? "image/jpeg" : "image/png";
59
- const buf = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength) as ArrayBuffer;
60
-
61
- return {
62
- ok: true,
63
- status: 200,
64
- statusText: "OK",
65
- headers: { get: (name) => (name === "content-type" ? mimeType : null) },
66
- text: async () => "",
67
- arrayBuffer: async () => buf,
68
- };
69
- },
70
- };
71
- }
72
-
73
- // ---------------------------------------------------------------------------
74
- // Tests
75
- // ---------------------------------------------------------------------------
76
-
77
- describe("spider() captureImages option", () => {
78
- it("1. captureImages: false (default) — images field is undefined", async () => {
79
- const page = await spider("https://example.com", {
80
- httpClient: makeStubClient(),
81
- // captureImages not set → defaults to false
82
- });
83
- expect(page.images).toBeUndefined();
84
- });
85
-
86
- it("2. captureImages: true — images array is populated", async () => {
87
- const page = await spider("https://example.com", {
88
- httpClient: makeStubClient(),
89
- captureImages: true,
90
- });
91
- expect(page.images).toBeDefined();
92
- expect(page.images!.length).toBeGreaterThan(0);
93
- });
94
-
95
- it("3. ImageRef fields are populated correctly", async () => {
96
- const page = await spider("https://example.com", {
97
- httpClient: makeStubClient(),
98
- captureImages: true,
99
- });
100
- for (const img of page.images!) {
101
- expect(img.src).toBeTruthy();
102
- expect(img.mimeType).toMatch(/^image\//);
103
- expect(typeof img.alt).toBe("string");
104
- // Either base64 or filePath must be set
105
- expect(img.base64 || img.filePath).toBeTruthy();
106
- }
107
- });
108
-
109
- it("4. maxImages cap is respected", async () => {
110
- const page = await spider("https://example.com", {
111
- httpClient: makeStubClient(),
112
- captureImages: true,
113
- maxImages: 2,
114
- });
115
- expect(page.images!.length).toBeLessThanOrEqual(2);
116
- });
117
-
118
- it("5. relative src URLs are resolved to absolute", async () => {
119
- const page = await spider("https://example.com", {
120
- httpClient: makeStubClient(),
121
- captureImages: true,
122
- });
123
- for (const img of page.images!) {
124
- // data: URLs are allowed as-is; all others must be absolute http(s)
125
- if (!img.src.startsWith("data:")) {
126
- expect(img.src).toMatch(/^https?:\/\//);
127
- }
128
- }
129
- // Specifically, the relative /images/chart.png should resolve to https://example.com/images/chart.png
130
- const resolved = page.images!.find((i) => i.src === "https://example.com/images/chart.png");
131
- expect(resolved).toBeDefined();
132
- });
133
-
134
- it("6. failed image fetch is skipped gracefully — no exception propagates", async () => {
135
- const page = await spider("https://example.com", {
136
- httpClient: makeStubClient({ failOnSecond: true }),
137
- captureImages: true,
138
- });
139
- // Should still return a page — just with fewer images
140
- expect(page.images).toBeDefined();
141
- expect(page.url).toBe("https://example.com");
142
- });
143
-
144
- it("7. data: URL images are included without fetching", async () => {
145
- const page = await spider("https://example.com", {
146
- httpClient: makeStubClient(),
147
- captureImages: true,
148
- });
149
- const dataImg = page.images!.find((i) => i.src.startsWith("data:"));
150
- expect(dataImg).toBeDefined();
151
- expect(dataImg!.mimeType).toBe("image/png");
152
- expect(dataImg!.base64).toBeTruthy();
153
- });
154
-
155
- it("base64 strings are valid (decodable)", async () => {
156
- const page = await spider("https://example.com", {
157
- httpClient: makeStubClient(),
158
- captureImages: true,
159
- });
160
- for (const img of page.images!) {
161
- if (img.base64) {
162
- expect(() => Buffer.from(img.base64!, "base64")).not.toThrow();
163
- expect(Buffer.from(img.base64!, "base64").byteLength).toBeGreaterThan(0);
164
- }
165
- }
166
- });
167
-
168
- it("produces valid LLM data URLs from captured images", async () => {
169
- const page = await spider("https://example.com", {
170
- httpClient: makeStubClient(),
171
- captureImages: true,
172
- });
173
- for (const img of page.images!) {
174
- if (img.base64) {
175
- const dataUrl = `data:${img.mimeType};base64,${img.base64}`;
176
- expect(dataUrl).toMatch(/^data:image\//);
177
- }
178
- }
179
- });
180
- });