@dpopsuev/web-spider 0.10.4 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/batch.js.map +1 -0
  2. package/dist/cache.js.map +1 -0
  3. package/dist/convert.js.map +1 -0
  4. package/dist/crawl.js.map +1 -0
  5. package/dist/disk-cache.js.map +1 -0
  6. package/dist/graph.js.map +1 -0
  7. package/dist/index.js.map +1 -0
  8. package/dist/parse.js.map +1 -0
  9. package/dist/playwright.js.map +1 -0
  10. package/dist/ports.js.map +1 -0
  11. package/dist/robots.js.map +1 -0
  12. package/dist/search.js.map +1 -0
  13. package/dist/sitemap.js.map +1 -0
  14. package/dist/spider.js.map +1 -0
  15. package/dist/throttle.js.map +1 -0
  16. package/dist/tree.js.map +1 -0
  17. package/dist/types.js.map +1 -0
  18. package/dist/views.js.map +1 -0
  19. package/dist/web-search.js.map +1 -0
  20. package/package.json +2 -1
  21. package/fixtures/article-with-images.html +0 -94
  22. package/fixtures/gh-shell.html +0 -32
  23. package/fixtures/guide-ai-agents-web-scraping.json +0 -552
  24. package/fixtures/images/large.jpg +0 -0
  25. package/fixtures/images/small.jpg +0 -0
  26. package/fixtures/images/tiny.png +0 -0
  27. package/fixtures/quotes-index.json +0 -40
  28. package/scripts/fetch-guide.mjs +0 -25
  29. package/src/cache.ts +0 -99
  30. package/src/convert.ts +0 -161
  31. package/src/crawl.ts +0 -186
  32. package/src/disk-cache.ts +0 -228
  33. package/src/graph.ts +0 -189
  34. package/src/index.ts +0 -74
  35. package/src/parse.ts +0 -154
  36. package/src/playwright.ts +0 -193
  37. package/src/ports.ts +0 -131
  38. package/src/robots.ts +0 -121
  39. package/src/search.ts +0 -173
  40. package/src/sitemap.ts +0 -67
  41. package/src/spider.ts +0 -475
  42. package/src/throttle.ts +0 -118
  43. package/src/tree.ts +0 -379
  44. package/src/types.ts +0 -225
  45. package/src/views.ts +0 -42
  46. package/src/web-search.ts +0 -548
  47. package/test/convert-images.test.ts +0 -69
  48. package/test/disk-cache-images.test.ts +0 -193
  49. package/test/engine-registry.test.ts +0 -114
  50. package/test/exports.test.ts +0 -124
  51. package/test/get-chunk.test.ts +0 -115
  52. package/test/images-integration.test.ts +0 -359
  53. package/test/improvements.test.ts +0 -279
  54. package/test/inbound-count.test.ts +0 -111
  55. package/test/lean.test.ts +0 -105
  56. package/test/playwright.test.ts +0 -128
  57. package/test/ports.test.ts +0 -161
  58. package/test/search.test.ts +0 -219
  59. package/test/spider-images.test.ts +0 -180
  60. package/test/spider-unit.test.ts +0 -610
  61. package/test/tree.test.ts +0 -272
  62. package/test/types.test.ts +0 -169
  63. package/test/web-search-integration.test.ts +0 -180
  64. package/test/web-search.test.ts +0 -305
  65. package/tsconfig.json +0 -9
  66. package/tsconfig.test.json +0 -7
  67. package/vitest.config.ts +0 -8
@@ -1,111 +0,0 @@
1
- /**
2
- * TDD tests for LeanPage.inboundCount via PageGraph.
3
- */
4
-
5
- import { describe, expect, it } from "vitest";
6
- import { PageGraph } from "../src/graph.js";
7
- import type { SpideredPage } from "../src/types.js";
8
- import { toLean } from "../src/views.js";
9
-
10
- // ---------------------------------------------------------------------------
11
- // Helpers
12
- // ---------------------------------------------------------------------------
13
-
14
- function makePage(url: string, linksTo: string[] = []): SpideredPage {
15
- return {
16
- url,
17
- domain: new URL(url).hostname,
18
- fetchedAt: new Date().toISOString(),
19
- title: `Page at ${url}`,
20
- description: "",
21
- author: "",
22
- publishedAt: "",
23
- lang: "en",
24
- tags: [],
25
- wordCount: 100,
26
- readingTimeMinutes: 1,
27
- headings: [],
28
- chunks: [],
29
- links: linksTo.map((href) => ({ href, text: href, isExternal: false, rel: "body" as const })),
30
- markdown: "",
31
- };
32
- }
33
-
34
- // ---------------------------------------------------------------------------
35
- // Tests
36
- // ---------------------------------------------------------------------------
37
-
38
- describe("LeanPage.inboundCount via PageGraph", () => {
39
- it("omitted when no graph is passed to toLean()", () => {
40
- const lean = toLean(makePage("https://example.com"));
41
- expect(lean.inboundCount).toBeUndefined();
42
- });
43
-
44
- it("0 when page has no inbound links", () => {
45
- const graph = new PageGraph();
46
- const page = makePage("https://example.com");
47
- graph.addPage(page);
48
- const lean = toLean(page, graph);
49
- expect(lean.inboundCount).toBe(0);
50
- });
51
-
52
- it("1 when one page links to this page", () => {
53
- const graph = new PageGraph();
54
- const home = makePage("https://example.com", ["https://example.com/about"]);
55
- const about = makePage("https://example.com/about");
56
- graph.addPage(home);
57
- graph.addPage(about);
58
- const lean = toLean(about, graph);
59
- expect(lean.inboundCount).toBe(1);
60
- });
61
-
62
- it("counts multiple inbound links correctly", () => {
63
- const graph = new PageGraph();
64
- const target = makePage("https://example.com/popular");
65
- const a = makePage("https://example.com/a", ["https://example.com/popular"]);
66
- const b = makePage("https://example.com/b", ["https://example.com/popular"]);
67
- const c = makePage("https://example.com/c", ["https://example.com/popular"]);
68
- [target, a, b, c].forEach((p) => graph.addPage(p));
69
- const lean = toLean(target, graph);
70
- expect(lean.inboundCount).toBe(3);
71
- });
72
-
73
- it("inboundCount on a hub page is 0 (only outbound)", () => {
74
- const graph = new PageGraph();
75
- const hub = makePage("https://example.com", [
76
- "https://example.com/a",
77
- "https://example.com/b",
78
- ]);
79
- graph.addPage(hub);
80
- const lean = toLean(hub, graph);
81
- expect(lean.inboundCount).toBe(0);
82
- });
83
-
84
- it("pages ranked by inboundCount descending matches graph.byPageRank()", () => {
85
- const graph = new PageGraph();
86
- const popular = makePage("https://example.com/popular");
87
- const normal = makePage("https://example.com/normal");
88
- const a = makePage("https://example.com/a", ["https://example.com/popular"]);
89
- const b = makePage("https://example.com/b", ["https://example.com/popular"]);
90
- const c = makePage("https://example.com/c", ["https://example.com/normal"]);
91
- [popular, normal, a, b, c].forEach((p) => graph.addPage(p));
92
-
93
- const popularLean = toLean(popular, graph);
94
- const normalLean = toLean(normal, graph);
95
-
96
- expect(popularLean.inboundCount).toBe(2);
97
- expect(normalLean.inboundCount).toBe(1);
98
- expect(popularLean.inboundCount!).toBeGreaterThan(normalLean.inboundCount!);
99
- });
100
-
101
- it("all other LeanPage fields are still populated when graph is provided", () => {
102
- const graph = new PageGraph();
103
- const page = makePage("https://example.com");
104
- graph.addPage(page);
105
- const lean = toLean(page, graph);
106
- expect(lean.url).toBe("https://example.com");
107
- expect(lean.title).toBeTruthy();
108
- expect(lean.view).toBe("lean");
109
- expect(typeof lean.wordCount).toBe("number");
110
- });
111
- });
package/test/lean.test.ts DELETED
@@ -1,105 +0,0 @@
1
- import { readFileSync } from "fs";
2
- import { dirname, join } from "path";
3
- import { fileURLToPath } from "url";
4
- import { describe, expect, it } from "vitest";
5
- import type { LeanPage, SpideredPage } from "../src/types.js";
6
- import { toLean } from "../src/views.js";
7
-
8
- const __dirname = dirname(fileURLToPath(import.meta.url));
9
-
10
- function loadFixture(name: string): SpideredPage {
11
- const raw = readFileSync(join(__dirname, "../fixtures", name), "utf8");
12
- return JSON.parse(raw) as SpideredPage;
13
- }
14
-
15
- const guide = loadFixture("guide-ai-agents-web-scraping.json");
16
-
17
- describe("toLean", () => {
18
- let lean: LeanPage;
19
-
20
- it("produces a lean page without error", () => {
21
- lean = toLean(guide);
22
- expect(lean).toBeDefined();
23
- });
24
-
25
- it("sets view discriminant to 'lean'", () => {
26
- lean = toLean(guide);
27
- expect(lean.view).toBe("lean");
28
- });
29
-
30
- it("preserves identity and metadata fields", () => {
31
- lean = toLean(guide);
32
- expect(lean.url).toBe(guide.url);
33
- expect(lean.domain).toBe(guide.domain);
34
- expect(lean.title).toBe(guide.title);
35
- expect(lean.lang).toBe(guide.lang);
36
- expect(lean.wordCount).toBe(guide.wordCount);
37
- expect(lean.readingTimeMinutes).toBe(guide.readingTimeMinutes);
38
- // fetchedAt is intentionally excluded from lean (noise in agent context)
39
- expect(lean).not.toHaveProperty("fetchedAt");
40
- });
41
-
42
- it("converts headings to flat markdown strings", () => {
43
- lean = toLean(guide);
44
- // Fixture has all level-2 headings
45
- expect(lean.headings).toBeInstanceOf(Array);
46
- expect(lean.headings.length).toBe(guide.headings.length);
47
- for (const h of lean.headings) {
48
- expect(typeof h).toBe("string");
49
- expect(h).toMatch(/^#{1,3} .+/);
50
- }
51
- // Spot-check first heading
52
- const first = guide.headings[0];
53
- expect(lean.headings[0]).toBe(`${"#".repeat(first.level)} ${first.text}`);
54
- });
55
-
56
- it("strips isExternal from links", () => {
57
- lean = toLean(guide);
58
- expect(lean.links.length).toBeGreaterThan(0);
59
- for (const link of lean.links) {
60
- expect(link).toHaveProperty("href");
61
- expect(link).toHaveProperty("text");
62
- expect(link).not.toHaveProperty("isExternal");
63
- }
64
- });
65
-
66
- it("passes tags through from source page", () => {
67
- lean = toLean(guide);
68
- expect(Array.isArray(lean.tags)).toBe(true);
69
- });
70
-
71
- it("passes canonicalUrl when present", () => {
72
- const withCanonical = { ...guide, canonicalUrl: "https://example.com/canonical" };
73
- const l = toLean(withCanonical);
74
- expect(l.canonicalUrl).toBe("https://example.com/canonical");
75
- });
76
-
77
- it("omits canonicalUrl when absent", () => {
78
- const withoutCanonical = { ...guide };
79
- delete (withoutCanonical as Partial<typeof guide>).canonicalUrl;
80
- const l = toLean(withoutCanonical);
81
- expect(l.canonicalUrl).toBeUndefined();
82
- });
83
-
84
- it("sets chunkCount from the source chunks array", () => {
85
- lean = toLean(guide);
86
- expect(lean.chunkCount).toBe(guide.chunks.length);
87
- expect(lean.chunkCount).toBeGreaterThan(0);
88
- });
89
-
90
- it("omits chunks and markdown fields", () => {
91
- lean = toLean(guide);
92
- expect(lean).not.toHaveProperty("chunks");
93
- expect(lean).not.toHaveProperty("markdown");
94
- });
95
-
96
- it("is materially smaller than the full page", () => {
97
- lean = toLean(guide);
98
- const fullSize = JSON.stringify(guide).length;
99
- const leanSize = JSON.stringify(lean).length;
100
- // Lean should be less than 30% the size of full.
101
- // Links (up to 200) still carry href+text so the floor isn't zero,
102
- // but chunks and markdown — the two biggest fields — are gone.
103
- expect(leanSize).toBeLessThan(fullSize * 0.3);
104
- });
105
- });
@@ -1,128 +0,0 @@
1
- /**
2
- * TDD tests for PlaywrightHttpClient captureImages option.
3
- *
4
- * No real browser is launched. We test:
5
- * 1. Option wiring — captureImages is stored and readable.
6
- * 2. Route logic — the abort/continue decision function in isolation.
7
- * 3. Interface conformance — PlaywrightHttpClient still satisfies IHttpClient.
8
- */
9
-
10
- import { describe, expect, it } from "vitest";
11
- import { PlaywrightHttpClient, createPlaywrightClient } from "../src/playwright.js";
12
- import type { IHttpClient } from "../src/ports.js";
13
-
14
- // ---------------------------------------------------------------------------
15
- // Extract the routing decision as a pure function so we can test it without
16
- // launching a browser. Mirrors the logic in playwright.ts fetch().
17
- // ---------------------------------------------------------------------------
18
-
19
- function shouldAbort(
20
- resourceType: string,
21
- acceptHeader: string,
22
- captureImages: boolean,
23
- ): boolean {
24
- const isImageFetch = acceptHeader.startsWith("image/");
25
- if (resourceType === "font") return true;
26
- if (["image", "media"].includes(resourceType) && !(captureImages && isImageFetch)) return true;
27
- return false;
28
- }
29
-
30
- // ---------------------------------------------------------------------------
31
- // Route logic — pure unit tests, no browser
32
- // ---------------------------------------------------------------------------
33
-
34
- describe("Playwright route abort logic", () => {
35
- describe("fonts — always aborted", () => {
36
- it("aborts font with captureImages: false", () => {
37
- expect(shouldAbort("font", "", false)).toBe(true);
38
- });
39
- it("aborts font with captureImages: true", () => {
40
- expect(shouldAbort("font", "image/*", true)).toBe(true);
41
- });
42
- });
43
-
44
- describe("images — aborted unless captureImages + Accept: image/*", () => {
45
- it("aborts image with captureImages: false", () => {
46
- expect(shouldAbort("image", "", false)).toBe(true);
47
- });
48
- it("aborts image with captureImages: false even if Accept: image/*", () => {
49
- expect(shouldAbort("image", "image/*", false)).toBe(true);
50
- });
51
- it("aborts image with captureImages: true but no image Accept header", () => {
52
- expect(shouldAbort("image", "text/html", true)).toBe(true);
53
- });
54
- it("allows image with captureImages: true AND Accept: image/*", () => {
55
- expect(shouldAbort("image", "image/*", true)).toBe(false);
56
- });
57
- it("allows image with captureImages: true AND Accept: image/jpeg", () => {
58
- expect(shouldAbort("image", "image/jpeg", true)).toBe(false);
59
- });
60
- it("allows image with captureImages: true AND Accept: image/png", () => {
61
- expect(shouldAbort("image", "image/png", true)).toBe(false);
62
- });
63
- });
64
-
65
- describe("media — same rules as image", () => {
66
- it("aborts media with captureImages: false", () => {
67
- expect(shouldAbort("media", "", false)).toBe(true);
68
- });
69
- it("aborts media with captureImages: true but no image Accept", () => {
70
- expect(shouldAbort("media", "video/mp4", true)).toBe(true);
71
- });
72
- it("allows media with captureImages: true AND Accept: image/*", () => {
73
- expect(shouldAbort("media", "image/*", true)).toBe(false);
74
- });
75
- });
76
-
77
- describe("other resource types — never aborted", () => {
78
- it.each(["document", "stylesheet", "script", "xhr", "fetch", "websocket"])(
79
- "allows %s regardless of captureImages",
80
- (type) => {
81
- expect(shouldAbort(type, "", false)).toBe(false);
82
- expect(shouldAbort(type, "", true)).toBe(false);
83
- },
84
- );
85
- });
86
- });
87
-
88
- // ---------------------------------------------------------------------------
89
- // Option wiring — captureImages stored on the instance
90
- // ---------------------------------------------------------------------------
91
-
92
- describe("PlaywrightHttpClient option wiring", () => {
93
- it("defaults captureImages to false", () => {
94
- const client = new PlaywrightHttpClient();
95
- // Access via cast — private field, but we verify the default behaviour
96
- // through the public interface indirectly. Here we just confirm construction.
97
- expect(client).toBeInstanceOf(PlaywrightHttpClient);
98
- });
99
-
100
- it("constructs with captureImages: true without throwing", () => {
101
- expect(() => new PlaywrightHttpClient({ captureImages: true })).not.toThrow();
102
- });
103
-
104
- it("constructs with captureImages: false without throwing", () => {
105
- expect(() => new PlaywrightHttpClient({ captureImages: false })).not.toThrow();
106
- });
107
-
108
- it("createPlaywrightClient passes captureImages through", () => {
109
- const client = createPlaywrightClient({ captureImages: true });
110
- expect(client).toBeInstanceOf(PlaywrightHttpClient);
111
- });
112
- });
113
-
114
- // ---------------------------------------------------------------------------
115
- // Interface conformance
116
- // ---------------------------------------------------------------------------
117
-
118
- describe("PlaywrightHttpClient interface conformance", () => {
119
- it("satisfies IHttpClient", () => {
120
- const client: IHttpClient = new PlaywrightHttpClient();
121
- expect(typeof client.fetch).toBe("function");
122
- });
123
-
124
- it("has a close() method", () => {
125
- const client = new PlaywrightHttpClient();
126
- expect(typeof client.close).toBe("function");
127
- });
128
- });
@@ -1,161 +0,0 @@
1
- /**
2
- * TDD tests for HttpResponse.arrayBuffer()
3
- *
4
- * All tests use stub HTTP clients — no real network.
5
- */
6
-
7
- import { readFileSync } from "node:fs";
8
- import { join } from "node:path";
9
- import { describe, expect, it } from "vitest";
10
- import type { HttpResponse, IHttpClient } from "../src/ports.js";
11
-
12
- // ---------------------------------------------------------------------------
13
- // Helpers
14
- // ---------------------------------------------------------------------------
15
-
16
- function makeStubResponse(overrides: Partial<HttpResponse> = {}): HttpResponse {
17
- return {
18
- ok: true,
19
- status: 200,
20
- statusText: "OK",
21
- headers: { get: () => null },
22
- text: async () => "",
23
- arrayBuffer: async () => new ArrayBuffer(0),
24
- ...overrides,
25
- };
26
- }
27
-
28
- // ---------------------------------------------------------------------------
29
- // Interface conformance (TypeScript structural check via satisfies)
30
- // ---------------------------------------------------------------------------
31
-
32
- describe("HttpResponse interface", () => {
33
- it("stub with arrayBuffer() satisfies HttpResponse", () => {
34
- const stub = {
35
- ok: true,
36
- status: 200,
37
- statusText: "OK",
38
- headers: { get: (_name: string) => null as string | null },
39
- text: async () => "",
40
- arrayBuffer: async () => new ArrayBuffer(4),
41
- } satisfies HttpResponse;
42
-
43
- expect(typeof stub.arrayBuffer).toBe("function");
44
- });
45
-
46
- it("IHttpClient stub with arrayBuffer-returning fetch satisfies the port", () => {
47
- const client: IHttpClient = {
48
- fetch: async (_req) => makeStubResponse({ arrayBuffer: async () => new ArrayBuffer(8) }),
49
- };
50
- expect(typeof client.fetch).toBe("function");
51
- });
52
- });
53
-
54
- // ---------------------------------------------------------------------------
55
- // arrayBuffer() returns correct bytes
56
- // ---------------------------------------------------------------------------
57
-
58
- describe("arrayBuffer() byte content", () => {
59
- it("resolves to an ArrayBuffer", async () => {
60
- const response = makeStubResponse({
61
- arrayBuffer: async () => new ArrayBuffer(4),
62
- });
63
- const buf = await response.arrayBuffer();
64
- expect(buf).toBeInstanceOf(ArrayBuffer);
65
- });
66
-
67
- it("returns the correct byte length", async () => {
68
- const response = makeStubResponse({
69
- arrayBuffer: async () => new ArrayBuffer(16),
70
- });
71
- const buf = await response.arrayBuffer();
72
- expect(buf.byteLength).toBe(16);
73
- });
74
-
75
- it("returns correct bytes from a known fixture", async () => {
76
- const tinyPng = readFileSync(
77
- join(import.meta.dirname, "../fixtures/images/tiny.png"),
78
- );
79
- const expected = tinyPng.buffer.slice(
80
- tinyPng.byteOffset,
81
- tinyPng.byteOffset + tinyPng.byteLength,
82
- ) as ArrayBuffer;
83
-
84
- const response = makeStubResponse({
85
- arrayBuffer: async () => expected,
86
- });
87
-
88
- const buf = await response.arrayBuffer();
89
- expect(buf.byteLength).toBe(tinyPng.byteLength);
90
-
91
- const view = new Uint8Array(buf);
92
- // PNG magic bytes: 0x89 0x50 0x4E 0x47
93
- expect(view[0]).toBe(0x89);
94
- expect(view[1]).toBe(0x50); // P
95
- expect(view[2]).toBe(0x4e); // N
96
- expect(view[3]).toBe(0x47); // G
97
- });
98
-
99
- it("returns zero-length buffer when resource is empty", async () => {
100
- const response = makeStubResponse({
101
- arrayBuffer: async () => new ArrayBuffer(0),
102
- });
103
- const buf = await response.arrayBuffer();
104
- expect(buf.byteLength).toBe(0);
105
- });
106
- });
107
-
108
- // ---------------------------------------------------------------------------
109
- // Default fetch adapter — arrayBuffer() on real tiny PNG bytes
110
- // ---------------------------------------------------------------------------
111
-
112
- describe("default fetch adapter arrayBuffer() via mock client", () => {
113
- it("mock client that returns tiny.png bytes produces correct ArrayBuffer", async () => {
114
- const tinyPng = readFileSync(
115
- join(import.meta.dirname, "../fixtures/images/tiny.png"),
116
- );
117
-
118
- const client: IHttpClient = {
119
- fetch: async (_req) =>
120
- makeStubResponse({
121
- arrayBuffer: async () =>
122
- tinyPng.buffer.slice(
123
- tinyPng.byteOffset,
124
- tinyPng.byteOffset + tinyPng.byteLength,
125
- ) as ArrayBuffer,
126
- }),
127
- };
128
-
129
- const res = await client.fetch({ url: "https://example.com/tiny.png" });
130
- const buf = await res.arrayBuffer();
131
-
132
- expect(buf.byteLength).toBe(tinyPng.byteLength);
133
- const view = new Uint8Array(buf);
134
- expect(view[0]).toBe(0x89); // PNG magic
135
- });
136
-
137
- it("base64-encoding an ArrayBuffer from mock produces correct data URL prefix", async () => {
138
- const tinyPng = readFileSync(
139
- join(import.meta.dirname, "../fixtures/images/tiny.png"),
140
- );
141
-
142
- const client: IHttpClient = {
143
- fetch: async (_req) =>
144
- makeStubResponse({
145
- arrayBuffer: async () =>
146
- tinyPng.buffer.slice(
147
- tinyPng.byteOffset,
148
- tinyPng.byteOffset + tinyPng.byteLength,
149
- ) as ArrayBuffer,
150
- }),
151
- };
152
-
153
- const res = await client.fetch({ url: "https://example.com/tiny.png" });
154
- const buf = await res.arrayBuffer();
155
- const b64 = Buffer.from(buf).toString("base64");
156
- const dataUrl = `data:image/png;base64,${b64}`;
157
-
158
- expect(dataUrl).toMatch(/^data:image\/png;base64,/);
159
- expect(b64.length).toBeGreaterThan(0);
160
- });
161
- });