@dpopsuev/web-spider 0.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/batch.d.ts +24 -0
- package/dist/batch.d.ts.map +1 -0
- package/dist/batch.js +68 -0
- package/dist/cache.d.ts +40 -0
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +78 -0
- package/dist/convert.d.ts +29 -0
- package/dist/convert.d.ts.map +1 -0
- package/dist/convert.js +131 -0
- package/dist/crawl.d.ts +56 -0
- package/dist/crawl.d.ts.map +1 -0
- package/dist/crawl.js +126 -0
- package/dist/disk-cache.d.ts +75 -0
- package/dist/disk-cache.d.ts.map +1 -0
- package/dist/disk-cache.js +185 -0
- package/dist/graph.d.ts +76 -0
- package/dist/graph.d.ts.map +1 -0
- package/dist/graph.js +156 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +44 -0
- package/dist/parse.d.ts +27 -0
- package/dist/parse.d.ts.map +1 -0
- package/dist/parse.js +131 -0
- package/dist/playwright.d.ts +75 -0
- package/dist/playwright.d.ts.map +1 -0
- package/dist/playwright.js +141 -0
- package/dist/ports.d.ts +104 -0
- package/dist/ports.d.ts.map +1 -0
- package/dist/ports.js +10 -0
- package/dist/robots.d.ts +24 -0
- package/dist/robots.d.ts.map +1 -0
- package/dist/robots.js +104 -0
- package/dist/search.d.ts +47 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +112 -0
- package/dist/sitemap.d.ts +15 -0
- package/dist/sitemap.d.ts.map +1 -0
- package/dist/sitemap.js +65 -0
- package/dist/spider.d.ts +74 -0
- package/dist/spider.d.ts.map +1 -0
- package/dist/spider.js +349 -0
- package/dist/throttle.d.ts +49 -0
- package/dist/throttle.d.ts.map +1 -0
- package/dist/throttle.js +85 -0
- package/dist/tree.d.ts +34 -0
- package/dist/tree.d.ts.map +1 -0
- package/dist/tree.js +354 -0
- package/dist/types.d.ts +189 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/views.d.ts +17 -0
- package/dist/views.d.ts.map +1 -0
- package/dist/views.js +39 -0
- package/dist/web-search.d.ts +184 -0
- package/dist/web-search.d.ts.map +1 -0
- package/dist/web-search.js +399 -0
- package/fixtures/article-with-images.html +94 -0
- package/fixtures/gh-shell.html +32 -0
- package/fixtures/guide-ai-agents-web-scraping.json +552 -0
- package/fixtures/images/large.jpg +0 -0
- package/fixtures/images/small.jpg +0 -0
- package/fixtures/images/tiny.png +0 -0
- package/fixtures/quotes-index.json +40 -0
- package/package.json +47 -0
- package/scripts/fetch-guide.mjs +25 -0
- package/src/cache.ts +99 -0
- package/src/convert.ts +161 -0
- package/src/crawl.ts +186 -0
- package/src/disk-cache.ts +228 -0
- package/src/graph.ts +189 -0
- package/src/index.ts +74 -0
- package/src/parse.ts +154 -0
- package/src/playwright.ts +193 -0
- package/src/ports.ts +131 -0
- package/src/robots.ts +121 -0
- package/src/search.ts +173 -0
- package/src/sitemap.ts +67 -0
- package/src/spider.ts +475 -0
- package/src/throttle.ts +118 -0
- package/src/tree.ts +379 -0
- package/src/types.ts +225 -0
- package/src/views.ts +42 -0
- package/src/web-search.ts +548 -0
- package/test/convert-images.test.ts +69 -0
- package/test/disk-cache-images.test.ts +193 -0
- package/test/engine-registry.test.ts +114 -0
- package/test/exports.test.ts +124 -0
- package/test/get-chunk.test.ts +115 -0
- package/test/images-integration.test.ts +359 -0
- package/test/improvements.test.ts +279 -0
- package/test/inbound-count.test.ts +111 -0
- package/test/lean.test.ts +105 -0
- package/test/playwright.test.ts +128 -0
- package/test/ports.test.ts +161 -0
- package/test/search.test.ts +219 -0
- package/test/spider-images.test.ts +180 -0
- package/test/spider-unit.test.ts +610 -0
- package/test/tree.test.ts +272 -0
- package/test/types.test.ts +169 -0
- package/test/web-search-integration.test.ts +180 -0
- package/test/web-search.test.ts +305 -0
- package/tsconfig.json +9 -0
- package/tsconfig.test.json +7 -0
- package/vitest.config.ts +8 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<title>Article With Images — Fixture</title>
|
|
6
|
+
<meta name="description" content="A fixture page for testing image scraping.">
|
|
7
|
+
<meta name="keywords" content="images, scraping, fixtures">
|
|
8
|
+
<link rel="canonical" href="https://example.com/article-with-images">
|
|
9
|
+
</head>
|
|
10
|
+
<body>
|
|
11
|
+
<header>
|
|
12
|
+
<nav><a href="/">Home</a></nav>
|
|
13
|
+
</header>
|
|
14
|
+
|
|
15
|
+
<article>
|
|
16
|
+
<h1>Understanding Web Images for AI Agents</h1>
|
|
17
|
+
|
|
18
|
+
<p>
|
|
19
|
+
Images are a fundamental part of the modern web. When AI agents scrape pages, they
|
|
20
|
+
often need to capture visual content — charts, product photos, diagrams — alongside
|
|
21
|
+
the text. This article covers the key considerations for image-aware scraping.
|
|
22
|
+
</p>
|
|
23
|
+
|
|
24
|
+
<!-- 1. Absolute JPEG src, descriptive alt -->
|
|
25
|
+
<img
|
|
26
|
+
src="https://example.com/images/hero.jpg"
|
|
27
|
+
alt="A hero image showing a web spider diagram"
|
|
28
|
+
>
|
|
29
|
+
|
|
30
|
+
<p>
|
|
31
|
+
The first consideration is format. JPEG is the dominant format for photographs on
|
|
32
|
+
the web, while PNG is preferred for screenshots and diagrams that require lossless
|
|
33
|
+
compression. WebP is increasingly common as a modern alternative offering superior
|
|
34
|
+
compression ratios.
|
|
35
|
+
</p>
|
|
36
|
+
|
|
37
|
+
<!-- 2. Absolute PNG src, descriptive alt -->
|
|
38
|
+
<img
|
|
39
|
+
src="https://example.com/images/diagram.png"
|
|
40
|
+
alt="Architecture diagram of the web spider pipeline"
|
|
41
|
+
>
|
|
42
|
+
|
|
43
|
+
<p>
|
|
44
|
+
For AI agents that feed images to vision-language models, the wire format that all
|
|
45
|
+
major LLM APIs accept is a base64-encoded data URL:
|
|
46
|
+
<code>data:image/jpeg;base64,…</code>. This means an agent can scrape an image,
|
|
47
|
+
encode it in-memory, and pass it directly to GPT-4o or Claude without any
|
|
48
|
+
intermediate file I/O.
|
|
49
|
+
</p>
|
|
50
|
+
|
|
51
|
+
<!-- 3. Absolute WebP src, descriptive alt -->
|
|
52
|
+
<img
|
|
53
|
+
src="https://example.com/images/chart.webp"
|
|
54
|
+
alt="Performance chart comparing image formats by file size"
|
|
55
|
+
>
|
|
56
|
+
|
|
57
|
+
<p>
|
|
58
|
+
Caching is the other key concern. Fetching the same image repeatedly wastes
|
|
59
|
+
bandwidth and risks rate limiting. A sensible strategy is to cache small images
|
|
60
|
+
(under 32 KB) inline as base64 in the page cache, and write larger images as binary
|
|
61
|
+
files to a sibling <code>images/</code> directory, storing only the file path in
|
|
62
|
+
the JSON index.
|
|
63
|
+
</p>
|
|
64
|
+
|
|
65
|
+
<!-- 4. Relative src — must be resolved to absolute against the page URL -->
|
|
66
|
+
<img
|
|
67
|
+
src="/images/chart.png"
|
|
68
|
+
alt="Relative URL image — should be resolved to absolute"
|
|
69
|
+
>
|
|
70
|
+
|
|
71
|
+
<!-- 5. data: URL — should be included without making a network request -->
|
|
72
|
+
<img
|
|
73
|
+
src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwADhQGAWjR9awAAAABJRU5ErkJggg=="
|
|
74
|
+
alt="Inline data URL image — 1x1 pixel PNG"
|
|
75
|
+
>
|
|
76
|
+
|
|
77
|
+
<!-- 6. No alt attribute — alt should default to empty string -->
|
|
78
|
+
<img src="https://example.com/images/no-alt.jpg">
|
|
79
|
+
|
|
80
|
+
<h2>Conclusion</h2>
|
|
81
|
+
|
|
82
|
+
<p>
|
|
83
|
+
Web image scraping for AI agents is straightforward when broken into three layers:
|
|
84
|
+
fetch (extend the HTTP client to return binary data), normalise (base64 + MIME type),
|
|
85
|
+
and persist (inline for small, file-backed for large). The resulting
|
|
86
|
+
<code>ImageRef</code> objects are immediately usable by any vision-capable LLM.
|
|
87
|
+
</p>
|
|
88
|
+
</article>
|
|
89
|
+
|
|
90
|
+
<footer>
|
|
91
|
+
<a href="/privacy">Privacy</a>
|
|
92
|
+
</footer>
|
|
93
|
+
</body>
|
|
94
|
+
</html>
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8">
|
|
5
|
+
<title>Issues · hyprwm/aquamarine · GitHub</title>
|
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
7
|
+
<meta name="description" content="">
|
|
8
|
+
<link rel="stylesheet" href="/assets/github-abc123.css" crossorigin="anonymous">
|
|
9
|
+
<script type="module" src="/assets/wp-runtime-xyz.js"></script>
|
|
10
|
+
<script type="module" src="/assets/vendors-node-xyz.js"></script>
|
|
11
|
+
<script type="module" src="/assets/app_assets-xyz.js"></script>
|
|
12
|
+
<meta name="turbo-body-classes" content="logged-out env-production">
|
|
13
|
+
</head>
|
|
14
|
+
<body class="logged-out env-production">
|
|
15
|
+
<div data-target="read-only-cursor-text-area.placeholder" id="placeholder" style="display:none"></div>
|
|
16
|
+
<div class="position-relative js-header-wrapper">
|
|
17
|
+
<div class="Header" role="banner"></div>
|
|
18
|
+
</div>
|
|
19
|
+
<div id="start-of-content" class="show-on-focus"></div>
|
|
20
|
+
<div data-pjax-timeout="1000" id="js-pjax-loader-bar" class="pjax-loader-bar"></div>
|
|
21
|
+
<div hidden id="ajax-error-message" class="ajax-error-message flash flash-error"></div>
|
|
22
|
+
<div role="main" data-turbo-body class="logged-out">
|
|
23
|
+
<react-app data-ssr="false">
|
|
24
|
+
<div id="root"></div>
|
|
25
|
+
</react-app>
|
|
26
|
+
</div>
|
|
27
|
+
<footer class="footer" role="contentinfo">
|
|
28
|
+
<div class="footer-links"></div>
|
|
29
|
+
</footer>
|
|
30
|
+
<script crossorigin="anonymous" defer type="application/json" id="__PRIMER_DATA__">{"version":"1.0"}</script>
|
|
31
|
+
</body>
|
|
32
|
+
</html>
|