kaos-web 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. kaos_web-0.1.0/.gitignore +36 -0
  2. kaos_web-0.1.0/CHANGELOG.md +506 -0
  3. kaos_web-0.1.0/LICENSE +201 -0
  4. kaos_web-0.1.0/NOTICE +8 -0
  5. kaos_web-0.1.0/PKG-INFO +284 -0
  6. kaos_web-0.1.0/README.md +242 -0
  7. kaos_web-0.1.0/kaos_web/__init__.py +42 -0
  8. kaos_web-0.1.0/kaos_web/__main__.py +5 -0
  9. kaos_web-0.1.0/kaos_web/_version.py +1 -0
  10. kaos_web-0.1.0/kaos_web/browser_tools.py +1916 -0
  11. kaos_web-0.1.0/kaos_web/cli.py +288 -0
  12. kaos_web-0.1.0/kaos_web/clients/__init__.py +15 -0
  13. kaos_web-0.1.0/kaos_web/clients/browser.py +1122 -0
  14. kaos_web-0.1.0/kaos_web/clients/config.py +108 -0
  15. kaos_web-0.1.0/kaos_web/clients/http.py +412 -0
  16. kaos_web-0.1.0/kaos_web/clients/page_prep.py +269 -0
  17. kaos_web-0.1.0/kaos_web/clients/protocol.py +19 -0
  18. kaos_web-0.1.0/kaos_web/clients/user_agents.py +74 -0
  19. kaos_web-0.1.0/kaos_web/crawl_tools.py +576 -0
  20. kaos_web-0.1.0/kaos_web/data/user_agents.json +107 -0
  21. kaos_web-0.1.0/kaos_web/discover/__init__.py +54 -0
  22. kaos_web-0.1.0/kaos_web/discover/batch.py +95 -0
  23. kaos_web-0.1.0/kaos_web/discover/crawl.py +256 -0
  24. kaos_web-0.1.0/kaos_web/discover/discovery.py +296 -0
  25. kaos_web-0.1.0/kaos_web/discover/sitemap.py +361 -0
  26. kaos_web-0.1.0/kaos_web/domain/__init__.py +122 -0
  27. kaos_web-0.1.0/kaos_web/domain/dns.py +653 -0
  28. kaos_web-0.1.0/kaos_web/domain/fingerprint.py +331 -0
  29. kaos_web-0.1.0/kaos_web/domain/http.py +225 -0
  30. kaos_web-0.1.0/kaos_web/domain/models.py +380 -0
  31. kaos_web-0.1.0/kaos_web/domain/org.py +367 -0
  32. kaos_web-0.1.0/kaos_web/domain/profile.py +93 -0
  33. kaos_web-0.1.0/kaos_web/domain/security.py +265 -0
  34. kaos_web-0.1.0/kaos_web/domain/service.py +95 -0
  35. kaos_web-0.1.0/kaos_web/domain/tcp.py +378 -0
  36. kaos_web-0.1.0/kaos_web/domain/tls.py +170 -0
  37. kaos_web-0.1.0/kaos_web/domain/udp.py +724 -0
  38. kaos_web-0.1.0/kaos_web/domain/whois.py +373 -0
  39. kaos_web-0.1.0/kaos_web/domain_tools.py +1194 -0
  40. kaos_web-0.1.0/kaos_web/errors.py +145 -0
  41. kaos_web-0.1.0/kaos_web/extract/__init__.py +28 -0
  42. kaos_web-0.1.0/kaos_web/extract/html_to_ast.py +227 -0
  43. kaos_web-0.1.0/kaos_web/extract/images.py +255 -0
  44. kaos_web-0.1.0/kaos_web/extract/links.py +266 -0
  45. kaos_web-0.1.0/kaos_web/extract/metadata.py +155 -0
  46. kaos_web-0.1.0/kaos_web/extract/readability.py +349 -0
  47. kaos_web-0.1.0/kaos_web/extract/readability_l3.py +667 -0
  48. kaos_web-0.1.0/kaos_web/middleware/__init__.py +21 -0
  49. kaos_web-0.1.0/kaos_web/middleware/base.py +57 -0
  50. kaos_web-0.1.0/kaos_web/middleware/cache.py +377 -0
  51. kaos_web-0.1.0/kaos_web/middleware/rate_limit.py +103 -0
  52. kaos_web-0.1.0/kaos_web/middleware/retry.py +109 -0
  53. kaos_web-0.1.0/kaos_web/middleware/robots.py +132 -0
  54. kaos_web-0.1.0/kaos_web/models/__init__.py +11 -0
  55. kaos_web-0.1.0/kaos_web/models/metadata.py +44 -0
  56. kaos_web-0.1.0/kaos_web/models/request.py +37 -0
  57. kaos_web-0.1.0/kaos_web/models/response.py +47 -0
  58. kaos_web-0.1.0/kaos_web/py.typed +0 -0
  59. kaos_web-0.1.0/kaos_web/search/__init__.py +23 -0
  60. kaos_web-0.1.0/kaos_web/search/backends.py +413 -0
  61. kaos_web-0.1.0/kaos_web/security.py +138 -0
  62. kaos_web-0.1.0/kaos_web/serve.py +125 -0
  63. kaos_web-0.1.0/kaos_web/settings.py +303 -0
  64. kaos_web-0.1.0/kaos_web/tools.py +1128 -0
  65. kaos_web-0.1.0/pyproject.toml +173 -0
  66. kaos_web-0.1.0/tests/__init__.py +0 -0
  67. kaos_web-0.1.0/tests/benchmarks/test_benchmarks.py +170 -0
  68. kaos_web-0.1.0/tests/fixtures/README.md +46 -0
  69. kaos_web-0.1.0/tests/fixtures/article.html +112 -0
  70. kaos_web-0.1.0/tests/fixtures/books_toscrape.html +361 -0
  71. kaos_web-0.1.0/tests/fixtures/cornell_law.html +879 -0
  72. kaos_web-0.1.0/tests/fixtures/httpbin.html +14 -0
  73. kaos_web-0.1.0/tests/fixtures/readability/README.md +49 -0
  74. kaos_web-0.1.0/tests/fixtures/readability/category_listing.html +66 -0
  75. kaos_web-0.1.0/tests/fixtures/readability/corpus.json +132 -0
  76. kaos_web-0.1.0/tests/fixtures/readability/directory_listing.html +75 -0
  77. kaos_web-0.1.0/tests/fixtures/readability/docket_report.html +95 -0
  78. kaos_web-0.1.0/tests/fixtures/readability/multi_section_landing.html +63 -0
  79. kaos_web-0.1.0/tests/fixtures/readability/search_results_page.html +80 -0
  80. kaos_web-0.1.0/tests/fixtures/readability/team_directory_cards.html +71 -0
  81. kaos_web-0.1.0/tests/integration/__init__.py +0 -0
  82. kaos_web-0.1.0/tests/integration/test_273v_e2e.py +402 -0
  83. kaos_web-0.1.0/tests/integration/test_browser_interaction.py +604 -0
  84. kaos_web-0.1.0/tests/integration/test_crawl.py +303 -0
  85. kaos_web-0.1.0/tests/integration/test_domain_live.py +409 -0
  86. kaos_web-0.1.0/tests/integration/test_entity_live.py +74 -0
  87. kaos_web-0.1.0/tests/integration/test_mcp_web_pipeline.py +193 -0
  88. kaos_web-0.1.0/tests/integration/test_mcp_web_session.py +197 -0
  89. kaos_web-0.1.0/tests/integration/test_middleware_e2e.py +228 -0
  90. kaos_web-0.1.0/tests/integration/test_real_http.py +244 -0
  91. kaos_web-0.1.0/tests/integration/test_real_sites.py +242 -0
  92. kaos_web-0.1.0/tests/integration/test_search_live.py +164 -0
  93. kaos_web-0.1.0/tests/unit/__init__.py +0 -0
  94. kaos_web-0.1.0/tests/unit/conftest.py +43 -0
  95. kaos_web-0.1.0/tests/unit/domain/__init__.py +0 -0
  96. kaos_web-0.1.0/tests/unit/domain/test_dns.py +555 -0
  97. kaos_web-0.1.0/tests/unit/domain/test_fingerprint.py +387 -0
  98. kaos_web-0.1.0/tests/unit/domain/test_http.py +233 -0
  99. kaos_web-0.1.0/tests/unit/domain/test_models.py +357 -0
  100. kaos_web-0.1.0/tests/unit/domain/test_org.py +576 -0
  101. kaos_web-0.1.0/tests/unit/domain/test_profile.py +86 -0
  102. kaos_web-0.1.0/tests/unit/domain/test_security.py +281 -0
  103. kaos_web-0.1.0/tests/unit/domain/test_service.py +125 -0
  104. kaos_web-0.1.0/tests/unit/domain/test_tcp.py +168 -0
  105. kaos_web-0.1.0/tests/unit/domain/test_tcp_banner.py +255 -0
  106. kaos_web-0.1.0/tests/unit/domain/test_tls.py +255 -0
  107. kaos_web-0.1.0/tests/unit/domain/test_udp.py +568 -0
  108. kaos_web-0.1.0/tests/unit/domain/test_whois.py +438 -0
  109. kaos_web-0.1.0/tests/unit/test_api_naming.py +57 -0
  110. kaos_web-0.1.0/tests/unit/test_batch.py +124 -0
  111. kaos_web-0.1.0/tests/unit/test_browser_client.py +186 -0
  112. kaos_web-0.1.0/tests/unit/test_browser_interaction.py +963 -0
  113. kaos_web-0.1.0/tests/unit/test_browser_session_isolation.py +530 -0
  114. kaos_web-0.1.0/tests/unit/test_browser_tools_wrappers.py +1033 -0
  115. kaos_web-0.1.0/tests/unit/test_cache.py +374 -0
  116. kaos_web-0.1.0/tests/unit/test_cli_commands.py +306 -0
  117. kaos_web-0.1.0/tests/unit/test_clients_page_prep.py +413 -0
  118. kaos_web-0.1.0/tests/unit/test_crawl.py +204 -0
  119. kaos_web-0.1.0/tests/unit/test_crawl_tools.py +651 -0
  120. kaos_web-0.1.0/tests/unit/test_discovery.py +293 -0
  121. kaos_web-0.1.0/tests/unit/test_domain_tools.py +837 -0
  122. kaos_web-0.1.0/tests/unit/test_edge_cases.py +426 -0
  123. kaos_web-0.1.0/tests/unit/test_fuzz.py +207 -0
  124. kaos_web-0.1.0/tests/unit/test_html_to_ast.py +495 -0
  125. kaos_web-0.1.0/tests/unit/test_http_client.py +335 -0
  126. kaos_web-0.1.0/tests/unit/test_metadata.py +108 -0
  127. kaos_web-0.1.0/tests/unit/test_middleware.py +359 -0
  128. kaos_web-0.1.0/tests/unit/test_readability.py +67 -0
  129. kaos_web-0.1.0/tests/unit/test_readability_experiments.py +66 -0
  130. kaos_web-0.1.0/tests/unit/test_readability_l3.py +265 -0
  131. kaos_web-0.1.0/tests/unit/test_response_capture.py +1311 -0
  132. kaos_web-0.1.0/tests/unit/test_search_backends.py +269 -0
  133. kaos_web-0.1.0/tests/unit/test_security.py +213 -0
  134. kaos_web-0.1.0/tests/unit/test_serve.py +175 -0
  135. kaos_web-0.1.0/tests/unit/test_settings.py +306 -0
  136. kaos_web-0.1.0/tests/unit/test_sitemap.py +410 -0
  137. kaos_web-0.1.0/tests/unit/test_tools.py +1017 -0
@@ -0,0 +1,36 @@
1
+ # Build / dist
2
+ build/
3
+ dist/
4
+ *.egg-info/
5
+
6
+ # Caches
7
+ .benchmarks/
8
+ .coverage
9
+ .coverage.*
10
+ .kaos-vfs/
11
+ .kaos-vfs-*/
12
+ .mypy_cache/
13
+ .pytest_cache/
14
+ .ruff_cache/
15
+ .ty_cache/
16
+ .venv/
17
+ __pycache__/
18
+ coverage.xml
19
+ htmlcov/
20
+ *.pyc
21
+
22
+ # OS / editor scratch
23
+ .DS_Store
24
+ Thumbs.db
25
+ .idea/
26
+ .vscode/
27
+
28
+ # Secrets — never commit
29
+ .env
30
+ .env.*
31
+ !.env.example
32
+ *.pem
33
+ *.key
34
+ *.p12
35
+ *.pfx
36
+ .kaos-credentials.json
@@ -0,0 +1,506 @@
1
+ # Changelog
2
+
3
+ All notable changes to `kaos-web` are documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ 
9
+ ## [Unreleased]
10
+
11
+
12
+ ## [0.1.0] — 2026-05-20
13
+
14
+ ### Released
15
+
16
+ - 0.1.0 GA — WU-L of GA plan. First stable release. Public API frozen.
17
+ - Pin floor raised to `>=0.1.0,<0.2` across all kaos-* runtime and
18
+ optional dependencies. Refreshed `uv.lock` to pick up the 0.1.0
19
+ line of every upstream.
20
+
21
+ ### Internal
22
+
23
+ - WU-L of the 0.1.0 GA plan
24
+ (`kaos-modules/docs/plans/2026-05-20-0.1.0-ga-plan.md`).
25
+
26
+
27
+ ## [0.1.0rc1] — 2026-05-20
28
+
29
+ ### Changed
30
+
31
+ - Pin floor raised to `>=0.1.0rc1,<0.2` across kaos-* runtime and
32
+ optional dependencies (`kaos-core`, `kaos-content`, `kaos-mcp`,
33
+ `kaos-nlp-core`). Refreshed `uv.lock` to pick up the rc1 line of
34
+ every upstream.
35
+
36
+ ### Internal
37
+
38
+ - WU-J of the 0.1.0 GA plan
39
+ (`kaos-modules/docs/plans/2026-05-20-0.1.0-ga-plan.md`).
40
+ Release candidate; freezes the public API for `kaos-web`
41
+ ahead of 0.1.0 GA.
42
+
43
+
44
+ ## [0.1.0a6] — 2026-05-20
45
+
46
+ ### Changed
47
+
48
+ - Bumped minimum `kaos-core` to `0.1.0a12` (post-URI-redesign +
49
+ Capability type). kaos-web does not use the URI redesign directly —
50
+ the bump aligns the supported floor with the rest of the kaos-*
51
+ DAG ahead of 0.1.0 GA.
52
+ - Refreshed `uv.lock` to pick up `kaos-core 0.1.0a12`,
53
+ `kaos-content 0.1.0a12`, `kaos-mcp 0.1.0a4`, and
54
+ `kaos-nlp-core 0.1.0a8`.
55
+
56
+ ### Internal
57
+
58
+ - WU-F.6 of the 0.1.0 GA plan
59
+ (`kaos-modules/docs/plans/2026-05-20-0.1.0-ga-plan.md`):
60
+ catch-up to kaos-core 0.1.0a12.
61
+
62
+ ## [0.1.0a5] — 2026-05-17
63
+
64
+ ### Changed (intentional break — alpha train)
65
+
66
+ - **`kaos-web-crawl-site`** and **`kaos-web-batch-fetch`** no longer
67
+ silently truncate page content at 5000 characters in the
68
+ no-runtime-context fallback path. The four `[:5000]` truncations
69
+ and the `truncated: bool` flag are **deleted** from
70
+ `crawl_tools.py`:
71
+ - `CrawlSiteTool.execute` (text + markdown fallback branches)
72
+ - `_extract_response` helper used by `BatchFetchTool` (text +
73
+ markdown branches)
74
+
75
+ The artifact-tier happy path (`_store_response_artifact` /
76
+ `_store_crawl_page_artifact`) — which already activates when a
77
+ `KaosContext` with a `KaosRuntime` is supplied — is the canonical
78
+ flow for large pages. The fallback path now returns full content
79
+ unbounded; downstream callers should supply a runtime context to
80
+ get the tiered (inline / summary+link / link-only) experience
81
+ driven by the `KaosCoreArtifactSettings` thresholds shipped in
82
+ kaos-core 0.1.0a8.
83
+
84
+ **Output-shape break:** the `truncated` key is gone from both
85
+ fallback responses. Callers that read it must remove the read.
86
+
87
+ ### Why
88
+
89
+ Stage B3 of the cross-package
90
+ `no-hardcoded-caps-and-artifact-first-tool-results` plan in the
91
+ kaos-modules monorepo. The 5000-char silent truncation hid information
92
+ from downstream agents — long pages came back claiming
93
+ `"truncated": true` but the full text was discarded. With the artifact
94
+ path already wired for the runtime-context case, the surgical fix is
95
+ to delete the fallback truncation and trust the artifact tier system
96
+ to handle size.
97
+
98
+ ### Constants audit
99
+
100
+ ```bash
101
+ $ git grep '\[:5000\]\|max_chars\s*=\|content_max_chars' kaos_web/
102
+ # (no hits in production code)
103
+ ```
104
+
105
+ ### Dependencies
106
+
107
+ No version pin changes. `kaos-web` continues to declare
108
+ `kaos-core>=0.1.0a4,<0.2`; the artifact helpers already used by
109
+ `_store_response_artifact` / `_store_crawl_page_artifact` predate
110
+ 0.1.0a8 and don't require the new API surface.
111
+
112
+ ## [0.1.0a4] — 2026-05-15
113
+
114
+ ### Added — `tags=["browser"]` / `tags=["netinfra"]` on Playwright + DNS/WHOIS tools (PRD PR 2 Stage A.5)
115
+
116
+ kaos-agents 0.1.0a3's `derive_group()` reads recognized tags as
117
+ narrowing signals: `["browser"]` routes a tool to the SessionToolSet
118
+ `browser` group; `["netinfra"]` routes to `netinfra`. Without these
119
+ tags, both surfaces would land in the broader `web` group and
120
+ accidentally surface on the default research preset, even for
121
+ sessions that haven't opted into Playwright or netinfra
122
+ introspection.
123
+
124
+ Affected tools:
125
+
126
+ - **Browser** (19 tools — every tool in `browser_tools.py`):
127
+ `browser-navigate`, `-click`, `-fill`, `-type`, `-press`,
128
+ `-select`, `-screenshot`, `-evaluate`, `-snapshot`, `-content`,
129
+ `-cookies`, `-set-cookie`, `-save-auth`, `-log-requests`,
130
+ `-requests`, `-get-request`, `-captured-responses`,
131
+ `-list-contexts`, `-close-context`.
132
+ - **Netinfra** (14 tools — every tool in `domain_tools.py`):
133
+ `tcp-probe`, `tls-inspect`, `http-headers`, `service-detect`,
134
+ `dns-lookup`, `dns-enumerate`, `dns-zone-transfer`, `dns-security`,
135
+ `whois-lookup`, `domain-profile`, `extract-org`, `tcp-banner`,
136
+ `fingerprint-service`, `udp-probe`.
137
+
138
+ HTTP fetch + search tools in `tools.py` (9) and crawl tools in
139
+ `crawl_tools.py` (3) deliberately stay untagged — they're pure
140
+ `web` group and the derivation reaches them via
141
+ `openWorldHint=True` + `readOnlyHint=True` without needing a tag.
142
+
143
+ Tests:
144
+ - 3 new tests pin the tag coverage: every browser tool carries
145
+ `tags=["browser"]`; every netinfra tool carries `tags=["netinfra"]`;
146
+ web + crawl tools carry NEITHER.
147
+
148
+ Motivated by `kaos-modules/docs/internal/dynamic-tool-planning-completion-plan.md`
149
+ §4 Stage A.5. Purely additive: the `tags` field was empty before.
150
+
151
+ ## [0.1.0a3] — 2026-05-15
152
+
153
+ ### Added — `register_web_all_tools` convenience union (PRD PR 1)
154
+
155
+ - **`register_web_all_tools(runtime)`** — registers every kaos-web
156
+ MCP tool with one call. Composes the existing 4 group entry
157
+ points:
158
+ - `register_web_tools(runtime)` → 9 HTTP fetch + search tools
159
+ (SessionToolSet `web` group)
160
+ - `register_browser_tools(runtime)` → 19 Playwright tools
161
+ (`browser` group; `[browser]` extra needed at *runtime*, not
162
+ registration)
163
+ - `register_domain_tools(runtime)` → 14 DNS / WHOIS / TLS / TCP
164
+ banner / UDP probe / HTTP header / org-extract tools
165
+ (`netinfra` group; `[dns]` extra at runtime)
166
+ - `register_crawl_tools(runtime)` → 3 URL discovery / batch
167
+ fetch / full-site crawl tools (`web` group)
168
+
169
+ Total: **45 tools** registered.
170
+
171
+ The four group-specific registration functions retain their
172
+ existing names and behavior — no breaking changes. The new union
173
+ is purely additive for callers (single-user-chat backend,
174
+ power-user sessions) that want the full 45-tool surface in one
175
+ call instead of four.
176
+
177
+ Pins the SessionToolSet `web` / `browser` / `netinfra` group entry
178
+ points so kaos-agents (PR 2) can wire ceiling membership without
179
+ a new public surface bump.
180
+
181
+ Motivated by `kaos-modules/docs/internal/dynamic-tool-planning-prd.md`
182
+ §4 ("PR 1 — catalog expansion"; round-1 decision #3 — bump from 9
183
+ to 45 registered tools).
184
+
185
+ ## [0.1.0a2] — 2026-05-11
186
+
187
+ ### Documentation
188
+
189
+ - **CI: 3.15 lane blocked on upstream rpds-py / PyO3.** Expanded the
190
+ inline comment on the Python 3.15 matrix entry in ``ci.yml`` to
191
+ explain the upstream block in detail: ``rpds-py==0.30.0`` (pulled
192
+ transitively via ``jsonschema`` → ``referencing``) source-builds
193
+ with ``pyo3-ffi 0.27.2`` which caps at CPython 3.14, so
194
+ ``maturin pep517 build-wheel`` fails with ``the configured Python
195
+ version (3.15) is newer than PyO3's``. No local fix is possible:
196
+ we cannot drop ``jsonschema`` and ``rpds-py`` cannot be pinned to
197
+ an older release on 3.15 because no older release has 3.15
198
+ wheels either. Resolution is gated on rpds-py cutting a release
199
+ with PyO3 0.28+. Tracking pointer added so the comment is
200
+ actionable on each ecosystem refresh. The leg remains
201
+ ``experimental: true`` / ``continue-on-error: true`` so the
202
+ workflow signal stays green on PRs. No code change.
203
+
204
+ ### Security
205
+
206
+ - **Sitemap parser no longer falls back to vulnerable
207
+ `xml.etree.ElementTree`.** ``kaos_web/discover/sitemap.py``
208
+ previously tried lxml first (safe — ``resolve_entities=False`` on a
209
+ recovering parser) but fell back to stdlib
210
+ ``xml.etree.ElementTree.fromstring`` if lxml raised. The fallback
211
+ was unreachable in practice (lxml's recovering parser doesn't raise
212
+ on syntactic chaos — it returns a partial tree) and stdlib
213
+ ``etree.fromstring`` is itself vulnerable to XML attacks (XXE,
214
+ entity expansion, billion-laughs). Bandit B314 flagged it; dropped
215
+ the fallback entirely. If lxml's recovering parser raises
216
+ ``ValueError`` / ``XMLSyntaxError``, the sitemap is now treated as
217
+ unparseable and returns ``([], [])`` — same shape as the existing
218
+ ``except ParseError`` path. The runtime ``xml.etree`` import is now
219
+ TYPE_CHECKING-only (kept for the ``_find_text`` type annotation).
220
+ Files: ``kaos_web/discover/sitemap.py``.
221
+
222
+ ### Security
223
+
224
+ - **bandit + vulture now run in both pre-commit and CI.** Two new
225
+ hooks in ``.pre-commit-config.yaml`` (bandit + vulture), mirrored
226
+ by two new jobs in ``security.yml`` (``bandit (static security)``
227
+ + ``vulture (dead-code scan)``). Pre-commit gives contributors fast
228
+ feedback before push; CI makes the scan publicly visible on every
229
+ PR. Bandit skip list (``B101,B404,B603,B607``) justified inline
230
+ per audit; vulture runs at ``--min-confidence 100`` with the shared
231
+ ``--ignore-names`` family list. Both pass clean — see PR for the
232
+ prerequisite B314 sitemap fix that this PR depends on. Mirrors
233
+ the rollout from kaos-core.
234
+ ### Changed
235
+
236
+ - **uv.lock bumped to the current PyPI-latest of two kaos-* siblings:**
237
+ ``kaos-content`` 0.1.0a2 → 0.1.0a4 and ``kaos-core`` 0.1.0a4 →
238
+ 0.1.0a5. Both bumps are no-op for the kaos-web public API but pull
239
+ in upstream bug fixes / performance work. All 1337 unit tests
240
+ continue to pass.
241
+
242
+ ### Security
243
+
244
+ - **SSRF gate at every outbound URL/host site** (WEB5-001). Wires
245
+ ``kaos_core.security.validate_outbound_url`` (and the host-only
246
+ ``is_loopback`` / ``is_private_ip`` / ``is_metadata_service``
247
+ primitives) into every kaos-web fetch site so a misconfigured
248
+ caller — especially the HTTP-mode MCP server fronting multiple
249
+ agents — cannot reach link-local cloud-metadata services
250
+ (``169.254.169.254``), loopback, RFC1918 private networks, or
251
+ block-listed schemes (``file://``, ``javascript:``, ``data:``,
252
+ ``vbscript:``). New ``kaos_web.security`` module exposes
253
+ ``validate_url(url)`` and ``validate_host(host)`` thin wrappers
254
+ that translate ``UnsafeURLError`` into a new
255
+ ``UrlPolicyError(WebError)`` whose message includes the specific
256
+ policy field that fired plus the env var the operator can flip to
257
+ relax it. **Strict by default**: blocks private/loopback/metadata
258
+ and limits schemes to ``http``/``https``. Operators relax via
259
+ ``KAOS_SECURITY_BLOCK_PRIVATE_NETWORKS=0`` /
260
+ ``KAOS_SECURITY_BLOCK_LOOPBACK=0`` /
261
+ ``KAOS_SECURITY_BLOCK_METADATA_SERVICES=0`` /
262
+ ``KAOS_SECURITY_ALLOWED_HOSTS=["host","10.0.0.0/24",".example.com"]``.
263
+ Sites wired (4 URL gates + 12 host gates):
264
+ - URL: ``HttpClient._raw_fetch``, ``BrowserClient.fetch`` /
265
+ ``screenshot`` / ``evaluate``, ``analyze_headers``,
266
+ ``ExtractOrgTool.execute``.
267
+ - Host: ``probe_port`` / ``probe_ports`` / ``probe_banner`` /
268
+ ``probe_banners``, ``inspect_tls``, ``probe_dns`` / ``probe_ntp``
269
+ / ``probe_snmp`` / ``probe_syslog``, ``whois_lookup``, ``lookup``
270
+ / ``lookup_many`` / ``enumerate_dns`` / ``attempt_zone_transfer``
271
+ / ``reverse_ptr``.
272
+ Known gap: ``follow_redirects=True`` on httpx only validates the
273
+ original URL — the redirect target is not re-validated. Closing
274
+ this requires a connect-time hook on the HTTP client (kaos-core
275
+ follow-up). Hostname-only inputs (most DNS / WHOIS use cases)
276
+ pass through; the gate fires on IP literals where the policy
277
+ classification is unambiguous.
278
+
279
+ - **Browser contexts are now session-scoped** (WEB5-002). Every entry
280
+ in ``BrowserClient._contexts`` / ``_pages`` / ``_request_logs`` /
281
+ ``_response_bodies`` / ``_logging_config`` is keyed by the tuple
282
+ ``(KaosContext.session_id, context_id)``. Previously, the shared
283
+ process-global client keyed by raw ``context_id`` strings — any
284
+ caller who knew or guessed a context_id could click/fill/screenshot
285
+ another caller's pages, read their cookies, or download captured
286
+ fetch/XHR bodies. With the MCP HTTP server fronting multiple
287
+ agents, that is a cross-tenant browser session takeover. Cross-
288
+ session lookups now miss uniformly with the same "No active page" /
289
+ "No context '<id>'" error a missing context returns — never
290
+ disclosing existence in another session. ``close_context`` from a
291
+ different session is a silent no-op. ``BrowserClient.active_contexts``
292
+ changed from a property to a ``method(session_id) -> list[str]``
293
+ that returns only the calling session's context IDs. Library
294
+ callers that omit ``session_id`` fall back to a module-level
295
+ ``ANONYMOUS_SESSION_ID`` sentinel so the original single-user stdio
296
+ surface keeps working without churn.
297
+
298
+ - **`SaveAuthStateTool` no longer accepts a caller-supplied filesystem
299
+ path** (WEB5-004). The previous implementation passed an MCP-input
300
+ ``path`` straight to Playwright's ``context.storage_state(path=...)``
301
+ — path-traversal / arbitrary-write to anywhere the server process
302
+ could write, plus a credentials-leak persistence path. Rewritten to:
303
+ capture the storage state in-memory via new
304
+ ``BrowserClient.get_storage_state(context_id)``, write to a
305
+ session-scoped VFS path, and persist as a kaos-core artifact via
306
+ ``KaosContext.runtime.artifacts.create_from_path`` (auto-bound to
307
+ the caller's ``session_id``). Returns an ``ArtifactManifest`` the
308
+ agent retrieves via standard artifact MCP tools. **Breaking change
309
+ for the MCP tool input schema**: the ``path`` parameter is removed
310
+ and replaced with an optional ``name`` parameter (artifact name).
311
+ Library users with their own filesystem authority can still call
312
+ ``BrowserClient.save_storage_state(path)`` directly.
313
+
314
+ - **Observed third-party traffic redacts sensitive headers by default**
315
+ (WEB5-003). When ``kaos-web-browser-log-requests`` captures network
316
+ traffic, the recorded request and response headers now mask values
317
+ for ``Authorization``, ``Proxy-Authorization``, ``Cookie``,
318
+ ``Set-Cookie``, ``X-API-Key``, ``X-Auth-Token``, ``X-CSRF-Token``,
319
+ plus any header whose name matches the catch-all
320
+ ``(?i).*(?:secret|token|api[_-]?key|password|auth).*``. Mask format
321
+ ``<redacted: N bytes>`` preserves length information without leaking
322
+ the value. New ``KAOS_WEB_REDACT_OBSERVED_TRAFFIC`` env var
323
+ (default ``true``); set ``false`` for explicit security-research
324
+ workflows. The agent's OWN session cookies (returned by
325
+ ``kaos-web-browser-cookies`` / ``GetCookiesTool``) are NOT
326
+ redacted — they're the agent's own state.
327
+
328
+ - **Response-body size cap enforced at every fetch site** (WEB5-007).
329
+ New `KaosWebSettings.max_body_bytes` (env: `KAOS_WEB_MAX_BODY_BYTES`,
330
+ default 50 MB) bounds memory usage on hostile or misconfigured
331
+ endpoints. Enforced at three sites:
332
+ - `HttpClient._raw_fetch` switched to `client.stream() +
333
+ aiter_bytes()` with a pre-check on the declared `Content-Length`
334
+ header and a running tally over the streamed bytes. Aborts with
335
+ `BodyTooLargeError` before materialization.
336
+ - `BrowserClient.fetch` post-checks `len(page.content())` (Playwright
337
+ has no streaming variant) — protects downstream parsers and
338
+ artifact storage from operating on absurd strings.
339
+ - `kaos_web.discover.sitemap._decompress_gzip` switched to
340
+ `gzip.GzipFile.read(max_bytes + 1)` (gzip-bomb protection — a
341
+ small gzipped payload can decompress to gigabytes; bounded read
342
+ is the only memory-safe pattern).
343
+ New `BodyTooLargeError(WebError)` carries `size_bytes`,
344
+ `max_bytes`, and an agent-friendly recovery hint pointing at the
345
+ env var.
346
+
347
+ - **URL filter regexes now use the Rust regex engine when available**
348
+ (WEB5-008). `kaos_web.discover.discovery._compile_patterns` previously
349
+ built `re.compile(...)` patterns from caller-supplied include /
350
+ exclude regex strings and applied `pattern.search(...)` to every
351
+ discovered URL path. Stdlib `re` is a backtracking engine —
352
+ pathological patterns like `(a+)+b` against `"a" * N` run in
353
+ exponential time and block the asyncio event loop (ReDoS).
354
+ Compiled patterns now route through a `_SafePattern` shim that
355
+ prefers `kaos_nlp_core.matching.RegexMatcher` (Rust regex, linear
356
+ time, no backtracking) when the `[nlp]` optional extra is installed,
357
+ with stdlib `re` as a fallback (one-shot warning logged so operators
358
+ see the path). Install `kaos-web[nlp]` to get the protection by
359
+ default.
360
+
361
+ - **CacheMiddleware bypasses any request bearing auth-shaped headers**
362
+ (WEB5-009). Cache key is `method:url` only — without this gate, an
363
+ authenticated request would either return another caller's cached
364
+ anonymous response (read-leak) or poison the cache for subsequent
365
+ anonymous callers (write-leak). The bypass is conservative: if the
366
+ request includes any of `Authorization`, `Proxy-Authorization`,
367
+ `Cookie`, `X-API-Key`, `X-Auth-Token`, `X-CSRF-Token`
368
+ (case-insensitive), the cache is skipped entirely (no LOOKUP, no
369
+ STORE) and the request always hits upstream. Anonymous requests
370
+ still benefit from the cache normally.
371
+
372
+ - **TLS verification on domain-intelligence probes now defaults to ON**
373
+ (WEB5-006). The two probes that explicitly disable verification
374
+ (`kaos-web-http-headers`, `kaos-web-extract-org`) previously
375
+ defaulted to `KAOS_WEB_DOMAIN_VERIFY_TLS=false` (audit-02 WEB2-001
376
+ shipped the setting with that default; WEB5-006 flips it to `true`).
377
+ Secure-by-default: the typical use case is observing healthy public
378
+ sites where CA validation is correct. Set
379
+ `KAOS_WEB_DOMAIN_VERIFY_TLS=false` to inspect hosts whose cert is
380
+ itself the subject of inspection (self-signed, expired, mismatched
381
+ SAN, staging environments). **Migration**: anyone scraping such
382
+ hosts will see new TLS errors; explicitly opt out via the env var.
383
+ Content-extraction tools (`HttpClient` / `BrowserClient`) keep TLS
384
+ verification on independently of this flag.
385
+
386
+ - **Browser interaction tools now declare `destructiveHint=True`**
387
+ (WEB5-005). Click / fill / type / press / select / evaluate run
388
+ inside an authenticated browser session and CAN trigger real actions
389
+ (form submit, settings change, JS-driven side effects). The prior
390
+ shared `_BROWSER_WRITE_ANNOTATIONS` annotation said
391
+ `destructiveHint=False` for all of them, which weakened any MCP
392
+ client that gates auto-approval on the annotation. Split into
393
+ `_BROWSER_INTERACT_ANNOTATIONS` (destructive=True for the 6
394
+ interaction tools) and the existing `_BROWSER_WRITE_ANNOTATIONS`
395
+ (destructive=False for local-state tools that do not trigger remote
396
+ actions: set-cookie, save-auth-state, enable-request-logging,
397
+ close-context, navigate). No behavior change — annotation
398
+ correctness only.
399
+
400
+ ### Changed
401
+
402
+ - **Refactored package layout** for better cohesion (per
403
+ `docs/python/design/modules.md`):
404
+ - New `kaos_web.discover` subpackage groups the BFS-discovery
405
+ subsystem: `batch`, `crawl`, `discovery`, `sitemap` (formerly four
406
+ top-level modules with mutual imports). Re-exports the canonical
407
+ public API at the package level (e.g. `from kaos_web.discover
408
+ import batch_fetch, crawl_site, discover_urls, parse_sitemap`).
409
+ - `kaos_web.browser_page_prep` → `kaos_web.clients.page_prep`. Only
410
+ consumer was `clients/browser.py`; the helper is logically a
411
+ browser-client primitive.
412
+ - The four `*_tools.py` files (`tools.py`, `browser_tools.py`,
413
+ `crawl_tools.py`, `domain_tools.py`) **stay top-level** per the
414
+ explicit KAOS convention documented in
415
+ `docs/python/design/modules.md` ("split tool files by domain when
416
+ they would otherwise exceed ~1500 lines"). No tools/ subpackage.
417
+
418
+ Pre-0.1.0a1 — no published version pins these import paths, so no
419
+ back-compat shims are shipped. If you imported from these paths in
420
+ pre-release builds, update to the new locations.
421
+
422
+ ## [0.1.0a1] — 2026-05-08
423
+
424
+ First public alpha release.
425
+
426
+ ### Added
427
+
428
+ - Web content extraction for the KAOS (Kelvin Agentic Operating System)
429
+ platform. Fetches HTML from URLs over HTTP or a headless browser and
430
+ produces `kaos-content` `ContentDocument` AST with provenance on every
431
+ block.
432
+ - **Dual-client architecture**: `HttpClient` (httpx, async, HTTP/2,
433
+ connection pooling, auth, SSL, proxy, structured error mapping) and
434
+ `BrowserClient` (Playwright, lazy launch, named-context page tracking,
435
+ cookie-banner dismissal for 8 known consent-management platforms).
436
+ - **HTML-to-AST extraction** (`html_to_document`): lxml element tree to
437
+ Block/Inline `ContentDocument` with `SourceRef` + `Provenance` on
438
+ every node. Supports headings, paragraphs, lists, blockquotes, code
439
+ blocks, tables, figures, definition lists, thematic breaks, and the
440
+ full inline grammar (text, strong, emphasis, code, link, image,
441
+ strikethrough, sub/superscript, line break).
442
+ - **Level-3 learned readability** (`extract.readability_l3`): pre-trained
443
+ logistic regression over 35 DOM-node features. Default extractor with
444
+ a `content_scope` parameter (0.0–1.0) controlling
445
+ precision/recall tradeoff. Heuristic readability and semantic
446
+ container detection (`<main>` → `<article>` → `[role=main]` → `<body>`)
447
+ remain as fallbacks when the L3 extraction returns `< 50` words.
448
+ - **Composable middleware chain** (`middleware/`): `RetryMiddleware`
449
+ (exponential backoff with jitter, honors `Retry-After`),
450
+ `RateLimitMiddleware` (per-domain token bucket), `RobotsMiddleware`
451
+ (stdlib `robotparser`, cached per domain), `CacheMiddleware`
452
+ (in-memory LRU, RFC 7231 compliant). Wired into `HttpClient.fetch()`
453
+ via `MiddlewareChain`; configurable per-client.
454
+ - **Search backends** (`search/`): SerpAPI, DuckDuckGo, Exa, Brave —
455
+ unified async interface with auto-detection from configured API keys.
456
+ - **Domain intelligence** (`domain/`): TCP probing + banner grab,
457
+ TLS cert inspection, HTTP header analysis with CDN detection and
458
+ security scoring, DNS lookup/enumeration/zone-transfer/security
459
+ posture, stdlib WHOIS client (55-TLD server map with referral
460
+ following), UDP protocol-aware probes (DNS / NTP / SNMPv1 / syslog),
461
+ pure banner→ServiceIdentity fingerprinting, and Schema.org
462
+ organization-entity extraction.
463
+ - **Multi-page workflows**: `discovery` (sitemaps + page links with
464
+ pattern filtering), `batch` (concurrent URL fetching with
465
+ `asyncio.Semaphore`), `crawl` (BFS site crawl with depth/page limits
466
+ and sitemap-first discovery).
467
+ - **45 MCP tools across 4 servers**:
468
+ - `register_web_tools()` — 7 extraction tools (fetch-page, get-text,
469
+ get-markdown, get-metadata, search-page, get-links, get-images).
470
+ - `register_browser_tools()` — 19 browser interaction tools
471
+ (navigate, click, fill, type, press, select, screenshot, evaluate,
472
+ snapshot, content, cookies, set-cookie, save-auth, log-requests,
473
+ requests, get-request, captured-responses, list-contexts,
474
+ close-context).
475
+ - `register_crawl_tools()` — 3 multi-page tools (discover-urls,
476
+ batch-fetch, crawl-site).
477
+ - `register_domain_tools()` — 14 domain-intelligence tools
478
+ (tcp-probe, tcp-banner, tls-inspect, http-headers, service-detect,
479
+ fingerprint-service, dns-lookup, dns-enumerate, dns-zone-transfer,
480
+ dns-security, whois-lookup, domain-profile, extract-org,
481
+ udp-probe). Enabled with `kaos-web-serve --domain`.
482
+ - **Typed module settings** (`KaosWebSettings`): `KAOS_WEB_*` env prefix
483
+ with legacy fallbacks (`SERPAPI_API_KEY`, `EXA_API_KEY`,
484
+ `BRAVE_API_KEY`, `KAOS_BROWSER_*`, `KAOS_SEARCH_*`). API keys use
485
+ `pydantic.SecretStr`. Knobs cover browser (type/headless/channel),
486
+ search (backend selection + per-backend timeouts + DDG user-agent),
487
+ discovery, sitemap, crawl, and middleware behavior.
488
+ - **CLI** (`kaos-web`): `extract`, `metadata`, `serve` subcommands with
489
+ `--json` envelope output for piping/agents.
490
+ - **Standalone MCP server** (`kaos-web-serve`): stdio (default) or
491
+ streamable HTTP (`--http --port`); `--browser`, `--crawl`, `--domain`
492
+ flags compose the registered tool surface; `--debug` enables verbose
493
+ logging.
494
+ - **Optional extras**: `[browser]` adds Playwright for JS-heavy pages;
495
+ `[dns]` adds dnspython for the domain-intelligence DNS tools; `[mcp]`
496
+ adds `kaos-mcp` for serving tools as a FastMCP bridge; `[nlp]` adds
497
+ `kaos-nlp-core` for BM25 search inside extracted documents.
498
+
499
+ ### License
500
+
501
+ This release is the first to ship under the Apache License 2.0. Earlier
502
+ internal versions were proprietary.
503
+
504
+ [Unreleased]: https://github.com/273v/kaos-web/compare/v0.1.0a2...HEAD
505
+ [0.1.0a2]: https://github.com/273v/kaos-web/compare/v0.1.0a1...v0.1.0a2
506
+ [0.1.0a1]: https://github.com/273v/kaos-web/releases/tag/v0.1.0a1