@gajae-code/coding-agent 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/dist/types/cli/notify-cli.d.ts +2 -0
  3. package/dist/types/config/settings-schema.d.ts +39 -2
  4. package/dist/types/extensibility/shared-events.d.ts +1 -0
  5. package/dist/types/gjc-runtime/launch-tmux.d.ts +1 -0
  6. package/dist/types/gjc-runtime/ralplan-runtime.d.ts +1 -1
  7. package/dist/types/gjc-runtime/tmux-common.d.ts +3 -0
  8. package/dist/types/gjc-runtime/tmux-sessions.d.ts +2 -0
  9. package/dist/types/lsp/types.d.ts +2 -0
  10. package/dist/types/notifications/attachment-registry.d.ts +17 -0
  11. package/dist/types/notifications/chat-adapters.d.ts +9 -0
  12. package/dist/types/notifications/config.d.ts +9 -1
  13. package/dist/types/notifications/engine.d.ts +59 -0
  14. package/dist/types/notifications/managed-daemon.d.ts +48 -0
  15. package/dist/types/notifications/telegram-daemon.d.ts +19 -0
  16. package/dist/types/notifications/threaded-inbound.d.ts +19 -0
  17. package/dist/types/notifications/threaded-render.d.ts +6 -1
  18. package/dist/types/session/agent-session.d.ts +2 -0
  19. package/dist/types/tools/fetch.d.ts +23 -0
  20. package/dist/types/tools/index.d.ts +1 -0
  21. package/dist/types/tools/telegram-send.d.ts +32 -0
  22. package/dist/types/web/insane/bridge.d.ts +103 -0
  23. package/dist/types/web/insane/url-guard.d.ts +22 -0
  24. package/dist/types/web/search/provider.d.ts +18 -1
  25. package/dist/types/web/search/providers/insane.d.ts +53 -0
  26. package/dist/types/web/search/providers/text-citations.d.ts +23 -0
  27. package/dist/types/web/search/types.d.ts +12 -4
  28. package/package.json +10 -8
  29. package/scripts/verify-insane-vendor.ts +132 -0
  30. package/src/cli/args.ts +1 -1
  31. package/src/cli/fast-help.ts +1 -1
  32. package/src/cli/notify-cli.ts +152 -5
  33. package/src/cli.ts +1 -3
  34. package/src/commands/team.ts +1 -1
  35. package/src/config/settings-schema.ts +30 -1
  36. package/src/defaults/gjc/skills/ralplan/SKILL.md +11 -4
  37. package/src/edit/modes/replace.ts +1 -1
  38. package/src/extensibility/shared-events.ts +1 -0
  39. package/src/gjc-runtime/launch-tmux.ts +27 -5
  40. package/src/gjc-runtime/ledger-event-renderer.ts +1 -0
  41. package/src/gjc-runtime/ralplan-runtime.ts +2 -2
  42. package/src/gjc-runtime/tmux-common.ts +8 -0
  43. package/src/gjc-runtime/tmux-sessions.ts +8 -1
  44. package/src/gjc-runtime/workflow-manifest.generated.json +29 -0
  45. package/src/gjc-runtime/workflow-manifest.ts +7 -2
  46. package/src/hashline/hash.ts +1 -1
  47. package/src/internal-urls/docs-index.generated.ts +9 -8
  48. package/src/lsp/config.ts +16 -3
  49. package/src/lsp/defaults.json +7 -0
  50. package/src/lsp/types.ts +2 -0
  51. package/src/modes/controllers/event-controller.ts +15 -0
  52. package/src/modes/interactive-mode.ts +46 -2
  53. package/src/modes/utils/context-usage.ts +2 -2
  54. package/src/notifications/attachment-registry.ts +23 -0
  55. package/src/notifications/chat-adapters.ts +147 -0
  56. package/src/notifications/config.ts +23 -2
  57. package/src/notifications/engine.ts +100 -0
  58. package/src/notifications/index.ts +224 -45
  59. package/src/notifications/managed-daemon.ts +163 -0
  60. package/src/notifications/telegram-daemon.ts +235 -14
  61. package/src/notifications/threaded-inbound.ts +60 -4
  62. package/src/notifications/threaded-render.ts +20 -2
  63. package/src/session/agent-session.ts +82 -51
  64. package/src/tools/ask.ts +3 -2
  65. package/src/tools/fetch.ts +78 -1
  66. package/src/tools/index.ts +3 -0
  67. package/src/tools/telegram-send.ts +137 -0
  68. package/src/web/insane/bridge.ts +350 -0
  69. package/src/web/insane/url-guard.ts +155 -0
  70. package/src/web/search/provider.ts +77 -18
  71. package/src/web/search/providers/anthropic.ts +70 -3
  72. package/src/web/search/providers/codex.ts +1 -119
  73. package/src/web/search/providers/gemini.ts +99 -0
  74. package/src/web/search/providers/insane.ts +551 -0
  75. package/src/web/search/providers/openai-compatible.ts +66 -32
  76. package/src/web/search/providers/text-citations.ts +111 -0
  77. package/src/web/search/types.ts +13 -2
  78. package/vendor/insane-search/LICENSE +21 -0
  79. package/vendor/insane-search/MANIFEST.json +24 -0
  80. package/vendor/insane-search/engine/__init__.py +23 -0
  81. package/vendor/insane-search/engine/__main__.py +128 -0
  82. package/vendor/insane-search/engine/bias_check.py +183 -0
  83. package/vendor/insane-search/engine/executor.py +254 -0
  84. package/vendor/insane-search/engine/fetch_chain.py +725 -0
  85. package/vendor/insane-search/engine/learning.py +175 -0
  86. package/vendor/insane-search/engine/phase0.py +214 -0
  87. package/vendor/insane-search/engine/safety.py +91 -0
  88. package/vendor/insane-search/engine/templates/package.json +11 -0
  89. package/vendor/insane-search/engine/templates/playwright_mobile_chrome.js +188 -0
  90. package/vendor/insane-search/engine/templates/playwright_real_chrome.js +243 -0
  91. package/vendor/insane-search/engine/tests/test_hardening.py +57 -0
  92. package/vendor/insane-search/engine/tests/test_smoke.py +152 -0
  93. package/vendor/insane-search/engine/tests/test_u1.py +200 -0
  94. package/vendor/insane-search/engine/tests/test_u4.py +131 -0
  95. package/vendor/insane-search/engine/tests/test_u5.py +163 -0
  96. package/vendor/insane-search/engine/tests/test_u7.py +124 -0
  97. package/vendor/insane-search/engine/transport.py +211 -0
  98. package/vendor/insane-search/engine/url_transforms.py +98 -0
  99. package/vendor/insane-search/engine/validators.py +331 -0
  100. package/vendor/insane-search/engine/waf_detector.py +214 -0
  101. package/vendor/insane-search/engine/waf_profiles.yaml +162 -0
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Inline citation extraction shared by native web-search providers.
3
+ *
4
+ * Web-search-capable models sometimes return a genuinely grounded answer whose
5
+ * sources are written inline (markdown links or bare URLs) instead of as
6
+ * structured citation annotations. When a provider has independent proof that a
7
+ * web search actually ran, these helpers recover sources from the answer text so
8
+ * the result is not discarded.
9
+ */
10
+ import type { SearchSource } from "../types";
11
+
12
+ /** Append a source, de-duplicating by URL. */
13
+ export function addSource(sources: SearchSource[], source: SearchSource): void {
14
+ if (!sources.some(existing => existing.url === source.url)) {
15
+ sources.push(source);
16
+ }
17
+ }
18
+
19
+ function countCharacter(text: string, target: string): number {
20
+ let count = 0;
21
+ for (const char of text) {
22
+ if (char === target) count += 1;
23
+ }
24
+ return count;
25
+ }
26
+
27
+ /**
28
+ * Strips prose punctuation and unmatched closing delimiters from extracted URLs.
29
+ * Models often return links embedded in markdown or sentence text.
30
+ */
31
+ export function normalizeExtractedUrl(candidate: string): string | null {
32
+ let url = candidate.trim();
33
+
34
+ while (url.length > 0) {
35
+ const lastCharacter = url.at(-1);
36
+ if (!lastCharacter) break;
37
+ if (/[.,!?;:'"]/u.test(lastCharacter)) {
38
+ url = url.slice(0, -1);
39
+ continue;
40
+ }
41
+ if (lastCharacter === ")" && countCharacter(url, ")") > countCharacter(url, "(")) {
42
+ url = url.slice(0, -1);
43
+ continue;
44
+ }
45
+ if (lastCharacter === "]" && countCharacter(url, "]") > countCharacter(url, "[")) {
46
+ url = url.slice(0, -1);
47
+ continue;
48
+ }
49
+ if (lastCharacter === "}" && countCharacter(url, "}") > countCharacter(url, "{")) {
50
+ url = url.slice(0, -1);
51
+ continue;
52
+ }
53
+ break;
54
+ }
55
+
56
+ if (!/^https?:\/\//.test(url)) return null;
57
+
58
+ try {
59
+ return new URL(url).toString();
60
+ } catch {
61
+ return null;
62
+ }
63
+ }
64
+
65
+ function findMarkdownLinkUrlEnd(text: string, openParenIndex: number): number | null {
66
+ let depth = 0;
67
+
68
+ for (let index = openParenIndex; index < text.length; index += 1) {
69
+ const character = text[index];
70
+ if (!character || character === "\n") return null;
71
+ if (character === "(") {
72
+ depth += 1;
73
+ continue;
74
+ }
75
+ if (character !== ")") continue;
76
+ depth -= 1;
77
+ if (depth === 0) return index;
78
+ if (depth < 0) return null;
79
+ }
80
+
81
+ return null;
82
+ }
83
+
84
+ /**
85
+ * Extracts citation sources from markdown links and bare URLs in answer text.
86
+ * Used only as a fallback when a provider confirms a search ran but omits
87
+ * structured citation annotations.
88
+ */
89
+ export function extractTextSources(text: string): SearchSource[] {
90
+ const sources: SearchSource[] = [];
91
+
92
+ for (let index = 0; index < text.length; index += 1) {
93
+ if (text[index] !== "[") continue;
94
+ const titleEnd = text.indexOf("]", index + 1);
95
+ if (titleEnd === -1 || text[titleEnd + 1] !== "(") continue;
96
+ const urlEnd = findMarkdownLinkUrlEnd(text, titleEnd + 1);
97
+ if (urlEnd === null) continue;
98
+ const title = text.slice(index + 1, titleEnd).trim();
99
+ const url = normalizeExtractedUrl(text.slice(titleEnd + 2, urlEnd));
100
+ if (url) addSource(sources, { title: title || url, url });
101
+ index = urlEnd;
102
+ }
103
+
104
+ for (const match of text.matchAll(/https?:\/\/\S+/g)) {
105
+ const url = normalizeExtractedUrl(match[0] ?? "");
106
+ if (!url) continue;
107
+ addSource(sources, { title: url, url });
108
+ }
109
+
110
+ return sources;
111
+ }
@@ -7,6 +7,7 @@
7
7
  /** Supported web search providers */
8
8
  export type SearchProviderId =
9
9
  | "duckduckgo"
10
+ | "insane"
10
11
  | "exa"
11
12
  | "brave"
12
13
  | "jina"
@@ -38,6 +39,7 @@ export interface ActiveSearchModelContext {
38
39
 
39
40
  export const CONFIGURABLE_SEARCH_PROVIDER_IDS = [
40
41
  "duckduckgo",
42
+ "insane",
41
43
  "exa",
42
44
  "brave",
43
45
  "jina",
@@ -162,6 +164,15 @@ export interface AnthropicCitation {
162
164
  encrypted_index: string;
163
165
  }
164
166
 
167
+ /**
168
+ * Error payload returned in `web_search_tool_result.content` when a server-side
169
+ * web search fails. Unlike the success case, this is an object, not an array.
170
+ */
171
+ export interface AnthropicWebSearchToolResultError {
172
+ type: "web_search_tool_result_error";
173
+ error_code?: string;
174
+ }
175
+
165
176
  export interface AnthropicContentBlock {
166
177
  type: string;
167
178
  /** Text content (for type="text") */
@@ -172,8 +183,8 @@ export interface AnthropicContentBlock {
172
183
  name?: string;
173
184
  /** Tool input (for type="server_tool_use") */
174
185
  input?: { query: string };
175
- /** Search results (for type="web_search_tool_result") */
176
- content?: AnthropicSearchResult[];
186
+ /** Search results array on success, or an error object on failure (type="web_search_tool_result") */
187
+ content?: AnthropicSearchResult[] | AnthropicWebSearchToolResultError;
177
188
  }
178
189
 
179
190
  export interface AnthropicApiResponse {
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 fivetaku
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,24 @@
1
+ {
2
+ "upstream": {
3
+ "repo": "https://github.com/fivetaku/insane-search",
4
+ "commit": "49306346b59aa89b5e96d98e1104da0890deed72",
5
+ "license": "MIT"
6
+ },
7
+ "vendoredAt": "2026-06-23",
8
+ "includedPaths": [
9
+ "skills/insane-search/engine/**",
10
+ "skills/insane-search/engine/templates/**",
11
+ "LICENSE"
12
+ ],
13
+ "excludedPaths": [
14
+ "setup/setup.sh",
15
+ "setup/gptaku-update-check.cjs",
16
+ ".claude-plugin/**",
17
+ "skills/insane-search/SKILL.md",
18
+ "skills/insane-search/references/**",
19
+ "skills/insane-search/tests/**"
20
+ ],
21
+ "exclusionRationale": "Excludes upstream install hooks (SessionStart settings.json mutation), GitHub star-baiting (gh api user/starred), the update-notifier, and the past-session transcript-language scanner. Only the runtime Phase 0-3 engine and its Playwright/stealth templates are vendored.",
22
+ "localPatches": [],
23
+ "notes": "Runtime engine is invoked via `python3 -m engine \"<url>\" --json` with cwd=this directory and PYTHONPATH pointed at this directory. Phase 0-2 require python3 + curl_cffi; Phase 3 requires node + playwright/playwright-extra/puppeteer-extra-plugin-stealth installed under engine/templates. GJC never auto-installs these dependencies."
24
+ }
@@ -0,0 +1,23 @@
1
+ """insane-search engine — generic WAF-profile-based fetch chain.
2
+
3
+ No site-specific logic lives here. Site specifics belong to runtime hints or
4
+ observations, never to code. See `../SKILL.md` for the No-Site-Name Rule.
5
+ """
6
+
7
+ from .validators import Verdict, ValidationResult, validate, CHALLENGE_MARKERS
8
+ from .waf_detector import detect
9
+ from .url_transforms import TRANSFORMS, apply_transform
10
+ from .fetch_chain import fetch, FetchResult, Attempt
11
+
12
+ __all__ = [
13
+ "Verdict",
14
+ "ValidationResult",
15
+ "validate",
16
+ "CHALLENGE_MARKERS",
17
+ "detect",
18
+ "TRANSFORMS",
19
+ "apply_transform",
20
+ "fetch",
21
+ "FetchResult",
22
+ "Attempt",
23
+ ]
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env python3
2
+ """CLI entrypoint for the insane-search engine.
3
+
4
+ Usage:
5
+ python3 -m engine URL [--selector CSS] [--device auto|desktop|mobile]
6
+ [--timeout N] [--max-attempts N] [--json] [--trace]
7
+
8
+ Examples:
9
+ python3 -m engine "https://example.com/" --selector "h1"
10
+ python3 -m engine "https://example.com/" --json
11
+ python3 -m engine "https://example.com/" --device mobile --trace
12
+
13
+ Exit codes:
14
+ 0 strong_ok or weak_ok
15
+ 1 ok=False (all attempts failed)
16
+ 2 CLI arg error
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import sys
23
+
24
+ from . import fetch
25
+
26
+
27
+ def build_parser() -> argparse.ArgumentParser:
28
+ p = argparse.ArgumentParser(prog="python3 -m engine",
29
+ description="Generic WAF-profile fetch chain.")
30
+ p.add_argument("url", help="URL to fetch.")
31
+ p.add_argument("--selector", "-s", action="append", default=None,
32
+ dest="selectors", metavar="CSS",
33
+ help="Positive-proof CSS selector. Repeatable.")
34
+ p.add_argument("--device", choices=("auto", "desktop", "mobile"), default="auto",
35
+ help="Device class pin.")
36
+ p.add_argument("--timeout", type=int, default=25,
37
+ help="Per-attempt timeout seconds (default 25).")
38
+ p.add_argument("--max-attempts", type=int, default=None,
39
+ help="TOTAL curl-attempt budget. Default: None = exhaustive (honours R6).")
40
+ p.add_argument("--no-playwright", action="store_true",
41
+ help="Skip Playwright fallback (curl-only).")
42
+ p.add_argument("--no-phase0", action="store_true",
43
+ help="Skip the Phase 0 official-API router (generic grid only).")
44
+ p.add_argument("--json", action="store_true",
45
+ help="Emit FetchResult as JSON to stdout with bounded content included.")
46
+ p.add_argument("--json-content-limit", type=int, default=4_000_000,
47
+ help="Maximum content characters to include in --json output (default 4000000).")
48
+ p.add_argument("--trace", action="store_true",
49
+ help="Print per-attempt trace to stderr.")
50
+ return p
51
+
52
+
53
+ def main(argv: list[str] | None = None) -> int:
54
+ args = build_parser().parse_args(argv)
55
+ try:
56
+ result = fetch(
57
+ args.url,
58
+ success_selectors=args.selectors,
59
+ device_class=args.device,
60
+ timeout=args.timeout,
61
+ max_attempts=args.max_attempts,
62
+ enable_playwright=not args.no_playwright,
63
+ enable_phase0=not args.no_phase0,
64
+ )
65
+ except Exception as e:
66
+ print(f"engine fatal: {type(e).__name__}: {e}", file=sys.stderr)
67
+ return 2
68
+
69
+ if args.trace:
70
+ print("=== trace ===", file=sys.stderr)
71
+ for att in result.trace:
72
+ d = att.to_dict()
73
+ imp = d.get("impersonate") or "-"
74
+ ref = d.get("referer") or "-"
75
+ print(
76
+ f"[{d['phase']:<8}] {d['executor']:<18} "
77
+ f"xform={d['url_transform']:<16} imp={imp:<14} ref={ref:<14} "
78
+ f"status={d['status']:>4} size={d['body_size']:>8} "
79
+ f"verdict={d['verdict']} {('err=' + d['error'][:60]) if d.get('error') else ''}",
80
+ file=sys.stderr,
81
+ )
82
+ print(f"=== summary: {result.summary} ===", file=sys.stderr)
83
+
84
+ # Surface R7 hint (API-first route) prominently when summary contains it,
85
+ # regardless of --trace flag — this is actionable guidance, not noise.
86
+ if "R7 API-first" in (result.summary or ""):
87
+ print(
88
+ "\n════════════════════════════════════════════════════════════════\n"
89
+ "⚠️ R7 triggered — consider API-first route instead of HTML grid.\n"
90
+ " See summary below (or re-run with --trace for full attempt log).\n"
91
+ "════════════════════════════════════════════════════════════════",
92
+ file=sys.stderr,
93
+ )
94
+ # Also print the full summary (which includes the hint) so caller sees it.
95
+ print(result.summary, file=sys.stderr)
96
+
97
+ # Failure gate (R6): giving up is NOT permission to stop. Surface the routes
98
+ # the engine could not run by itself so the caller keeps escalating instead
99
+ # of reporting "blocked" prematurely (the exact bug this hardening fixes).
100
+ if not result.ok and (result.untried_routes or result.must_invoke_playwright_mcp):
101
+ print(
102
+ "\n════════════════════════════════════════════════════════════════\n"
103
+ "⛔ NOT EXHAUSTED — do not declare failure yet (R6).\n"
104
+ f" grid_exhausted={result.grid_exhausted} stop_reason={result.stop_reason}\n"
105
+ " Routes the engine cannot run itself — try these before giving up:",
106
+ file=sys.stderr,
107
+ )
108
+ for r in result.untried_routes:
109
+ print(f" • {r}", file=sys.stderr)
110
+ if result.must_invoke_playwright_mcp:
111
+ print(" ➜ must_invoke_playwright_mcp = TRUE — drive MCP Playwright from the agent session.", file=sys.stderr)
112
+ print("════════════════════════════════════════════════════════════════", file=sys.stderr)
113
+
114
+ if args.json:
115
+ payload = result.to_dict(include_content=True, content_limit=args.json_content_limit)
116
+ print(json.dumps(payload, ensure_ascii=False, indent=2))
117
+ else:
118
+ # Default: HTML to stdout, status to stderr.
119
+ print(result.content, end="")
120
+ print(f"\n[engine] ok={result.ok} verdict={result.verdict} "
121
+ f"profile={result.profile_used} attempts={len(result.trace)}",
122
+ file=sys.stderr)
123
+
124
+ return 0 if result.ok else 1
125
+
126
+
127
+ if __name__ == "__main__":
128
+ sys.exit(main())
@@ -0,0 +1,183 @@
1
+ #!/usr/bin/env python3
2
+ """No-Site-Name Rule checker.
3
+
4
+ Run in CI / pre-commit. Scans engine/** for hard-coded site names or
5
+ domains that would bias the generic fetch chain toward one site.
6
+
7
+ Exit code 0 if clean, 1 if violations found.
8
+
9
+ python3 engine/bias_check.py
10
+ python3 engine/bias_check.py --strict # also check references/*.md (usually off)
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import os
16
+ import re
17
+ import sys
18
+ from pathlib import Path
19
+
20
+
21
+ # Known brand / domain substrings that should NOT appear in engine code.
22
+ # This is a non-exhaustive deny list. CI should treat hits as warnings that
23
+ # require human review; false positives (e.g. "github" in comments) can be
24
+ # whitelisted via EXPLICIT_ALLOW.
25
+ BRAND_SUBSTRINGS = [
26
+ "coupang", "11st", "11번가", "musinsa", "무신사",
27
+ "fmkorea", "에펨코리아", "dcinside", "디시인사이드",
28
+ "ohou", "오늘의집", "kurly", "마켓컬리",
29
+ "daangn", "당근",
30
+ # Naver is allowed in Phase 0 references (official APIs) but not in engine code.
31
+ "naver.com", "blog.naver", "shopping.naver",
32
+ # Korean portal brand names
33
+ "daum.net", "kakao.com",
34
+ ]
35
+
36
+ # Regex for bare URLs / domains. Used as a secondary pass to flag hardcoded
37
+ # site hosts that slipped past the brand denylist.
38
+ URL_PATTERN = re.compile(
39
+ r"https?://[\w\.-]+|[\w-]+\.(?:com|net|org|co\.kr|kr|io)\b",
40
+ re.IGNORECASE,
41
+ )
42
+
43
+ # Generic / neutral hosts that are allowed anywhere (examples, specs, stdlib,
44
+ # and domains that legitimately appear as non-site-specific referrers / test
45
+ # fixtures — Google search as a generic Referer strategy, httpbin for transport
46
+ # tests, etc.). Anything in this set must be provably unrelated to a specific
47
+ # target-site preference.
48
+ URL_ALLOWLIST = {
49
+ "example.com", "example.org", "example.net",
50
+ "localhost", "127.0.0.1",
51
+ # Official API / documentation sources cited in code comments.
52
+ "curl.se", "playwright.dev", "nodejs.org", "npmjs.com",
53
+ # Generic Referer strategy target (used as a neutral off-site referer).
54
+ "www.google.com", "google.com",
55
+ # Generic HTTP test endpoint for infrastructure / transport tests.
56
+ "httpbin.org",
57
+ }
58
+
59
+ # Files / dirs that must be clean.
60
+ SCAN_ROOTS_STRICT_OFF = ["engine"]
61
+ SCAN_ROOTS_STRICT_ON = ["engine", "references"]
62
+
63
+ # Directory names skipped during scan (third-party code, build artefacts).
64
+ # `tests` is excluded because test fixtures legitimately use concrete hosts and
65
+ # IP literals (e.g. SSRF/redirect cases, per-host session keys) — same exemption
66
+ # rationale as SKILL.md examples; tests are not the generic fetch path.
67
+ EXCLUDED_DIR_NAMES = {
68
+ "node_modules", "__pycache__", ".git", ".venv", "dist", "build", "tests",
69
+ }
70
+
71
+ # Comment markers within which a brand mention is OK (explanation).
72
+ # Keyed per-extension; any line containing these is skipped.
73
+ COMMENT_OK_MARKERS = {
74
+ ".py": ("# NOTE-BIAS-OK", "# EXAMPLE-ONLY"),
75
+ ".js": ("// NOTE-BIAS-OK", "// EXAMPLE-ONLY"),
76
+ ".yaml": ("# NOTE-BIAS-OK", "# EXAMPLE-ONLY"),
77
+ ".yml": ("# NOTE-BIAS-OK", "# EXAMPLE-ONLY"),
78
+ ".md": ("<!-- NOTE-BIAS-OK -->", "<!-- EXAMPLE-ONLY -->"),
79
+ }
80
+
81
+ # File paths explicitly exempted (full match against relative path from scan root).
82
+ EXPLICIT_ALLOW_FILES = {
83
+ # Phase 0 official-API router. Per SKILL.md R5, naming platform hosts here is
84
+ # the SANCTIONED exception — these are official no-auth public endpoints, not
85
+ # a bias toward one target. This is the ONLY engine/ file allowed to do so;
86
+ # keeping it isolated is precisely why the rest of engine/ stays site-agnostic.
87
+ # NOTE: rel paths are computed against skill_root.parent, so they include the
88
+ # skill dir name (e.g. "insane-search/engine/phase0.py").
89
+ "insane-search/engine/phase0.py",
90
+ }
91
+
92
+
93
+ def _line_is_exempt(line: str, ext: str) -> bool:
94
+ markers = COMMENT_OK_MARKERS.get(ext, ())
95
+ return any(m in line for m in markers)
96
+
97
+
98
+ def _scan_file(path: Path, root: Path) -> list[str]:
99
+ """Return list of violation strings for this file."""
100
+ rel = path.relative_to(root.parent)
101
+ if str(rel) in EXPLICIT_ALLOW_FILES:
102
+ return []
103
+
104
+ ext = path.suffix.lower()
105
+ try:
106
+ text = path.read_text(encoding="utf-8", errors="ignore")
107
+ except Exception as e:
108
+ return [f"{rel}:0 — read error: {e}"]
109
+
110
+ violations: list[str] = []
111
+ for lineno, line in enumerate(text.splitlines(), start=1):
112
+ if _line_is_exempt(line, ext):
113
+ continue
114
+ lowered = line.lower()
115
+ # 1) Brand / domain denylist
116
+ hit_brand = None
117
+ for brand in BRAND_SUBSTRINGS:
118
+ if brand.lower() in lowered:
119
+ hit_brand = brand
120
+ break
121
+ if hit_brand:
122
+ violations.append(f"{rel}:{lineno} — brand `{hit_brand}` in: {line.strip()[:120]}")
123
+ continue # one violation per line
124
+ # 2) URL/domain regex scan — catches hosts that aren't in the denylist.
125
+ for match in URL_PATTERN.finditer(line):
126
+ host = match.group(0).lower()
127
+ host = host.split("//", 1)[-1].split("/", 1)[0]
128
+ if host in URL_ALLOWLIST:
129
+ continue
130
+ if host.endswith(".example.com") or host.endswith(".example.org"):
131
+ continue
132
+ violations.append(f"{rel}:{lineno} — hardcoded host `{host}` in: {line.strip()[:120]}")
133
+ break
134
+ return violations
135
+
136
+
137
+ def main(argv: list[str] | None = None) -> int:
138
+ parser = argparse.ArgumentParser(description="Scan engine for site-name bias")
139
+ parser.add_argument("--strict", action="store_true",
140
+ help="Also scan references/*.md (usually noisy — off by default)")
141
+ parser.add_argument("--root", default=None,
142
+ help="Skill root directory. Defaults to parent of this file.")
143
+ args = parser.parse_args(argv)
144
+
145
+ skill_root = Path(args.root) if args.root else Path(__file__).parent.parent
146
+ scan_roots = SCAN_ROOTS_STRICT_ON if args.strict else SCAN_ROOTS_STRICT_OFF
147
+
148
+ total_violations: list[str] = []
149
+ scanned = 0
150
+ for name in scan_roots:
151
+ root = skill_root / name
152
+ if not root.exists():
153
+ continue
154
+ for dirpath, dirnames, filenames in os.walk(root):
155
+ # In-place filter so os.walk skips these subtrees.
156
+ dirnames[:] = [d for d in dirnames if d not in EXCLUDED_DIR_NAMES]
157
+ for fname in filenames:
158
+ p = Path(dirpath) / fname
159
+ if p.suffix.lower() not in (".py", ".js", ".yaml", ".yml", ".md", ".ts", ".mjs"):
160
+ continue
161
+ if p.name == "bias_check.py":
162
+ continue # self-exempt (this file lists the brands)
163
+ scanned += 1
164
+ total_violations.extend(_scan_file(p, skill_root))
165
+
166
+ print(f"[bias-check] scanned {scanned} files under {skill_root}")
167
+ if total_violations:
168
+ print(f"[bias-check] ❌ {len(total_violations)} violation(s):")
169
+ for v in total_violations:
170
+ print(f" - {v}")
171
+ print()
172
+ print("Fix options:")
173
+ print(" 1) Remove the brand name (preferred)")
174
+ print(" 2) If genuinely explanatory, add '# NOTE-BIAS-OK' on the same line")
175
+ print(" 3) If this is a Phase 0 official API reference, move it to references/*.md and rerun without --strict")
176
+ return 1
177
+
178
+ print("[bias-check] ✅ clean")
179
+ return 0
180
+
181
+
182
+ if __name__ == "__main__":
183
+ sys.exit(main())