@gajae-code/coding-agent 0.7.1 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +57 -0
- package/dist/types/cli/mcp-cli.d.ts +25 -0
- package/dist/types/cli/notify-cli.d.ts +2 -0
- package/dist/types/cli.d.ts +6 -0
- package/dist/types/commands/mcp.d.ts +70 -0
- package/dist/types/config/keybindings.d.ts +2 -2
- package/dist/types/config/settings-schema.d.ts +39 -2
- package/dist/types/deep-interview/plaintext-gate-guard.d.ts +11 -0
- package/dist/types/extensibility/shared-events.d.ts +1 -0
- package/dist/types/gjc-runtime/ralplan-runtime.d.ts +1 -1
- package/dist/types/lsp/types.d.ts +2 -0
- package/dist/types/modes/components/custom-editor.d.ts +1 -1
- package/dist/types/modes/components/model-selector.d.ts +2 -0
- package/dist/types/modes/components/status-line/git-utils.d.ts +6 -0
- package/dist/types/modes/theme/defaults/index.d.ts +99 -0
- package/dist/types/notifications/attachment-registry.d.ts +17 -0
- package/dist/types/notifications/chat-adapters.d.ts +9 -0
- package/dist/types/notifications/config.d.ts +9 -1
- package/dist/types/notifications/engine.d.ts +59 -0
- package/dist/types/notifications/managed-daemon.d.ts +48 -0
- package/dist/types/notifications/operator-runtime.d.ts +52 -0
- package/dist/types/notifications/telegram-daemon.d.ts +73 -16
- package/dist/types/notifications/threaded-inbound.d.ts +19 -0
- package/dist/types/notifications/threaded-render.d.ts +6 -1
- package/dist/types/notifications/topic-registry.d.ts +2 -0
- package/dist/types/session/agent-session.d.ts +2 -0
- package/dist/types/tools/composer-bash-policy.d.ts +14 -0
- package/dist/types/tools/fetch.d.ts +23 -0
- package/dist/types/tools/index.d.ts +1 -0
- package/dist/types/tools/telegram-send.d.ts +32 -0
- package/dist/types/web/insane/bridge.d.ts +103 -0
- package/dist/types/web/insane/url-guard.d.ts +25 -0
- package/dist/types/web/scrapers/types.d.ts +5 -0
- package/dist/types/web/scrapers/utils.d.ts +7 -1
- package/dist/types/web/search/provider.d.ts +18 -1
- package/dist/types/web/search/providers/insane.d.ts +53 -0
- package/dist/types/web/search/providers/text-citations.d.ts +23 -0
- package/dist/types/web/search/types.d.ts +12 -4
- package/package.json +10 -8
- package/scripts/verify-insane-vendor.ts +132 -0
- package/src/cli/args.ts +1 -1
- package/src/cli/fast-help.ts +1 -1
- package/src/cli/mcp-cli.ts +272 -0
- package/src/cli/notify-cli.ts +152 -5
- package/src/cli.ts +6 -2
- package/src/commands/mcp.ts +117 -0
- package/src/commands/team.ts +1 -1
- package/src/config/keybindings.ts +2 -2
- package/src/config/settings-schema.ts +30 -1
- package/src/deep-interview/plaintext-gate-guard.ts +94 -0
- package/src/defaults/gjc/skills/deep-interview/SKILL.md +4 -3
- package/src/defaults/gjc/skills/ralplan/SKILL.md +11 -4
- package/src/defaults/gjc/skills/team/SKILL.md +3 -2
- package/src/extensibility/extensions/runner.ts +1 -0
- package/src/extensibility/shared-events.ts +1 -0
- package/src/gjc-runtime/launch-tmux.ts +17 -3
- package/src/gjc-runtime/ledger-event-renderer.ts +1 -0
- package/src/gjc-runtime/ralplan-runtime.ts +2 -2
- package/src/gjc-runtime/tmux-common.ts +3 -1
- package/src/gjc-runtime/ultragoal-guard.ts +25 -8
- package/src/gjc-runtime/workflow-manifest.generated.json +29 -0
- package/src/gjc-runtime/workflow-manifest.ts +7 -2
- package/src/hooks/skill-state.ts +57 -0
- package/src/internal-urls/docs-index.generated.ts +14 -11
- package/src/lsp/config.ts +16 -3
- package/src/lsp/defaults.json +7 -0
- package/src/lsp/types.ts +2 -0
- package/src/modes/bridge/bridge-mode.ts +11 -0
- package/src/modes/components/custom-editor.ts +2 -0
- package/src/modes/components/footer.ts +2 -3
- package/src/modes/components/model-selector.ts +12 -0
- package/src/modes/components/status-line/git-utils.ts +25 -0
- package/src/modes/components/status-line.ts +10 -11
- package/src/modes/components/welcome.ts +2 -3
- package/src/modes/controllers/event-controller.ts +15 -0
- package/src/modes/controllers/selector-controller.ts +3 -0
- package/src/modes/interactive-mode.ts +48 -3
- package/src/modes/shared/agent-wire/scopes.ts +1 -1
- package/src/modes/theme/defaults/gruvbox-dark.json +99 -0
- package/src/modes/theme/defaults/index.ts +2 -0
- package/src/modes/utils/context-usage.ts +2 -2
- package/src/notifications/attachment-registry.ts +23 -0
- package/src/notifications/chat-adapters.ts +147 -0
- package/src/notifications/config.ts +23 -2
- package/src/notifications/engine.ts +100 -0
- package/src/notifications/index.ts +180 -38
- package/src/notifications/managed-daemon.ts +163 -0
- package/src/notifications/operator-runtime.ts +171 -0
- package/src/notifications/telegram-daemon.ts +553 -236
- package/src/notifications/threaded-inbound.ts +60 -4
- package/src/notifications/threaded-render.ts +20 -2
- package/src/notifications/topic-registry.ts +5 -0
- package/src/session/agent-session.ts +82 -51
- package/src/slash-commands/helpers/parse.ts +2 -1
- package/src/tools/bash.ts +9 -0
- package/src/tools/composer-bash-policy.ts +96 -0
- package/src/tools/fetch.ts +94 -1
- package/src/tools/index.ts +3 -0
- package/src/tools/telegram-send.ts +137 -0
- package/src/web/insane/bridge.ts +350 -0
- package/src/web/insane/url-guard.ts +159 -0
- package/src/web/scrapers/types.ts +143 -45
- package/src/web/scrapers/utils.ts +70 -19
- package/src/web/search/provider.ts +77 -18
- package/src/web/search/providers/anthropic.ts +70 -3
- package/src/web/search/providers/codex.ts +1 -119
- package/src/web/search/providers/gemini.ts +99 -0
- package/src/web/search/providers/insane.ts +551 -0
- package/src/web/search/providers/openai-compatible.ts +66 -32
- package/src/web/search/providers/text-citations.ts +111 -0
- package/src/web/search/types.ts +13 -2
- package/vendor/insane-search/LICENSE +21 -0
- package/vendor/insane-search/MANIFEST.json +24 -0
- package/vendor/insane-search/engine/__init__.py +23 -0
- package/vendor/insane-search/engine/__main__.py +128 -0
- package/vendor/insane-search/engine/bias_check.py +183 -0
- package/vendor/insane-search/engine/executor.py +254 -0
- package/vendor/insane-search/engine/fetch_chain.py +725 -0
- package/vendor/insane-search/engine/learning.py +175 -0
- package/vendor/insane-search/engine/phase0.py +214 -0
- package/vendor/insane-search/engine/safety.py +91 -0
- package/vendor/insane-search/engine/templates/package.json +11 -0
- package/vendor/insane-search/engine/templates/playwright_mobile_chrome.js +188 -0
- package/vendor/insane-search/engine/templates/playwright_real_chrome.js +243 -0
- package/vendor/insane-search/engine/tests/test_hardening.py +57 -0
- package/vendor/insane-search/engine/tests/test_smoke.py +152 -0
- package/vendor/insane-search/engine/tests/test_u1.py +200 -0
- package/vendor/insane-search/engine/tests/test_u4.py +131 -0
- package/vendor/insane-search/engine/tests/test_u5.py +163 -0
- package/vendor/insane-search/engine/tests/test_u7.py +124 -0
- package/vendor/insane-search/engine/transport.py +211 -0
- package/vendor/insane-search/engine/url_transforms.py +98 -0
- package/vendor/insane-search/engine/validators.py +331 -0
- package/vendor/insane-search/engine/waf_detector.py +214 -0
- package/vendor/insane-search/engine/waf_profiles.yaml +162 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inline citation extraction shared by native web-search providers.
|
|
3
|
+
*
|
|
4
|
+
* Web-search-capable models sometimes return a genuinely grounded answer whose
|
|
5
|
+
* sources are written inline (markdown links or bare URLs) instead of as
|
|
6
|
+
* structured citation annotations. When a provider has independent proof that a
|
|
7
|
+
* web search actually ran, these helpers recover sources from the answer text so
|
|
8
|
+
* the result is not discarded.
|
|
9
|
+
*/
|
|
10
|
+
import type { SearchSource } from "../types";
|
|
11
|
+
|
|
12
|
+
/** Append a source, de-duplicating by URL. */
|
|
13
|
+
export function addSource(sources: SearchSource[], source: SearchSource): void {
|
|
14
|
+
if (!sources.some(existing => existing.url === source.url)) {
|
|
15
|
+
sources.push(source);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function countCharacter(text: string, target: string): number {
|
|
20
|
+
let count = 0;
|
|
21
|
+
for (const char of text) {
|
|
22
|
+
if (char === target) count += 1;
|
|
23
|
+
}
|
|
24
|
+
return count;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Strips prose punctuation and unmatched closing delimiters from extracted URLs.
|
|
29
|
+
* Models often return links embedded in markdown or sentence text.
|
|
30
|
+
*/
|
|
31
|
+
export function normalizeExtractedUrl(candidate: string): string | null {
|
|
32
|
+
let url = candidate.trim();
|
|
33
|
+
|
|
34
|
+
while (url.length > 0) {
|
|
35
|
+
const lastCharacter = url.at(-1);
|
|
36
|
+
if (!lastCharacter) break;
|
|
37
|
+
if (/[.,!?;:'"]/u.test(lastCharacter)) {
|
|
38
|
+
url = url.slice(0, -1);
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
if (lastCharacter === ")" && countCharacter(url, ")") > countCharacter(url, "(")) {
|
|
42
|
+
url = url.slice(0, -1);
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
45
|
+
if (lastCharacter === "]" && countCharacter(url, "]") > countCharacter(url, "[")) {
|
|
46
|
+
url = url.slice(0, -1);
|
|
47
|
+
continue;
|
|
48
|
+
}
|
|
49
|
+
if (lastCharacter === "}" && countCharacter(url, "}") > countCharacter(url, "{")) {
|
|
50
|
+
url = url.slice(0, -1);
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (!/^https?:\/\//.test(url)) return null;
|
|
57
|
+
|
|
58
|
+
try {
|
|
59
|
+
return new URL(url).toString();
|
|
60
|
+
} catch {
|
|
61
|
+
return null;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function findMarkdownLinkUrlEnd(text: string, openParenIndex: number): number | null {
|
|
66
|
+
let depth = 0;
|
|
67
|
+
|
|
68
|
+
for (let index = openParenIndex; index < text.length; index += 1) {
|
|
69
|
+
const character = text[index];
|
|
70
|
+
if (!character || character === "\n") return null;
|
|
71
|
+
if (character === "(") {
|
|
72
|
+
depth += 1;
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
if (character !== ")") continue;
|
|
76
|
+
depth -= 1;
|
|
77
|
+
if (depth === 0) return index;
|
|
78
|
+
if (depth < 0) return null;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Extracts citation sources from markdown links and bare URLs in answer text.
|
|
86
|
+
* Used only as a fallback when a provider confirms a search ran but omits
|
|
87
|
+
* structured citation annotations.
|
|
88
|
+
*/
|
|
89
|
+
export function extractTextSources(text: string): SearchSource[] {
|
|
90
|
+
const sources: SearchSource[] = [];
|
|
91
|
+
|
|
92
|
+
for (let index = 0; index < text.length; index += 1) {
|
|
93
|
+
if (text[index] !== "[") continue;
|
|
94
|
+
const titleEnd = text.indexOf("]", index + 1);
|
|
95
|
+
if (titleEnd === -1 || text[titleEnd + 1] !== "(") continue;
|
|
96
|
+
const urlEnd = findMarkdownLinkUrlEnd(text, titleEnd + 1);
|
|
97
|
+
if (urlEnd === null) continue;
|
|
98
|
+
const title = text.slice(index + 1, titleEnd).trim();
|
|
99
|
+
const url = normalizeExtractedUrl(text.slice(titleEnd + 2, urlEnd));
|
|
100
|
+
if (url) addSource(sources, { title: title || url, url });
|
|
101
|
+
index = urlEnd;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
for (const match of text.matchAll(/https?:\/\/\S+/g)) {
|
|
105
|
+
const url = normalizeExtractedUrl(match[0] ?? "");
|
|
106
|
+
if (!url) continue;
|
|
107
|
+
addSource(sources, { title: url, url });
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return sources;
|
|
111
|
+
}
|
package/src/web/search/types.ts
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
/** Supported web search providers */
|
|
8
8
|
export type SearchProviderId =
|
|
9
9
|
| "duckduckgo"
|
|
10
|
+
| "insane"
|
|
10
11
|
| "exa"
|
|
11
12
|
| "brave"
|
|
12
13
|
| "jina"
|
|
@@ -38,6 +39,7 @@ export interface ActiveSearchModelContext {
|
|
|
38
39
|
|
|
39
40
|
export const CONFIGURABLE_SEARCH_PROVIDER_IDS = [
|
|
40
41
|
"duckduckgo",
|
|
42
|
+
"insane",
|
|
41
43
|
"exa",
|
|
42
44
|
"brave",
|
|
43
45
|
"jina",
|
|
@@ -162,6 +164,15 @@ export interface AnthropicCitation {
|
|
|
162
164
|
encrypted_index: string;
|
|
163
165
|
}
|
|
164
166
|
|
|
167
|
+
/**
|
|
168
|
+
* Error payload returned in `web_search_tool_result.content` when a server-side
|
|
169
|
+
* web search fails. Unlike the success case, this is an object, not an array.
|
|
170
|
+
*/
|
|
171
|
+
export interface AnthropicWebSearchToolResultError {
|
|
172
|
+
type: "web_search_tool_result_error";
|
|
173
|
+
error_code?: string;
|
|
174
|
+
}
|
|
175
|
+
|
|
165
176
|
export interface AnthropicContentBlock {
|
|
166
177
|
type: string;
|
|
167
178
|
/** Text content (for type="text") */
|
|
@@ -172,8 +183,8 @@ export interface AnthropicContentBlock {
|
|
|
172
183
|
name?: string;
|
|
173
184
|
/** Tool input (for type="server_tool_use") */
|
|
174
185
|
input?: { query: string };
|
|
175
|
-
/** Search results (
|
|
176
|
-
content?: AnthropicSearchResult[];
|
|
186
|
+
/** Search results array on success, or an error object on failure (type="web_search_tool_result") */
|
|
187
|
+
content?: AnthropicSearchResult[] | AnthropicWebSearchToolResultError;
|
|
177
188
|
}
|
|
178
189
|
|
|
179
190
|
export interface AnthropicApiResponse {
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 fivetaku
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"upstream": {
|
|
3
|
+
"repo": "https://github.com/fivetaku/insane-search",
|
|
4
|
+
"commit": "49306346b59aa89b5e96d98e1104da0890deed72",
|
|
5
|
+
"license": "MIT"
|
|
6
|
+
},
|
|
7
|
+
"vendoredAt": "2026-06-23",
|
|
8
|
+
"includedPaths": [
|
|
9
|
+
"skills/insane-search/engine/**",
|
|
10
|
+
"skills/insane-search/engine/templates/**",
|
|
11
|
+
"LICENSE"
|
|
12
|
+
],
|
|
13
|
+
"excludedPaths": [
|
|
14
|
+
"setup/setup.sh",
|
|
15
|
+
"setup/gptaku-update-check.cjs",
|
|
16
|
+
".claude-plugin/**",
|
|
17
|
+
"skills/insane-search/SKILL.md",
|
|
18
|
+
"skills/insane-search/references/**",
|
|
19
|
+
"skills/insane-search/tests/**"
|
|
20
|
+
],
|
|
21
|
+
"exclusionRationale": "Excludes upstream install hooks (SessionStart settings.json mutation), GitHub star-baiting (gh api user/starred), the update-notifier, and the past-session transcript-language scanner. Only the runtime Phase 0-3 engine and its Playwright/stealth templates are vendored.",
|
|
22
|
+
"localPatches": [],
|
|
23
|
+
"notes": "Runtime engine is invoked via `python3 -m engine \"<url>\" --json` with cwd=this directory and PYTHONPATH pointed at this directory. Phase 0-2 require python3 + curl_cffi; Phase 3 requires node + playwright/playwright-extra/puppeteer-extra-plugin-stealth installed under engine/templates. GJC never auto-installs these dependencies."
|
|
24
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""insane-search engine — generic WAF-profile-based fetch chain.
|
|
2
|
+
|
|
3
|
+
No site-specific logic lives here. Site specifics belong to runtime hints or
|
|
4
|
+
observations, never to code. See `../SKILL.md` for the No-Site-Name Rule.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .validators import Verdict, ValidationResult, validate, CHALLENGE_MARKERS
|
|
8
|
+
from .waf_detector import detect
|
|
9
|
+
from .url_transforms import TRANSFORMS, apply_transform
|
|
10
|
+
from .fetch_chain import fetch, FetchResult, Attempt
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Verdict",
|
|
14
|
+
"ValidationResult",
|
|
15
|
+
"validate",
|
|
16
|
+
"CHALLENGE_MARKERS",
|
|
17
|
+
"detect",
|
|
18
|
+
"TRANSFORMS",
|
|
19
|
+
"apply_transform",
|
|
20
|
+
"fetch",
|
|
21
|
+
"FetchResult",
|
|
22
|
+
"Attempt",
|
|
23
|
+
]
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""CLI entrypoint for the insane-search engine.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python3 -m engine URL [--selector CSS] [--device auto|desktop|mobile]
|
|
6
|
+
[--timeout N] [--max-attempts N] [--json] [--trace]
|
|
7
|
+
|
|
8
|
+
Examples:
|
|
9
|
+
python3 -m engine "https://example.com/" --selector "h1"
|
|
10
|
+
python3 -m engine "https://example.com/" --json
|
|
11
|
+
python3 -m engine "https://example.com/" --device mobile --trace
|
|
12
|
+
|
|
13
|
+
Exit codes:
|
|
14
|
+
0 strong_ok or weak_ok
|
|
15
|
+
1 ok=False (all attempts failed)
|
|
16
|
+
2 CLI arg error
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import sys
|
|
23
|
+
|
|
24
|
+
from . import fetch
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
28
|
+
p = argparse.ArgumentParser(prog="python3 -m engine",
|
|
29
|
+
description="Generic WAF-profile fetch chain.")
|
|
30
|
+
p.add_argument("url", help="URL to fetch.")
|
|
31
|
+
p.add_argument("--selector", "-s", action="append", default=None,
|
|
32
|
+
dest="selectors", metavar="CSS",
|
|
33
|
+
help="Positive-proof CSS selector. Repeatable.")
|
|
34
|
+
p.add_argument("--device", choices=("auto", "desktop", "mobile"), default="auto",
|
|
35
|
+
help="Device class pin.")
|
|
36
|
+
p.add_argument("--timeout", type=int, default=25,
|
|
37
|
+
help="Per-attempt timeout seconds (default 25).")
|
|
38
|
+
p.add_argument("--max-attempts", type=int, default=None,
|
|
39
|
+
help="TOTAL curl-attempt budget. Default: None = exhaustive (honours R6).")
|
|
40
|
+
p.add_argument("--no-playwright", action="store_true",
|
|
41
|
+
help="Skip Playwright fallback (curl-only).")
|
|
42
|
+
p.add_argument("--no-phase0", action="store_true",
|
|
43
|
+
help="Skip the Phase 0 official-API router (generic grid only).")
|
|
44
|
+
p.add_argument("--json", action="store_true",
|
|
45
|
+
help="Emit FetchResult as JSON to stdout with bounded content included.")
|
|
46
|
+
p.add_argument("--json-content-limit", type=int, default=4_000_000,
|
|
47
|
+
help="Maximum content characters to include in --json output (default 4000000).")
|
|
48
|
+
p.add_argument("--trace", action="store_true",
|
|
49
|
+
help="Print per-attempt trace to stderr.")
|
|
50
|
+
return p
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def main(argv: list[str] | None = None) -> int:
|
|
54
|
+
args = build_parser().parse_args(argv)
|
|
55
|
+
try:
|
|
56
|
+
result = fetch(
|
|
57
|
+
args.url,
|
|
58
|
+
success_selectors=args.selectors,
|
|
59
|
+
device_class=args.device,
|
|
60
|
+
timeout=args.timeout,
|
|
61
|
+
max_attempts=args.max_attempts,
|
|
62
|
+
enable_playwright=not args.no_playwright,
|
|
63
|
+
enable_phase0=not args.no_phase0,
|
|
64
|
+
)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"engine fatal: {type(e).__name__}: {e}", file=sys.stderr)
|
|
67
|
+
return 2
|
|
68
|
+
|
|
69
|
+
if args.trace:
|
|
70
|
+
print("=== trace ===", file=sys.stderr)
|
|
71
|
+
for att in result.trace:
|
|
72
|
+
d = att.to_dict()
|
|
73
|
+
imp = d.get("impersonate") or "-"
|
|
74
|
+
ref = d.get("referer") or "-"
|
|
75
|
+
print(
|
|
76
|
+
f"[{d['phase']:<8}] {d['executor']:<18} "
|
|
77
|
+
f"xform={d['url_transform']:<16} imp={imp:<14} ref={ref:<14} "
|
|
78
|
+
f"status={d['status']:>4} size={d['body_size']:>8} "
|
|
79
|
+
f"verdict={d['verdict']} {('err=' + d['error'][:60]) if d.get('error') else ''}",
|
|
80
|
+
file=sys.stderr,
|
|
81
|
+
)
|
|
82
|
+
print(f"=== summary: {result.summary} ===", file=sys.stderr)
|
|
83
|
+
|
|
84
|
+
# Surface R7 hint (API-first route) prominently when summary contains it,
|
|
85
|
+
# regardless of --trace flag — this is actionable guidance, not noise.
|
|
86
|
+
if "R7 API-first" in (result.summary or ""):
|
|
87
|
+
print(
|
|
88
|
+
"\n════════════════════════════════════════════════════════════════\n"
|
|
89
|
+
"⚠️ R7 triggered — consider API-first route instead of HTML grid.\n"
|
|
90
|
+
" See summary below (or re-run with --trace for full attempt log).\n"
|
|
91
|
+
"════════════════════════════════════════════════════════════════",
|
|
92
|
+
file=sys.stderr,
|
|
93
|
+
)
|
|
94
|
+
# Also print the full summary (which includes the hint) so caller sees it.
|
|
95
|
+
print(result.summary, file=sys.stderr)
|
|
96
|
+
|
|
97
|
+
# Failure gate (R6): giving up is NOT permission to stop. Surface the routes
|
|
98
|
+
# the engine could not run by itself so the caller keeps escalating instead
|
|
99
|
+
# of reporting "blocked" prematurely (the exact bug this hardening fixes).
|
|
100
|
+
if not result.ok and (result.untried_routes or result.must_invoke_playwright_mcp):
|
|
101
|
+
print(
|
|
102
|
+
"\n════════════════════════════════════════════════════════════════\n"
|
|
103
|
+
"⛔ NOT EXHAUSTED — do not declare failure yet (R6).\n"
|
|
104
|
+
f" grid_exhausted={result.grid_exhausted} stop_reason={result.stop_reason}\n"
|
|
105
|
+
" Routes the engine cannot run itself — try these before giving up:",
|
|
106
|
+
file=sys.stderr,
|
|
107
|
+
)
|
|
108
|
+
for r in result.untried_routes:
|
|
109
|
+
print(f" • {r}", file=sys.stderr)
|
|
110
|
+
if result.must_invoke_playwright_mcp:
|
|
111
|
+
print(" ➜ must_invoke_playwright_mcp = TRUE — drive MCP Playwright from the agent session.", file=sys.stderr)
|
|
112
|
+
print("════════════════════════════════════════════════════════════════", file=sys.stderr)
|
|
113
|
+
|
|
114
|
+
if args.json:
|
|
115
|
+
payload = result.to_dict(include_content=True, content_limit=args.json_content_limit)
|
|
116
|
+
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
117
|
+
else:
|
|
118
|
+
# Default: HTML to stdout, status to stderr.
|
|
119
|
+
print(result.content, end="")
|
|
120
|
+
print(f"\n[engine] ok={result.ok} verdict={result.verdict} "
|
|
121
|
+
f"profile={result.profile_used} attempts={len(result.trace)}",
|
|
122
|
+
file=sys.stderr)
|
|
123
|
+
|
|
124
|
+
return 0 if result.ok else 1
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __name__ == "__main__":
|
|
128
|
+
sys.exit(main())
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""No-Site-Name Rule checker.
|
|
3
|
+
|
|
4
|
+
Run in CI / pre-commit. Scans engine/** for hard-coded site names or
|
|
5
|
+
domains that would bias the generic fetch chain toward one site.
|
|
6
|
+
|
|
7
|
+
Exit code 0 if clean, 1 if violations found.
|
|
8
|
+
|
|
9
|
+
python3 engine/bias_check.py
|
|
10
|
+
python3 engine/bias_check.py --strict # also check references/*.md (usually off)
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import sys
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Known brand / domain substrings that should NOT appear in engine code.
|
|
22
|
+
# This is a non-exhaustive deny list. CI should treat hits as warnings that
|
|
23
|
+
# require human review; false positives (e.g. "github" in comments) can be
|
|
24
|
+
# whitelisted via EXPLICIT_ALLOW.
|
|
25
|
+
BRAND_SUBSTRINGS = [
|
|
26
|
+
"coupang", "11st", "11번가", "musinsa", "무신사",
|
|
27
|
+
"fmkorea", "에펨코리아", "dcinside", "디시인사이드",
|
|
28
|
+
"ohou", "오늘의집", "kurly", "마켓컬리",
|
|
29
|
+
"daangn", "당근",
|
|
30
|
+
# Naver is allowed in Phase 0 references (official APIs) but not in engine code.
|
|
31
|
+
"naver.com", "blog.naver", "shopping.naver",
|
|
32
|
+
# Korean portal brand names
|
|
33
|
+
"daum.net", "kakao.com",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
# Regex for bare URLs / domains. Used as a secondary pass to flag hardcoded
|
|
37
|
+
# site hosts that slipped past the brand denylist.
|
|
38
|
+
URL_PATTERN = re.compile(
|
|
39
|
+
r"https?://[\w\.-]+|[\w-]+\.(?:com|net|org|co\.kr|kr|io)\b",
|
|
40
|
+
re.IGNORECASE,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Generic / neutral hosts that are allowed anywhere (examples, specs, stdlib,
|
|
44
|
+
# and domains that legitimately appear as non-site-specific referrers / test
|
|
45
|
+
# fixtures — Google search as a generic Referer strategy, httpbin for transport
|
|
46
|
+
# tests, etc.). Anything in this set must be provably unrelated to a specific
|
|
47
|
+
# target-site preference.
|
|
48
|
+
URL_ALLOWLIST = {
|
|
49
|
+
"example.com", "example.org", "example.net",
|
|
50
|
+
"localhost", "127.0.0.1",
|
|
51
|
+
# Official API / documentation sources cited in code comments.
|
|
52
|
+
"curl.se", "playwright.dev", "nodejs.org", "npmjs.com",
|
|
53
|
+
# Generic Referer strategy target (used as a neutral off-site referer).
|
|
54
|
+
"www.google.com", "google.com",
|
|
55
|
+
# Generic HTTP test endpoint for infrastructure / transport tests.
|
|
56
|
+
"httpbin.org",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Files / dirs that must be clean.
|
|
60
|
+
SCAN_ROOTS_STRICT_OFF = ["engine"]
|
|
61
|
+
SCAN_ROOTS_STRICT_ON = ["engine", "references"]
|
|
62
|
+
|
|
63
|
+
# Directory names skipped during scan (third-party code, build artefacts).
|
|
64
|
+
# `tests` is excluded because test fixtures legitimately use concrete hosts and
|
|
65
|
+
# IP literals (e.g. SSRF/redirect cases, per-host session keys) — same exemption
|
|
66
|
+
# rationale as SKILL.md examples; tests are not the generic fetch path.
|
|
67
|
+
EXCLUDED_DIR_NAMES = {
|
|
68
|
+
"node_modules", "__pycache__", ".git", ".venv", "dist", "build", "tests",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# Comment markers within which a brand mention is OK (explanation).
|
|
72
|
+
# Keyed per-extension; any line containing these is skipped.
|
|
73
|
+
COMMENT_OK_MARKERS = {
|
|
74
|
+
".py": ("# NOTE-BIAS-OK", "# EXAMPLE-ONLY"),
|
|
75
|
+
".js": ("// NOTE-BIAS-OK", "// EXAMPLE-ONLY"),
|
|
76
|
+
".yaml": ("# NOTE-BIAS-OK", "# EXAMPLE-ONLY"),
|
|
77
|
+
".yml": ("# NOTE-BIAS-OK", "# EXAMPLE-ONLY"),
|
|
78
|
+
".md": ("<!-- NOTE-BIAS-OK -->", "<!-- EXAMPLE-ONLY -->"),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# File paths explicitly exempted (full match against relative path from scan root).
|
|
82
|
+
EXPLICIT_ALLOW_FILES = {
|
|
83
|
+
# Phase 0 official-API router. Per SKILL.md R5, naming platform hosts here is
|
|
84
|
+
# the SANCTIONED exception — these are official no-auth public endpoints, not
|
|
85
|
+
# a bias toward one target. This is the ONLY engine/ file allowed to do so;
|
|
86
|
+
# keeping it isolated is precisely why the rest of engine/ stays site-agnostic.
|
|
87
|
+
# NOTE: rel paths are computed against skill_root.parent, so they include the
|
|
88
|
+
# skill dir name (e.g. "insane-search/engine/phase0.py").
|
|
89
|
+
"insane-search/engine/phase0.py",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _line_is_exempt(line: str, ext: str) -> bool:
|
|
94
|
+
markers = COMMENT_OK_MARKERS.get(ext, ())
|
|
95
|
+
return any(m in line for m in markers)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _scan_file(path: Path, root: Path) -> list[str]:
|
|
99
|
+
"""Return list of violation strings for this file."""
|
|
100
|
+
rel = path.relative_to(root.parent)
|
|
101
|
+
if str(rel) in EXPLICIT_ALLOW_FILES:
|
|
102
|
+
return []
|
|
103
|
+
|
|
104
|
+
ext = path.suffix.lower()
|
|
105
|
+
try:
|
|
106
|
+
text = path.read_text(encoding="utf-8", errors="ignore")
|
|
107
|
+
except Exception as e:
|
|
108
|
+
return [f"{rel}:0 — read error: {e}"]
|
|
109
|
+
|
|
110
|
+
violations: list[str] = []
|
|
111
|
+
for lineno, line in enumerate(text.splitlines(), start=1):
|
|
112
|
+
if _line_is_exempt(line, ext):
|
|
113
|
+
continue
|
|
114
|
+
lowered = line.lower()
|
|
115
|
+
# 1) Brand / domain denylist
|
|
116
|
+
hit_brand = None
|
|
117
|
+
for brand in BRAND_SUBSTRINGS:
|
|
118
|
+
if brand.lower() in lowered:
|
|
119
|
+
hit_brand = brand
|
|
120
|
+
break
|
|
121
|
+
if hit_brand:
|
|
122
|
+
violations.append(f"{rel}:{lineno} — brand `{hit_brand}` in: {line.strip()[:120]}")
|
|
123
|
+
continue # one violation per line
|
|
124
|
+
# 2) URL/domain regex scan — catches hosts that aren't in the denylist.
|
|
125
|
+
for match in URL_PATTERN.finditer(line):
|
|
126
|
+
host = match.group(0).lower()
|
|
127
|
+
host = host.split("//", 1)[-1].split("/", 1)[0]
|
|
128
|
+
if host in URL_ALLOWLIST:
|
|
129
|
+
continue
|
|
130
|
+
if host.endswith(".example.com") or host.endswith(".example.org"):
|
|
131
|
+
continue
|
|
132
|
+
violations.append(f"{rel}:{lineno} — hardcoded host `{host}` in: {line.strip()[:120]}")
|
|
133
|
+
break
|
|
134
|
+
return violations
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def main(argv: list[str] | None = None) -> int:
|
|
138
|
+
parser = argparse.ArgumentParser(description="Scan engine for site-name bias")
|
|
139
|
+
parser.add_argument("--strict", action="store_true",
|
|
140
|
+
help="Also scan references/*.md (usually noisy — off by default)")
|
|
141
|
+
parser.add_argument("--root", default=None,
|
|
142
|
+
help="Skill root directory. Defaults to parent of this file.")
|
|
143
|
+
args = parser.parse_args(argv)
|
|
144
|
+
|
|
145
|
+
skill_root = Path(args.root) if args.root else Path(__file__).parent.parent
|
|
146
|
+
scan_roots = SCAN_ROOTS_STRICT_ON if args.strict else SCAN_ROOTS_STRICT_OFF
|
|
147
|
+
|
|
148
|
+
total_violations: list[str] = []
|
|
149
|
+
scanned = 0
|
|
150
|
+
for name in scan_roots:
|
|
151
|
+
root = skill_root / name
|
|
152
|
+
if not root.exists():
|
|
153
|
+
continue
|
|
154
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
155
|
+
# In-place filter so os.walk skips these subtrees.
|
|
156
|
+
dirnames[:] = [d for d in dirnames if d not in EXCLUDED_DIR_NAMES]
|
|
157
|
+
for fname in filenames:
|
|
158
|
+
p = Path(dirpath) / fname
|
|
159
|
+
if p.suffix.lower() not in (".py", ".js", ".yaml", ".yml", ".md", ".ts", ".mjs"):
|
|
160
|
+
continue
|
|
161
|
+
if p.name == "bias_check.py":
|
|
162
|
+
continue # self-exempt (this file lists the brands)
|
|
163
|
+
scanned += 1
|
|
164
|
+
total_violations.extend(_scan_file(p, skill_root))
|
|
165
|
+
|
|
166
|
+
print(f"[bias-check] scanned {scanned} files under {skill_root}")
|
|
167
|
+
if total_violations:
|
|
168
|
+
print(f"[bias-check] ❌ {len(total_violations)} violation(s):")
|
|
169
|
+
for v in total_violations:
|
|
170
|
+
print(f" - {v}")
|
|
171
|
+
print()
|
|
172
|
+
print("Fix options:")
|
|
173
|
+
print(" 1) Remove the brand name (preferred)")
|
|
174
|
+
print(" 2) If genuinely explanatory, add '# NOTE-BIAS-OK' on the same line")
|
|
175
|
+
print(" 3) If this is a Phase 0 official API reference, move it to references/*.md and rerun without --strict")
|
|
176
|
+
return 1
|
|
177
|
+
|
|
178
|
+
print("[bias-check] ✅ clean")
|
|
179
|
+
return 0
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
if __name__ == "__main__":
|
|
183
|
+
sys.exit(main())
|