shellward 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ // src/rules/overseas-llm.ts — 境外大模型端点识别(数据出境检测)
2
+ //
3
+ // 中国差异化能力:识别请求是否指向「境外大模型 / 境外 AI 服务」端点。
4
+ // 在中国监管下,向境外大模型 API 发送个人信息/重要数据 = 数据出境,受
5
+ // 《促进数据跨境流动规定》《数据出境安全评估办法》约束。英文工具没有
6
+ // "出境"概念,这是 ShellWard 面向中国市场的护城河之一。
7
+ //
8
+ // 用途:
9
+ // - 体检引擎据此判断"是否存在数据出境风险"
10
+ // - 网关层据此标记"数据出境"事件 / 触发出境前脱敏(路线图)
11
+ //
12
+ // 边界:仅做端点归属判断(不做 IP 归属解析),覆盖主流境外大模型与聚合网关。
13
+
14
+ export interface OverseasEndpoint {
15
+ id: string
16
+ /** 主机名匹配(小写、去端口)。命中其一即视为境外大模型端点 */
17
+ hosts: string[]
18
+ provider_zh: string
19
+ provider_en: string
20
+ }
21
+
22
+ export const OVERSEAS_LLM_ENDPOINTS: OverseasEndpoint[] = [
23
+ { id: 'openai', hosts: ['api.openai.com', 'openai.com'], provider_zh: 'OpenAI', provider_en: 'OpenAI' },
24
+ { id: 'anthropic', hosts: ['api.anthropic.com', 'anthropic.com'], provider_zh: 'Anthropic Claude', provider_en: 'Anthropic Claude' },
25
+ { id: 'google', hosts: ['generativelanguage.googleapis.com', 'aiplatform.googleapis.com'], provider_zh: 'Google Gemini', provider_en: 'Google Gemini' },
26
+ { id: 'azure-openai', hosts: ['openai.azure.com'], provider_zh: 'Azure OpenAI', provider_en: 'Azure OpenAI' },
27
+ { id: 'aws-bedrock', hosts: ['bedrock-runtime', 'bedrock.', 'amazonaws.com'], provider_zh: 'AWS Bedrock', provider_en: 'AWS Bedrock' },
28
+ { id: 'cohere', hosts: ['api.cohere.ai', 'api.cohere.com'], provider_zh: 'Cohere', provider_en: 'Cohere' },
29
+ { id: 'mistral', hosts: ['api.mistral.ai'], provider_zh: 'Mistral AI', provider_en: 'Mistral AI' },
30
+ { id: 'groq', hosts: ['api.groq.com'], provider_zh: 'Groq', provider_en: 'Groq' },
31
+ { id: 'together', hosts: ['api.together.xyz', 'api.together.ai'], provider_zh: 'Together AI', provider_en: 'Together AI' },
32
+ { id: 'perplexity', hosts: ['api.perplexity.ai'], provider_zh: 'Perplexity', provider_en: 'Perplexity' },
33
+ { id: 'openrouter', hosts: ['openrouter.ai'], provider_zh: 'OpenRouter (聚合网关)', provider_en: 'OpenRouter (gateway)' },
34
+ { id: 'huggingface', hosts: ['api-inference.huggingface.co', 'huggingface.co'], provider_zh: 'HuggingFace', provider_en: 'HuggingFace' },
35
+ { id: 'xai', hosts: ['api.x.ai'], provider_zh: 'xAI Grok', provider_en: 'xAI Grok' },
36
+ ]
37
+
38
+ export interface OverseasMatch {
39
+ isOverseas: boolean
40
+ endpointId?: string
41
+ provider_zh?: string
42
+ provider_en?: string
43
+ host?: string
44
+ }
45
+
46
+ /**
47
+ * 从任意字符串(URL / base_url / 命令行 / 配置)中提取主机名并判断是否境外大模型端点。
48
+ * 同时识别裸 URL 与命令行里的 https://... 形式。
49
+ */
50
+ export function detectOverseasLLM(text: string): OverseasMatch {
51
+ if (!text) return { isOverseas: false }
52
+ const lower = text.toLowerCase()
53
+
54
+ // 提取所有候选主机名:URL 中的 host,或文本中出现的端点关键字
55
+ const hosts = extractHosts(lower)
56
+
57
+ for (const ep of OVERSEAS_LLM_ENDPOINTS) {
58
+ for (const h of ep.hosts) {
59
+ // host 列表里带 '.' 或子串形式(如 'bedrock.')做包含匹配,纯域名做后缀/相等匹配
60
+ const hit = hosts.some(host => host === h || host.endsWith('.' + h) || host.includes(h))
61
+ || lower.includes('://' + h)
62
+ || lower.includes(h)
63
+ if (hit) {
64
+ return {
65
+ isOverseas: true,
66
+ endpointId: ep.id,
67
+ provider_zh: ep.provider_zh,
68
+ provider_en: ep.provider_en,
69
+ host: h,
70
+ }
71
+ }
72
+ }
73
+ }
74
+ return { isOverseas: false }
75
+ }
76
+
77
+ // ===== 境外大模型 SDK 依赖检测 =====
78
+ // 几乎每个国产 AI 项目的依赖清单里都躺着 openai/anthropic 等境外 SDK —— 这是
79
+ // 一条"数据出境通道",命中率远高于扫 URL 字符串。英文工具不会把它框定为"出境"。
80
+
81
+ /** 包名(小写)→ 厂商 */
82
+ export const OVERSEAS_LLM_PACKAGES: Record<string, { zh: string; en: string }> = {
83
+ // npm
84
+ 'openai': { zh: 'OpenAI', en: 'OpenAI' },
85
+ '@anthropic-ai/sdk': { zh: 'Anthropic Claude', en: 'Anthropic Claude' },
86
+ '@google/generative-ai': { zh: 'Google Gemini', en: 'Google Gemini' },
87
+ '@google-cloud/aiplatform': { zh: 'Google Vertex AI', en: 'Google Vertex AI' },
88
+ 'cohere-ai': { zh: 'Cohere', en: 'Cohere' },
89
+ '@mistralai/mistralai': { zh: 'Mistral AI', en: 'Mistral AI' },
90
+ 'groq-sdk': { zh: 'Groq', en: 'Groq' },
91
+ '@huggingface/inference': { zh: 'HuggingFace', en: 'HuggingFace' },
92
+ 'replicate': { zh: 'Replicate', en: 'Replicate' },
93
+ '@langchain/openai': { zh: 'OpenAI (LangChain)', en: 'OpenAI (LangChain)' },
94
+ '@langchain/anthropic': { zh: 'Anthropic (LangChain)', en: 'Anthropic (LangChain)' },
95
+ '@langchain/google-genai': { zh: 'Gemini (LangChain)', en: 'Gemini (LangChain)' },
96
+ // python
97
+ 'anthropic': { zh: 'Anthropic Claude', en: 'Anthropic Claude' },
98
+ 'google-generativeai': { zh: 'Google Gemini', en: 'Google Gemini' },
99
+ 'google-cloud-aiplatform': { zh: 'Google Vertex AI', en: 'Google Vertex AI' },
100
+ 'cohere': { zh: 'Cohere', en: 'Cohere' },
101
+ 'mistralai': { zh: 'Mistral AI', en: 'Mistral AI' },
102
+ 'groq': { zh: 'Groq', en: 'Groq' },
103
+ 'together': { zh: 'Together AI', en: 'Together AI' },
104
+ 'huggingface-hub': { zh: 'HuggingFace', en: 'HuggingFace' },
105
+ 'langchain-openai': { zh: 'OpenAI (LangChain)', en: 'OpenAI (LangChain)' },
106
+ 'langchain-anthropic': { zh: 'Anthropic (LangChain)', en: 'Anthropic (LangChain)' },
107
+ 'langchain-google-genai': { zh: 'Gemini (LangChain)', en: 'Gemini (LangChain)' },
108
+ // go
109
+ 'github.com/sashabaranov/go-openai': { zh: 'OpenAI (Go)', en: 'OpenAI (Go)' },
110
+ 'github.com/anthropics/anthropic-sdk-go': { zh: 'Anthropic (Go)', en: 'Anthropic (Go)' },
111
+ }
112
+
113
+ export interface DepMatch {
114
+ pkg: string
115
+ provider_zh: string
116
+ provider_en: string
117
+ }
118
+
119
+ /** 依赖清单文件名(小写)判定 */
120
+ export function isDependencyManifest(filename: string): boolean {
121
+ const f = filename.toLowerCase()
122
+ return f === 'package.json' || f === 'requirements.txt' || f === 'pyproject.toml' || f === 'go.mod'
123
+ }
124
+
125
+ /**
126
+ * 从依赖清单内容中提取境外大模型 SDK 依赖。
127
+ * 对 package.json 解析 JSON 的 deps/devDeps/peerDeps;其余按"出现包名"宽松匹配。
128
+ */
129
+ export function detectOverseasDeps(filename: string, content: string): DepMatch[] {
130
+ const f = filename.toLowerCase()
131
+ const names = new Set<string>()
132
+
133
+ if (f === 'package.json') {
134
+ try {
135
+ const json = JSON.parse(content)
136
+ for (const field of ['dependencies', 'devDependencies', 'peerDependencies', 'optionalDependencies']) {
137
+ const deps = json[field]
138
+ if (deps && typeof deps === 'object') {
139
+ for (const k of Object.keys(deps)) names.add(k.toLowerCase())
140
+ }
141
+ }
142
+ } catch { /* 解析失败则跳过 */ }
143
+ } else {
144
+ // requirements.txt / pyproject.toml / go.mod:按行宽松提取包名 token
145
+ for (const raw of content.split('\n')) {
146
+ const line = raw.trim().toLowerCase()
147
+ if (!line || line.startsWith('#') || line.startsWith('//')) continue
148
+ // 提取依赖名:requirements `pkg==x` / pyproject `"pkg>=x"` / go.mod `module vX`
149
+ const tokens = line.match(/[a-z0-9_.\-\/@]+/g) || []
150
+ for (const t of tokens) names.add(t)
151
+ }
152
+ }
153
+
154
+ const matches: DepMatch[] = []
155
+ const seen = new Set<string>()
156
+ for (const [pkg, prov] of Object.entries(OVERSEAS_LLM_PACKAGES)) {
157
+ if (names.has(pkg.toLowerCase()) && !seen.has(pkg)) {
158
+ seen.add(pkg)
159
+ matches.push({ pkg, provider_zh: prov.zh, provider_en: prov.en })
160
+ }
161
+ }
162
+ return matches
163
+ }
164
+
165
+ /** 从文本中粗提取主机名(URL host 段) */
166
+ function extractHosts(lowerText: string): string[] {
167
+ const hosts: string[] = []
168
+ const urlRe = /https?:\/\/([a-z0-9.\-]+)(?::\d+)?/g
169
+ let m: RegExpExecArray | null
170
+ while ((m = urlRe.exec(lowerText)) !== null) {
171
+ hosts.push(m[1])
172
+ }
173
+ return hosts
174
+ }