@agentgazer/shared 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/__tests__/normalize.test.d.ts +2 -0
- package/dist/__tests__/normalize.test.d.ts.map +1 -0
- package/dist/__tests__/normalize.test.js +159 -0
- package/dist/__tests__/normalize.test.js.map +1 -0
- package/dist/__tests__/pricing.test.js +14 -27
- package/dist/__tests__/pricing.test.js.map +1 -1
- package/dist/__tests__/providers.test.js +59 -32
- package/dist/__tests__/providers.test.js.map +1 -1
- package/dist/__tests__/simhash.test.d.ts +2 -0
- package/dist/__tests__/simhash.test.d.ts.map +1 -0
- package/dist/__tests__/simhash.test.js +107 -0
- package/dist/__tests__/simhash.test.js.map +1 -0
- package/dist/index.d.ts +6 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +19 -1
- package/dist/index.js.map +1 -1
- package/dist/logger.d.ts.map +1 -1
- package/dist/logger.js +3 -2
- package/dist/logger.js.map +1 -1
- package/dist/models.d.ts +4 -1
- package/dist/models.d.ts.map +1 -1
- package/dist/models.js +22 -34
- package/dist/models.js.map +1 -1
- package/dist/normalize.d.ts +35 -0
- package/dist/normalize.d.ts.map +1 -0
- package/dist/normalize.js +119 -0
- package/dist/normalize.js.map +1 -0
- package/dist/parsers.d.ts.map +1 -1
- package/dist/parsers.js +0 -1
- package/dist/parsers.js.map +1 -1
- package/dist/pricing.d.ts +6 -0
- package/dist/pricing.d.ts.map +1 -1
- package/dist/pricing.js +38 -8
- package/dist/pricing.js.map +1 -1
- package/dist/provider-validator.d.ts +19 -0
- package/dist/provider-validator.d.ts.map +1 -0
- package/dist/provider-validator.js +321 -0
- package/dist/provider-validator.js.map +1 -0
- package/dist/providers.d.ts +30 -2
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +105 -19
- package/dist/providers.js.map +1 -1
- package/dist/simhash.d.ts +28 -0
- package/dist/simhash.d.ts.map +1 -0
- package/dist/simhash.js +98 -0
- package/dist/simhash.js.map +1 -0
- package/package.json +1 -1
package/dist/providers.js
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.KNOWN_PROVIDER_NAMES = void 0;
|
|
3
|
+
exports.SELECTABLE_PROVIDER_NAMES = exports.KNOWN_PROVIDER_NAMES = void 0;
|
|
4
4
|
exports.detectProvider = detectProvider;
|
|
5
5
|
exports.detectProviderByHostname = detectProviderByHostname;
|
|
6
6
|
exports.getProviderBaseUrl = getProviderBaseUrl;
|
|
7
|
+
exports.getProviderRootUrl = getProviderRootUrl;
|
|
8
|
+
exports.providerUsesPathRouting = providerUsesPathRouting;
|
|
9
|
+
exports.getProviderChatEndpoint = getProviderChatEndpoint;
|
|
7
10
|
exports.getProviderAuthHeader = getProviderAuthHeader;
|
|
8
11
|
exports.parsePathPrefix = parsePathPrefix;
|
|
12
|
+
exports.rewriteProviderPath = rewriteProviderPath;
|
|
9
13
|
exports.parseAgentPath = parseAgentPath;
|
|
10
14
|
/** All known provider names (excludes "unknown"). Single source of truth. */
|
|
11
15
|
exports.KNOWN_PROVIDER_NAMES = [
|
|
@@ -18,9 +22,23 @@ exports.KNOWN_PROVIDER_NAMES = [
|
|
|
18
22
|
"moonshot",
|
|
19
23
|
"zhipu",
|
|
20
24
|
"minimax",
|
|
21
|
-
"baichuan",
|
|
22
25
|
"yi",
|
|
23
26
|
];
|
|
27
|
+
/**
|
|
28
|
+
* Provider names available for user selection in UI/CLI.
|
|
29
|
+
* Excludes providers without active API access.
|
|
30
|
+
*/
|
|
31
|
+
exports.SELECTABLE_PROVIDER_NAMES = [
|
|
32
|
+
"openai",
|
|
33
|
+
"anthropic",
|
|
34
|
+
"google",
|
|
35
|
+
"mistral",
|
|
36
|
+
"cohere",
|
|
37
|
+
"deepseek",
|
|
38
|
+
"moonshot",
|
|
39
|
+
"zhipu",
|
|
40
|
+
"minimax",
|
|
41
|
+
];
|
|
24
42
|
const PROVIDER_PATTERNS = [
|
|
25
43
|
{
|
|
26
44
|
name: "openai",
|
|
@@ -50,7 +68,7 @@ const PROVIDER_PATTERNS = [
|
|
|
50
68
|
},
|
|
51
69
|
{
|
|
52
70
|
name: "moonshot",
|
|
53
|
-
hostPatterns: [/^api\.moonshot\.cn$/],
|
|
71
|
+
hostPatterns: [/^api\.moonshot\.ai$/, /^api\.moonshot\.cn$/],
|
|
54
72
|
},
|
|
55
73
|
{
|
|
56
74
|
name: "zhipu",
|
|
@@ -58,15 +76,11 @@ const PROVIDER_PATTERNS = [
|
|
|
58
76
|
},
|
|
59
77
|
{
|
|
60
78
|
name: "minimax",
|
|
61
|
-
hostPatterns: [/^api\.minimax\.chat$/],
|
|
62
|
-
},
|
|
63
|
-
{
|
|
64
|
-
name: "baichuan",
|
|
65
|
-
hostPatterns: [/^api\.baichuan-ai\.com$/],
|
|
79
|
+
hostPatterns: [/^api\.minimax\.io$/, /^api\.minimax\.chat$/],
|
|
66
80
|
},
|
|
67
81
|
{
|
|
68
82
|
name: "yi",
|
|
69
|
-
hostPatterns: [/^api\.lingyiwanwu\.com$/],
|
|
83
|
+
hostPatterns: [/^api\.01\.ai$/, /^api\.lingyiwanwu\.com$/],
|
|
70
84
|
},
|
|
71
85
|
];
|
|
72
86
|
function detectProvider(url) {
|
|
@@ -123,26 +137,76 @@ function detectProviderByHostname(url) {
|
|
|
123
137
|
return "unknown";
|
|
124
138
|
}
|
|
125
139
|
function getProviderBaseUrl(provider) {
|
|
140
|
+
// Base URLs include version path so users can set OPENAI_BASE_URL=http://localhost:4000/openai
|
|
141
|
+
// and the SDK will send /openai/chat/completions which becomes /v1/chat/completions
|
|
142
|
+
const urls = {
|
|
143
|
+
openai: "https://api.openai.com/v1",
|
|
144
|
+
anthropic: "https://api.anthropic.com/v1",
|
|
145
|
+
google: "https://generativelanguage.googleapis.com/v1beta/openai",
|
|
146
|
+
mistral: "https://api.mistral.ai/v1",
|
|
147
|
+
cohere: "https://api.cohere.com/v2",
|
|
148
|
+
deepseek: "https://api.deepseek.com/v1",
|
|
149
|
+
moonshot: "https://api.moonshot.ai/v1",
|
|
150
|
+
zhipu: "https://api.z.ai/api/paas/v4",
|
|
151
|
+
minimax: "https://api.minimax.io/v1",
|
|
152
|
+
yi: "https://api.01.ai/v1",
|
|
153
|
+
};
|
|
154
|
+
return urls[provider] ?? null;
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Returns the root API URL for a provider for path-based routing.
|
|
158
|
+
* Includes version prefix where needed (e.g., /v1beta for Google).
|
|
159
|
+
* Used for providers that need path-based routing (e.g., Google's native API).
|
|
160
|
+
*/
|
|
161
|
+
function getProviderRootUrl(provider) {
|
|
126
162
|
const urls = {
|
|
127
163
|
openai: "https://api.openai.com",
|
|
128
164
|
anthropic: "https://api.anthropic.com",
|
|
129
|
-
google: "https://generativelanguage.googleapis.com",
|
|
165
|
+
google: "https://generativelanguage.googleapis.com/v1beta", // Include version prefix
|
|
130
166
|
mistral: "https://api.mistral.ai",
|
|
131
167
|
cohere: "https://api.cohere.com",
|
|
132
168
|
deepseek: "https://api.deepseek.com",
|
|
133
|
-
moonshot: "https://api.moonshot.
|
|
134
|
-
zhipu: "https://
|
|
135
|
-
minimax: "https://api.minimax.
|
|
136
|
-
|
|
137
|
-
yi: "https://api.lingyiwanwu.com",
|
|
169
|
+
moonshot: "https://api.moonshot.ai",
|
|
170
|
+
zhipu: "https://api.z.ai",
|
|
171
|
+
minimax: "https://api.minimax.io",
|
|
172
|
+
yi: "https://api.01.ai",
|
|
138
173
|
};
|
|
139
174
|
return urls[provider] ?? null;
|
|
140
175
|
}
|
|
176
|
+
/**
|
|
177
|
+
* Check if a provider uses path-based routing (client provides the full path).
|
|
178
|
+
* These providers expect the trailing path to be preserved, not replaced with a fixed endpoint.
|
|
179
|
+
*/
|
|
180
|
+
function providerUsesPathRouting(provider) {
|
|
181
|
+
// Google's native API uses paths like /v1beta/models/{model}:generateContent
|
|
182
|
+
return provider === "google";
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Returns the complete chat endpoint URL for a provider.
|
|
186
|
+
* This is the full URL including path - no additional path construction needed.
|
|
187
|
+
* Returns null for unknown providers.
|
|
188
|
+
*/
|
|
189
|
+
function getProviderChatEndpoint(provider) {
|
|
190
|
+
const endpoints = {
|
|
191
|
+
openai: "https://api.openai.com/v1/chat/completions",
|
|
192
|
+
anthropic: "https://api.anthropic.com/v1/messages",
|
|
193
|
+
google: "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
|
|
194
|
+
mistral: "https://api.mistral.ai/v1/chat/completions",
|
|
195
|
+
cohere: "https://api.cohere.com/v2/chat",
|
|
196
|
+
deepseek: "https://api.deepseek.com/v1/chat/completions",
|
|
197
|
+
moonshot: "https://api.moonshot.ai/v1/chat/completions",
|
|
198
|
+
zhipu: "https://api.z.ai/api/paas/v4/chat/completions",
|
|
199
|
+
minimax: "https://api.minimax.io/v1/text/chatcompletion_v2",
|
|
200
|
+
yi: "https://api.01.ai/v1/chat/completions",
|
|
201
|
+
};
|
|
202
|
+
return endpoints[provider] ?? null;
|
|
203
|
+
}
|
|
141
204
|
/**
|
|
142
205
|
* Returns the auth header name and value for a given provider.
|
|
143
206
|
* Different providers use different header conventions.
|
|
207
|
+
* @param useNativeApi - For Google, whether to use native API auth (x-goog-api-key) vs OpenAI-compatible (Bearer)
|
|
144
208
|
*/
|
|
145
|
-
function getProviderAuthHeader(provider, apiKey) {
|
|
209
|
+
function getProviderAuthHeader(provider, apiKey, useNativeApi = false) {
|
|
146
210
|
switch (provider) {
|
|
147
211
|
case "openai":
|
|
148
212
|
case "mistral":
|
|
@@ -151,13 +215,16 @@ function getProviderAuthHeader(provider, apiKey) {
|
|
|
151
215
|
case "moonshot":
|
|
152
216
|
case "zhipu":
|
|
153
217
|
case "minimax":
|
|
154
|
-
case "baichuan":
|
|
155
218
|
case "yi":
|
|
156
219
|
return { name: "authorization", value: `Bearer ${apiKey}` };
|
|
157
220
|
case "anthropic":
|
|
158
221
|
return { name: "x-api-key", value: apiKey };
|
|
159
222
|
case "google":
|
|
160
|
-
|
|
223
|
+
// Google native API uses x-goog-api-key, OpenAI-compatible uses Bearer
|
|
224
|
+
if (useNativeApi) {
|
|
225
|
+
return { name: "x-goog-api-key", value: apiKey };
|
|
226
|
+
}
|
|
227
|
+
return { name: "authorization", value: `Bearer ${apiKey}` };
|
|
161
228
|
default:
|
|
162
229
|
return null;
|
|
163
230
|
}
|
|
@@ -174,10 +241,29 @@ function parsePathPrefix(path) {
|
|
|
174
241
|
const segment = match[1].toLowerCase();
|
|
175
242
|
const rest = match[2] ?? "/";
|
|
176
243
|
if (exports.KNOWN_PROVIDER_NAMES.includes(segment)) {
|
|
177
|
-
|
|
244
|
+
// Apply path rewriting for non-OpenAI-compatible providers
|
|
245
|
+
const rewrittenPath = rewriteProviderPath(segment, rest);
|
|
246
|
+
return { provider: segment, remainingPath: rewrittenPath };
|
|
178
247
|
}
|
|
179
248
|
return null;
|
|
180
249
|
}
|
|
250
|
+
/**
|
|
251
|
+
* Rewrite OpenAI-compatible paths to provider-specific paths.
|
|
252
|
+
* Some Chinese providers use different endpoint structures.
|
|
253
|
+
*/
|
|
254
|
+
function rewriteProviderPath(provider, path) {
|
|
255
|
+
switch (provider) {
|
|
256
|
+
case "minimax":
|
|
257
|
+
// MiniMax uses /v1/text/chatcompletion_v2 instead of /v1/chat/completions
|
|
258
|
+
if (path === "/v1/chat/completions" || path.startsWith("/v1/chat/completions?")) {
|
|
259
|
+
return path.replace("/v1/chat/completions", "/v1/text/chatcompletion_v2");
|
|
260
|
+
}
|
|
261
|
+
break;
|
|
262
|
+
// Zhipu base URL already includes /api/paas, so /v4/chat/completions works directly
|
|
263
|
+
// Yi and Moonshot use standard OpenAI-compatible paths
|
|
264
|
+
}
|
|
265
|
+
return path;
|
|
266
|
+
}
|
|
181
267
|
/**
|
|
182
268
|
* Parse a URL path to extract an agent ID from /agents/{id}/... format.
|
|
183
269
|
* Given "/agents/my-bot/openai/v1/chat/completions" returns:
|
package/dist/providers.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"providers.js","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":";;;
|
|
1
|
+
{"version":3,"file":"providers.js","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":";;;AA8FA,wCA4BC;AAMD,4DAmBC;AAED,gDAgBC;AAOD,gDAcC;AAMD,0DAGC;AAOD,0DAcC;AAOD,sDA0BC;AAOD,0CAaC;AAMD,kDAYC;AAQD,wCAQC;AA5RD,6EAA6E;AAChE,QAAA,oBAAoB,GAAmB;IAClD,QAAQ;IACR,WAAW;IACX,QAAQ;IACR,SAAS;IACT,QAAQ;IACR,UAAU;IACV,UAAU;IACV,OAAO;IACP,SAAS;IACT,IAAI;CACL,CAAC;AAEF;;;GAGG;AACU,QAAA,yBAAyB,GAAmB;IACvD,QAAQ;IACR,WAAW;IACX,QAAQ;IACR,SAAS;IACT,QAAQ;IACR,UAAU;IACV,UAAU;IACV,OAAO;IACP,SAAS;CACV,CAAC;AAEF,MAAM,iBAAiB,GAAsB;IAC3C;QACE,IAAI,EAAE,QAAQ;QACd,YAAY,EAAE,CAAC,oBAAoB,CAAC;QACpC,YAAY,EAAE,CAAC,yBAAyB,EAAE,mBAAmB,CAAC;KAC/D;IACD;QACE,IAAI,EAAE,WAAW;QACjB,YAAY,EAAE,CAAC,uBAAuB,CAAC;QACvC,YAAY,EAAE,CAAC,gBAAgB,CAAC;KACjC;IACD;QACE,IAAI,EAAE,QAAQ;QACd,YAAY,EAAE,CAAC,uCAAuC,CAAC;KACxD;IACD;QACE,IAAI,EAAE,SAAS;QACf,YAAY,EAAE,CAAC,oBAAoB,CAAC;KACrC;IACD;QACE,IAAI,EAAE,QAAQ;QACd,YAAY,EAAE,CAAC,oBAAoB,EAAE,mBAAmB,CAAC;KAC1D;IACD;QACE,IAAI,EAAE,UAAU;QAChB,YAAY,EAAE,CAAC,sBAAsB,CAAC;KACvC;IACD;QACE,IAAI,EAAE,UAAU;QAChB,YAAY,EAAE,CAAC,qBAAqB,EAAE,qBAAqB,CAAC;KAC7D;IACD;QACE,IAAI,EAAE,OAAO;QACb,YAAY,EAAE,CAAC,sBAAsB,EAAE,cAAc,CAAC;KACvD;IACD;QACE,IAAI,EAAE,SAAS;QACf,YAAY,EAAE,CAAC,oBAAoB,EAAE,sBAAsB,CAAC;KAC7D;IACD;QACE,IAAI,EAAE,IAAI;QACV,YAAY,EAAE,CAAC,eAAe,EAAE,yBAAyB,CAAC;KAC3D;CACF,CAAC;AAEF,SAAgB,cAAc,CAAC,GAAW;IACxC,IAAI,QAAQ,GAAG,EAAE,CAAC;IAClB,IAAI,QAAQ,GAAG,GAAG,CAAC;IACnB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;QAC3B,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC;QACP,6DAA6D;IAC/D,CAAC;IAED,KAAK,MAAM,QAAQ,IAAI,iBAAiB,EAAE,CAAC;QACzC,IAAI,QAAQ,EAAE,CAAC;YACb,KAAK,MAAM,WAAW,IAAI,QAAQ,CAAC,YAAY,EAAE,CAAC;gBAChD,IAAI,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;oBAC/B,OAAO,QAAQ,CAAC,IAAI,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;QACD,IAAI,QAAQ,CAAC,YAAY,EAAE,CAAC;YAC1B,KAAK,MAAM,WAAW,IAAI,QAAQ,CAAC,YAAY,EAAE,CAAC;gBAChD,IAAI,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;oBAC/B,OAAO,QAAQ,CAAC,IAAI,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;;GAGG;AACH,SAAgB,wBAAwB,CAAC,GAAW;IAClD,IAAI,QAAQ,GAAG,EAAE,CAAC;IAClB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAEhC,KAAK,MAAM,QAAQ,IAAI,iBAAiB,EAAE,CAAC;QACzC,KAAK,MAAM,WAAW,IAAI,QAAQ,CAAC,YAAY,EAAE,CAAC;YAChD,IAAI,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC/B,OAAO,QAAQ,CAAC,IAAI,CAAC;YACvB,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAgB,kBAAkB,CAAC,QAAsB;IACvD,+FAA+F;IAC/F,oFAAoF;IACpF,MAAM,IAAI,GAA2B;QACnC,MAAM,EAAE,2BAA2B;QACnC,SAAS,EAAE,8BAA8B;QACzC,MAAM,EAAE,yDAAyD;QACjE,OAAO,EAAE,2BAA2B;QACpC,MAAM,EAAE,2BAA2B;QACnC,QAAQ,EAAE,6BAA6B;QACvC,QAAQ,EAAE,4BAA4B;QACtC,KAAK,EAAE,8BAA8B;QACrC,OAAO,EAAE,2BAA2B;QACpC,EAAE,EAAE,sBAAsB;KAC3B,CAAC;IACF,OAAO,IAAI,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC;AAChC,CAAC;AAED;;;;GAIG;AACH,SAAgB,kBAAkB,CAAC,QAAsB;IACvD,MAAM,IAAI,GAA2B;QACnC,MAAM,EAAE,wBAAwB;QAChC,SAAS,EAAE,2BAA2B;QACtC,MAAM,EAAE,kDAAkD,EAAG,yBAAyB;QACtF,OAAO,EAAE,wBAAwB;QACjC,MAAM,EAAE,wBAAwB;QAChC,QAAQ,EAAE,0BAA0B;QACpC,QAAQ,EAAE,yBAAyB;QACnC,KAAK,EAAE,kBAAkB;QACzB,OAAO,EAAE,wBAAwB;QACjC,EAAE,EAAE,mBAAmB;KACxB,CAAC;IACF,OAAO,IAAI,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC;AAChC,CAAC;AAED;;;GAGG;AACH,SAAgB,uBAAuB,CAAC,QAAsB;IAC5D,6EAA6E;IAC7E,OAAO,QAAQ,KAAK,QAAQ,CAAC;AAC/B,CAAC;AAED;;;;GAIG;AACH,SAAgB,uBAAuB,CAAC,QAAsB;IAC5D,MAAM,SAAS,GAA2B;QACxC,MAAM,EAAE,4CAA4C;QACpD,SAAS,EAAE,uCAAuC;QAClD,MAAM,EAAE,0EAA0E;QAClF,OAAO,EAAE,4CAA4C;QACrD,MAAM,EAAE,gCAAgC;QACxC,QAAQ,EAAE,8CAA8C;QACxD,QAAQ,EAAE,6CAA6C;QACvD,KAAK,EAAE,+CAA+C;QACtD,OAAO,EAAE,kDAAkD;QAC3D,EAAE,EAAE,uCAAuC;KAC5C,CAAC;IACF,OAAO,SAAS,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC;AACrC,CAAC;AAED;;;;GAIG;AACH,SAAgB,qBAAqB,CACnC,QAAsB,EACtB,MAAc,EACd,eAAwB,KAAK;IAE7B,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,QAAQ,CAAC;QACd,KAAK,SAAS,CAAC;QACf,KAAK,QAAQ,CAAC;QACd,KAAK,UAAU,CAAC;QAChB,KAAK,UAAU,CAAC;QAChB,KAAK,OAAO,CAAC;QACb,KAAK,SAAS,CAAC;QACf,KAAK,IAAI;YACP,OAAO,EAAE,IAAI,EAAE,eAAe,EAAE,KAAK,EAAE,UAAU,MAAM,EAAE,EAAE,CAAC;QAC9D,KAAK,WAAW;YACd,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;QAC9C,KAAK,QAAQ;YACX,uEAAuE;YACvE,IAAI,YAAY,EAAE,CAAC;gBACjB,OAAO,EAAE,IAAI,EAAE,gBAAgB,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;YACnD,CAAC;YACD,OAAO,EAAE,IAAI,EAAE,eAAe,EAAE,KAAK,EAAE,UAAU,MAAM,EAAE,EAAE,CAAC;QAC9D;YACE,OAAO,IAAI,CAAC;IAChB,CAAC;AACH,CAAC;AAED;;;;GAIG;AACH,SAAgB,eAAe,CAC7B,IAAY;IAEZ,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC;IAC/C,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;IACvC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;IAC7B,IAAI,4BAAoB,CAAC,QAAQ,CAAC,OAAuB,CAAC,EAAE,CAAC;QAC3D,2DAA2D;QAC3D,MAAM,aAAa,GAAG,mBAAmB,CAAC,OAAuB,EAAE,IAAI,CAAC,CAAC;QACzE,OAAO,EAAE,QAAQ,EAAE,OAAuB,EAAE,aAAa,EAAE,aAAa,EAAE,CAAC;IAC7E,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,SAAgB,mBAAmB,CAAC,QAAsB,EAAE,IAAY;IACtE,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,SAAS;YACZ,0EAA0E;YAC1E,IAAI,IAAI,KAAK,sBAAsB,IAAI,IAAI,CAAC,UAAU,CAAC,uBAAuB,CAAC,EAAE,CAAC;gBAChF,OAAO,IAAI,CAAC,OAAO,CAAC,sBAAsB,EAAE,4BAA4B,CAAC,CAAC;YAC5E,CAAC;YACD,MAAM;QACR,oFAAoF;QACpF,uDAAuD;IACzD,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;GAKG;AACH,SAAgB,cAAc,CAC5B,IAAY;IAEZ,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;IACvD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IACzB,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;IAC7B,OAAO,EAAE,OAAO,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC;AAC1C,CAAC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SimHash - Locality Sensitive Hashing for text similarity detection.
|
|
3
|
+
* Used by Google for web page deduplication.
|
|
4
|
+
*
|
|
5
|
+
* Similar texts produce hashes with small Hamming distance.
|
|
6
|
+
* Different texts produce hashes with large Hamming distance.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Compute 64-bit SimHash for text.
|
|
10
|
+
*
|
|
11
|
+
* Algorithm:
|
|
12
|
+
* 1. Tokenize text into n-grams
|
|
13
|
+
* 2. Hash each token to 64-bit value
|
|
14
|
+
* 3. For each bit position, sum +1 if bit is 1, -1 if bit is 0
|
|
15
|
+
* 4. Final hash: bit is 1 if sum > 0, else 0
|
|
16
|
+
*/
|
|
17
|
+
export declare function computeSimHash(text: string): bigint;
|
|
18
|
+
/**
|
|
19
|
+
* Compute Hamming distance between two SimHash values.
|
|
20
|
+
* Returns the number of differing bits.
|
|
21
|
+
*/
|
|
22
|
+
export declare function hammingDistance(a: bigint, b: bigint): number;
|
|
23
|
+
/**
|
|
24
|
+
* Check if two SimHash values are similar.
|
|
25
|
+
* Default threshold of 3 bits difference is commonly used.
|
|
26
|
+
*/
|
|
27
|
+
export declare function isSimilar(a: bigint, b: bigint, threshold?: number): boolean;
|
|
28
|
+
//# sourceMappingURL=simhash.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"simhash.d.ts","sourceRoot":"","sources":["../src/simhash.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAiCH;;;;;;;;GAQG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CA8BnD;AAED;;;GAGG;AACH,wBAAgB,eAAe,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAW5D;AAED;;;GAGG;AACH,wBAAgB,SAAS,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,EAAE,SAAS,SAAI,GAAG,OAAO,CAEtE"}
|
package/dist/simhash.js
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* SimHash - Locality Sensitive Hashing for text similarity detection.
|
|
4
|
+
* Used by Google for web page deduplication.
|
|
5
|
+
*
|
|
6
|
+
* Similar texts produce hashes with small Hamming distance.
|
|
7
|
+
* Different texts produce hashes with large Hamming distance.
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.computeSimHash = computeSimHash;
|
|
11
|
+
exports.hammingDistance = hammingDistance;
|
|
12
|
+
exports.isSimilar = isSimilar;
|
|
13
|
+
// 64-bit SimHash using bigint
|
|
14
|
+
const HASH_BITS = 64n;
|
|
15
|
+
/**
|
|
16
|
+
* Simple string hash function (FNV-1a variant)
|
|
17
|
+
*/
|
|
18
|
+
function hashToken(token) {
|
|
19
|
+
let hash = 0xcbf29ce484222325n; // FNV offset basis
|
|
20
|
+
for (let i = 0; i < token.length; i++) {
|
|
21
|
+
hash ^= BigInt(token.charCodeAt(i));
|
|
22
|
+
hash = BigInt.asUintN(64, hash * 0x100000001b3n); // FNV prime
|
|
23
|
+
}
|
|
24
|
+
return hash;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Tokenize text into n-grams (default: 3-grams)
|
|
28
|
+
*/
|
|
29
|
+
function tokenize(text, n = 3) {
|
|
30
|
+
const normalized = text.toLowerCase().replace(/\s+/g, " ").trim();
|
|
31
|
+
if (normalized.length < n) {
|
|
32
|
+
return [normalized];
|
|
33
|
+
}
|
|
34
|
+
const tokens = [];
|
|
35
|
+
for (let i = 0; i <= normalized.length - n; i++) {
|
|
36
|
+
tokens.push(normalized.slice(i, i + n));
|
|
37
|
+
}
|
|
38
|
+
return tokens;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Compute 64-bit SimHash for text.
|
|
42
|
+
*
|
|
43
|
+
* Algorithm:
|
|
44
|
+
* 1. Tokenize text into n-grams
|
|
45
|
+
* 2. Hash each token to 64-bit value
|
|
46
|
+
* 3. For each bit position, sum +1 if bit is 1, -1 if bit is 0
|
|
47
|
+
* 4. Final hash: bit is 1 if sum > 0, else 0
|
|
48
|
+
*/
|
|
49
|
+
function computeSimHash(text) {
|
|
50
|
+
const tokens = tokenize(text);
|
|
51
|
+
if (tokens.length === 0) {
|
|
52
|
+
return 0n;
|
|
53
|
+
}
|
|
54
|
+
// Vector of 64 counters (one per bit position)
|
|
55
|
+
const vector = new Array(64).fill(0);
|
|
56
|
+
for (const token of tokens) {
|
|
57
|
+
const hash = hashToken(token);
|
|
58
|
+
for (let i = 0; i < 64; i++) {
|
|
59
|
+
// Check if bit i is set
|
|
60
|
+
if ((hash >> BigInt(i)) & 1n) {
|
|
61
|
+
vector[i]++;
|
|
62
|
+
}
|
|
63
|
+
else {
|
|
64
|
+
vector[i]--;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
// Build final hash from vector
|
|
69
|
+
let result = 0n;
|
|
70
|
+
for (let i = 0; i < 64; i++) {
|
|
71
|
+
if (vector[i] > 0) {
|
|
72
|
+
result |= 1n << BigInt(i);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return result;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Compute Hamming distance between two SimHash values.
|
|
79
|
+
* Returns the number of differing bits.
|
|
80
|
+
*/
|
|
81
|
+
function hammingDistance(a, b) {
|
|
82
|
+
let xor = a ^ b;
|
|
83
|
+
let count = 0;
|
|
84
|
+
// Count set bits (Brian Kernighan's algorithm)
|
|
85
|
+
while (xor > 0n) {
|
|
86
|
+
xor &= xor - 1n;
|
|
87
|
+
count++;
|
|
88
|
+
}
|
|
89
|
+
return count;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Check if two SimHash values are similar.
|
|
93
|
+
* Default threshold of 3 bits difference is commonly used.
|
|
94
|
+
*/
|
|
95
|
+
function isSimilar(a, b, threshold = 3) {
|
|
96
|
+
return hammingDistance(a, b) <= threshold;
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=simhash.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"simhash.js","sourceRoot":"","sources":["../src/simhash.ts"],"names":[],"mappings":";AAAA;;;;;;GAMG;;AA0CH,wCA8BC;AAMD,0CAWC;AAMD,8BAEC;AA/FD,8BAA8B;AAC9B,MAAM,SAAS,GAAG,GAAG,CAAC;AAEtB;;GAEG;AACH,SAAS,SAAS,CAAC,KAAa;IAC9B,IAAI,IAAI,GAAG,mBAAmB,CAAC,CAAC,mBAAmB;IACnD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,IAAI,IAAI,MAAM,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;QACpC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,EAAE,EAAE,IAAI,GAAG,cAAc,CAAC,CAAC,CAAC,YAAY;IAChE,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,QAAQ,CAAC,IAAY,EAAE,CAAC,GAAG,CAAC;IACnC,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAClE,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,OAAO,CAAC,UAAU,CAAC,CAAC;IACtB,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAChD,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;IAC1C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;;GAQG;AACH,SAAgB,cAAc,CAAC,IAAY;IACzC,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9B,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,+CAA+C;IAC/C,MAAM,MAAM,GAAa,IAAI,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAE/C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC;QAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5B,wBAAwB;YACxB,IAAI,CAAC,IAAI,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC;gBAC7B,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;YACd,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;YACd,CAAC;QACH,CAAC;IACH,CAAC;IAED,+BAA+B;IAC/B,IAAI,MAAM,GAAG,EAAE,CAAC;IAChB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,IAAI,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YAClB,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,SAAgB,eAAe,CAAC,CAAS,EAAE,CAAS;IAClD,IAAI,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;IAChB,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,+CAA+C;IAC/C,OAAO,GAAG,GAAG,EAAE,EAAE,CAAC;QAChB,GAAG,IAAI,GAAG,GAAG,EAAE,CAAC;QAChB,KAAK,EAAE,CAAC;IACV,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;GAGG;AACH,SAAgB,SAAS,CAAC,CAAS,EAAE,CAAS,EAAE,SAAS,GAAG,CAAC;IAC3D,OAAO,eAAe,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,SAAS,CAAC;AAC5C,CAAC"}
|