toonify-mcp 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.es.md +270 -0
- package/README.id.md +270 -0
- package/README.ja.md +270 -0
- package/README.md +21 -10
- package/README.vi.md +270 -0
- package/README.zh-CN.md +270 -0
- package/README.zh-TW.md +27 -16
- package/dist/metrics/metrics-collector.d.ts +2 -0
- package/dist/metrics/metrics-collector.d.ts.map +1 -1
- package/dist/metrics/metrics-collector.js +43 -8
- package/dist/metrics/metrics-collector.js.map +1 -1
- package/dist/optimizer/caching/cache-optimizer.d.ts +53 -0
- package/dist/optimizer/caching/cache-optimizer.d.ts.map +1 -0
- package/dist/optimizer/caching/cache-optimizer.js +176 -0
- package/dist/optimizer/caching/cache-optimizer.js.map +1 -0
- package/dist/optimizer/caching/cache-strategies.d.ts +19 -0
- package/dist/optimizer/caching/cache-strategies.d.ts.map +1 -0
- package/dist/optimizer/caching/cache-strategies.js +62 -0
- package/dist/optimizer/caching/cache-strategies.js.map +1 -0
- package/dist/optimizer/caching/cache-types.d.ts +36 -0
- package/dist/optimizer/caching/cache-types.d.ts.map +1 -0
- package/dist/optimizer/caching/cache-types.js +5 -0
- package/dist/optimizer/caching/cache-types.js.map +1 -0
- package/dist/optimizer/caching/index.d.ts +7 -0
- package/dist/optimizer/caching/index.d.ts.map +1 -0
- package/dist/optimizer/caching/index.js +7 -0
- package/dist/optimizer/caching/index.js.map +1 -0
- package/dist/optimizer/multilingual/index.d.ts +7 -0
- package/dist/optimizer/multilingual/index.d.ts.map +1 -0
- package/dist/optimizer/multilingual/index.js +7 -0
- package/dist/optimizer/multilingual/index.js.map +1 -0
- package/dist/optimizer/multilingual/language-detector.d.ts +43 -0
- package/dist/optimizer/multilingual/language-detector.d.ts.map +1 -0
- package/dist/optimizer/multilingual/language-detector.js +161 -0
- package/dist/optimizer/multilingual/language-detector.js.map +1 -0
- package/dist/optimizer/multilingual/language-profiles.d.ts +34 -0
- package/dist/optimizer/multilingual/language-profiles.d.ts.map +1 -0
- package/dist/optimizer/multilingual/language-profiles.js +196 -0
- package/dist/optimizer/multilingual/language-profiles.js.map +1 -0
- package/dist/optimizer/multilingual/tokenizer-adapter.d.ts +47 -0
- package/dist/optimizer/multilingual/tokenizer-adapter.d.ts.map +1 -0
- package/dist/optimizer/multilingual/tokenizer-adapter.js +96 -0
- package/dist/optimizer/multilingual/tokenizer-adapter.js.map +1 -0
- package/dist/optimizer/token-optimizer.d.ts +11 -1
- package/dist/optimizer/token-optimizer.d.ts.map +1 -1
- package/dist/optimizer/token-optimizer.js +49 -8
- package/dist/optimizer/token-optimizer.js.map +1 -1
- package/dist/optimizer/types.d.ts +15 -0
- package/dist/optimizer/types.d.ts.map +1 -1
- package/package.json +2 -2
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache-optimizer.js","sourceRoot":"","sources":["../../../src/optimizer/caching/cache-optimizer.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EAAE,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAEpD,MAAM,OAAO,cAAc;IACjB,MAAM,CAAc;IACpB,OAAO,CAAe;IAE9B,YAAY,SAA+B,EAAE;QAC3C,IAAI,CAAC,MAAM,GAAG;YACZ,OAAO,EAAE,IAAI;YACb,QAAQ,EAAE,MAAM;YAChB,GAAG,EAAE,OAAO;YACZ,kBAAkB,EAAE,IAAI;YACxB,kBAAkB,EAAE,IAAI;YACxB,GAAG,MAAM;SACV,CAAC;QAEF,IAAI,CAAC,OAAO,GAAG;YACb,SAAS,EAAE,CAAC;YACZ,WAAW,EAAE,CAAC;YACd,YAAY,EAAE,CAAC;YACf,qBAAqB,EAAE,CAAC;YACxB,oBAAoB,EAAE,CAAC;YACvB,sBAAsB,EAAE,CAAC;SAC1B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,eAAe,CACb,WAAmB,EACnB,QAAgB,EAChB,MAA+B,EAC/B,cAAsB,EACtB,eAAuB;QAEvB,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,kBAAkB,EAAE,CAAC;YAC5D,8BAA8B;YAC9B,OAAO;gBACL,YAAY,EAAE,EAAE;gBAChB,cAAc,EAAE,IAAI,CAAC,gBAAgB,CAAC,WAAW,EAAE,QAAQ,EAAE,MAAM,CAAC;gBACpE,eAAe,EAAE,KAAK;aACvB,CAAC;QACJ,CAAC;QAED,+CAA+C;QAC/C,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QACnD,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,WAAW,EAAE,eAAe,CAAC,EAAE,CAAC;YACxD,OAAO;gBACL,YAAY,EAAE,EAAE;gBAChB,cAAc,EAAE,IAAI,CAAC,gBAAgB,CAAC,WAAW,EAAE,QAAQ,EAAE,MAAM,CAAC;gBACpE,eAAe,EAAE,KAAK;aACvB,CAAC;QACJ,CAAC;QAED,iCAAiC;QACjC,MAAM,YAAY,GAAG,IAAI,CAAC,kBAAkB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;QAE/D,0CAA0C;QAC1C,MAAM,cAAc,GAAG,aAAa,WAAW,EAAE,CAAC;QAElD,sBAAsB;QACtB,MAAM,kBAAkB,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,iBAAiB;QAErE,MAAM,QAAQ,GAAkB;YAC9B,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ,KAAK,MAAM,CAAC,CAAC;gBACzC,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;gBAC1D,IAAI,CAAC,MAAM,CAAC,QAAQ;YACtB,kBAAkB;YAClB,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,GAAG;SACrB,CAAC;QAEF,iBAAiB;QACjB,IAAI,CAAC,OAAO,CAAC,oBAAoB,IAAI,kBAAkB,CAAC;QAExD,OAAO;YACL,YAAY;YACZ,cAAc;YACd,eAAe,EAAE,IAAI;YACrB,aAAa,EAAE,QAAQ;SACxB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,kBAAkB,CAAC,QAAgB,EAAE,MAAc;QACzD,OAAO,sCAAsC,QAAQ;;;;;;;;;;;;;;;;iBAgBxC,MAAM,CAAC,WAAW,EAAE;;;;;;;;;IASjC,CAAC;IACH,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,WAAmB,EAAE,QAAgB,EAAE,MAAc;QAC5E,OAAO,SAAS,MAAM,CAAC,WAAW,EAAE,MAAM,WAAW,EAAE,CAAC;IAC1D,CAAC;IAED;;OAEG;IACH,kBAAkB,CAAC,MAAqB;QACtC,IAAI,CAAC,MAAM,CAAC,eAAe,EAAE,CAAC;YAC5B,OAAO,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,CAAC,cAAc,EAAE,CAAC,CAAC;QACzD,CAAC;QAED,OAAO;YACL;gBACE,IAAI,EAAE,MAAM;gBACZ,IAAI,EAAE,MAAM,CAAC,YAAY;gBACzB,aAAa,EAAE,EAAE,IAAI,EAAE,WAAW,EAAE;aACrC;YACD;gBACE,IAAI,EAAE,MAAM;gBACZ,IAAI,EAAE,MAAM,CAAC,cAAc;aAC5B;SACF,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,MAAqB;QACnC,OAAO;YACL,MAAM,EAAE,MAAM,CAAC,YAAY,IAAI,4CAA4C;YAC3E,IAAI,EAAE,MAAM,CAAC,cAAc;SAC5B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,EAAE,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,WAAmB;QAChC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;QACzB,IAAI,CAAC,OAAO,CAAC,qBAAqB,IAAI,WAAW,CAAC;QAClD,IAAI,CAAC,aAAa,EAAE,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,eAAe;QACb,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;QAC3B,IAAI,CAAC,aAAa,EAAE,CAAC;IACvB,CAAC;IAED;;OAEG;IACK,aAAa;QACnB,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC;QAChE,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7E,CAAC;IAED;;OAEG;IACH,YAAY;QACV,IAAI,CAAC,OAAO,GAAG;YACb,SAAS,EAAE,CAAC;YACZ,WAAW,EAAE,CAAC;YACd,YAAY,EAAE,CAAC;YACf,qBAAqB,EAAE,CAAC;YACxB,oBAAoB,EAAE,CAAC;YACvB,sBAAsB,EAAE,CAAC;SAC1B,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Caching strategies for different providers
|
|
3
|
+
*/
|
|
4
|
+
import type { CacheStrategy } from './cache-types.js';
|
|
5
|
+
/**
|
|
6
|
+
* Anthropic Prompt Caching Strategy
|
|
7
|
+
* https://docs.anthropic.com/claude/docs/prompt-caching
|
|
8
|
+
*/
|
|
9
|
+
export declare const anthropicStrategy: CacheStrategy;
|
|
10
|
+
/**
|
|
11
|
+
* OpenAI Prompt Caching Strategy (Placeholder)
|
|
12
|
+
* Note: OpenAI's caching is automatic and not configurable via API
|
|
13
|
+
*/
|
|
14
|
+
export declare const openaiStrategy: CacheStrategy;
|
|
15
|
+
/**
|
|
16
|
+
* Get strategy by provider name
|
|
17
|
+
*/
|
|
18
|
+
export declare function getStrategy(provider: 'anthropic' | 'openai' | 'auto'): CacheStrategy;
|
|
19
|
+
//# sourceMappingURL=cache-strategies.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache-strategies.d.ts","sourceRoot":"","sources":["../../../src/optimizer/caching/cache-strategies.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAiB,MAAM,kBAAkB,CAAC;AAErE;;;GAGG;AACH,eAAO,MAAM,iBAAiB,EAAE,aAsB/B,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,cAAc,EAAE,aAiB5B,CAAC;AAEF;;GAEG;AACH,wBAAgB,WAAW,CAAC,QAAQ,EAAE,WAAW,GAAG,QAAQ,GAAG,MAAM,GAAG,aAAa,CAUpF"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Caching strategies for different providers
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Anthropic Prompt Caching Strategy
|
|
6
|
+
* https://docs.anthropic.com/claude/docs/prompt-caching
|
|
7
|
+
*/
|
|
8
|
+
export const anthropicStrategy = {
|
|
9
|
+
name: 'anthropic',
|
|
10
|
+
shouldCache(content, tokens) {
|
|
11
|
+
// Anthropic requires minimum 1024 tokens for cache breakpoints
|
|
12
|
+
// and minimum 2048 tokens for effective caching
|
|
13
|
+
return tokens >= 1024;
|
|
14
|
+
},
|
|
15
|
+
formatCacheStructure(cached) {
|
|
16
|
+
return [
|
|
17
|
+
{
|
|
18
|
+
type: 'text',
|
|
19
|
+
text: cached.staticPrefix,
|
|
20
|
+
cache_control: { type: 'ephemeral' }
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
type: 'text',
|
|
24
|
+
text: cached.dynamicContent
|
|
25
|
+
}
|
|
26
|
+
];
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
/**
|
|
30
|
+
* OpenAI Prompt Caching Strategy (Placeholder)
|
|
31
|
+
* Note: OpenAI's caching is automatic and not configurable via API
|
|
32
|
+
*/
|
|
33
|
+
export const openaiStrategy = {
|
|
34
|
+
name: 'openai',
|
|
35
|
+
shouldCache(content, tokens) {
|
|
36
|
+
// OpenAI's caching is automatic
|
|
37
|
+
// We still structure prompts for better reuse
|
|
38
|
+
return tokens >= 500;
|
|
39
|
+
},
|
|
40
|
+
formatCacheStructure(cached) {
|
|
41
|
+
// OpenAI doesn't have explicit cache_control
|
|
42
|
+
// Just return structured content
|
|
43
|
+
return {
|
|
44
|
+
system: cached.staticPrefix,
|
|
45
|
+
user: cached.dynamicContent
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
/**
|
|
50
|
+
* Get strategy by provider name
|
|
51
|
+
*/
|
|
52
|
+
export function getStrategy(provider) {
|
|
53
|
+
if (provider === 'auto') {
|
|
54
|
+
// Auto-detect based on environment
|
|
55
|
+
if (process.env.ANTHROPIC_API_KEY) {
|
|
56
|
+
return anthropicStrategy;
|
|
57
|
+
}
|
|
58
|
+
return openaiStrategy;
|
|
59
|
+
}
|
|
60
|
+
return provider === 'anthropic' ? anthropicStrategy : openaiStrategy;
|
|
61
|
+
}
|
|
62
|
+
//# sourceMappingURL=cache-strategies.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache-strategies.js","sourceRoot":"","sources":["../../../src/optimizer/caching/cache-strategies.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH;;;GAGG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAkB;IAC9C,IAAI,EAAE,WAAW;IAEjB,WAAW,CAAC,OAAe,EAAE,MAAc;QACzC,+DAA+D;QAC/D,gDAAgD;QAChD,OAAO,MAAM,IAAI,IAAI,CAAC;IACxB,CAAC;IAED,oBAAoB,CAAC,MAAqB;QACxC,OAAO;YACL;gBACE,IAAI,EAAE,MAAM;gBACZ,IAAI,EAAE,MAAM,CAAC,YAAY;gBACzB,aAAa,EAAE,EAAE,IAAI,EAAE,WAAW,EAAE;aACrC;YACD;gBACE,IAAI,EAAE,MAAM;gBACZ,IAAI,EAAE,MAAM,CAAC,cAAc;aAC5B;SACF,CAAC;IACJ,CAAC;CACF,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,cAAc,GAAkB;IAC3C,IAAI,EAAE,QAAQ;IAEd,WAAW,CAAC,OAAe,EAAE,MAAc;QACzC,gCAAgC;QAChC,8CAA8C;QAC9C,OAAO,MAAM,IAAI,GAAG,CAAC;IACvB,CAAC;IAED,oBAAoB,CAAC,MAAqB;QACxC,6CAA6C;QAC7C,iCAAiC;QACjC,OAAO;YACL,MAAM,EAAE,MAAM,CAAC,YAAY;YAC3B,IAAI,EAAE,MAAM,CAAC,cAAc;SAC5B,CAAC;IACJ,CAAC;CACF,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,QAAyC;IACnE,IAAI,QAAQ,KAAK,MAAM,EAAE,CAAC;QACxB,mCAAmC;QACnC,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;YAClC,OAAO,iBAAiB,CAAC;QAC3B,CAAC;QACD,OAAO,cAAc,CAAC;IACxB,CAAC;IAED,OAAO,QAAQ,KAAK,WAAW,CAAC,CAAC,CAAC,iBAAiB,CAAC,CAAC,CAAC,cAAc,CAAC;AACvE,CAAC"}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type definitions for prompt caching integration
|
|
3
|
+
*/
|
|
4
|
+
export interface CacheConfig {
|
|
5
|
+
enabled: boolean;
|
|
6
|
+
provider: 'anthropic' | 'openai' | 'auto';
|
|
7
|
+
ttl?: '5min' | '1hour';
|
|
8
|
+
cacheStaticPrompts: boolean;
|
|
9
|
+
minCacheableTokens?: number;
|
|
10
|
+
}
|
|
11
|
+
export interface CachedContent {
|
|
12
|
+
staticPrefix: string;
|
|
13
|
+
dynamicContent: string;
|
|
14
|
+
cacheBreakpoint: boolean;
|
|
15
|
+
cacheMetadata?: CacheMetadata;
|
|
16
|
+
}
|
|
17
|
+
export interface CacheMetadata {
|
|
18
|
+
provider: 'anthropic' | 'openai';
|
|
19
|
+
estimatedCacheSize: number;
|
|
20
|
+
cacheKey?: string;
|
|
21
|
+
ttl?: string;
|
|
22
|
+
}
|
|
23
|
+
export interface CacheMetrics {
|
|
24
|
+
cacheHits: number;
|
|
25
|
+
cacheMisses: number;
|
|
26
|
+
cacheHitRate: number;
|
|
27
|
+
estimatedCacheSavings: number;
|
|
28
|
+
totalCacheableTokens: number;
|
|
29
|
+
averageCacheReuseCount: number;
|
|
30
|
+
}
|
|
31
|
+
export interface CacheStrategy {
|
|
32
|
+
name: string;
|
|
33
|
+
shouldCache: (content: string, tokens: number) => boolean;
|
|
34
|
+
formatCacheStructure: (content: CachedContent) => any;
|
|
35
|
+
}
|
|
36
|
+
//# sourceMappingURL=cache-types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache-types.d.ts","sourceRoot":"","sources":["../../../src/optimizer/caching/cache-types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,OAAO,CAAC;IACjB,QAAQ,EAAE,WAAW,GAAG,QAAQ,GAAG,MAAM,CAAC;IAC1C,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC;IACvB,kBAAkB,EAAE,OAAO,CAAC;IAC5B,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,WAAW,aAAa;IAC5B,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,eAAe,EAAE,OAAO,CAAC;IACzB,aAAa,CAAC,EAAE,aAAa,CAAC;CAC/B;AAED,MAAM,WAAW,aAAa;IAC5B,QAAQ,EAAE,WAAW,GAAG,QAAQ,CAAC;IACjC,kBAAkB,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,qBAAqB,EAAE,MAAM,CAAC;IAC9B,oBAAoB,EAAE,MAAM,CAAC;IAC7B,sBAAsB,EAAE,MAAM,CAAC;CAChC;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,KAAK,OAAO,CAAC;IAC1D,oBAAoB,EAAE,CAAC,OAAO,EAAE,aAAa,KAAK,GAAG,CAAC;CACvD"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache-types.js","sourceRoot":"","sources":["../../../src/optimizer/caching/cache-types.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/optimizer/caching/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,kBAAkB,CAAC;AACjC,cAAc,uBAAuB,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/optimizer/caching/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,kBAAkB,CAAC;AACjC,cAAc,uBAAuB,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/optimizer/multilingual/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,wBAAwB,CAAC;AACvC,cAAc,wBAAwB,CAAC;AACvC,cAAc,wBAAwB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/optimizer/multilingual/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,wBAAwB,CAAC;AACvC,cAAc,wBAAwB,CAAC;AACvC,cAAc,wBAAwB,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language detection for token estimation
|
|
3
|
+
*/
|
|
4
|
+
import { type LanguageProfile } from './language-profiles.js';
|
|
5
|
+
export interface LanguageDetectionResult {
|
|
6
|
+
language: LanguageProfile;
|
|
7
|
+
confidence: number;
|
|
8
|
+
detectedPatterns: number;
|
|
9
|
+
}
|
|
10
|
+
export declare class LanguageDetector {
|
|
11
|
+
private sampleSize;
|
|
12
|
+
constructor(sampleSize?: number);
|
|
13
|
+
/**
|
|
14
|
+
* Detect language from text content
|
|
15
|
+
*/
|
|
16
|
+
detect(text: string): LanguageDetectionResult;
|
|
17
|
+
/**
|
|
18
|
+
* Detect if text is mixed-language
|
|
19
|
+
*/
|
|
20
|
+
detectMixed(text: string): LanguageProfile[];
|
|
21
|
+
/**
|
|
22
|
+
* Estimate token multiplier for mixed-language content
|
|
23
|
+
*/
|
|
24
|
+
estimateMultiplierForMixed(languages: LanguageProfile[]): number;
|
|
25
|
+
/**
|
|
26
|
+
* Estimate tokens with language awareness
|
|
27
|
+
*/
|
|
28
|
+
estimateTokens(text: string, baseTokens: number): number;
|
|
29
|
+
/**
|
|
30
|
+
* Estimate tokens for mixed-language content
|
|
31
|
+
*/
|
|
32
|
+
estimateTokensMixed(text: string, baseTokens: number): number;
|
|
33
|
+
/**
|
|
34
|
+
* Get detailed language breakdown
|
|
35
|
+
*/
|
|
36
|
+
analyze(text: string): {
|
|
37
|
+
primary: LanguageDetectionResult;
|
|
38
|
+
all: LanguageProfile[];
|
|
39
|
+
estimatedMultiplier: number;
|
|
40
|
+
isMixed: boolean;
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=language-detector.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"language-detector.d.ts","sourceRoot":"","sources":["../../../src/optimizer/multilingual/language-detector.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAqB,KAAK,eAAe,EAAE,MAAM,wBAAwB,CAAC;AAEjF,MAAM,WAAW,uBAAuB;IACtC,QAAQ,EAAE,eAAe,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,gBAAgB,EAAE,MAAM,CAAC;CAC1B;AAED,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,UAAU,CAAS;gBAEf,UAAU,GAAE,MAAY;IAIpC;;OAEG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,uBAAuB;IA+E7C;;OAEG;IACH,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,eAAe,EAAE;IAgC5C;;OAEG;IACH,0BAA0B,CAAC,SAAS,EAAE,eAAe,EAAE,GAAG,MAAM;IAahE;;OAEG;IACH,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,MAAM;IAYxD;;OAEG;IACH,mBAAmB,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,MAAM;IAM7D;;OAEG;IACH,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG;QACrB,OAAO,EAAE,uBAAuB,CAAC;QACjC,GAAG,EAAE,eAAe,EAAE,CAAC;QACvB,mBAAmB,EAAE,MAAM,CAAC;QAC5B,OAAO,EAAE,OAAO,CAAC;KAClB;CAeF"}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language detection for token estimation
|
|
3
|
+
*/
|
|
4
|
+
import { LANGUAGE_PROFILES } from './language-profiles.js';
|
|
5
|
+
export class LanguageDetector {
|
|
6
|
+
sampleSize;
|
|
7
|
+
constructor(sampleSize = 500) {
|
|
8
|
+
this.sampleSize = sampleSize;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Detect language from text content
|
|
12
|
+
*/
|
|
13
|
+
detect(text) {
|
|
14
|
+
if (!text || text.trim().length === 0) {
|
|
15
|
+
return {
|
|
16
|
+
language: LANGUAGE_PROFILES[0], // Default to English
|
|
17
|
+
confidence: 0,
|
|
18
|
+
detectedPatterns: 0
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
// Use first N characters for detection (performance optimization)
|
|
22
|
+
const sample = text.slice(0, this.sampleSize);
|
|
23
|
+
// Score each language profile
|
|
24
|
+
const scores = LANGUAGE_PROFILES.map(profile => {
|
|
25
|
+
let matchedPatterns = 0;
|
|
26
|
+
let totalMatches = 0;
|
|
27
|
+
for (const pattern of profile.detectionPatterns) {
|
|
28
|
+
const matches = sample.match(new RegExp(pattern, 'g'));
|
|
29
|
+
if (matches && matches.length > 0) {
|
|
30
|
+
matchedPatterns++;
|
|
31
|
+
totalMatches += matches.length;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
// Calculate confidence based on:
|
|
35
|
+
// 1. Number of patterns matched
|
|
36
|
+
// 2. Total number of matches relative to sample size (density)
|
|
37
|
+
// 3. Profile's inherent confidence
|
|
38
|
+
const patternScore = matchedPatterns / profile.detectionPatterns.length;
|
|
39
|
+
// Density score: how much of the text matches this language
|
|
40
|
+
const sampleLength = Math.max(sample.length, 1);
|
|
41
|
+
// For character-based languages (CJK, Arabic, etc.), matches can be very high
|
|
42
|
+
const matchDensity = Math.min(totalMatches / (sampleLength / 5), 1.0); // Expect ~1 match per 5 chars
|
|
43
|
+
const densityScore = matchDensity;
|
|
44
|
+
// Adaptive weighting: if density is very high, trust it more
|
|
45
|
+
// (many characters match = strong signal, even if only 1 pattern matched)
|
|
46
|
+
const patternWeight = densityScore > 0.9 ? 0.4 : 0.7;
|
|
47
|
+
const densityWeight = 1.0 - patternWeight;
|
|
48
|
+
// Boost confidence for high-confidence scenarios
|
|
49
|
+
let boost = 1.0;
|
|
50
|
+
if (patternScore === 1.0)
|
|
51
|
+
boost = 1.1; // All patterns matched
|
|
52
|
+
if (densityScore > 0.95)
|
|
53
|
+
boost = Math.max(boost, 1.05); // Very high density
|
|
54
|
+
// Calculate final confidence
|
|
55
|
+
const rawConfidence = (patternScore * patternWeight + densityScore * densityWeight) * boost;
|
|
56
|
+
const confidence = Math.min(rawConfidence, 1.0) * profile.confidence;
|
|
57
|
+
return {
|
|
58
|
+
profile,
|
|
59
|
+
confidence,
|
|
60
|
+
matchedPatterns
|
|
61
|
+
};
|
|
62
|
+
});
|
|
63
|
+
// Find the highest scoring language
|
|
64
|
+
const best = scores.reduce((a, b) => a.confidence > b.confidence ? a : b);
|
|
65
|
+
// If confidence is too low, default to English
|
|
66
|
+
if (best.confidence < 0.1) {
|
|
67
|
+
return {
|
|
68
|
+
language: LANGUAGE_PROFILES[0], // English
|
|
69
|
+
confidence: 0.5, // Low confidence fallback
|
|
70
|
+
detectedPatterns: 0
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
language: best.profile,
|
|
75
|
+
confidence: best.confidence,
|
|
76
|
+
detectedPatterns: best.matchedPatterns
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Detect if text is mixed-language
|
|
81
|
+
*/
|
|
82
|
+
detectMixed(text) {
|
|
83
|
+
const sample = text.slice(0, this.sampleSize);
|
|
84
|
+
const detected = [];
|
|
85
|
+
for (const profile of LANGUAGE_PROFILES) {
|
|
86
|
+
for (const pattern of profile.detectionPatterns) {
|
|
87
|
+
if (pattern.test(sample)) {
|
|
88
|
+
detected.push(profile);
|
|
89
|
+
break;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
// Handle CJK overlap: Japanese Kanji overlaps with Chinese
|
|
94
|
+
// If both Chinese and Japanese are detected, check for Japanese-specific characters
|
|
95
|
+
const hasChinese = detected.some(p => p.code === 'zh');
|
|
96
|
+
const hasJapanese = detected.some(p => p.code === 'ja');
|
|
97
|
+
if (hasChinese && hasJapanese) {
|
|
98
|
+
// Check for Hiragana or Katakana (Japanese-specific)
|
|
99
|
+
const hasHiragana = /[\u3040-\u309f]/.test(sample);
|
|
100
|
+
const hasKatakana = /[\u30a0-\u30ff]/.test(sample);
|
|
101
|
+
if (!hasHiragana && !hasKatakana) {
|
|
102
|
+
// No Japanese-specific characters, remove Japanese
|
|
103
|
+
return detected.filter(p => p.code !== 'ja');
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return detected;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Estimate token multiplier for mixed-language content
|
|
110
|
+
*/
|
|
111
|
+
estimateMultiplierForMixed(languages) {
|
|
112
|
+
if (languages.length === 0)
|
|
113
|
+
return 1.0;
|
|
114
|
+
if (languages.length === 1)
|
|
115
|
+
return languages[0].tokenMultiplier;
|
|
116
|
+
// Use weighted average (favor higher multipliers for safety)
|
|
117
|
+
const multipliers = languages.map(l => l.tokenMultiplier);
|
|
118
|
+
const max = Math.max(...multipliers);
|
|
119
|
+
const avg = multipliers.reduce((a, b) => a + b) / multipliers.length;
|
|
120
|
+
// Weight toward max to be conservative
|
|
121
|
+
return avg * 0.4 + max * 0.6;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Estimate tokens with language awareness
|
|
125
|
+
*/
|
|
126
|
+
estimateTokens(text, baseTokens) {
|
|
127
|
+
const detection = this.detect(text);
|
|
128
|
+
// If very low confidence, use base tokens
|
|
129
|
+
if (detection.confidence < 0.3) {
|
|
130
|
+
return baseTokens;
|
|
131
|
+
}
|
|
132
|
+
// Apply language multiplier
|
|
133
|
+
return Math.ceil(baseTokens * detection.language.tokenMultiplier);
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Estimate tokens for mixed-language content
|
|
137
|
+
*/
|
|
138
|
+
estimateTokensMixed(text, baseTokens) {
|
|
139
|
+
const languages = this.detectMixed(text);
|
|
140
|
+
const multiplier = this.estimateMultiplierForMixed(languages);
|
|
141
|
+
return Math.ceil(baseTokens * multiplier);
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Get detailed language breakdown
|
|
145
|
+
*/
|
|
146
|
+
analyze(text) {
|
|
147
|
+
const primary = this.detect(text);
|
|
148
|
+
const all = this.detectMixed(text);
|
|
149
|
+
const isMixed = all.length > 1;
|
|
150
|
+
const estimatedMultiplier = isMixed
|
|
151
|
+
? this.estimateMultiplierForMixed(all)
|
|
152
|
+
: primary.language.tokenMultiplier;
|
|
153
|
+
return {
|
|
154
|
+
primary,
|
|
155
|
+
all,
|
|
156
|
+
estimatedMultiplier,
|
|
157
|
+
isMixed
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
//# sourceMappingURL=language-detector.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"language-detector.js","sourceRoot":"","sources":["../../../src/optimizer/multilingual/language-detector.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,iBAAiB,EAAwB,MAAM,wBAAwB,CAAC;AAQjF,MAAM,OAAO,gBAAgB;IACnB,UAAU,CAAS;IAE3B,YAAY,aAAqB,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,IAAY;QACjB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtC,OAAO;gBACL,QAAQ,EAAE,iBAAiB,CAAC,CAAC,CAAC,EAAE,qBAAqB;gBACrD,UAAU,EAAE,CAAC;gBACb,gBAAgB,EAAE,CAAC;aACpB,CAAC;QACJ,CAAC;QAED,kEAAkE;QAClE,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAE9C,8BAA8B;QAC9B,MAAM,MAAM,GAAG,iBAAiB,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE;YAC7C,IAAI,eAAe,GAAG,CAAC,CAAC;YACxB,IAAI,YAAY,GAAG,CAAC,CAAC;YAErB,KAAK,MAAM,OAAO,IAAI,OAAO,CAAC,iBAAiB,EAAE,CAAC;gBAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC,CAAC;gBACvD,IAAI,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAClC,eAAe,EAAE,CAAC;oBAClB,YAAY,IAAI,OAAO,CAAC,MAAM,CAAC;gBACjC,CAAC;YACH,CAAC;YAED,iCAAiC;YACjC,gCAAgC;YAChC,+DAA+D;YAC/D,mCAAmC;YACnC,MAAM,YAAY,GAAG,eAAe,GAAG,OAAO,CAAC,iBAAiB,CAAC,MAAM,CAAC;YAExE,4DAA4D;YAC5D,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YAChD,8EAA8E;YAC9E,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,YAAY,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,8BAA8B;YACrG,MAAM,YAAY,GAAG,YAAY,CAAC;YAElC,6DAA6D;YAC7D,0EAA0E;YAC1E,MAAM,aAAa,GAAG,YAAY,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;YACrD,MAAM,aAAa,GAAG,GAAG,GAAG,aAAa,CAAC;YAE1C,iDAAiD;YACjD,IAAI,KAAK,GAAG,GAAG,CAAC;YAChB,IAAI,YAAY,KAAK,GAAG;gBAAE,KAAK,GAAG,GAAG,CAAC,CAAC,uBAAuB;YAC9D,IAAI,YAAY,GAAG,IAAI;gBAAE,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,CAAC,oBAAoB;YAE5E,6BAA6B;YAC7B,MAAM,aAAa,GAAG,CAAC,YAAY,GAAG,aAAa,GAAG,YAAY,GAAG,aAAa,CAAC,GAAG,KAAK,CAAC;YAC5F,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,aAAa,EAAE,GAAG,CAAC,GAAG,OAAO,CAAC,UAAU,CAAC;YAErE,OAAO;gBACL,OAAO;gBACP,UAAU;gBACV,eAAe;aAChB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,oCAAoC;QACpC,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAClC,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CACpC,CAAC;QAEF,+CAA+C;QAC/C,IAAI,IAAI,CAAC,UAAU,GAAG,GAAG,EAAE,CAAC;YAC1B,OAAO;gBACL,QAAQ,EAAE,iBAAiB,CAAC,CAAC,CAAC,EAAE,UAAU;gBAC1C,UAAU,EAAE,GAAG,EAAE,0BAA0B;gBAC3C,gBAAgB,EAAE,CAAC;aACpB,CAAC;QACJ,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,IAAI,CAAC,OAAO;YACtB,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,gBAAgB,EAAE,IAAI,CAAC,eAAe;SACvC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,WAAW,CAAC,IAAY;QACtB,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAC9C,MAAM,QAAQ,GAAsB,EAAE,CAAC;QAEvC,KAAK,MAAM,OAAO,IAAI,iBAAiB,EAAE,CAAC;YACxC,KAAK,MAAM,OAAO,IAAI,OAAO,CAAC,iBAAiB,EAAE,CAAC;gBAChD,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;oBACzB,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBACvB,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;QAED,2DAA2D;QAC3D,oFAAoF;QACpF,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;QACvD,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;QAExD,IAAI,UAAU,IAAI,WAAW,EAAE,CAAC;YAC9B,qDAAqD;YACrD,MAAM,WAAW,GAAG,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACnD,MAAM,WAAW,GAAG,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAEnD,IAAI,CAAC,WAAW,IAAI,CAAC,WAAW,EAAE,CAAC;gBACjC,mDAAmD;gBACnD,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;YAC/C,CAAC;QACH,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;OAEG;IACH,0BAA0B,CAAC,SAA4B;QACrD,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,GAAG,CAAC;QACvC,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC;QAEhE,6DAA6D;QAC7D,MAAM,WAAW,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC;QAC1D,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC,CAAC;QACrC,MAAM,GAAG,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,WAAW,CAAC,MAAM,CAAC;QAErE,uCAAuC;QACvC,OAAO,GAAG,GAAG,GAAG,GAAG,GAAG,GAAG,GAAG,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,IAAY,EAAE,UAAkB;QAC7C,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAEpC,0CAA0C;QAC1C,IAAI,SAAS,CAAC,UAAU,GAAG,GAAG,EAAE,CAAC;YAC/B,OAAO,UAAU,CAAC;QACpB,CAAC;QAED,4BAA4B;QAC5B,OAAO,IAAI,CAAC,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC;IACpE,CAAC;IAED;;OAEG;IACH,mBAAmB,CAAC,IAAY,EAAE,UAAkB;QAClD,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,UAAU,GAAG,IAAI,CAAC,0BAA0B,CAAC,SAAS,CAAC,CAAC;QAC9D,OAAO,IAAI,CAAC,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,IAAY;QAMlB,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAClC,MAAM,GAAG,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,OAAO,GAAG,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC;QAC/B,MAAM,mBAAmB,GAAG,OAAO;YACjC,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,GAAG,CAAC;YACtC,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC;QAErC,OAAO;YACL,OAAO;YACP,GAAG;YACH,mBAAmB;YACnB,OAAO;SACR,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language profiles with token multipliers
|
|
3
|
+
* Based on research: different languages require different numbers of tokens
|
|
4
|
+
*/
|
|
5
|
+
export interface LanguageProfile {
|
|
6
|
+
code: string;
|
|
7
|
+
name: string;
|
|
8
|
+
nativeName: string;
|
|
9
|
+
tokenMultiplier: number;
|
|
10
|
+
detectionPatterns: RegExp[];
|
|
11
|
+
confidence: number;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Token multipliers based on research and testing
|
|
15
|
+
*
|
|
16
|
+
* Sources:
|
|
17
|
+
* - Anthropic: https://docs.anthropic.com/claude/docs/models-overview
|
|
18
|
+
* - OpenAI tokenizer analysis
|
|
19
|
+
* - Community research on multilingual token efficiency
|
|
20
|
+
*/
|
|
21
|
+
export declare const LANGUAGE_PROFILES: LanguageProfile[];
|
|
22
|
+
/**
|
|
23
|
+
* Get language profile by code
|
|
24
|
+
*/
|
|
25
|
+
export declare function getLanguageProfile(code: string): LanguageProfile | null;
|
|
26
|
+
/**
|
|
27
|
+
* Get all supported language codes
|
|
28
|
+
*/
|
|
29
|
+
export declare function getSupportedLanguages(): string[];
|
|
30
|
+
/**
|
|
31
|
+
* Get token multiplier for a language code
|
|
32
|
+
*/
|
|
33
|
+
export declare function getTokenMultiplier(code: string): number;
|
|
34
|
+
//# sourceMappingURL=language-profiles.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"language-profiles.d.ts","sourceRoot":"","sources":["../../../src/optimizer/multilingual/language-profiles.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,iBAAiB,EAAE,MAAM,EAAE,CAAC;IAC5B,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;;;;;;GAOG;AACH,eAAO,MAAM,iBAAiB,EAAE,eAAe,EAmK9C,CAAC;AAEF;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,eAAe,GAAG,IAAI,CAEvE;AAED;;GAEG;AACH,wBAAgB,qBAAqB,IAAI,MAAM,EAAE,CAEhD;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAGvD"}
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language profiles with token multipliers
|
|
3
|
+
* Based on research: different languages require different numbers of tokens
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Token multipliers based on research and testing
|
|
7
|
+
*
|
|
8
|
+
* Sources:
|
|
9
|
+
* - Anthropic: https://docs.anthropic.com/claude/docs/models-overview
|
|
10
|
+
* - OpenAI tokenizer analysis
|
|
11
|
+
* - Community research on multilingual token efficiency
|
|
12
|
+
*/
|
|
13
|
+
export const LANGUAGE_PROFILES = [
|
|
14
|
+
{
|
|
15
|
+
code: 'en',
|
|
16
|
+
name: 'English',
|
|
17
|
+
nativeName: 'English',
|
|
18
|
+
tokenMultiplier: 1.0,
|
|
19
|
+
detectionPatterns: [
|
|
20
|
+
/[a-zA-Z]{2,}/, // At least 2 consecutive English letters
|
|
21
|
+
/\b(the|is|are|was|were|have|has|had|do|does|did|a|an|and|or|but|in|on|at|to|for|of|with|be|this|that|from|as|by|they|we|you|he|she|it|not|can|will|would|could|should|may|might|must|hello|world)\b/i
|
|
22
|
+
],
|
|
23
|
+
confidence: 1.0
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
code: 'es',
|
|
27
|
+
name: 'Spanish',
|
|
28
|
+
nativeName: 'Español',
|
|
29
|
+
tokenMultiplier: 1.7,
|
|
30
|
+
detectionPatterns: [
|
|
31
|
+
/[áéíóúüñ¿¡]/i,
|
|
32
|
+
/\b(el|la|los|las|un|una|de|en|que|y|es|por)\b/i
|
|
33
|
+
],
|
|
34
|
+
confidence: 0.9
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
code: 'fr',
|
|
38
|
+
name: 'French',
|
|
39
|
+
nativeName: 'Français',
|
|
40
|
+
tokenMultiplier: 1.8,
|
|
41
|
+
detectionPatterns: [
|
|
42
|
+
/[àâäéèêëïîôùûüÿç]/i,
|
|
43
|
+
/\b(le|la|les|un|une|de|et|est|à|en|que)\b/i
|
|
44
|
+
],
|
|
45
|
+
confidence: 0.9
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
code: 'de',
|
|
49
|
+
name: 'German',
|
|
50
|
+
nativeName: 'Deutsch',
|
|
51
|
+
tokenMultiplier: 1.6,
|
|
52
|
+
detectionPatterns: [
|
|
53
|
+
/[äöüß]/i,
|
|
54
|
+
/\b(der|die|das|den|dem|des|ein|eine|und|ist|in)\b/i
|
|
55
|
+
],
|
|
56
|
+
confidence: 0.9
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
code: 'zh',
|
|
60
|
+
name: 'Chinese',
|
|
61
|
+
nativeName: '中文',
|
|
62
|
+
tokenMultiplier: 2.0,
|
|
63
|
+
detectionPatterns: [
|
|
64
|
+
/[\u4e00-\u9fff]/,
|
|
65
|
+
/[\u3400-\u4dbf]/ // CJK Extension A
|
|
66
|
+
],
|
|
67
|
+
confidence: 0.95
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
code: 'ja',
|
|
71
|
+
name: 'Japanese',
|
|
72
|
+
nativeName: '日本語',
|
|
73
|
+
tokenMultiplier: 2.5,
|
|
74
|
+
detectionPatterns: [
|
|
75
|
+
/[\u3040-\u309f]/, // Hiragana
|
|
76
|
+
/[\u30a0-\u30ff]/, // Katakana
|
|
77
|
+
/[\u4e00-\u9fff]/ // Kanji (overlaps with Chinese)
|
|
78
|
+
],
|
|
79
|
+
confidence: 0.9
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
code: 'ko',
|
|
83
|
+
name: 'Korean',
|
|
84
|
+
nativeName: '한국어',
|
|
85
|
+
tokenMultiplier: 2.3,
|
|
86
|
+
detectionPatterns: [
|
|
87
|
+
/[\uac00-\ud7af]/, // Hangul Syllables
|
|
88
|
+
/[\u1100-\u11ff]/ // Hangul Jamo
|
|
89
|
+
],
|
|
90
|
+
confidence: 0.9
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
code: 'ar',
|
|
94
|
+
name: 'Arabic',
|
|
95
|
+
nativeName: 'العربية',
|
|
96
|
+
tokenMultiplier: 3.0,
|
|
97
|
+
detectionPatterns: [
|
|
98
|
+
/[\u0600-\u06ff]/, // Arabic
|
|
99
|
+
/[\u0750-\u077f]/ // Arabic Supplement
|
|
100
|
+
],
|
|
101
|
+
confidence: 0.85
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
code: 'ta',
|
|
105
|
+
name: 'Tamil',
|
|
106
|
+
nativeName: 'தமிழ்',
|
|
107
|
+
tokenMultiplier: 4.5,
|
|
108
|
+
detectionPatterns: [
|
|
109
|
+
/[\u0b80-\u0bff]/ // Tamil
|
|
110
|
+
],
|
|
111
|
+
confidence: 0.8
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
code: 'hi',
|
|
115
|
+
name: 'Hindi',
|
|
116
|
+
nativeName: 'हिन्दी',
|
|
117
|
+
tokenMultiplier: 3.5,
|
|
118
|
+
detectionPatterns: [
|
|
119
|
+
/[\u0900-\u097f]/ // Devanagari
|
|
120
|
+
],
|
|
121
|
+
confidence: 0.85
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
code: 'ru',
|
|
125
|
+
name: 'Russian',
|
|
126
|
+
nativeName: 'Русский',
|
|
127
|
+
tokenMultiplier: 1.9,
|
|
128
|
+
detectionPatterns: [
|
|
129
|
+
/[\u0400-\u04ff]/, // Cyrillic
|
|
130
|
+
/\b(и|в|не|на|я|что|он|с|как|а)\b/i
|
|
131
|
+
],
|
|
132
|
+
confidence: 0.9
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
code: 'pt',
|
|
136
|
+
name: 'Portuguese',
|
|
137
|
+
nativeName: 'Português',
|
|
138
|
+
tokenMultiplier: 1.7,
|
|
139
|
+
detectionPatterns: [
|
|
140
|
+
/[àáâãäèéêëìíîïòóôõöùúûü]/i,
|
|
141
|
+
/\b(o|a|os|as|de|em|que|e|é|do|da)\b/i
|
|
142
|
+
],
|
|
143
|
+
confidence: 0.9
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
code: 'th',
|
|
147
|
+
name: 'Thai',
|
|
148
|
+
nativeName: 'ไทย',
|
|
149
|
+
tokenMultiplier: 4.0,
|
|
150
|
+
detectionPatterns: [
|
|
151
|
+
/[\u0e00-\u0e7f]/ // Thai
|
|
152
|
+
],
|
|
153
|
+
confidence: 0.8
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
code: 'vi',
|
|
157
|
+
name: 'Vietnamese',
|
|
158
|
+
nativeName: 'Tiếng Việt',
|
|
159
|
+
tokenMultiplier: 1.5,
|
|
160
|
+
detectionPatterns: [
|
|
161
|
+
/[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]/i,
|
|
162
|
+
/\b(và|của|có|là|này|được|trong|cho|người|từ|để|với|một|những|các|không|khi|trên)\b/i
|
|
163
|
+
],
|
|
164
|
+
confidence: 0.9
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
code: 'id',
|
|
168
|
+
name: 'Indonesian',
|
|
169
|
+
nativeName: 'Bahasa Indonesia',
|
|
170
|
+
tokenMultiplier: 1.4,
|
|
171
|
+
detectionPatterns: [
|
|
172
|
+
/\b(yang|dan|di|ke|dari|ini|itu|untuk|dengan|pada|adalah|tidak|ada|atau|akan|juga|oleh|dalam)\b/i
|
|
173
|
+
],
|
|
174
|
+
confidence: 0.85
|
|
175
|
+
}
|
|
176
|
+
];
|
|
177
|
+
/**
|
|
178
|
+
* Get language profile by code
|
|
179
|
+
*/
|
|
180
|
+
export function getLanguageProfile(code) {
|
|
181
|
+
return LANGUAGE_PROFILES.find(p => p.code === code) || null;
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Get all supported language codes
|
|
185
|
+
*/
|
|
186
|
+
export function getSupportedLanguages() {
|
|
187
|
+
return LANGUAGE_PROFILES.map(p => p.code);
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Get token multiplier for a language code
|
|
191
|
+
*/
|
|
192
|
+
export function getTokenMultiplier(code) {
|
|
193
|
+
const profile = getLanguageProfile(code);
|
|
194
|
+
return profile ? profile.tokenMultiplier : 1.0; // Default to English
|
|
195
|
+
}
|
|
196
|
+
//# sourceMappingURL=language-profiles.js.map
|