modality-kit 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +718 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/util_text_compression.d.ts +3 -5
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -4283,6 +4283,723 @@ async function loadVersion(packageJsonPath) {
|
|
|
4283
4283
|
return "0.0.0";
|
|
4284
4284
|
}
|
|
4285
4285
|
}
|
|
4286
|
+
// src/util_text_compression.ts
|
|
4287
|
+
var DEFAULT_CONFIG = {
|
|
4288
|
+
maxTokens: 4000,
|
|
4289
|
+
compressionLevel: "moderate",
|
|
4290
|
+
preserveCodeBlocks: true,
|
|
4291
|
+
autoDetectLanguage: true,
|
|
4292
|
+
enableLogging: false,
|
|
4293
|
+
maxSentencesForAnalysis: 500,
|
|
4294
|
+
fastModeMaxSentences: 200
|
|
4295
|
+
};
|
|
4296
|
+
class LanguageDetectionError extends Error {
|
|
4297
|
+
fallbackLanguage;
|
|
4298
|
+
constructor(message, fallbackLanguage) {
|
|
4299
|
+
super(message);
|
|
4300
|
+
this.fallbackLanguage = fallbackLanguage;
|
|
4301
|
+
this.name = "LanguageDetectionError";
|
|
4302
|
+
}
|
|
4303
|
+
}
|
|
4304
|
+
|
|
4305
|
+
class CompressionLogger {
|
|
4306
|
+
enabled;
|
|
4307
|
+
constructor(enabled = false) {
|
|
4308
|
+
this.enabled = enabled;
|
|
4309
|
+
}
|
|
4310
|
+
info(message, data) {
|
|
4311
|
+
if (this.enabled) {
|
|
4312
|
+
console.info(`[TextCompression] ${message}`, data || "");
|
|
4313
|
+
}
|
|
4314
|
+
}
|
|
4315
|
+
warn(message, data) {
|
|
4316
|
+
if (this.enabled) {
|
|
4317
|
+
console.warn(`[TextCompression] ${message}`, data || "");
|
|
4318
|
+
}
|
|
4319
|
+
}
|
|
4320
|
+
error(message, error) {
|
|
4321
|
+
if (this.enabled) {
|
|
4322
|
+
console.error(`[TextCompression] ${message}`, error || "");
|
|
4323
|
+
}
|
|
4324
|
+
}
|
|
4325
|
+
}
|
|
4326
|
+
|
|
4327
|
+
class UniversalLanguageDetector {
|
|
4328
|
+
logger;
|
|
4329
|
+
cache = new Map;
|
|
4330
|
+
constructor(logger2) {
|
|
4331
|
+
this.logger = logger2;
|
|
4332
|
+
}
|
|
4333
|
+
async detectLanguage(text) {
|
|
4334
|
+
try {
|
|
4335
|
+
if (!text || typeof text !== "string") {
|
|
4336
|
+
throw new LanguageDetectionError("Invalid input text", "und");
|
|
4337
|
+
}
|
|
4338
|
+
const cacheKey = text.length > 200 ? text.substring(0, 200) : text;
|
|
4339
|
+
if (this.cache.has(cacheKey)) {
|
|
4340
|
+
return this.cache.get(cacheKey);
|
|
4341
|
+
}
|
|
4342
|
+
const result = await this.performDetection(text);
|
|
4343
|
+
if (this.cache.size > 50) {
|
|
4344
|
+
this.cache.clear();
|
|
4345
|
+
}
|
|
4346
|
+
this.cache.set(cacheKey, result);
|
|
4347
|
+
this.logger.info(`Language detected: ${result.code} (confidence: ${result.confidence})`);
|
|
4348
|
+
return result;
|
|
4349
|
+
} catch (error) {
|
|
4350
|
+
this.logger.error("Language detection failed", error);
|
|
4351
|
+
const fallback = {
|
|
4352
|
+
code: "und",
|
|
4353
|
+
locale: "und",
|
|
4354
|
+
confidence: 0.1
|
|
4355
|
+
};
|
|
4356
|
+
return fallback;
|
|
4357
|
+
}
|
|
4358
|
+
}
|
|
4359
|
+
async performDetection(text) {
|
|
4360
|
+
const unicodeHints = this.analyzeUnicodeRanges(text);
|
|
4361
|
+
const testLocales = this.prioritizeLocalesBasedOnUnicode(unicodeHints);
|
|
4362
|
+
let bestMatch = {
|
|
4363
|
+
code: "und",
|
|
4364
|
+
locale: "und",
|
|
4365
|
+
confidence: 0.3
|
|
4366
|
+
};
|
|
4367
|
+
for (const testLocale of testLocales) {
|
|
4368
|
+
try {
|
|
4369
|
+
const intlScore = await this.testLocaleWithIntlAPIs(text, testLocale);
|
|
4370
|
+
const unicodeBoost = this.getUnicodeBoost(testLocale, unicodeHints);
|
|
4371
|
+
const combinedScore = Math.min(1, intlScore + unicodeBoost);
|
|
4372
|
+
if (combinedScore > bestMatch.confidence) {
|
|
4373
|
+
const locale = new Intl.Locale(testLocale);
|
|
4374
|
+
bestMatch = {
|
|
4375
|
+
code: locale.language,
|
|
4376
|
+
locale: testLocale,
|
|
4377
|
+
confidence: combinedScore,
|
|
4378
|
+
script: locale.script,
|
|
4379
|
+
region: locale.region
|
|
4380
|
+
};
|
|
4381
|
+
}
|
|
4382
|
+
} catch (error) {
|
|
4383
|
+
continue;
|
|
4384
|
+
}
|
|
4385
|
+
}
|
|
4386
|
+
return bestMatch;
|
|
4387
|
+
}
|
|
4388
|
+
analyzeUnicodeRanges(text) {
|
|
4389
|
+
const sample = text.slice(0, 500);
|
|
4390
|
+
const codePoints = Array.from(sample).map((char) => char.codePointAt(0));
|
|
4391
|
+
const ranges = new Map;
|
|
4392
|
+
let totalRelevantChars = 0;
|
|
4393
|
+
for (const codePoint of codePoints) {
|
|
4394
|
+
let rangeFound = false;
|
|
4395
|
+
if (codePoint >= 19968 && codePoint <= 40959) {
|
|
4396
|
+
ranges.set("cjk", (ranges.get("cjk") || 0) + 1);
|
|
4397
|
+
totalRelevantChars++;
|
|
4398
|
+
rangeFound = true;
|
|
4399
|
+
}
|
|
4400
|
+
if (codePoint >= 12352 && codePoint <= 12447) {
|
|
4401
|
+
ranges.set("hiragana", (ranges.get("hiragana") || 0) + 1);
|
|
4402
|
+
totalRelevantChars++;
|
|
4403
|
+
rangeFound = true;
|
|
4404
|
+
}
|
|
4405
|
+
if (codePoint >= 12448 && codePoint <= 12543) {
|
|
4406
|
+
ranges.set("katakana", (ranges.get("katakana") || 0) + 1);
|
|
4407
|
+
totalRelevantChars++;
|
|
4408
|
+
rangeFound = true;
|
|
4409
|
+
}
|
|
4410
|
+
if (codePoint >= 44032 && codePoint <= 55215) {
|
|
4411
|
+
ranges.set("hangul", (ranges.get("hangul") || 0) + 1);
|
|
4412
|
+
totalRelevantChars++;
|
|
4413
|
+
rangeFound = true;
|
|
4414
|
+
}
|
|
4415
|
+
if (codePoint >= 1024 && codePoint <= 1279) {
|
|
4416
|
+
ranges.set("cyrillic", (ranges.get("cyrillic") || 0) + 1);
|
|
4417
|
+
totalRelevantChars++;
|
|
4418
|
+
rangeFound = true;
|
|
4419
|
+
}
|
|
4420
|
+
if (codePoint >= 1536 && codePoint <= 1791) {
|
|
4421
|
+
ranges.set("arabic", (ranges.get("arabic") || 0) + 1);
|
|
4422
|
+
totalRelevantChars++;
|
|
4423
|
+
rangeFound = true;
|
|
4424
|
+
}
|
|
4425
|
+
if (!rangeFound && codePoint >= 32 && codePoint <= 126) {
|
|
4426
|
+
ranges.set("latin", (ranges.get("latin") || 0) + 1);
|
|
4427
|
+
totalRelevantChars++;
|
|
4428
|
+
}
|
|
4429
|
+
}
|
|
4430
|
+
const percentages = new Map;
|
|
4431
|
+
for (const [range, count] of ranges) {
|
|
4432
|
+
percentages.set(range, totalRelevantChars > 0 ? count / totalRelevantChars : 0);
|
|
4433
|
+
}
|
|
4434
|
+
return percentages;
|
|
4435
|
+
}
|
|
4436
|
+
prioritizeLocalesBasedOnUnicode(unicodeHints) {
|
|
4437
|
+
const availableLocales = this.getAvailableTestLocales(unicodeHints);
|
|
4438
|
+
return availableLocales.sort((a, b) => {
|
|
4439
|
+
const scoreA = this.getUnicodeRelevanceScore(a, unicodeHints);
|
|
4440
|
+
const scoreB = this.getUnicodeRelevanceScore(b, unicodeHints);
|
|
4441
|
+
return scoreB - scoreA;
|
|
4442
|
+
});
|
|
4443
|
+
}
|
|
4444
|
+
getAvailableTestLocales(unicodeHints) {
|
|
4445
|
+
const locales = new Set;
|
|
4446
|
+
if (typeof Intl !== "undefined" && Intl.DateTimeFormat) {
|
|
4447
|
+
try {
|
|
4448
|
+
const systemLocale = Intl.DateTimeFormat().resolvedOptions().locale;
|
|
4449
|
+
locales.add(systemLocale);
|
|
4450
|
+
const locale = new Intl.Locale(systemLocale);
|
|
4451
|
+
if (locale.language) {
|
|
4452
|
+
locales.add(locale.language);
|
|
4453
|
+
}
|
|
4454
|
+
} catch (error) {}
|
|
4455
|
+
}
|
|
4456
|
+
const ranges = Array.from(unicodeHints.keys());
|
|
4457
|
+
if (ranges.includes("cjk")) {
|
|
4458
|
+
locales.add("zh-Hant");
|
|
4459
|
+
locales.add("zh-Hans");
|
|
4460
|
+
}
|
|
4461
|
+
if (ranges.includes("hiragana") || ranges.includes("katakana")) {
|
|
4462
|
+
locales.add("ja-JP");
|
|
4463
|
+
locales.add("ja");
|
|
4464
|
+
}
|
|
4465
|
+
if (ranges.includes("hangul")) {
|
|
4466
|
+
locales.add("ko-KR");
|
|
4467
|
+
locales.add("ko");
|
|
4468
|
+
}
|
|
4469
|
+
if (ranges.includes("cyrillic")) {
|
|
4470
|
+
locales.add("ru-RU");
|
|
4471
|
+
locales.add("ru");
|
|
4472
|
+
}
|
|
4473
|
+
if (ranges.includes("arabic")) {
|
|
4474
|
+
locales.add("ar-SA");
|
|
4475
|
+
locales.add("ar");
|
|
4476
|
+
}
|
|
4477
|
+
if (ranges.includes("latin")) {
|
|
4478
|
+
locales.add("en-US");
|
|
4479
|
+
locales.add("en");
|
|
4480
|
+
locales.add("fr-FR");
|
|
4481
|
+
locales.add("de-DE");
|
|
4482
|
+
locales.add("es-ES");
|
|
4483
|
+
}
|
|
4484
|
+
locales.add("en-US");
|
|
4485
|
+
locales.add("en");
|
|
4486
|
+
return Array.from(locales);
|
|
4487
|
+
}
|
|
4488
|
+
getUnicodeRelevanceScore(locale, unicodeHints) {
|
|
4489
|
+
const hiragana = unicodeHints.get("hiragana") || 0;
|
|
4490
|
+
const katakana = unicodeHints.get("katakana") || 0;
|
|
4491
|
+
const hangul = unicodeHints.get("hangul") || 0;
|
|
4492
|
+
const cjk = unicodeHints.get("cjk") || 0;
|
|
4493
|
+
const cyrillic = unicodeHints.get("cyrillic") || 0;
|
|
4494
|
+
const arabic = unicodeHints.get("arabic") || 0;
|
|
4495
|
+
const latin = unicodeHints.get("latin") || 0;
|
|
4496
|
+
switch (locale) {
|
|
4497
|
+
case "ja-JP":
|
|
4498
|
+
return hiragana * 10 + katakana * 10 + cjk * 2;
|
|
4499
|
+
case "ko-KR":
|
|
4500
|
+
return hangul * 10 + cjk * 1;
|
|
4501
|
+
case "zh-Hant":
|
|
4502
|
+
case "zh-Hans":
|
|
4503
|
+
return cjk * 5 - (hiragana + katakana + hangul) * 2;
|
|
4504
|
+
case "ru-RU":
|
|
4505
|
+
return cyrillic * 8;
|
|
4506
|
+
case "ar-SA":
|
|
4507
|
+
return arabic * 8;
|
|
4508
|
+
case "en-US":
|
|
4509
|
+
case "fr-FR":
|
|
4510
|
+
case "de-DE":
|
|
4511
|
+
case "es-ES":
|
|
4512
|
+
return latin * 3;
|
|
4513
|
+
default:
|
|
4514
|
+
return 0;
|
|
4515
|
+
}
|
|
4516
|
+
}
|
|
4517
|
+
getUnicodeBoost(locale, unicodeHints) {
|
|
4518
|
+
const relevanceScore = this.getUnicodeRelevanceScore(locale, unicodeHints);
|
|
4519
|
+
return Math.min(0.4, relevanceScore * 0.1);
|
|
4520
|
+
}
|
|
4521
|
+
async testLocaleWithIntlAPIs(text, locale) {
|
|
4522
|
+
let score = 0;
|
|
4523
|
+
const sample = text.slice(0, 500);
|
|
4524
|
+
try {
|
|
4525
|
+
if (typeof Intl !== "undefined" && Intl.Segmenter) {
|
|
4526
|
+
const segmenter = new Intl.Segmenter(locale, { granularity: "word" });
|
|
4527
|
+
const segments = Array.from(segmenter.segment(sample));
|
|
4528
|
+
const segmentQuality = segments.length > 0 ? Math.min(1, segments.length / (sample.length / 10)) : 0;
|
|
4529
|
+
score += segmentQuality * 0.4;
|
|
4530
|
+
}
|
|
4531
|
+
if (typeof Intl !== "undefined" && Intl.Collator) {
|
|
4532
|
+
const collator = new Intl.Collator(locale, { sensitivity: "base" });
|
|
4533
|
+
const testChars = Array.from(sample).slice(0, 10);
|
|
4534
|
+
if (testChars.length > 1) {
|
|
4535
|
+
const sorted = testChars.sort(collator.compare);
|
|
4536
|
+
score += sorted.length > 0 ? 0.3 : 0;
|
|
4537
|
+
}
|
|
4538
|
+
}
|
|
4539
|
+
if (typeof Intl !== "undefined" && Intl.DisplayNames) {
|
|
4540
|
+
try {
|
|
4541
|
+
const displayNames = new Intl.DisplayNames([locale], {
|
|
4542
|
+
type: "language"
|
|
4543
|
+
});
|
|
4544
|
+
const langCode = new Intl.Locale(locale).language;
|
|
4545
|
+
const displayName = displayNames.of(langCode);
|
|
4546
|
+
score += displayName ? 0.2 : 0;
|
|
4547
|
+
} catch (displayError) {}
|
|
4548
|
+
}
|
|
4549
|
+
if (typeof Intl !== "undefined" && Intl.RelativeTimeFormat) {
|
|
4550
|
+
try {
|
|
4551
|
+
const rtf = new Intl.RelativeTimeFormat(locale);
|
|
4552
|
+
score += rtf ? 0.1 : 0;
|
|
4553
|
+
} catch (rtfError) {}
|
|
4554
|
+
}
|
|
4555
|
+
} catch (error) {
|
|
4556
|
+
this.logger.warn(`Testing locale ${locale} failed:`, error);
|
|
4557
|
+
return 0;
|
|
4558
|
+
}
|
|
4559
|
+
return Math.min(1, score);
|
|
4560
|
+
}
|
|
4561
|
+
}
|
|
4562
|
+
|
|
4563
|
+
class IntelligentImportanceAnalyzer {
|
|
4564
|
+
wordFrequencyCache = new Map;
|
|
4565
|
+
logger;
|
|
4566
|
+
config;
|
|
4567
|
+
constructor(logger2, config) {
|
|
4568
|
+
this.logger = logger2;
|
|
4569
|
+
this.config = config;
|
|
4570
|
+
}
|
|
4571
|
+
async analyzeImportance(text, detectedLanguage) {
|
|
4572
|
+
if (text.length > 50000) {
|
|
4573
|
+
return await this.fastAnalyzeImportance(text, detectedLanguage);
|
|
4574
|
+
}
|
|
4575
|
+
const sentences = this.segmentSentences(text, detectedLanguage);
|
|
4576
|
+
const maxSentences = Math.min(sentences.length, this.config.maxSentencesForAnalysis);
|
|
4577
|
+
const processedSentences = sentences.slice(0, maxSentences);
|
|
4578
|
+
const wordFrequencies = await this.calculateWordFrequencies(text);
|
|
4579
|
+
const avgSentenceLength = processedSentences.reduce((sum, s) => sum + s.length, 0) / processedSentences.length;
|
|
4580
|
+
const results = await Promise.all(processedSentences.map(async (sentence, index) => {
|
|
4581
|
+
return new Promise((resolve) => {
|
|
4582
|
+
const reasons = [];
|
|
4583
|
+
let score = 1;
|
|
4584
|
+
score *= this.analyzePosition(index, processedSentences.length, reasons);
|
|
4585
|
+
score *= this.analyzeLengthDeviation(sentence, avgSentenceLength, reasons);
|
|
4586
|
+
score *= this.analyzeWordRarity(sentence, wordFrequencies, reasons);
|
|
4587
|
+
resolve({
|
|
4588
|
+
text: sentence.trim(),
|
|
4589
|
+
score: Math.round(score * 100) / 100,
|
|
4590
|
+
reasons
|
|
4591
|
+
});
|
|
4592
|
+
});
|
|
4593
|
+
}));
|
|
4594
|
+
return results;
|
|
4595
|
+
}
|
|
4596
|
+
async fastAnalyzeImportance(text, detectedLanguage) {
|
|
4597
|
+
const sentences = this.segmentSentences(text, detectedLanguage).slice(0, this.config.fastModeMaxSentences);
|
|
4598
|
+
const maxSentences = sentences.length;
|
|
4599
|
+
return sentences.slice(0, maxSentences).map((sentence, index) => {
|
|
4600
|
+
const reasons = [];
|
|
4601
|
+
let score = 1;
|
|
4602
|
+
if (index < 3) {
|
|
4603
|
+
score = 2;
|
|
4604
|
+
reasons.push("first-sentences");
|
|
4605
|
+
} else if (index >= maxSentences - 3) {
|
|
4606
|
+
score = 1.8;
|
|
4607
|
+
reasons.push("last-sentences");
|
|
4608
|
+
} else if (sentence.length > 200) {
|
|
4609
|
+
score = 1.3;
|
|
4610
|
+
reasons.push("long-sentence");
|
|
4611
|
+
}
|
|
4612
|
+
return {
|
|
4613
|
+
text: sentence.trim(),
|
|
4614
|
+
score,
|
|
4615
|
+
reasons
|
|
4616
|
+
};
|
|
4617
|
+
});
|
|
4618
|
+
}
|
|
4619
|
+
segmentSentences(text, locale) {
|
|
4620
|
+
try {
|
|
4621
|
+
if (typeof Intl !== "undefined" && Intl.Segmenter) {
|
|
4622
|
+
const segmenter = new Intl.Segmenter(locale || "en", {
|
|
4623
|
+
granularity: "sentence"
|
|
4624
|
+
});
|
|
4625
|
+
const segments = Array.from(segmenter.segment(text));
|
|
4626
|
+
return segments.map((segment) => segment.segment.trim()).filter((s) => s.length > 10);
|
|
4627
|
+
}
|
|
4628
|
+
} catch (error) {}
|
|
4629
|
+
return text.split(/[.!?]+/).map((s) => s.trim()).filter((s) => s.length > 10);
|
|
4630
|
+
}
|
|
4631
|
+
analyzePosition(index, totalSentences, reasons) {
|
|
4632
|
+
if (totalSentences === 1)
|
|
4633
|
+
return 2;
|
|
4634
|
+
if (index === 0) {
|
|
4635
|
+
reasons.push("first-sentence");
|
|
4636
|
+
return 2;
|
|
4637
|
+
}
|
|
4638
|
+
if (index === totalSentences - 1) {
|
|
4639
|
+
reasons.push("last-sentence");
|
|
4640
|
+
return 2;
|
|
4641
|
+
}
|
|
4642
|
+
if (index < 3 || index >= totalSentences - 3) {
|
|
4643
|
+
reasons.push("near-boundary");
|
|
4644
|
+
return 1.5;
|
|
4645
|
+
}
|
|
4646
|
+
return 1;
|
|
4647
|
+
}
|
|
4648
|
+
analyzeLengthDeviation(sentence, avgLength, reasons) {
|
|
4649
|
+
if (avgLength === 0)
|
|
4650
|
+
return 1;
|
|
4651
|
+
const length = sentence.length;
|
|
4652
|
+
const deviation = Math.abs(length - avgLength) / avgLength;
|
|
4653
|
+
if (deviation > 0.8) {
|
|
4654
|
+
if (length > avgLength) {
|
|
4655
|
+
reasons.push("unusually-long");
|
|
4656
|
+
return 1.3;
|
|
4657
|
+
} else if (length > 20) {
|
|
4658
|
+
reasons.push("unusually-short");
|
|
4659
|
+
return 1.2;
|
|
4660
|
+
}
|
|
4661
|
+
}
|
|
4662
|
+
return 1;
|
|
4663
|
+
}
|
|
4664
|
+
analyzeWordRarity(sentence, wordFreqs, reasons) {
|
|
4665
|
+
const words = sentence.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((w) => w.length > 2);
|
|
4666
|
+
if (words.length === 0)
|
|
4667
|
+
return 1;
|
|
4668
|
+
const totalWords = Array.from(wordFreqs.values()).reduce((sum, freq) => sum + freq, 0);
|
|
4669
|
+
let rareWordCount = 0;
|
|
4670
|
+
let veryCommonCount = 0;
|
|
4671
|
+
words.forEach((word) => {
|
|
4672
|
+
const freq = wordFreqs.get(word) || 0;
|
|
4673
|
+
const relativeFreq = freq / totalWords;
|
|
4674
|
+
if (relativeFreq < 0.005) {
|
|
4675
|
+
rareWordCount++;
|
|
4676
|
+
} else if (relativeFreq > 0.05) {
|
|
4677
|
+
veryCommonCount++;
|
|
4678
|
+
}
|
|
4679
|
+
});
|
|
4680
|
+
const rareWordRatio = rareWordCount / words.length;
|
|
4681
|
+
const commonWordRatio = veryCommonCount / words.length;
|
|
4682
|
+
if (rareWordRatio > 0.25) {
|
|
4683
|
+
reasons.push("has-rare-words");
|
|
4684
|
+
return 1.3;
|
|
4685
|
+
}
|
|
4686
|
+
if (commonWordRatio > 0.8) {
|
|
4687
|
+
reasons.push("mostly-common-words");
|
|
4688
|
+
return 0.7;
|
|
4689
|
+
}
|
|
4690
|
+
return 1;
|
|
4691
|
+
}
|
|
4692
|
+
async calculateWordFrequencies(text) {
|
|
4693
|
+
return new Promise((resolve) => {
|
|
4694
|
+
try {
|
|
4695
|
+
const cacheKey = text.length > 1000 ? text.substring(0, 1000) : text;
|
|
4696
|
+
if (this.wordFrequencyCache.has(cacheKey)) {
|
|
4697
|
+
resolve(this.wordFrequencyCache.get(cacheKey));
|
|
4698
|
+
return;
|
|
4699
|
+
}
|
|
4700
|
+
const processText = text.length > 1e4 ? text.substring(0, 1e4) : text;
|
|
4701
|
+
const processAsync = async () => {
|
|
4702
|
+
const words = processText.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((w) => w.length > 2);
|
|
4703
|
+
const freq = new Map;
|
|
4704
|
+
const chunkSize = 1000;
|
|
4705
|
+
for (let i = 0;i < words.length; i += chunkSize) {
|
|
4706
|
+
const chunk = words.slice(i, i + chunkSize);
|
|
4707
|
+
chunk.forEach((word) => {
|
|
4708
|
+
freq.set(word, (freq.get(word) || 0) + 1);
|
|
4709
|
+
});
|
|
4710
|
+
if (i % 5000 === 0 && i > 0) {
|
|
4711
|
+
await new Promise((resolve2) => setTimeout(resolve2, 0));
|
|
4712
|
+
}
|
|
4713
|
+
}
|
|
4714
|
+
return freq;
|
|
4715
|
+
};
|
|
4716
|
+
processAsync().then((freq) => {
|
|
4717
|
+
if (this.wordFrequencyCache.size > 10) {
|
|
4718
|
+
this.wordFrequencyCache.clear();
|
|
4719
|
+
}
|
|
4720
|
+
this.wordFrequencyCache.set(cacheKey, freq);
|
|
4721
|
+
resolve(freq);
|
|
4722
|
+
});
|
|
4723
|
+
} catch (error) {
|
|
4724
|
+
this.logger.error("Word frequency calculation failed:", error);
|
|
4725
|
+
resolve(new Map);
|
|
4726
|
+
}
|
|
4727
|
+
});
|
|
4728
|
+
}
|
|
4729
|
+
}
|
|
4730
|
+
|
|
4731
|
+
class TextCompressionUtility {
|
|
4732
|
+
languageDetector;
|
|
4733
|
+
importanceAnalyzer;
|
|
4734
|
+
logger;
|
|
4735
|
+
config;
|
|
4736
|
+
constructor(config = {}) {
|
|
4737
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
4738
|
+
this.logger = new CompressionLogger(this.config.enableLogging);
|
|
4739
|
+
this.languageDetector = new UniversalLanguageDetector(this.logger);
|
|
4740
|
+
this.importanceAnalyzer = new IntelligentImportanceAnalyzer(this.logger, this.config);
|
|
4741
|
+
}
|
|
4742
|
+
async compress(text, options = {}) {
|
|
4743
|
+
if (!text || typeof text !== "string") {
|
|
4744
|
+
return {
|
|
4745
|
+
compressedText: "",
|
|
4746
|
+
originalLength: 0,
|
|
4747
|
+
compressedLength: 0,
|
|
4748
|
+
compressionRatio: 1,
|
|
4749
|
+
tokensEstimate: 0,
|
|
4750
|
+
detectedLanguage: "und"
|
|
4751
|
+
};
|
|
4752
|
+
}
|
|
4753
|
+
const maxTokens = options.maxTokens ?? this.config.maxTokens;
|
|
4754
|
+
const preserveCodeBlocks = options.preserveCodeBlocks ?? this.config.preserveCodeBlocks;
|
|
4755
|
+
const compressionLevel = options.compressionLevel ?? this.config.compressionLevel;
|
|
4756
|
+
const autoDetectLanguage = options.autoDetectLanguage ?? this.config.autoDetectLanguage;
|
|
4757
|
+
const {
|
|
4758
|
+
prioritizeFirst = true,
|
|
4759
|
+
prioritizeLast = true,
|
|
4760
|
+
bufferPercentage = 10,
|
|
4761
|
+
maxSentences = 500,
|
|
4762
|
+
fastMode = false
|
|
4763
|
+
} = options;
|
|
4764
|
+
if (maxTokens <= 0) {
|
|
4765
|
+
throw new Error("maxTokens must be greater than 0");
|
|
4766
|
+
}
|
|
4767
|
+
const trimmedText = text.trim();
|
|
4768
|
+
const originalLength = trimmedText.length;
|
|
4769
|
+
if (originalLength < 10) {
|
|
4770
|
+
return {
|
|
4771
|
+
compressedText: trimmedText,
|
|
4772
|
+
originalLength,
|
|
4773
|
+
compressedLength: originalLength,
|
|
4774
|
+
compressionRatio: 1,
|
|
4775
|
+
tokensEstimate: Math.max(1, Math.ceil(originalLength / 4)),
|
|
4776
|
+
detectedLanguage: "und"
|
|
4777
|
+
};
|
|
4778
|
+
}
|
|
4779
|
+
let detectedLanguage;
|
|
4780
|
+
if (autoDetectLanguage) {
|
|
4781
|
+
try {
|
|
4782
|
+
const detection = await this.languageDetector.detectLanguage(trimmedText);
|
|
4783
|
+
detectedLanguage = detection.code;
|
|
4784
|
+
} catch (error) {
|
|
4785
|
+
this.logger.warn("Language detection failed, using fallback:", error);
|
|
4786
|
+
detectedLanguage = "und";
|
|
4787
|
+
}
|
|
4788
|
+
}
|
|
4789
|
+
const initialTokens = this.estimateTokens(trimmedText);
|
|
4790
|
+
if (initialTokens <= maxTokens) {
|
|
4791
|
+
return {
|
|
4792
|
+
compressedText: trimmedText,
|
|
4793
|
+
originalLength,
|
|
4794
|
+
compressedLength: trimmedText.length,
|
|
4795
|
+
compressionRatio: 1,
|
|
4796
|
+
tokensEstimate: initialTokens,
|
|
4797
|
+
detectedLanguage
|
|
4798
|
+
};
|
|
4799
|
+
}
|
|
4800
|
+
const { text: textWithoutCode, codeMap } = preserveCodeBlocks ? this.extractCodeElements(trimmedText) : { text: trimmedText, codeMap: new Map };
|
|
4801
|
+
if (textWithoutCode.trim().length < 10) {
|
|
4802
|
+
const finalText = this.trimToTokenLimit(trimmedText, maxTokens);
|
|
4803
|
+
return {
|
|
4804
|
+
compressedText: finalText,
|
|
4805
|
+
originalLength,
|
|
4806
|
+
compressedLength: finalText.length,
|
|
4807
|
+
compressionRatio: originalLength / finalText.length,
|
|
4808
|
+
tokensEstimate: this.estimateTokens(finalText),
|
|
4809
|
+
detectedLanguage
|
|
4810
|
+
};
|
|
4811
|
+
}
|
|
4812
|
+
let importanceScores;
|
|
4813
|
+
try {
|
|
4814
|
+
importanceScores = fastMode || textWithoutCode.length > 50000 ? await this.importanceAnalyzer.fastAnalyzeImportance(textWithoutCode, detectedLanguage) : await this.importanceAnalyzer.analyzeImportance(textWithoutCode, detectedLanguage);
|
|
4815
|
+
if (prioritizeFirst || prioritizeLast) {
|
|
4816
|
+
importanceScores = this.applyUserPriorities(importanceScores, prioritizeFirst, prioritizeLast);
|
|
4817
|
+
}
|
|
4818
|
+
if (maxSentences && importanceScores.length > maxSentences) {
|
|
4819
|
+
importanceScores = importanceScores.sort((a, b) => b.score - a.score).slice(0, maxSentences);
|
|
4820
|
+
}
|
|
4821
|
+
} catch (error) {
|
|
4822
|
+
console.warn("Importance analysis failed, using fallback:", error);
|
|
4823
|
+
const sentences = this.segmentSentences(textWithoutCode);
|
|
4824
|
+
importanceScores = sentences.map((sentence, index) => ({
|
|
4825
|
+
text: sentence.trim(),
|
|
4826
|
+
score: index < 3 ? 2 : 1,
|
|
4827
|
+
reasons: index < 3 ? ["first-sentences-fallback"] : ["fallback"]
|
|
4828
|
+
}));
|
|
4829
|
+
}
|
|
4830
|
+
if (importanceScores.length === 0) {
|
|
4831
|
+
const finalText = this.trimToTokenLimit(trimmedText, maxTokens);
|
|
4832
|
+
return {
|
|
4833
|
+
compressedText: finalText,
|
|
4834
|
+
originalLength,
|
|
4835
|
+
compressedLength: finalText.length,
|
|
4836
|
+
compressionRatio: originalLength / finalText.length,
|
|
4837
|
+
tokensEstimate: this.estimateTokens(finalText),
|
|
4838
|
+
detectedLanguage
|
|
4839
|
+
};
|
|
4840
|
+
}
|
|
4841
|
+
let compressed;
|
|
4842
|
+
try {
|
|
4843
|
+
compressed = this.applyCompression(importanceScores, maxTokens, compressionLevel, codeMap, bufferPercentage);
|
|
4844
|
+
} catch (error) {
|
|
4845
|
+
console.warn("Compression failed, using fallback:", error);
|
|
4846
|
+
compressed = this.trimToTokenLimit(trimmedText, maxTokens);
|
|
4847
|
+
}
|
|
4848
|
+
if (compressed.trim().length === 0) {
|
|
4849
|
+
compressed = this.trimToTokenLimit(trimmedText, Math.min(maxTokens, 100));
|
|
4850
|
+
}
|
|
4851
|
+
const finalTokens = this.estimateTokens(compressed);
|
|
4852
|
+
return {
|
|
4853
|
+
compressedText: compressed,
|
|
4854
|
+
originalLength,
|
|
4855
|
+
compressedLength: compressed.length,
|
|
4856
|
+
compressionRatio: originalLength / compressed.length,
|
|
4857
|
+
tokensEstimate: finalTokens,
|
|
4858
|
+
detectedLanguage,
|
|
4859
|
+
importanceScores
|
|
4860
|
+
};
|
|
4861
|
+
}
|
|
4862
|
+
extractCodeElements(text) {
|
|
4863
|
+
const codeMap = new Map;
|
|
4864
|
+
let counter = 0;
|
|
4865
|
+
let result = text;
|
|
4866
|
+
result = result.replace(/```[\s\S]*?```/g, (match) => {
|
|
4867
|
+
const placeholder = `__CODE_BLOCK_${counter++}__`;
|
|
4868
|
+
codeMap.set(placeholder, match);
|
|
4869
|
+
return placeholder;
|
|
4870
|
+
});
|
|
4871
|
+
result = result.replace(/`[^`\n]+`/g, (match) => {
|
|
4872
|
+
const placeholder = `__INLINE_CODE_${counter++}__`;
|
|
4873
|
+
codeMap.set(placeholder, match);
|
|
4874
|
+
return placeholder;
|
|
4875
|
+
});
|
|
4876
|
+
result = result.replace(/\b[a-zA-Z_][a-zA-Z0-9_]*\(\)/g, (match) => {
|
|
4877
|
+
const placeholder = `__FUNC_CALL_${counter++}__`;
|
|
4878
|
+
codeMap.set(placeholder, match);
|
|
4879
|
+
return placeholder;
|
|
4880
|
+
});
|
|
4881
|
+
return { text: result, codeMap };
|
|
4882
|
+
}
|
|
4883
|
+
applyUserPriorities(importanceScores, prioritizeFirst, prioritizeLast) {
|
|
4884
|
+
return importanceScores.map((item, index) => {
|
|
4885
|
+
const newReasons = [...item.reasons];
|
|
4886
|
+
let newScore = item.score;
|
|
4887
|
+
if (prioritizeFirst && index < 3) {
|
|
4888
|
+
newScore *= 1.5;
|
|
4889
|
+
newReasons.push("user-prioritize-first");
|
|
4890
|
+
}
|
|
4891
|
+
if (prioritizeLast && index >= importanceScores.length - 3) {
|
|
4892
|
+
newScore *= 1.4;
|
|
4893
|
+
newReasons.push("user-prioritize-last");
|
|
4894
|
+
}
|
|
4895
|
+
return {
|
|
4896
|
+
...item,
|
|
4897
|
+
score: newScore,
|
|
4898
|
+
reasons: newReasons
|
|
4899
|
+
};
|
|
4900
|
+
});
|
|
4901
|
+
}
|
|
4902
|
+
applyCompression(importanceScores, maxTokens, level, codeMap, bufferPercentage = 10) {
|
|
4903
|
+
const sorted = [...importanceScores].sort((a, b) => b.score - a.score);
|
|
4904
|
+
const threshold = this.getCompressionThreshold(sorted, level);
|
|
4905
|
+
const important = sorted.filter((item) => item.score >= threshold);
|
|
4906
|
+
let compressed = "";
|
|
4907
|
+
let currentTokens = 0;
|
|
4908
|
+
const bufferMultiplier = (100 - bufferPercentage) / 100;
|
|
4909
|
+
const effectiveMaxTokens = Math.floor(maxTokens * bufferMultiplier);
|
|
4910
|
+
for (const item of important) {
|
|
4911
|
+
const sentenceTokens = this.estimateTokens(item.text);
|
|
4912
|
+
if (currentTokens + sentenceTokens <= effectiveMaxTokens) {
|
|
4913
|
+
compressed += item.text + ". ";
|
|
4914
|
+
currentTokens += sentenceTokens;
|
|
4915
|
+
}
|
|
4916
|
+
}
|
|
4917
|
+
let result = compressed.trim();
|
|
4918
|
+
codeMap.forEach((code, placeholder) => {
|
|
4919
|
+
result = result.replace(new RegExp(placeholder, "g"), code);
|
|
4920
|
+
});
|
|
4921
|
+
if (this.estimateTokens(result) > maxTokens) {
|
|
4922
|
+
result = this.trimToTokenLimit(result, maxTokens);
|
|
4923
|
+
}
|
|
4924
|
+
return result;
|
|
4925
|
+
}
|
|
4926
|
+
getCompressionThreshold(sortedScores, level) {
|
|
4927
|
+
if (sortedScores.length === 0)
|
|
4928
|
+
return 1;
|
|
4929
|
+
const scores = sortedScores.map((s) => s.score);
|
|
4930
|
+
const median = scores[Math.floor(scores.length / 2)];
|
|
4931
|
+
const max = Math.max(...scores);
|
|
4932
|
+
switch (level) {
|
|
4933
|
+
case "light":
|
|
4934
|
+
return Math.max(median * 0.8, 1);
|
|
4935
|
+
case "moderate":
|
|
4936
|
+
return Math.max(median * 1.1, 1.2);
|
|
4937
|
+
case "aggressive":
|
|
4938
|
+
return Math.max(max * 0.7, median * 1.3);
|
|
4939
|
+
default:
|
|
4940
|
+
return Math.max(median * 1.1, 1.2);
|
|
4941
|
+
}
|
|
4942
|
+
}
|
|
4943
|
+
segmentSentences(text, locale) {
|
|
4944
|
+
try {
|
|
4945
|
+
if (typeof Intl !== "undefined" && Intl.Segmenter) {
|
|
4946
|
+
const segmenter = new Intl.Segmenter(locale || "en", {
|
|
4947
|
+
granularity: "sentence"
|
|
4948
|
+
});
|
|
4949
|
+
const segments = Array.from(segmenter.segment(text));
|
|
4950
|
+
return segments.map((segment) => segment.segment.trim()).filter((s) => s.length > 10);
|
|
4951
|
+
}
|
|
4952
|
+
} catch (error) {}
|
|
4953
|
+
return text.split(/[.!?]+/).map((s) => s.trim()).filter((s) => s.length > 10);
|
|
4954
|
+
}
|
|
4955
|
+
trimToTokenLimit(text, maxTokens) {
|
|
4956
|
+
const sentences = this.segmentSentences(text);
|
|
4957
|
+
let result = "";
|
|
4958
|
+
let tokens = 0;
|
|
4959
|
+
for (const sentence of sentences) {
|
|
4960
|
+
const sentenceTokens = this.estimateTokens(sentence + ".");
|
|
4961
|
+
if (tokens + sentenceTokens <= maxTokens) {
|
|
4962
|
+
result += sentence + ". ";
|
|
4963
|
+
tokens += sentenceTokens;
|
|
4964
|
+
} else {
|
|
4965
|
+
break;
|
|
4966
|
+
}
|
|
4967
|
+
}
|
|
4968
|
+
return result.trim();
|
|
4969
|
+
}
|
|
4970
|
+
estimateTokens(text) {
|
|
4971
|
+
try {
|
|
4972
|
+
if (typeof Intl !== "undefined" && Intl.Segmenter) {
|
|
4973
|
+
const sample = text.slice(0, 200);
|
|
4974
|
+
let cjkCount = 0;
|
|
4975
|
+
let totalChars = 0;
|
|
4976
|
+
for (const char of sample) {
|
|
4977
|
+
const codePoint = char.codePointAt(0);
|
|
4978
|
+
totalChars++;
|
|
4979
|
+
if (codePoint >= 19968 && codePoint <= 40959 || codePoint >= 12352 && codePoint <= 12447 || codePoint >= 12448 && codePoint <= 12543 || codePoint >= 44032 && codePoint <= 55215) {
|
|
4980
|
+
cjkCount++;
|
|
4981
|
+
}
|
|
4982
|
+
}
|
|
4983
|
+
const cjkRatio = totalChars > 0 ? cjkCount / totalChars : 0;
|
|
4984
|
+
if (cjkRatio > 0.3) {
|
|
4985
|
+
return Math.ceil(text.length * 1.5);
|
|
4986
|
+
} else {
|
|
4987
|
+
return Math.ceil(text.length / 4);
|
|
4988
|
+
}
|
|
4989
|
+
}
|
|
4990
|
+
} catch (error) {}
|
|
4991
|
+
return Math.ceil(text.length / 4);
|
|
4992
|
+
}
|
|
4993
|
+
}
|
|
4994
|
+
async function compressWithLanguageDetection(text, maxTokens = 4000) {
|
|
4995
|
+
const compressor = new TextCompressionUtility;
|
|
4996
|
+
return await compressor.compress(text, {
|
|
4997
|
+
maxTokens,
|
|
4998
|
+
autoDetectLanguage: true,
|
|
4999
|
+
preserveCodeBlocks: true,
|
|
5000
|
+
compressionLevel: "moderate"
|
|
5001
|
+
});
|
|
5002
|
+
}
|
|
4286
5003
|
export {
|
|
4287
5004
|
withErrorHandling,
|
|
4288
5005
|
setupAITools,
|
|
@@ -4291,6 +5008,7 @@ export {
|
|
|
4291
5008
|
formatSuccessResponse,
|
|
4292
5009
|
formatErrorResponse,
|
|
4293
5010
|
emptySchema,
|
|
5011
|
+
compressWithLanguageDetection as compressText,
|
|
4294
5012
|
exports_schemas_symbol as SymbolType,
|
|
4295
5013
|
ErrorCode
|
|
4296
5014
|
};
|
package/dist/types/index.d.ts
CHANGED
|
@@ -7,3 +7,4 @@ export * as SymbolType from "./schemas/schemas_symbol";
|
|
|
7
7
|
export type { EmptyType } from "./schemas/schemas_empty";
|
|
8
8
|
export { emptySchema } from "./schemas/schemas_empty";
|
|
9
9
|
export { loadVersion } from "./util_version";
|
|
10
|
+
export { compressWithLanguageDetection as compressText } from "./util_text_compression";
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
export interface CompressionConfig {
|
|
2
2
|
maxTokens: number;
|
|
3
|
-
compressionLevel:
|
|
3
|
+
compressionLevel: "light" | "moderate" | "aggressive";
|
|
4
4
|
preserveCodeBlocks: boolean;
|
|
5
5
|
autoDetectLanguage: boolean;
|
|
6
6
|
enableLogging: boolean;
|
|
@@ -10,7 +10,7 @@ export interface CompressionConfig {
|
|
|
10
10
|
export declare const DEFAULT_CONFIG: CompressionConfig;
|
|
11
11
|
export interface CompressionOptions {
|
|
12
12
|
maxTokens?: number;
|
|
13
|
-
compressionLevel?:
|
|
13
|
+
compressionLevel?: "light" | "moderate" | "aggressive";
|
|
14
14
|
preserveCodeBlocks?: boolean;
|
|
15
15
|
autoDetectLanguage?: boolean;
|
|
16
16
|
locale?: string;
|
|
@@ -23,7 +23,7 @@ export interface CompressionOptions {
|
|
|
23
23
|
enableLogging?: boolean;
|
|
24
24
|
sentenceSplitPattern?: RegExp;
|
|
25
25
|
importanceWeights?: ImportanceWeights;
|
|
26
|
-
tokenizationMethod?:
|
|
26
|
+
tokenizationMethod?: "simple" | "advanced";
|
|
27
27
|
}
|
|
28
28
|
export interface ImportanceWeights {
|
|
29
29
|
position: number;
|
|
@@ -82,8 +82,6 @@ export declare class UniversalLanguageDetector {
|
|
|
82
82
|
private getUnicodeRelevanceScore;
|
|
83
83
|
private getUnicodeBoost;
|
|
84
84
|
private testLocaleWithIntlAPIs;
|
|
85
|
-
private enhanceWithTextAnalysis;
|
|
86
|
-
private detectFromTextHeuristics;
|
|
87
85
|
}
|
|
88
86
|
export declare class IntelligentImportanceAnalyzer {
|
|
89
87
|
private wordFrequencyCache;
|
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "0.6.
|
|
2
|
+
"version": "0.6.2",
|
|
3
3
|
"name": "modality-kit",
|
|
4
4
|
"repository": {
|
|
5
5
|
"type": "git",
|
|
@@ -32,7 +32,7 @@
|
|
|
32
32
|
"build:src": "bun build src/index.ts --outdir dist",
|
|
33
33
|
"build": "bun run build:clean && bun run build:src && bun run build:types",
|
|
34
34
|
"test": "bun test",
|
|
35
|
-
"prepublishOnly": "npm run test"
|
|
35
|
+
"prepublishOnly": "npm run build && npm run test"
|
|
36
36
|
},
|
|
37
37
|
"types": "./dist/types/index.d.ts",
|
|
38
38
|
"files": [
|