modality-kit 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4283,6 +4283,712 @@ async function loadVersion(packageJsonPath) {
4283
4283
  return "0.0.0";
4284
4284
  }
4285
4285
  }
4286
+ // src/util_text_compression.ts
4287
+ var DEFAULT_CONFIG = {
4288
+ maxTokens: 4000,
4289
+ compressionLevel: "moderate",
4290
+ preserveCodeBlocks: true,
4291
+ autoDetectLanguage: true,
4292
+ enableLogging: false,
4293
+ maxSentencesForAnalysis: 500,
4294
+ fastModeMaxSentences: 200
4295
+ };
4296
+
4297
+ class CompressionError extends ErrorCode {
4298
+ code;
4299
+ details;
4300
+ constructor(message, code, details, originalError) {
4301
+ super(message, originalError);
4302
+ this.code = code;
4303
+ this.details = details;
4304
+ }
4305
+ }
4306
+
4307
+ class LanguageDetectionError extends ErrorCode {
4308
+ code = "LANGUAGE_DETECTION_ERROR";
4309
+ fallbackLanguage;
4310
+ constructor(message, fallbackLanguage, originalError) {
4311
+ super(message, originalError);
4312
+ this.fallbackLanguage = fallbackLanguage;
4313
+ }
4314
+ }
4315
+
4316
+ class UniversalLanguageDetector {
4317
+ logger;
4318
+ cache = new Map;
4319
+ constructor(logger2) {
4320
+ this.logger = logger2;
4321
+ }
4322
+ async detectLanguage(text) {
4323
+ try {
4324
+ if (!text || typeof text !== "string") {
4325
+ throw new LanguageDetectionError("Invalid input text", "und");
4326
+ }
4327
+ const cacheKey = text.length > 200 ? text.substring(0, 200) : text;
4328
+ if (this.cache.has(cacheKey)) {
4329
+ return this.cache.get(cacheKey);
4330
+ }
4331
+ const result = await this.performDetection(text);
4332
+ if (this.cache.size > 50) {
4333
+ this.cache.clear();
4334
+ }
4335
+ this.cache.set(cacheKey, result);
4336
+ this.logger.info(`Language detected: ${result.code} (confidence: ${result.confidence})`);
4337
+ return result;
4338
+ } catch (error) {
4339
+ this.logger.error("Language detection failed", error);
4340
+ const fallback = {
4341
+ code: "und",
4342
+ locale: "und",
4343
+ confidence: 0.1
4344
+ };
4345
+ return fallback;
4346
+ }
4347
+ }
4348
+ async performDetection(text) {
4349
+ const unicodeHints = this.analyzeUnicodeRanges(text);
4350
+ const testLocales = this.prioritizeLocalesBasedOnUnicode(unicodeHints);
4351
+ let bestMatch = {
4352
+ code: "und",
4353
+ locale: "und",
4354
+ confidence: 0.3
4355
+ };
4356
+ for (const testLocale of testLocales) {
4357
+ try {
4358
+ const intlScore = await this.testLocaleWithIntlAPIs(text, testLocale);
4359
+ const unicodeBoost = this.getUnicodeBoost(testLocale, unicodeHints);
4360
+ const combinedScore = Math.min(1, intlScore + unicodeBoost);
4361
+ if (combinedScore > bestMatch.confidence) {
4362
+ const locale = new Intl.Locale(testLocale);
4363
+ bestMatch = {
4364
+ code: locale.language,
4365
+ locale: testLocale,
4366
+ confidence: combinedScore,
4367
+ script: locale.script,
4368
+ region: locale.region
4369
+ };
4370
+ }
4371
+ } catch (error) {
4372
+ continue;
4373
+ }
4374
+ }
4375
+ return bestMatch;
4376
+ }
4377
+ analyzeUnicodeRanges(text) {
4378
+ const sample = text.slice(0, 500);
4379
+ const codePoints = Array.from(sample).map((char) => char.codePointAt(0));
4380
+ const ranges = new Map;
4381
+ let totalRelevantChars = 0;
4382
+ for (const codePoint of codePoints) {
4383
+ let rangeFound = false;
4384
+ if (codePoint >= 19968 && codePoint <= 40959) {
4385
+ ranges.set("cjk", (ranges.get("cjk") || 0) + 1);
4386
+ totalRelevantChars++;
4387
+ rangeFound = true;
4388
+ }
4389
+ if (codePoint >= 12352 && codePoint <= 12447) {
4390
+ ranges.set("hiragana", (ranges.get("hiragana") || 0) + 1);
4391
+ totalRelevantChars++;
4392
+ rangeFound = true;
4393
+ }
4394
+ if (codePoint >= 12448 && codePoint <= 12543) {
4395
+ ranges.set("katakana", (ranges.get("katakana") || 0) + 1);
4396
+ totalRelevantChars++;
4397
+ rangeFound = true;
4398
+ }
4399
+ if (codePoint >= 44032 && codePoint <= 55215) {
4400
+ ranges.set("hangul", (ranges.get("hangul") || 0) + 1);
4401
+ totalRelevantChars++;
4402
+ rangeFound = true;
4403
+ }
4404
+ if (codePoint >= 1024 && codePoint <= 1279) {
4405
+ ranges.set("cyrillic", (ranges.get("cyrillic") || 0) + 1);
4406
+ totalRelevantChars++;
4407
+ rangeFound = true;
4408
+ }
4409
+ if (codePoint >= 1536 && codePoint <= 1791) {
4410
+ ranges.set("arabic", (ranges.get("arabic") || 0) + 1);
4411
+ totalRelevantChars++;
4412
+ rangeFound = true;
4413
+ }
4414
+ if (!rangeFound && codePoint >= 32 && codePoint <= 126) {
4415
+ ranges.set("latin", (ranges.get("latin") || 0) + 1);
4416
+ totalRelevantChars++;
4417
+ }
4418
+ }
4419
+ const percentages = new Map;
4420
+ for (const [range, count] of ranges) {
4421
+ percentages.set(range, totalRelevantChars > 0 ? count / totalRelevantChars : 0);
4422
+ }
4423
+ return percentages;
4424
+ }
4425
+ prioritizeLocalesBasedOnUnicode(unicodeHints) {
4426
+ const availableLocales = this.getAvailableTestLocales(unicodeHints);
4427
+ return availableLocales.sort((a, b) => {
4428
+ const scoreA = this.getUnicodeRelevanceScore(a, unicodeHints);
4429
+ const scoreB = this.getUnicodeRelevanceScore(b, unicodeHints);
4430
+ return scoreB - scoreA;
4431
+ });
4432
+ }
4433
+ getAvailableTestLocales(unicodeHints) {
4434
+ const locales = new Set;
4435
+ if (typeof Intl !== "undefined" && Intl.DateTimeFormat) {
4436
+ try {
4437
+ const systemLocale = Intl.DateTimeFormat().resolvedOptions().locale;
4438
+ locales.add(systemLocale);
4439
+ const locale = new Intl.Locale(systemLocale);
4440
+ if (locale.language) {
4441
+ locales.add(locale.language);
4442
+ }
4443
+ } catch (error) {}
4444
+ }
4445
+ const ranges = Array.from(unicodeHints.keys());
4446
+ if (ranges.includes("cjk")) {
4447
+ locales.add("zh-Hant");
4448
+ locales.add("zh-Hans");
4449
+ }
4450
+ if (ranges.includes("hiragana") || ranges.includes("katakana")) {
4451
+ locales.add("ja-JP");
4452
+ locales.add("ja");
4453
+ }
4454
+ if (ranges.includes("hangul")) {
4455
+ locales.add("ko-KR");
4456
+ locales.add("ko");
4457
+ }
4458
+ if (ranges.includes("cyrillic")) {
4459
+ locales.add("ru-RU");
4460
+ locales.add("ru");
4461
+ }
4462
+ if (ranges.includes("arabic")) {
4463
+ locales.add("ar-SA");
4464
+ locales.add("ar");
4465
+ }
4466
+ if (ranges.includes("latin")) {
4467
+ locales.add("en-US");
4468
+ locales.add("en");
4469
+ locales.add("fr-FR");
4470
+ locales.add("de-DE");
4471
+ locales.add("es-ES");
4472
+ }
4473
+ locales.add("en-US");
4474
+ locales.add("en");
4475
+ return Array.from(locales);
4476
+ }
4477
+ getUnicodeRelevanceScore(locale, unicodeHints) {
4478
+ const hiragana = unicodeHints.get("hiragana") || 0;
4479
+ const katakana = unicodeHints.get("katakana") || 0;
4480
+ const hangul = unicodeHints.get("hangul") || 0;
4481
+ const cjk = unicodeHints.get("cjk") || 0;
4482
+ const cyrillic = unicodeHints.get("cyrillic") || 0;
4483
+ const arabic = unicodeHints.get("arabic") || 0;
4484
+ const latin = unicodeHints.get("latin") || 0;
4485
+ switch (locale) {
4486
+ case "ja-JP":
4487
+ return hiragana * 10 + katakana * 10 + cjk * 2;
4488
+ case "ko-KR":
4489
+ return hangul * 10 + cjk * 1;
4490
+ case "zh-Hant":
4491
+ case "zh-Hans":
4492
+ return cjk * 5 - (hiragana + katakana + hangul) * 2;
4493
+ case "ru-RU":
4494
+ return cyrillic * 8;
4495
+ case "ar-SA":
4496
+ return arabic * 8;
4497
+ case "en-US":
4498
+ case "fr-FR":
4499
+ case "de-DE":
4500
+ case "es-ES":
4501
+ return latin * 3;
4502
+ default:
4503
+ return 0;
4504
+ }
4505
+ }
4506
+ getUnicodeBoost(locale, unicodeHints) {
4507
+ const relevanceScore = this.getUnicodeRelevanceScore(locale, unicodeHints);
4508
+ return Math.min(0.4, relevanceScore * 0.1);
4509
+ }
4510
+ async testLocaleWithIntlAPIs(text, locale) {
4511
+ let score = 0;
4512
+ const sample = text.slice(0, 500);
4513
+ try {
4514
+ if (typeof Intl !== "undefined" && Intl.Segmenter) {
4515
+ const segmenter = new Intl.Segmenter(locale, { granularity: "word" });
4516
+ const segments = Array.from(segmenter.segment(sample));
4517
+ const segmentQuality = segments.length > 0 ? Math.min(1, segments.length / (sample.length / 10)) : 0;
4518
+ score += segmentQuality * 0.4;
4519
+ }
4520
+ if (typeof Intl !== "undefined" && Intl.Collator) {
4521
+ const collator = new Intl.Collator(locale, { sensitivity: "base" });
4522
+ const testChars = Array.from(sample).slice(0, 10);
4523
+ if (testChars.length > 1) {
4524
+ const sorted = testChars.sort(collator.compare);
4525
+ score += sorted.length > 0 ? 0.3 : 0;
4526
+ }
4527
+ }
4528
+ if (typeof Intl !== "undefined" && Intl.DisplayNames) {
4529
+ try {
4530
+ const displayNames = new Intl.DisplayNames([locale], {
4531
+ type: "language"
4532
+ });
4533
+ const langCode = new Intl.Locale(locale).language;
4534
+ const displayName = displayNames.of(langCode);
4535
+ score += displayName ? 0.2 : 0;
4536
+ } catch (displayError) {}
4537
+ }
4538
+ if (typeof Intl !== "undefined" && Intl.RelativeTimeFormat) {
4539
+ try {
4540
+ const rtf = new Intl.RelativeTimeFormat(locale);
4541
+ score += rtf ? 0.1 : 0;
4542
+ } catch (rtfError) {}
4543
+ }
4544
+ } catch (error) {
4545
+ this.logger.warn(`Testing locale ${locale} failed:`, error);
4546
+ return 0;
4547
+ }
4548
+ return Math.min(1, score);
4549
+ }
4550
+ }
4551
+
4552
+ class IntelligentImportanceAnalyzer {
4553
+ wordFrequencyCache = new Map;
4554
+ logger;
4555
+ config;
4556
+ constructor(logger2, config) {
4557
+ this.logger = logger2;
4558
+ this.config = config;
4559
+ }
4560
+ async analyzeImportance(text, detectedLanguage) {
4561
+ if (text.length > 50000) {
4562
+ return await this.fastAnalyzeImportance(text, detectedLanguage);
4563
+ }
4564
+ const sentences = this.segmentSentences(text, detectedLanguage);
4565
+ const maxSentences = Math.min(sentences.length, this.config.maxSentencesForAnalysis);
4566
+ const processedSentences = sentences.slice(0, maxSentences);
4567
+ const wordFrequencies = await this.calculateWordFrequencies(text);
4568
+ const avgSentenceLength = processedSentences.reduce((sum, s) => sum + s.length, 0) / processedSentences.length;
4569
+ const results = await Promise.all(processedSentences.map(async (sentence, index) => {
4570
+ return new Promise((resolve) => {
4571
+ const reasons = [];
4572
+ let score = 1;
4573
+ score *= this.analyzePosition(index, processedSentences.length, reasons);
4574
+ score *= this.analyzeLengthDeviation(sentence, avgSentenceLength, reasons);
4575
+ score *= this.analyzeWordRarity(sentence, wordFrequencies, reasons);
4576
+ resolve({
4577
+ text: sentence.trim(),
4578
+ score: Math.round(score * 100) / 100,
4579
+ reasons
4580
+ });
4581
+ });
4582
+ }));
4583
+ return results;
4584
+ }
4585
+ async fastAnalyzeImportance(text, detectedLanguage) {
4586
+ const sentences = this.segmentSentences(text, detectedLanguage).slice(0, this.config.fastModeMaxSentences);
4587
+ const maxSentences = sentences.length;
4588
+ return sentences.slice(0, maxSentences).map((sentence, index) => {
4589
+ const reasons = [];
4590
+ let score = 1;
4591
+ if (index < 3) {
4592
+ score = 2;
4593
+ reasons.push("first-sentences");
4594
+ } else if (index >= maxSentences - 3) {
4595
+ score = 1.8;
4596
+ reasons.push("last-sentences");
4597
+ } else if (sentence.length > 200) {
4598
+ score = 1.3;
4599
+ reasons.push("long-sentence");
4600
+ }
4601
+ return {
4602
+ text: sentence.trim(),
4603
+ score,
4604
+ reasons
4605
+ };
4606
+ });
4607
+ }
4608
+ segmentSentences(text, locale) {
4609
+ try {
4610
+ if (typeof Intl !== "undefined" && Intl.Segmenter) {
4611
+ const segmenter = new Intl.Segmenter(locale || "en", {
4612
+ granularity: "sentence"
4613
+ });
4614
+ const segments = Array.from(segmenter.segment(text));
4615
+ return segments.map((segment) => segment.segment.trim()).filter((s) => s.length > 10);
4616
+ }
4617
+ } catch (error) {}
4618
+ return text.split(/[.!?]+/).map((s) => s.trim()).filter((s) => s.length > 10);
4619
+ }
4620
+ analyzePosition(index, totalSentences, reasons) {
4621
+ if (totalSentences === 1)
4622
+ return 2;
4623
+ if (index === 0) {
4624
+ reasons.push("first-sentence");
4625
+ return 2;
4626
+ }
4627
+ if (index === totalSentences - 1) {
4628
+ reasons.push("last-sentence");
4629
+ return 2;
4630
+ }
4631
+ if (index < 3 || index >= totalSentences - 3) {
4632
+ reasons.push("near-boundary");
4633
+ return 1.5;
4634
+ }
4635
+ return 1;
4636
+ }
4637
+ analyzeLengthDeviation(sentence, avgLength, reasons) {
4638
+ if (avgLength === 0)
4639
+ return 1;
4640
+ const length = sentence.length;
4641
+ const deviation = Math.abs(length - avgLength) / avgLength;
4642
+ if (deviation > 0.8) {
4643
+ if (length > avgLength) {
4644
+ reasons.push("unusually-long");
4645
+ return 1.3;
4646
+ } else if (length > 20) {
4647
+ reasons.push("unusually-short");
4648
+ return 1.2;
4649
+ }
4650
+ }
4651
+ return 1;
4652
+ }
4653
+ analyzeWordRarity(sentence, wordFreqs, reasons) {
4654
+ const words = sentence.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((w) => w.length > 2);
4655
+ if (words.length === 0)
4656
+ return 1;
4657
+ const totalWords = Array.from(wordFreqs.values()).reduce((sum, freq) => sum + freq, 0);
4658
+ let rareWordCount = 0;
4659
+ let veryCommonCount = 0;
4660
+ words.forEach((word) => {
4661
+ const freq = wordFreqs.get(word) || 0;
4662
+ const relativeFreq = freq / totalWords;
4663
+ if (relativeFreq < 0.005) {
4664
+ rareWordCount++;
4665
+ } else if (relativeFreq > 0.05) {
4666
+ veryCommonCount++;
4667
+ }
4668
+ });
4669
+ const rareWordRatio = rareWordCount / words.length;
4670
+ const commonWordRatio = veryCommonCount / words.length;
4671
+ if (rareWordRatio > 0.25) {
4672
+ reasons.push("has-rare-words");
4673
+ return 1.3;
4674
+ }
4675
+ if (commonWordRatio > 0.8) {
4676
+ reasons.push("mostly-common-words");
4677
+ return 0.7;
4678
+ }
4679
+ return 1;
4680
+ }
4681
+ async calculateWordFrequencies(text) {
4682
+ return new Promise((resolve) => {
4683
+ try {
4684
+ const cacheKey = text.length > 1000 ? text.substring(0, 1000) : text;
4685
+ if (this.wordFrequencyCache.has(cacheKey)) {
4686
+ resolve(this.wordFrequencyCache.get(cacheKey));
4687
+ return;
4688
+ }
4689
+ const processText = text.length > 1e4 ? text.substring(0, 1e4) : text;
4690
+ const processAsync = async () => {
4691
+ const words = processText.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((w) => w.length > 2);
4692
+ const freq = new Map;
4693
+ const chunkSize = 1000;
4694
+ for (let i = 0;i < words.length; i += chunkSize) {
4695
+ const chunk = words.slice(i, i + chunkSize);
4696
+ chunk.forEach((word) => {
4697
+ freq.set(word, (freq.get(word) || 0) + 1);
4698
+ });
4699
+ if (i % 5000 === 0 && i > 0) {
4700
+ await new Promise((resolve2) => setTimeout(resolve2, 0));
4701
+ }
4702
+ }
4703
+ return freq;
4704
+ };
4705
+ processAsync().then((freq) => {
4706
+ if (this.wordFrequencyCache.size > 10) {
4707
+ this.wordFrequencyCache.clear();
4708
+ }
4709
+ this.wordFrequencyCache.set(cacheKey, freq);
4710
+ resolve(freq);
4711
+ });
4712
+ } catch (error) {
4713
+ this.logger.error("Word frequency calculation failed:", error);
4714
+ resolve(new Map);
4715
+ }
4716
+ });
4717
+ }
4718
+ }
4719
+
4720
+ class TextCompressionUtility {
4721
+ languageDetector;
4722
+ importanceAnalyzer;
4723
+ logger;
4724
+ config;
4725
+ constructor(config = {}) {
4726
+ this.config = { ...DEFAULT_CONFIG, ...config };
4727
+ this.logger = ModalityLogger.getInstance("TextCompression", this.config.enableLogging ? "info" : "error");
4728
+ this.languageDetector = new UniversalLanguageDetector(this.logger);
4729
+ this.importanceAnalyzer = new IntelligentImportanceAnalyzer(this.logger, this.config);
4730
+ }
4731
+ async compress(text, options = {}) {
4732
+ if (!text || typeof text !== "string") {
4733
+ return {
4734
+ compressedText: "",
4735
+ originalLength: 0,
4736
+ compressedLength: 0,
4737
+ compressionRatio: 1,
4738
+ tokensEstimate: 0,
4739
+ detectedLanguage: "und"
4740
+ };
4741
+ }
4742
+ const maxTokens = options.maxTokens ?? this.config.maxTokens;
4743
+ const preserveCodeBlocks = options.preserveCodeBlocks ?? this.config.preserveCodeBlocks;
4744
+ const compressionLevel = options.compressionLevel ?? this.config.compressionLevel;
4745
+ const autoDetectLanguage = options.autoDetectLanguage ?? this.config.autoDetectLanguage;
4746
+ const {
4747
+ prioritizeFirst = true,
4748
+ prioritizeLast = true,
4749
+ bufferPercentage = 10,
4750
+ maxSentences = 500,
4751
+ fastMode = false
4752
+ } = options;
4753
+ if (maxTokens <= 0) {
4754
+ throw new CompressionError("maxTokens must be greater than 0", "INVALID_MAX_TOKENS");
4755
+ }
4756
+ const trimmedText = text.trim();
4757
+ const originalLength = trimmedText.length;
4758
+ if (originalLength < 10) {
4759
+ return {
4760
+ compressedText: trimmedText,
4761
+ originalLength,
4762
+ compressedLength: originalLength,
4763
+ compressionRatio: 1,
4764
+ tokensEstimate: Math.max(1, Math.ceil(originalLength / 4)),
4765
+ detectedLanguage: "und"
4766
+ };
4767
+ }
4768
+ let detectedLanguage;
4769
+ if (autoDetectLanguage) {
4770
+ try {
4771
+ const detection = await this.languageDetector.detectLanguage(trimmedText);
4772
+ detectedLanguage = detection.code;
4773
+ } catch (error) {
4774
+ this.logger.warn("Language detection failed, using fallback:", error);
4775
+ detectedLanguage = "und";
4776
+ }
4777
+ }
4778
+ const initialTokens = this.estimateTokens(trimmedText);
4779
+ if (initialTokens <= maxTokens) {
4780
+ return {
4781
+ compressedText: trimmedText,
4782
+ originalLength,
4783
+ compressedLength: trimmedText.length,
4784
+ compressionRatio: 1,
4785
+ tokensEstimate: initialTokens,
4786
+ detectedLanguage
4787
+ };
4788
+ }
4789
+ const { text: textWithoutCode, codeMap } = preserveCodeBlocks ? this.extractCodeElements(trimmedText) : { text: trimmedText, codeMap: new Map };
4790
+ if (textWithoutCode.trim().length < 10) {
4791
+ const finalText = this.trimToTokenLimit(trimmedText, maxTokens);
4792
+ return {
4793
+ compressedText: finalText,
4794
+ originalLength,
4795
+ compressedLength: finalText.length,
4796
+ compressionRatio: originalLength / finalText.length,
4797
+ tokensEstimate: this.estimateTokens(finalText),
4798
+ detectedLanguage
4799
+ };
4800
+ }
4801
+ let importanceScores;
4802
+ try {
4803
+ importanceScores = fastMode || textWithoutCode.length > 50000 ? await this.importanceAnalyzer.fastAnalyzeImportance(textWithoutCode, detectedLanguage) : await this.importanceAnalyzer.analyzeImportance(textWithoutCode, detectedLanguage);
4804
+ if (prioritizeFirst || prioritizeLast) {
4805
+ importanceScores = this.applyUserPriorities(importanceScores, prioritizeFirst, prioritizeLast);
4806
+ }
4807
+ if (maxSentences && importanceScores.length > maxSentences) {
4808
+ importanceScores = importanceScores.sort((a, b) => b.score - a.score).slice(0, maxSentences);
4809
+ }
4810
+ } catch (error) {
4811
+ console.warn("Importance analysis failed, using fallback:", error);
4812
+ const sentences = this.segmentSentences(textWithoutCode);
4813
+ importanceScores = sentences.map((sentence, index) => ({
4814
+ text: sentence.trim(),
4815
+ score: index < 3 ? 2 : 1,
4816
+ reasons: index < 3 ? ["first-sentences-fallback"] : ["fallback"]
4817
+ }));
4818
+ }
4819
+ if (importanceScores.length === 0) {
4820
+ const finalText = this.trimToTokenLimit(trimmedText, maxTokens);
4821
+ return {
4822
+ compressedText: finalText,
4823
+ originalLength,
4824
+ compressedLength: finalText.length,
4825
+ compressionRatio: originalLength / finalText.length,
4826
+ tokensEstimate: this.estimateTokens(finalText),
4827
+ detectedLanguage
4828
+ };
4829
+ }
4830
+ let compressed;
4831
+ try {
4832
+ compressed = this.applyCompression(importanceScores, maxTokens, compressionLevel, codeMap, bufferPercentage);
4833
+ } catch (error) {
4834
+ console.warn("Compression failed, using fallback:", error);
4835
+ compressed = this.trimToTokenLimit(trimmedText, maxTokens);
4836
+ }
4837
+ if (compressed.trim().length === 0) {
4838
+ compressed = this.trimToTokenLimit(trimmedText, Math.min(maxTokens, 100));
4839
+ }
4840
+ const finalTokens = this.estimateTokens(compressed);
4841
+ return {
4842
+ compressedText: compressed,
4843
+ originalLength,
4844
+ compressedLength: compressed.length,
4845
+ compressionRatio: originalLength / compressed.length,
4846
+ tokensEstimate: finalTokens,
4847
+ detectedLanguage,
4848
+ importanceScores
4849
+ };
4850
+ }
4851
+ extractCodeElements(text) {
4852
+ const codeMap = new Map;
4853
+ let counter = 0;
4854
+ let result = text;
4855
+ result = result.replace(/```[\s\S]*?```/g, (match) => {
4856
+ const placeholder = `__CODE_BLOCK_${counter++}__`;
4857
+ codeMap.set(placeholder, match);
4858
+ return placeholder;
4859
+ });
4860
+ result = result.replace(/`[^`\n]+`/g, (match) => {
4861
+ const placeholder = `__INLINE_CODE_${counter++}__`;
4862
+ codeMap.set(placeholder, match);
4863
+ return placeholder;
4864
+ });
4865
+ result = result.replace(/\b[a-zA-Z_][a-zA-Z0-9_]*\(\)/g, (match) => {
4866
+ const placeholder = `__FUNC_CALL_${counter++}__`;
4867
+ codeMap.set(placeholder, match);
4868
+ return placeholder;
4869
+ });
4870
+ return { text: result, codeMap };
4871
+ }
4872
+ applyUserPriorities(importanceScores, prioritizeFirst, prioritizeLast) {
4873
+ return importanceScores.map((item, index) => {
4874
+ const newReasons = [...item.reasons];
4875
+ let newScore = item.score;
4876
+ if (prioritizeFirst && index < 3) {
4877
+ newScore *= 1.5;
4878
+ newReasons.push("user-prioritize-first");
4879
+ }
4880
+ if (prioritizeLast && index >= importanceScores.length - 3) {
4881
+ newScore *= 1.4;
4882
+ newReasons.push("user-prioritize-last");
4883
+ }
4884
+ return {
4885
+ ...item,
4886
+ score: newScore,
4887
+ reasons: newReasons
4888
+ };
4889
+ });
4890
+ }
4891
+ applyCompression(importanceScores, maxTokens, level, codeMap, bufferPercentage = 10) {
4892
+ const sorted = [...importanceScores].sort((a, b) => b.score - a.score);
4893
+ const threshold = this.getCompressionThreshold(sorted, level);
4894
+ const important = sorted.filter((item) => item.score >= threshold);
4895
+ let compressed = "";
4896
+ let currentTokens = 0;
4897
+ const bufferMultiplier = (100 - bufferPercentage) / 100;
4898
+ const effectiveMaxTokens = Math.floor(maxTokens * bufferMultiplier);
4899
+ for (const item of important) {
4900
+ const sentenceTokens = this.estimateTokens(item.text);
4901
+ if (currentTokens + sentenceTokens <= effectiveMaxTokens) {
4902
+ compressed += item.text + ". ";
4903
+ currentTokens += sentenceTokens;
4904
+ }
4905
+ }
4906
+ let result = compressed.trim();
4907
+ codeMap.forEach((code, placeholder) => {
4908
+ result = result.replace(new RegExp(placeholder, "g"), code);
4909
+ });
4910
+ if (this.estimateTokens(result) > maxTokens) {
4911
+ result = this.trimToTokenLimit(result, maxTokens);
4912
+ }
4913
+ return result;
4914
+ }
4915
+ getCompressionThreshold(sortedScores, level) {
4916
+ if (sortedScores.length === 0)
4917
+ return 1;
4918
+ const scores = sortedScores.map((s) => s.score);
4919
+ const median = scores[Math.floor(scores.length / 2)];
4920
+ const max = Math.max(...scores);
4921
+ switch (level) {
4922
+ case "light":
4923
+ return Math.max(median * 0.8, 1);
4924
+ case "moderate":
4925
+ return Math.max(median * 1.1, 1.2);
4926
+ case "aggressive":
4927
+ return Math.max(max * 0.7, median * 1.3);
4928
+ default:
4929
+ return Math.max(median * 1.1, 1.2);
4930
+ }
4931
+ }
4932
+ segmentSentences(text, locale) {
4933
+ try {
4934
+ if (typeof Intl !== "undefined" && Intl.Segmenter) {
4935
+ const segmenter = new Intl.Segmenter(locale || "en", {
4936
+ granularity: "sentence"
4937
+ });
4938
+ const segments = Array.from(segmenter.segment(text));
4939
+ return segments.map((segment) => segment.segment.trim()).filter((s) => s.length > 10);
4940
+ }
4941
+ } catch (error) {}
4942
+ return text.split(/[.!?]+/).map((s) => s.trim()).filter((s) => s.length > 10);
4943
+ }
4944
+ trimToTokenLimit(text, maxTokens) {
4945
+ const sentences = this.segmentSentences(text);
4946
+ let result = "";
4947
+ let tokens = 0;
4948
+ for (const sentence of sentences) {
4949
+ const sentenceTokens = this.estimateTokens(sentence + ".");
4950
+ if (tokens + sentenceTokens <= maxTokens) {
4951
+ result += sentence + ". ";
4952
+ tokens += sentenceTokens;
4953
+ } else {
4954
+ break;
4955
+ }
4956
+ }
4957
+ return result.trim();
4958
+ }
4959
+ estimateTokens(text) {
4960
+ try {
4961
+ if (typeof Intl !== "undefined" && Intl.Segmenter) {
4962
+ const sample = text.slice(0, 200);
4963
+ let cjkCount = 0;
4964
+ let totalChars = 0;
4965
+ for (const char of sample) {
4966
+ const codePoint = char.codePointAt(0);
4967
+ totalChars++;
4968
+ if (codePoint >= 19968 && codePoint <= 40959 || codePoint >= 12352 && codePoint <= 12447 || codePoint >= 12448 && codePoint <= 12543 || codePoint >= 44032 && codePoint <= 55215) {
4969
+ cjkCount++;
4970
+ }
4971
+ }
4972
+ const cjkRatio = totalChars > 0 ? cjkCount / totalChars : 0;
4973
+ if (cjkRatio > 0.3) {
4974
+ return Math.ceil(text.length * 1.5);
4975
+ } else {
4976
+ return Math.ceil(text.length / 4);
4977
+ }
4978
+ }
4979
+ } catch (error) {}
4980
+ return Math.ceil(text.length / 4);
4981
+ }
4982
+ }
4983
+ async function compressWithLanguageDetection(text, maxTokens = DEFAULT_CONFIG.maxTokens) {
4984
+ const compressor = new TextCompressionUtility;
4985
+ return await compressor.compress(text, {
4986
+ maxTokens,
4987
+ autoDetectLanguage: true,
4988
+ preserveCodeBlocks: true,
4989
+ compressionLevel: "moderate"
4990
+ });
4991
+ }
4286
4992
  export {
4287
4993
  withErrorHandling,
4288
4994
  setupAITools,
@@ -4291,6 +4997,7 @@ export {
4291
4997
  formatSuccessResponse,
4292
4998
  formatErrorResponse,
4293
4999
  emptySchema,
5000
+ compressWithLanguageDetection as compressText,
4294
5001
  exports_schemas_symbol as SymbolType,
4295
5002
  ErrorCode
4296
5003
  };
@@ -7,3 +7,4 @@ export * as SymbolType from "./schemas/schemas_symbol";
7
7
  export type { EmptyType } from "./schemas/schemas_empty";
8
8
  export { emptySchema } from "./schemas/schemas_empty";
9
9
  export { loadVersion } from "./util_version";
10
+ export { compressWithLanguageDetection as compressText } from "./util_text_compression";
@@ -0,0 +1,49 @@
1
+ /**
2
+ * Console Mock Utility
3
+ *
4
+ * Provides utilities for mocking console output during testing to keep test output clean.
5
+ * Can be used across multiple test files for consistent console mocking.
6
+ */
7
+ export declare class ConsoleMock {
8
+ private originalMethods;
9
+ private isMocked;
10
+ /**
11
+ * Mock all console methods to prevent output during tests
12
+ */
13
+ mock(): void;
14
+ /**
15
+ * Restore original console methods
16
+ */
17
+ restore(): void;
18
+ /**
19
+ * Check if console is currently mocked
20
+ */
21
+ get isActive(): boolean;
22
+ /**
23
+ * Temporarily restore console methods for debugging purposes
24
+ * Returns a function to re-mock console methods
25
+ */
26
+ temporaryRestore(): () => void;
27
+ }
28
+ export declare const consoleMock: ConsoleMock;
29
+ /**
30
+ * Convenience functions for common usage patterns
31
+ */
32
+ /**
33
+ * Setup console mocking for a test suite (use in beforeAll)
34
+ */
35
+ export declare function setupConsoleMock(): void;
36
+ /**
37
+ * Cleanup console mocking for a test suite (use in afterAll)
38
+ */
39
+ export declare function cleanupConsoleMock(): void;
40
+ /**
41
+ * Higher-order function to run a function with console temporarily restored
42
+ * Useful for debugging specific tests
43
+ */
44
+ export declare function withConsole<T>(fn: () => T): T;
45
+ /**
46
+ * Higher-order function to run an async function with console temporarily restored
47
+ * Useful for debugging specific async tests
48
+ */
49
+ export declare function withConsoleAsync<T>(fn: () => Promise<T>): Promise<T>;
@@ -1,6 +1,8 @@
1
+ import { ModalityLogger } from './util_logger.js';
2
+ import { ErrorCode } from './util_error.js';
1
3
  export interface CompressionConfig {
2
4
  maxTokens: number;
3
- compressionLevel: 'light' | 'moderate' | 'aggressive';
5
+ compressionLevel: "light" | "moderate" | "aggressive";
4
6
  preserveCodeBlocks: boolean;
5
7
  autoDetectLanguage: boolean;
6
8
  enableLogging: boolean;
@@ -10,7 +12,7 @@ export interface CompressionConfig {
10
12
  export declare const DEFAULT_CONFIG: CompressionConfig;
11
13
  export interface CompressionOptions {
12
14
  maxTokens?: number;
13
- compressionLevel?: 'light' | 'moderate' | 'aggressive';
15
+ compressionLevel?: "light" | "moderate" | "aggressive";
14
16
  preserveCodeBlocks?: boolean;
15
17
  autoDetectLanguage?: boolean;
16
18
  locale?: string;
@@ -23,7 +25,7 @@ export interface CompressionOptions {
23
25
  enableLogging?: boolean;
24
26
  sentenceSplitPattern?: RegExp;
25
27
  importanceWeights?: ImportanceWeights;
26
- tokenizationMethod?: 'simple' | 'advanced';
28
+ tokenizationMethod?: "simple" | "advanced";
27
29
  }
28
30
  export interface ImportanceWeights {
29
31
  position: number;
@@ -54,26 +56,20 @@ export interface LanguageDetectionResult {
54
56
  script?: string;
55
57
  region?: string;
56
58
  }
57
- export declare class CompressionError extends Error {
58
- code: string;
59
- details?: any | undefined;
60
- constructor(message: string, code: string, details?: any | undefined);
59
+ export declare class CompressionError extends ErrorCode {
60
+ readonly code: string;
61
+ details?: any;
62
+ constructor(message: string, code: string, details?: any, originalError?: unknown);
61
63
  }
62
- export declare class LanguageDetectionError extends Error {
64
+ export declare class LanguageDetectionError extends ErrorCode {
65
+ readonly code: string;
63
66
  fallbackLanguage: string;
64
- constructor(message: string, fallbackLanguage: string);
65
- }
66
- export declare class CompressionLogger {
67
- private enabled;
68
- constructor(enabled?: boolean);
69
- info(message: string, data?: any): void;
70
- warn(message: string, data?: any): void;
71
- error(message: string, error?: Error): void;
67
+ constructor(message: string, fallbackLanguage: string, originalError?: unknown);
72
68
  }
73
69
  export declare class UniversalLanguageDetector {
74
70
  private logger;
75
71
  private cache;
76
- constructor(logger: CompressionLogger);
72
+ constructor(logger: ModalityLogger);
77
73
  detectLanguage(text: string): Promise<LanguageDetectionResult>;
78
74
  private performDetection;
79
75
  private analyzeUnicodeRanges;
@@ -82,14 +78,12 @@ export declare class UniversalLanguageDetector {
82
78
  private getUnicodeRelevanceScore;
83
79
  private getUnicodeBoost;
84
80
  private testLocaleWithIntlAPIs;
85
- private enhanceWithTextAnalysis;
86
- private detectFromTextHeuristics;
87
81
  }
88
82
  export declare class IntelligentImportanceAnalyzer {
89
83
  private wordFrequencyCache;
90
84
  private logger;
91
85
  private config;
92
- constructor(logger: CompressionLogger, config: CompressionConfig);
86
+ constructor(logger: ModalityLogger, config: CompressionConfig);
93
87
  analyzeImportance(text: string, detectedLanguage?: string): Promise<Array<{
94
88
  text: string;
95
89
  score: number;
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "0.6.1",
2
+ "version": "0.6.3",
3
3
  "name": "modality-kit",
4
4
  "repository": {
5
5
  "type": "git",
@@ -32,7 +32,7 @@
32
32
  "build:src": "bun build src/index.ts --outdir dist",
33
33
  "build": "bun run build:clean && bun run build:src && bun run build:types",
34
34
  "test": "bun test",
35
- "prepublishOnly": "npm run test"
35
+ "prepublishOnly": "npm run build && npm run test"
36
36
  },
37
37
  "types": "./dist/types/index.d.ts",
38
38
  "files": [