codecritique 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +1145 -0
- package/package.json +98 -0
- package/src/content-retrieval.js +747 -0
- package/src/custom-documents.js +597 -0
- package/src/embeddings/cache-manager.js +364 -0
- package/src/embeddings/constants.js +40 -0
- package/src/embeddings/database.js +921 -0
- package/src/embeddings/errors.js +208 -0
- package/src/embeddings/factory.js +447 -0
- package/src/embeddings/file-processor.js +851 -0
- package/src/embeddings/model-manager.js +337 -0
- package/src/embeddings/similarity-calculator.js +97 -0
- package/src/embeddings/types.js +113 -0
- package/src/feedback-loader.js +384 -0
- package/src/index.js +1418 -0
- package/src/llm.js +123 -0
- package/src/pr-history/analyzer.js +579 -0
- package/src/pr-history/bot-detector.js +123 -0
- package/src/pr-history/cli-utils.js +204 -0
- package/src/pr-history/comment-processor.js +549 -0
- package/src/pr-history/database.js +819 -0
- package/src/pr-history/github-client.js +629 -0
- package/src/project-analyzer.js +955 -0
- package/src/rag-analyzer.js +2764 -0
- package/src/rag-review.js +566 -0
- package/src/technology-keywords.json +753 -0
- package/src/utils/command.js +48 -0
- package/src/utils/constants.js +263 -0
- package/src/utils/context-inference.js +364 -0
- package/src/utils/document-detection.js +105 -0
- package/src/utils/file-validation.js +271 -0
- package/src/utils/git.js +232 -0
- package/src/utils/language-detection.js +170 -0
- package/src/utils/logging.js +24 -0
- package/src/utils/markdown.js +132 -0
- package/src/utils/mobilebert-tokenizer.js +141 -0
- package/src/utils/pr-chunking.js +276 -0
- package/src/utils/string-utils.js +28 -0
- package/src/zero-shot-classifier-open.js +392 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language Detection Module
|
|
3
|
+
*
|
|
4
|
+
* This module provides utilities for detecting programming languages
|
|
5
|
+
* and file types from file extensions and content analysis.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import path from 'path';
|
|
9
|
+
import { EXTENSION_TO_LANGUAGE_MAP, ALL_SUPPORTED_EXTENSIONS } from './constants.js';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Detect programming language from file extension
|
|
13
|
+
*
|
|
14
|
+
* @param {string} extension - File extension (including the dot)
|
|
15
|
+
* @returns {string|null} Detected language or null if unknown
|
|
16
|
+
*
|
|
17
|
+
* @example
|
|
18
|
+
* const language = detectLanguageFromExtension('.ts');
|
|
19
|
+
* // Returns: 'typescript'
|
|
20
|
+
*/
|
|
21
|
+
export function detectLanguageFromExtension(extension) {
|
|
22
|
+
// Normalize extension to lowercase with leading dot
|
|
23
|
+
const normalizedExt = extension.toLowerCase();
|
|
24
|
+
if (!normalizedExt.startsWith('.')) {
|
|
25
|
+
extension = `.${normalizedExt}`;
|
|
26
|
+
} else {
|
|
27
|
+
extension = normalizedExt;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Check if the extension is supported
|
|
31
|
+
if (!ALL_SUPPORTED_EXTENSIONS.includes(extension)) {
|
|
32
|
+
return 'unknown';
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Use the centralized extension-to-language mapping from constants
|
|
36
|
+
return EXTENSION_TO_LANGUAGE_MAP[extension] || 'unknown';
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Detect file type and framework from file path and content
|
|
41
|
+
*
|
|
42
|
+
* @param {string} filePath - Path to the file
|
|
43
|
+
* @param {string} content - Content of the file (optional)
|
|
44
|
+
* @returns {Object} File type information including language, framework, and flags
|
|
45
|
+
*
|
|
46
|
+
* @example
|
|
47
|
+
* const fileInfo = detectFileType('src/components/Button.tsx', 'import React from "react"');
|
|
48
|
+
* // Returns: { path: '...', extension: '.tsx', language: 'typescript', framework: 'react', ... }
|
|
49
|
+
*/
|
|
50
|
+
export function detectFileType(filePath, content = '') {
|
|
51
|
+
// Get file extension and base name
|
|
52
|
+
const extension = path.extname(filePath);
|
|
53
|
+
const baseName = path.basename(filePath);
|
|
54
|
+
|
|
55
|
+
// Detect language from extension
|
|
56
|
+
const language = detectLanguageFromExtension(extension);
|
|
57
|
+
|
|
58
|
+
// Initialize result object
|
|
59
|
+
const result = {
|
|
60
|
+
path: filePath,
|
|
61
|
+
extension,
|
|
62
|
+
language,
|
|
63
|
+
type: 'unknown',
|
|
64
|
+
framework: null,
|
|
65
|
+
isConfig: false,
|
|
66
|
+
isTest: false,
|
|
67
|
+
isTypeDefinition: false,
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
// Detect file type based on name patterns
|
|
71
|
+
if (baseName.endsWith('.d.ts')) {
|
|
72
|
+
result.type = 'type-definition';
|
|
73
|
+
result.isTypeDefinition = true;
|
|
74
|
+
} else if (baseName.match(/\.test\.|\.spec\.|_test\.|_spec\./)) {
|
|
75
|
+
result.type = 'test';
|
|
76
|
+
result.isTest = true;
|
|
77
|
+
} else if (baseName.match(/^test.*\.|^spec.*\./)) {
|
|
78
|
+
result.type = 'test';
|
|
79
|
+
result.isTest = true;
|
|
80
|
+
} else if (baseName.match(/config|conf|settings|\.rc$/)) {
|
|
81
|
+
result.type = 'config';
|
|
82
|
+
result.isConfig = true;
|
|
83
|
+
} else if (language) {
|
|
84
|
+
result.type = language;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// If content is provided, perform deeper analysis
|
|
88
|
+
if (content && content.length > 0) {
|
|
89
|
+
// Detect React
|
|
90
|
+
if (
|
|
91
|
+
extension === '.jsx' ||
|
|
92
|
+
extension === '.tsx' ||
|
|
93
|
+
content.includes('import React') ||
|
|
94
|
+
content.includes('from "react"') ||
|
|
95
|
+
content.includes("from 'react'")
|
|
96
|
+
) {
|
|
97
|
+
result.framework = 'react';
|
|
98
|
+
|
|
99
|
+
// Check for specific React patterns
|
|
100
|
+
if (content.includes('useState') || content.includes('useEffect') || content.includes('useContext')) {
|
|
101
|
+
result.isHook = content.match(/^\s*function\s+use[A-Z]/m) !== null;
|
|
102
|
+
result.isComponent = content.match(/^\s*function\s+[A-Z]/m) !== null || content.match(/^\s*const\s+[A-Z]\w+\s*=\s*\(/m) !== null;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Detect Vue
|
|
107
|
+
else if (extension === '.vue' || (content.includes('<template>') && content.includes('<script>'))) {
|
|
108
|
+
result.framework = 'vue';
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Detect Angular
|
|
112
|
+
else if (
|
|
113
|
+
content.includes('@Component') ||
|
|
114
|
+
content.includes('@NgModule') ||
|
|
115
|
+
content.includes('from "@angular/core"') ||
|
|
116
|
+
content.includes("from '@angular/core'")
|
|
117
|
+
) {
|
|
118
|
+
result.framework = 'angular';
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Detect Express.js
|
|
122
|
+
else if (
|
|
123
|
+
content.includes('express()') ||
|
|
124
|
+
content.includes('require("express")') ||
|
|
125
|
+
content.includes("require('express')") ||
|
|
126
|
+
content.includes('from "express"') ||
|
|
127
|
+
content.includes("from 'express'")
|
|
128
|
+
) {
|
|
129
|
+
result.framework = 'express';
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Detect Next.js
|
|
133
|
+
else if (
|
|
134
|
+
content.includes('from "next"') ||
|
|
135
|
+
content.includes("from 'next'") ||
|
|
136
|
+
content.includes('next/app') ||
|
|
137
|
+
content.includes('next/document')
|
|
138
|
+
) {
|
|
139
|
+
result.framework = 'nextjs';
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Detect Django (Python)
|
|
143
|
+
else if (language === 'python' && (content.includes('from django') || content.includes('import django'))) {
|
|
144
|
+
result.framework = 'django';
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Detect Flask (Python)
|
|
148
|
+
else if (language === 'python' && (content.includes('from flask import') || content.includes('import flask'))) {
|
|
149
|
+
result.framework = 'flask';
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Detect Rails (Ruby)
|
|
153
|
+
else if (language === 'ruby' && (content.includes('Rails') || content.includes('ActiveRecord'))) {
|
|
154
|
+
result.framework = 'rails';
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Detect Spring (Java)
|
|
158
|
+
else if (
|
|
159
|
+
language === 'java' &&
|
|
160
|
+
(content.includes('@Controller') ||
|
|
161
|
+
content.includes('@Service') ||
|
|
162
|
+
content.includes('@Repository') ||
|
|
163
|
+
content.includes('@SpringBootApplication'))
|
|
164
|
+
) {
|
|
165
|
+
result.framework = 'spring';
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return result;
|
|
170
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Logging Module
|
|
3
|
+
*
|
|
4
|
+
* This module provides debugging and logging utilities with support
|
|
5
|
+
* for environment-based and command-line argument-based log level control.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import chalk from 'chalk';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Debug function for conditional logging based on environment variables and command line arguments
|
|
12
|
+
*
|
|
13
|
+
* @param {string} message - Debug message to log
|
|
14
|
+
*
|
|
15
|
+
* @example
|
|
16
|
+
* debug('Processing file: example.js');
|
|
17
|
+
* // Only logs if DEBUG=true, VERBOSE=true, or --verbose flag is present
|
|
18
|
+
*/
|
|
19
|
+
export function debug(message) {
|
|
20
|
+
const DEBUG = process.env.DEBUG || false;
|
|
21
|
+
if (DEBUG || process.env.VERBOSE === 'true' || process.argv.includes('--verbose')) {
|
|
22
|
+
console.log(chalk.cyan(`[DEBUG] ${message}`));
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown Processing Module
|
|
3
|
+
*
|
|
4
|
+
* This module provides utilities for processing markdown content,
|
|
5
|
+
* including chunk extraction, heading analysis, and content parsing.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import path from 'path';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Extracts chunks from Markdown content based on H2 and H3 headings,
|
|
12
|
+
* and also extracts the first H1 heading as the document title.
|
|
13
|
+
*
|
|
14
|
+
* @param {string} filePath - The absolute path to the file
|
|
15
|
+
* @param {string} content - The Markdown content of the file
|
|
16
|
+
* @param {string} relativePath - The relative path of the file
|
|
17
|
+
* @returns {Object} An object containing `chunks` (Array) and `documentH1` (string|null).
|
|
18
|
+
* Each chunk object contains:
|
|
19
|
+
* `content`, `heading` (H2/H3 text),
|
|
20
|
+
* `original_document_path`, `start_line_in_doc`, `language`.
|
|
21
|
+
*
|
|
22
|
+
* @example
|
|
23
|
+
* const result = extractMarkdownChunks('/path/to/file.md', '# Title\n## Section\nContent...', 'docs/file.md');
|
|
24
|
+
* // Returns: { chunks: [{ content: '...', heading: 'Section', ... }], documentH1: 'Title' }
|
|
25
|
+
*/
|
|
26
|
+
export function extractMarkdownChunks(filePath, content, relativePath) {
|
|
27
|
+
const chunks = [];
|
|
28
|
+
let documentH1 = null;
|
|
29
|
+
if (!content || typeof content !== 'string') return { chunks, documentH1 };
|
|
30
|
+
|
|
31
|
+
const lines = content.split('\n');
|
|
32
|
+
let currentChunkLines = [];
|
|
33
|
+
let currentH2H3Heading = null; // Stores the H2 or H3 heading for the current chunk
|
|
34
|
+
let chunkStartLine = 1;
|
|
35
|
+
let inCodeBlock = false;
|
|
36
|
+
let h1Found = false;
|
|
37
|
+
let linesProcessedForH1 = 0; // Debug counter
|
|
38
|
+
|
|
39
|
+
const h1Regex = /^#\s*(.*)/; // Regex for H1 (allow zero or more spaces after #)
|
|
40
|
+
const h2h3Regex = /^(##|###)\s+(.*)/; // Regex for H2 or H3
|
|
41
|
+
|
|
42
|
+
for (let i = 0; i < lines.length; i++) {
|
|
43
|
+
const line = lines[i];
|
|
44
|
+
const trimmedLine = line.trim();
|
|
45
|
+
|
|
46
|
+
if (trimmedLine.startsWith('```')) {
|
|
47
|
+
inCodeBlock = !inCodeBlock;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (!h1Found && linesProcessedForH1 < 5) {
|
|
51
|
+
linesProcessedForH1++;
|
|
52
|
+
// Check for H1 heading in first few lines
|
|
53
|
+
if (filePath.includes('README.md') || filePath.includes('RUNBOOK.md')) {
|
|
54
|
+
// Log only for specific files to reduce noise
|
|
55
|
+
console.log(`[extractMarkdownChunks DEBUG] File: ${filePath}, Line ${i + 1} (trimmed): "${trimmedLine}", Attempting H1 match.`);
|
|
56
|
+
}
|
|
57
|
+
const h1Match = trimmedLine.match(h1Regex);
|
|
58
|
+
if (h1Match) {
|
|
59
|
+
documentH1 = h1Match[1].trim();
|
|
60
|
+
h1Found = true;
|
|
61
|
+
console.log(`[extractMarkdownChunks DEBUG] H1 FOUND for ${filePath}: "${documentH1}" on line ${i + 1}`);
|
|
62
|
+
} else if (filePath.includes('README.md') || filePath.includes('RUNBOOK.md')) {
|
|
63
|
+
if (linesProcessedForH1 <= 5 && trimmedLine.startsWith('#')) {
|
|
64
|
+
// If it starts with # but didn't match
|
|
65
|
+
console.log(
|
|
66
|
+
`[extractMarkdownChunks DEBUG] File: ${filePath}, Line ${i + 1}: Starts with # but H1Regex DID NOT match "${trimmedLine}"`
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const h2h3Match = !inCodeBlock && trimmedLine.match(h2h3Regex);
|
|
73
|
+
|
|
74
|
+
if (h2h3Match) {
|
|
75
|
+
// Found an H2 or H3 heading, finalize the previous chunk if it has content
|
|
76
|
+
if (currentChunkLines.length > 0 && currentChunkLines.join('\n').trim().length > 0) {
|
|
77
|
+
chunks.push({
|
|
78
|
+
content: currentChunkLines.join('\n').trim(),
|
|
79
|
+
heading: currentH2H3Heading, // Heading of the *previous* H2/H3 chunk
|
|
80
|
+
original_document_path: relativePath,
|
|
81
|
+
start_line_in_doc: chunkStartLine,
|
|
82
|
+
language: 'markdown',
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
// Start a new H2/H3 chunk
|
|
86
|
+
currentH2H3Heading = h2h3Match[2].trim();
|
|
87
|
+
currentChunkLines = [line]; // Include H2/H3 heading line in the new chunk's content
|
|
88
|
+
chunkStartLine = i + 1;
|
|
89
|
+
} else {
|
|
90
|
+
// Not an H1 or H2/H3 heading line (or H1 already found), add to current chunk
|
|
91
|
+
// This also correctly captures content before the first H2/H3 heading (under an H1 or if no H1).
|
|
92
|
+
currentChunkLines.push(line);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Add the last processed chunk if it has content
|
|
97
|
+
if (currentChunkLines.length > 0 && currentChunkLines.join('\n').trim().length > 0) {
|
|
98
|
+
chunks.push({
|
|
99
|
+
content: currentChunkLines.join('\n').trim(),
|
|
100
|
+
heading: currentH2H3Heading, // H2/H3 heading of the last chunk
|
|
101
|
+
original_document_path: relativePath,
|
|
102
|
+
start_line_in_doc: chunkStartLine,
|
|
103
|
+
language: 'markdown',
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// If no H2/H3 chunks were created (e.g., file has only H1 and paragraphs, or just paragraphs)
|
|
108
|
+
// treat the whole file content (that wasn't part of H1 line itself if H1 was first line) as a single chunk.
|
|
109
|
+
if (chunks.length === 0 && content.trim().length > 0) {
|
|
110
|
+
let initialContent = content.trim();
|
|
111
|
+
// If H1 was the very first line and we captured it, remove it from this single chunk content
|
|
112
|
+
if (documentH1 && lines.length > 0 && lines[0].trim().match(h1Regex)) {
|
|
113
|
+
initialContent = lines.slice(1).join('\n').trim();
|
|
114
|
+
}
|
|
115
|
+
if (initialContent.length > 0) {
|
|
116
|
+
chunks.push({
|
|
117
|
+
content: initialContent,
|
|
118
|
+
heading: null, // No H2/H3 heading for this single chunk
|
|
119
|
+
original_document_path: relativePath,
|
|
120
|
+
start_line_in_doc: h1Found && lines.length > 0 && lines[0].trim().match(h1Regex) ? 2 : 1,
|
|
121
|
+
language: 'markdown',
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (!documentH1) {
|
|
127
|
+
documentH1 = path.basename(filePath).replace(path.extname(filePath), '');
|
|
128
|
+
console.log(`[extractMarkdownChunks DEBUG] H1 NOT FOUND for ${filePath}. Using fallback title: "${documentH1}"`);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return { chunks: chunks.filter((chunk) => chunk.content.length > 0), documentH1 };
|
|
132
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MobileBERT Tokenizer Utility
|
|
3
|
+
*
|
|
4
|
+
* Shared tokenizer functionality for MobileBERT models to handle token counting
|
|
5
|
+
* and text truncation while staying within the 512 token limit.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { AutoTokenizer } from '@huggingface/transformers';
|
|
9
|
+
import chalk from 'chalk';
|
|
10
|
+
|
|
11
|
+
// Shared tokenizer instance and initialization state
|
|
12
|
+
let tokenizer = null;
|
|
13
|
+
let isInitializing = false;
|
|
14
|
+
let initializationPromise = null;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Initialize and get the MobileBERT tokenizer (singleton pattern)
|
|
18
|
+
* @returns {Promise<AutoTokenizer|null>} Tokenizer instance or null if initialization fails
|
|
19
|
+
*/
|
|
20
|
+
async function getTokenizer() {
|
|
21
|
+
// If already initialized, return immediately
|
|
22
|
+
if (tokenizer) return tokenizer;
|
|
23
|
+
|
|
24
|
+
// If currently initializing, wait for the existing initialization
|
|
25
|
+
if (isInitializing && initializationPromise) {
|
|
26
|
+
return await initializationPromise;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Start initialization
|
|
30
|
+
isInitializing = true;
|
|
31
|
+
initializationPromise = _initializeTokenizer();
|
|
32
|
+
|
|
33
|
+
try {
|
|
34
|
+
tokenizer = await initializationPromise;
|
|
35
|
+
return tokenizer;
|
|
36
|
+
} finally {
|
|
37
|
+
isInitializing = false;
|
|
38
|
+
initializationPromise = null;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Internal tokenizer initialization
|
|
44
|
+
* @returns {Promise<AutoTokenizer|null>}
|
|
45
|
+
*/
|
|
46
|
+
async function _initializeTokenizer() {
|
|
47
|
+
try {
|
|
48
|
+
console.log(chalk.blue('Initializing MobileBERT tokenizer...'));
|
|
49
|
+
const tok = await AutoTokenizer.from_pretrained('Xenova/mobilebert-uncased-mnli');
|
|
50
|
+
console.log(chalk.green('✓ MobileBERT tokenizer initialized successfully'));
|
|
51
|
+
return tok;
|
|
52
|
+
} catch (error) {
|
|
53
|
+
console.warn(chalk.yellow('⚠ Failed to initialize tokenizer, falling back to character estimation'), error.message);
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Count exact tokens for MobileBERT model
|
|
60
|
+
* @param {string} text - Text to count tokens for
|
|
61
|
+
* @returns {Promise<number>} Number of tokens
|
|
62
|
+
*/
|
|
63
|
+
async function countTokens(text) {
|
|
64
|
+
if (!text || typeof text !== 'string') {
|
|
65
|
+
return 0;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
try {
|
|
69
|
+
const tok = await getTokenizer();
|
|
70
|
+
if (!tok) {
|
|
71
|
+
// Fallback to character estimation if tokenizer fails
|
|
72
|
+
return Math.ceil(text.length / 3); // Conservative estimate for MobileBERT
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const encoded = await tok.encode(text);
|
|
76
|
+
return encoded.length;
|
|
77
|
+
} catch (error) {
|
|
78
|
+
console.warn(chalk.gray('Token counting failed, using character estimation'), error.message);
|
|
79
|
+
return Math.ceil(text.length / 3);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Truncate text to fit within token limit while preserving important content
|
|
85
|
+
* @param {string} text - Text to truncate
|
|
86
|
+
* @param {number} maxTokens - Maximum tokens allowed (default: 450 for MobileBERT safety)
|
|
87
|
+
* @returns {Promise<string>} Truncated text
|
|
88
|
+
*/
|
|
89
|
+
export async function truncateToTokenLimit(text, maxTokens = 450) {
|
|
90
|
+
if (!text) return '';
|
|
91
|
+
|
|
92
|
+
const currentTokens = await countTokens(text);
|
|
93
|
+
if (currentTokens <= maxTokens) {
|
|
94
|
+
return text;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Binary search to find the right length
|
|
98
|
+
let left = 0;
|
|
99
|
+
let right = text.length;
|
|
100
|
+
let bestLength = 0;
|
|
101
|
+
|
|
102
|
+
while (left <= right) {
|
|
103
|
+
const mid = Math.floor((left + right) / 2);
|
|
104
|
+
const truncated = text.substring(0, mid);
|
|
105
|
+
const tokens = await countTokens(truncated);
|
|
106
|
+
|
|
107
|
+
if (tokens <= maxTokens) {
|
|
108
|
+
bestLength = mid;
|
|
109
|
+
left = mid + 1;
|
|
110
|
+
} else {
|
|
111
|
+
right = mid - 1;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Ensure we don't cut off in the middle of a word
|
|
116
|
+
let result = text.substring(0, bestLength);
|
|
117
|
+
const lastSpaceIndex = result.lastIndexOf(' ');
|
|
118
|
+
if (lastSpaceIndex > bestLength * 0.8) {
|
|
119
|
+
result = result.substring(0, lastSpaceIndex);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return result;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Clean up tokenizer resources
|
|
127
|
+
*/
|
|
128
|
+
export async function cleanupTokenizer() {
|
|
129
|
+
if (tokenizer) {
|
|
130
|
+
try {
|
|
131
|
+
if (typeof tokenizer.dispose === 'function') {
|
|
132
|
+
await tokenizer.dispose();
|
|
133
|
+
}
|
|
134
|
+
tokenizer = null;
|
|
135
|
+
console.log(chalk.green('✓ MobileBERT tokenizer resources cleaned up'));
|
|
136
|
+
} catch (error) {
|
|
137
|
+
console.warn(chalk.yellow('⚠ Error cleaning up tokenizer:'), error.message);
|
|
138
|
+
tokenizer = null;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|