npm - @appland/search - Versions diffs - 1.1.0 → 1.1.2 - Mend

@appland/search 1.1.0 → 1.1.2

Files changed (8) hide show

package/CHANGELOG.md +17 -0
package/built/build-file-index.d.ts +2 -2
package/built/build-file-index.js +2 -1
package/built/build-snippet-index.js +5 -3
package/built/splitter.js +3 -1
package/built/tokenize.d.ts +11 -1
package/built/tokenize.js +33 -3
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,20 @@
+# [@appland/search-v1.1.2](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.1.1...@appland/search-v1.1.2) (2025-01-23)
+### Bug Fixes
+* Performance issue when chunking large documents ([cfff5a0](https://github.com/getappmap/appmap-js/commit/cfff5a0f9937f8fb57d3344812bc304e6292819e))
+* Prevent re-tokenization of chunks ([2b75aaf](https://github.com/getappmap/appmap-js/commit/2b75aafe35f40abae21961acf4363edbae810aee))
+* Tokenization no longer hangs the process ([a7df088](https://github.com/getappmap/appmap-js/commit/a7df088461add710b0f5e91aaec0ce92b2e1baed))
+* Tokenization will consider the file type ([727c29b](https://github.com/getappmap/appmap-js/commit/727c29be5f31c09e736b9ab0554a8094b46a01a4))
+# [@appland/search-v1.1.1](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.1.0...@appland/search-v1.1.1) (2024-12-18)
+### Bug Fixes
+* Extract complete chunk when splitting text ([75d2f5d](https://github.com/getappmap/appmap-js/commit/75d2f5df06c9794b772116c2facde366d5e1cd7d))
 # [@appland/search-v1.1.0](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.0.1...@appland/search-v1.1.0) (2024-12-01)

package/built/build-file-index.d.ts CHANGED Viewed

@@ -2,8 +2,8 @@ import FileIndex from './file-index';
 import { ContentReader } from './ioutil';
 export type ListFn = (path: string) => Promise<string[]>;
 export type FilterFn = (path: string) => PromiseLike<boolean>;
-export type Tokenizer = (content: string, fileExtension: string) => {
+export type Tokenizer = (content: string, fileExtension: string) => Promise<{
     symbols: string[];
     words: string[];
-};
+}>;
 export default function buildFileIndex(fileIndex: FileIndex, directories: string[], listDirectory: ListFn, fileFilter: FilterFn, contentReader: ContentReader, tokenizer: Tokenizer): Promise<void>;

package/built/build-file-index.js CHANGED Viewed

@@ -15,7 +15,8 @@ async function indexFile(context, filePath) {
     if (!fileContents)
         return;
     debug('Read file: %s, length: %d (%s...)', filePath, fileContents.length, fileContents.slice(0, 40));
-    const tokens = context.tokenizer(fileContents, filePath);
+    const fileExtension = filePath.split('.').pop() ?? '';
+    const tokens = await context.tokenizer(fileContents, fileExtension);
     const symbols = tokens.symbols.join(' ');
     const words = tokens.words.join(' ');
     debug('Tokenized file: %s', filePath);

package/built/build-snippet-index.js CHANGED Viewed

@@ -10,11 +10,13 @@ async function indexFile(context, file) {
         return;
     const extension = file.filePath.split('.').pop() || '';
     const chunks = await context.splitter(fileContent, extension);
-    chunks.forEach((chunk) => {
+    for (const chunk of chunks) {
         const { content, startLine } = chunk;
         const snippetId = (0, snippet_index_1.fileChunkSnippetId)(filePath, startLine);
-        context.snippetIndex.indexSnippet(snippetId, file.directory, context.tokenizer(content, file.filePath).symbols.join(' '), context.tokenizer(content, file.filePath).words.join(' '), content);
-    });
+        const fileExtension = file.filePath.split('.').pop() ?? '';
+        const { symbols, words } = await context.tokenizer(content, fileExtension);
+        context.snippetIndex.indexSnippet(snippetId, file.directory, symbols.join(' '), words.join(' '), content);
+    }
 }
 async function buildSnippetIndex(snippetIndex, files, contentReader, splitter, tokenizer) {
     const context = {

package/built/splitter.js CHANGED Viewed

@@ -36,15 +36,17 @@ async function langchainSplitter(content, fileExtension) {
         splitter = new text_splitter_1.RecursiveCharacterTextSplitter();
     }
     const documents = await splitter.createDocuments([content]);
+    const contentLines = content.split('\n');
     // metadata includes:
     // { loc: { lines: { from: 1, to: 14 } } }
     return documents.map((doc) => {
         const loc = doc.metadata?.loc;
         const lines = loc?.lines;
         const result = {
-            content: doc.pageContent,
+            content: '',
         };
         if (lines) {
+            result.content = contentLines.slice(lines.from - 1, lines.to).join('\n');
             result.startLine = lines.from;
             result.endLine = lines.to;
         }

package/built/tokenize.d.ts CHANGED Viewed

@@ -1,9 +1,19 @@
 export declare const SymbolRegexes: Record<string, RegExp>;
 export declare function symbols(content: string, fileExtension: string, allowGeneric?: boolean): string[];
 export declare function words(content: string): string[];
+/**
+ * Prepares a string for tokenization by splitting it into batches of lines, each of which is
+ * no longer than the specified maximum length.
+ *
+ * @param content The content to split into batches
+ * @param batchSize The maximum number of characters per batch
+ * @param maxLineLength The maximum length of a line
+ * @returns an array of batches of content
+ */
+export declare function batch(content: string, batchSize?: number, maxLineLength?: number): string[];
 type FileTokens = {
     symbols: string[];
     words: string[];
 };
-export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): FileTokens;
+export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): Promise<FileTokens>;
 export {};

package/built/tokenize.js CHANGED Viewed

@@ -6,6 +6,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
 exports.SymbolRegexes = void 0;
 exports.symbols = symbols;
 exports.words = words;
+exports.batch = batch;
 exports.fileTokens = fileTokens;
 const query_keywords_1 = __importDefault(require("./query-keywords"));
 const debug_1 = __importDefault(require("debug"));
@@ -54,11 +55,40 @@ function symbols(content, fileExtension, allowGeneric = true) {
 function words(content) {
     return content.match(/\b\w+\b/g) ?? [];
 }
-function fileTokens(content, fileExtension, enableGenericSymbolParsing = true) {
+/**
+ * Prepares a string for tokenization by splitting it into batches of lines, each of which is
+ * no longer than the specified maximum length.
+ *
+ * @param content The content to split into batches
+ * @param batchSize The maximum number of characters per batch
+ * @param maxLineLength The maximum length of a line
+ * @returns an array of batches of content
+ */
+function batch(content, batchSize = 1000, maxLineLength = 1000) {
+    const lines = content.split('\n').filter(({ length }) => length <= maxLineLength);
+    const result = [];
+    for (let i = 0; i < lines.length; i += batchSize) {
+        result.push(lines.slice(i, i + batchSize).join('\n'));
+    }
+    return result;
+}
+async function fileTokens(content, fileExtension, enableGenericSymbolParsing = true) {
     if (enableGenericSymbolParsing)
         debug('Using generic symbol parsing for file extension: %s', fileExtension);
-    const symbolList = (0, query_keywords_1.default)(symbols(content, fileExtension, enableGenericSymbolParsing)).sort();
-    const wordList = (0, query_keywords_1.default)(words(content)).sort();
+    const batches = batch(content);
+    const symbolList = [];
+    const wordList = [];
+    for (let i = 0; i < batches.length; ++i) {
+        if (i && i % 5 === 0) {
+            // Every 5th batch, wait for the next tick to avoid blocking the event loop
+            await new Promise((resolve) => setImmediate(resolve));
+        }
+        const batch = batches[i];
+        symbolList.push(...(0, query_keywords_1.default)(symbols(batch, fileExtension, enableGenericSymbolParsing)));
+        wordList.push(...(0, query_keywords_1.default)(words(batch)));
+    }
+    symbolList.sort();
+    wordList.sort();
     // Iterate through words, with a corresponding pointer to symbols.
     // If the word at the word index does not match the symbol at the symbol index,
     // add the word to the output. Otherwise, advance both pointers. Repeat

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@appland/search",
-  "version": "1.1.0",
+  "version": "1.1.2",
   "description": "",
   "bin": "built/cli.js",
   "publishConfig": {