@appland/search 1.1.0 → 1.1.2
Sign up to get free protection for your applications and to get access to all the features.
- package/CHANGELOG.md +17 -0
- package/built/build-file-index.d.ts +2 -2
- package/built/build-file-index.js +2 -1
- package/built/build-snippet-index.js +5 -3
- package/built/splitter.js +3 -1
- package/built/tokenize.d.ts +11 -1
- package/built/tokenize.js +33 -3
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
@@ -1,3 +1,20 @@
|
|
1
|
+
# [@appland/search-v1.1.2](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.1.1...@appland/search-v1.1.2) (2025-01-23)
|
2
|
+
|
3
|
+
|
4
|
+
### Bug Fixes
|
5
|
+
|
6
|
+
* Performance issue when chunking large documents ([cfff5a0](https://github.com/getappmap/appmap-js/commit/cfff5a0f9937f8fb57d3344812bc304e6292819e))
|
7
|
+
* Prevent re-tokenization of chunks ([2b75aaf](https://github.com/getappmap/appmap-js/commit/2b75aafe35f40abae21961acf4363edbae810aee))
|
8
|
+
* Tokenization no longer hangs the process ([a7df088](https://github.com/getappmap/appmap-js/commit/a7df088461add710b0f5e91aaec0ce92b2e1baed))
|
9
|
+
* Tokenization will consider the file type ([727c29b](https://github.com/getappmap/appmap-js/commit/727c29be5f31c09e736b9ab0554a8094b46a01a4))
|
10
|
+
|
11
|
+
# [@appland/search-v1.1.1](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.1.0...@appland/search-v1.1.1) (2024-12-18)
|
12
|
+
|
13
|
+
|
14
|
+
### Bug Fixes
|
15
|
+
|
16
|
+
* Extract complete chunk when splitting text ([75d2f5d](https://github.com/getappmap/appmap-js/commit/75d2f5df06c9794b772116c2facde366d5e1cd7d))
|
17
|
+
|
1
18
|
# [@appland/search-v1.1.0](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.0.1...@appland/search-v1.1.0) (2024-12-01)
|
2
19
|
|
3
20
|
|
@@ -2,8 +2,8 @@ import FileIndex from './file-index';
|
|
2
2
|
import { ContentReader } from './ioutil';
|
3
3
|
export type ListFn = (path: string) => Promise<string[]>;
|
4
4
|
export type FilterFn = (path: string) => PromiseLike<boolean>;
|
5
|
-
export type Tokenizer = (content: string, fileExtension: string) => {
|
5
|
+
export type Tokenizer = (content: string, fileExtension: string) => Promise<{
|
6
6
|
symbols: string[];
|
7
7
|
words: string[];
|
8
|
-
}
|
8
|
+
}>;
|
9
9
|
export default function buildFileIndex(fileIndex: FileIndex, directories: string[], listDirectory: ListFn, fileFilter: FilterFn, contentReader: ContentReader, tokenizer: Tokenizer): Promise<void>;
|
@@ -15,7 +15,8 @@ async function indexFile(context, filePath) {
|
|
15
15
|
if (!fileContents)
|
16
16
|
return;
|
17
17
|
debug('Read file: %s, length: %d (%s...)', filePath, fileContents.length, fileContents.slice(0, 40));
|
18
|
-
const
|
18
|
+
const fileExtension = filePath.split('.').pop() ?? '';
|
19
|
+
const tokens = await context.tokenizer(fileContents, fileExtension);
|
19
20
|
const symbols = tokens.symbols.join(' ');
|
20
21
|
const words = tokens.words.join(' ');
|
21
22
|
debug('Tokenized file: %s', filePath);
|
@@ -10,11 +10,13 @@ async function indexFile(context, file) {
|
|
10
10
|
return;
|
11
11
|
const extension = file.filePath.split('.').pop() || '';
|
12
12
|
const chunks = await context.splitter(fileContent, extension);
|
13
|
-
|
13
|
+
for (const chunk of chunks) {
|
14
14
|
const { content, startLine } = chunk;
|
15
15
|
const snippetId = (0, snippet_index_1.fileChunkSnippetId)(filePath, startLine);
|
16
|
-
|
17
|
-
|
16
|
+
const fileExtension = file.filePath.split('.').pop() ?? '';
|
17
|
+
const { symbols, words } = await context.tokenizer(content, fileExtension);
|
18
|
+
context.snippetIndex.indexSnippet(snippetId, file.directory, symbols.join(' '), words.join(' '), content);
|
19
|
+
}
|
18
20
|
}
|
19
21
|
async function buildSnippetIndex(snippetIndex, files, contentReader, splitter, tokenizer) {
|
20
22
|
const context = {
|
package/built/splitter.js
CHANGED
@@ -36,15 +36,17 @@ async function langchainSplitter(content, fileExtension) {
|
|
36
36
|
splitter = new text_splitter_1.RecursiveCharacterTextSplitter();
|
37
37
|
}
|
38
38
|
const documents = await splitter.createDocuments([content]);
|
39
|
+
const contentLines = content.split('\n');
|
39
40
|
// metadata includes:
|
40
41
|
// { loc: { lines: { from: 1, to: 14 } } }
|
41
42
|
return documents.map((doc) => {
|
42
43
|
const loc = doc.metadata?.loc;
|
43
44
|
const lines = loc?.lines;
|
44
45
|
const result = {
|
45
|
-
content:
|
46
|
+
content: '',
|
46
47
|
};
|
47
48
|
if (lines) {
|
49
|
+
result.content = contentLines.slice(lines.from - 1, lines.to).join('\n');
|
48
50
|
result.startLine = lines.from;
|
49
51
|
result.endLine = lines.to;
|
50
52
|
}
|
package/built/tokenize.d.ts
CHANGED
@@ -1,9 +1,19 @@
|
|
1
1
|
export declare const SymbolRegexes: Record<string, RegExp>;
|
2
2
|
export declare function symbols(content: string, fileExtension: string, allowGeneric?: boolean): string[];
|
3
3
|
export declare function words(content: string): string[];
|
4
|
+
/**
|
5
|
+
* Prepares a string for tokenization by splitting it into batches of lines, each of which is
|
6
|
+
* no longer than the specified maximum length.
|
7
|
+
*
|
8
|
+
* @param content The content to split into batches
|
9
|
+
* @param batchSize The maximum number of characters per batch
|
10
|
+
* @param maxLineLength The maximum length of a line
|
11
|
+
* @returns an array of batches of content
|
12
|
+
*/
|
13
|
+
export declare function batch(content: string, batchSize?: number, maxLineLength?: number): string[];
|
4
14
|
type FileTokens = {
|
5
15
|
symbols: string[];
|
6
16
|
words: string[];
|
7
17
|
};
|
8
|
-
export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): FileTokens
|
18
|
+
export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): Promise<FileTokens>;
|
9
19
|
export {};
|
package/built/tokenize.js
CHANGED
@@ -6,6 +6,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
6
|
exports.SymbolRegexes = void 0;
|
7
7
|
exports.symbols = symbols;
|
8
8
|
exports.words = words;
|
9
|
+
exports.batch = batch;
|
9
10
|
exports.fileTokens = fileTokens;
|
10
11
|
const query_keywords_1 = __importDefault(require("./query-keywords"));
|
11
12
|
const debug_1 = __importDefault(require("debug"));
|
@@ -54,11 +55,40 @@ function symbols(content, fileExtension, allowGeneric = true) {
|
|
54
55
|
function words(content) {
|
55
56
|
return content.match(/\b\w+\b/g) ?? [];
|
56
57
|
}
|
57
|
-
|
58
|
+
/**
|
59
|
+
* Prepares a string for tokenization by splitting it into batches of lines, each of which is
|
60
|
+
* no longer than the specified maximum length.
|
61
|
+
*
|
62
|
+
* @param content The content to split into batches
|
63
|
+
* @param batchSize The maximum number of characters per batch
|
64
|
+
* @param maxLineLength The maximum length of a line
|
65
|
+
* @returns an array of batches of content
|
66
|
+
*/
|
67
|
+
function batch(content, batchSize = 1000, maxLineLength = 1000) {
|
68
|
+
const lines = content.split('\n').filter(({ length }) => length <= maxLineLength);
|
69
|
+
const result = [];
|
70
|
+
for (let i = 0; i < lines.length; i += batchSize) {
|
71
|
+
result.push(lines.slice(i, i + batchSize).join('\n'));
|
72
|
+
}
|
73
|
+
return result;
|
74
|
+
}
|
75
|
+
async function fileTokens(content, fileExtension, enableGenericSymbolParsing = true) {
|
58
76
|
if (enableGenericSymbolParsing)
|
59
77
|
debug('Using generic symbol parsing for file extension: %s', fileExtension);
|
60
|
-
const
|
61
|
-
const
|
78
|
+
const batches = batch(content);
|
79
|
+
const symbolList = [];
|
80
|
+
const wordList = [];
|
81
|
+
for (let i = 0; i < batches.length; ++i) {
|
82
|
+
if (i && i % 5 === 0) {
|
83
|
+
// Every 5th batch, wait for the next tick to avoid blocking the event loop
|
84
|
+
await new Promise((resolve) => setImmediate(resolve));
|
85
|
+
}
|
86
|
+
const batch = batches[i];
|
87
|
+
symbolList.push(...(0, query_keywords_1.default)(symbols(batch, fileExtension, enableGenericSymbolParsing)));
|
88
|
+
wordList.push(...(0, query_keywords_1.default)(words(batch)));
|
89
|
+
}
|
90
|
+
symbolList.sort();
|
91
|
+
wordList.sort();
|
62
92
|
// Iterate through words, with a corresponding pointer to symbols.
|
63
93
|
// If the word at the word index does not match the symbol at the symbol index,
|
64
94
|
// add the word to the output. Otherwise, advance both pointers. Repeat
|