@appland/search 1.1.0 → 1.1.2

Sign up to get free protection for your applications and to get access to all the features.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,20 @@
1
+ # [@appland/search-v1.1.2](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.1.1...@appland/search-v1.1.2) (2025-01-23)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * Performance issue when chunking large documents ([cfff5a0](https://github.com/getappmap/appmap-js/commit/cfff5a0f9937f8fb57d3344812bc304e6292819e))
7
+ * Prevent re-tokenization of chunks ([2b75aaf](https://github.com/getappmap/appmap-js/commit/2b75aafe35f40abae21961acf4363edbae810aee))
8
+ * Tokenization no longer hangs the process ([a7df088](https://github.com/getappmap/appmap-js/commit/a7df088461add710b0f5e91aaec0ce92b2e1baed))
9
+ * Tokenization will consider the file type ([727c29b](https://github.com/getappmap/appmap-js/commit/727c29be5f31c09e736b9ab0554a8094b46a01a4))
10
+
11
+ # [@appland/search-v1.1.1](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.1.0...@appland/search-v1.1.1) (2024-12-18)
12
+
13
+
14
+ ### Bug Fixes
15
+
16
+ * Extract complete chunk when splitting text ([75d2f5d](https://github.com/getappmap/appmap-js/commit/75d2f5df06c9794b772116c2facde366d5e1cd7d))
17
+
1
18
  # [@appland/search-v1.1.0](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.0.1...@appland/search-v1.1.0) (2024-12-01)
2
19
 
3
20
 
@@ -2,8 +2,8 @@ import FileIndex from './file-index';
2
2
  import { ContentReader } from './ioutil';
3
3
  export type ListFn = (path: string) => Promise<string[]>;
4
4
  export type FilterFn = (path: string) => PromiseLike<boolean>;
5
- export type Tokenizer = (content: string, fileExtension: string) => {
5
+ export type Tokenizer = (content: string, fileExtension: string) => Promise<{
6
6
  symbols: string[];
7
7
  words: string[];
8
- };
8
+ }>;
9
9
  export default function buildFileIndex(fileIndex: FileIndex, directories: string[], listDirectory: ListFn, fileFilter: FilterFn, contentReader: ContentReader, tokenizer: Tokenizer): Promise<void>;
@@ -15,7 +15,8 @@ async function indexFile(context, filePath) {
15
15
  if (!fileContents)
16
16
  return;
17
17
  debug('Read file: %s, length: %d (%s...)', filePath, fileContents.length, fileContents.slice(0, 40));
18
- const tokens = context.tokenizer(fileContents, filePath);
18
+ const fileExtension = filePath.split('.').pop() ?? '';
19
+ const tokens = await context.tokenizer(fileContents, fileExtension);
19
20
  const symbols = tokens.symbols.join(' ');
20
21
  const words = tokens.words.join(' ');
21
22
  debug('Tokenized file: %s', filePath);
@@ -10,11 +10,13 @@ async function indexFile(context, file) {
10
10
  return;
11
11
  const extension = file.filePath.split('.').pop() || '';
12
12
  const chunks = await context.splitter(fileContent, extension);
13
- chunks.forEach((chunk) => {
13
+ for (const chunk of chunks) {
14
14
  const { content, startLine } = chunk;
15
15
  const snippetId = (0, snippet_index_1.fileChunkSnippetId)(filePath, startLine);
16
- context.snippetIndex.indexSnippet(snippetId, file.directory, context.tokenizer(content, file.filePath).symbols.join(' '), context.tokenizer(content, file.filePath).words.join(' '), content);
17
- });
16
+ const fileExtension = file.filePath.split('.').pop() ?? '';
17
+ const { symbols, words } = await context.tokenizer(content, fileExtension);
18
+ context.snippetIndex.indexSnippet(snippetId, file.directory, symbols.join(' '), words.join(' '), content);
19
+ }
18
20
  }
19
21
  async function buildSnippetIndex(snippetIndex, files, contentReader, splitter, tokenizer) {
20
22
  const context = {
package/built/splitter.js CHANGED
@@ -36,15 +36,17 @@ async function langchainSplitter(content, fileExtension) {
36
36
  splitter = new text_splitter_1.RecursiveCharacterTextSplitter();
37
37
  }
38
38
  const documents = await splitter.createDocuments([content]);
39
+ const contentLines = content.split('\n');
39
40
  // metadata includes:
40
41
  // { loc: { lines: { from: 1, to: 14 } } }
41
42
  return documents.map((doc) => {
42
43
  const loc = doc.metadata?.loc;
43
44
  const lines = loc?.lines;
44
45
  const result = {
45
- content: doc.pageContent,
46
+ content: '',
46
47
  };
47
48
  if (lines) {
49
+ result.content = contentLines.slice(lines.from - 1, lines.to).join('\n');
48
50
  result.startLine = lines.from;
49
51
  result.endLine = lines.to;
50
52
  }
@@ -1,9 +1,19 @@
1
1
  export declare const SymbolRegexes: Record<string, RegExp>;
2
2
  export declare function symbols(content: string, fileExtension: string, allowGeneric?: boolean): string[];
3
3
  export declare function words(content: string): string[];
4
+ /**
5
+ * Prepares a string for tokenization by splitting it into batches of lines, each of which is
6
+ * no longer than the specified maximum length.
7
+ *
8
+ * @param content The content to split into batches
9
+ * @param batchSize The maximum number of characters per batch
10
+ * @param maxLineLength The maximum length of a line
11
+ * @returns an array of batches of content
12
+ */
13
+ export declare function batch(content: string, batchSize?: number, maxLineLength?: number): string[];
4
14
  type FileTokens = {
5
15
  symbols: string[];
6
16
  words: string[];
7
17
  };
8
- export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): FileTokens;
18
+ export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): Promise<FileTokens>;
9
19
  export {};
package/built/tokenize.js CHANGED
@@ -6,6 +6,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
6
6
  exports.SymbolRegexes = void 0;
7
7
  exports.symbols = symbols;
8
8
  exports.words = words;
9
+ exports.batch = batch;
9
10
  exports.fileTokens = fileTokens;
10
11
  const query_keywords_1 = __importDefault(require("./query-keywords"));
11
12
  const debug_1 = __importDefault(require("debug"));
@@ -54,11 +55,40 @@ function symbols(content, fileExtension, allowGeneric = true) {
54
55
  function words(content) {
55
56
  return content.match(/\b\w+\b/g) ?? [];
56
57
  }
57
- function fileTokens(content, fileExtension, enableGenericSymbolParsing = true) {
58
+ /**
59
+ * Prepares a string for tokenization by splitting it into batches of lines, each of which is
60
+ * no longer than the specified maximum length.
61
+ *
62
+ * @param content The content to split into batches
63
+ * @param batchSize The maximum number of characters per batch
64
+ * @param maxLineLength The maximum length of a line
65
+ * @returns an array of batches of content
66
+ */
67
+ function batch(content, batchSize = 1000, maxLineLength = 1000) {
68
+ const lines = content.split('\n').filter(({ length }) => length <= maxLineLength);
69
+ const result = [];
70
+ for (let i = 0; i < lines.length; i += batchSize) {
71
+ result.push(lines.slice(i, i + batchSize).join('\n'));
72
+ }
73
+ return result;
74
+ }
75
+ async function fileTokens(content, fileExtension, enableGenericSymbolParsing = true) {
58
76
  if (enableGenericSymbolParsing)
59
77
  debug('Using generic symbol parsing for file extension: %s', fileExtension);
60
- const symbolList = (0, query_keywords_1.default)(symbols(content, fileExtension, enableGenericSymbolParsing)).sort();
61
- const wordList = (0, query_keywords_1.default)(words(content)).sort();
78
+ const batches = batch(content);
79
+ const symbolList = [];
80
+ const wordList = [];
81
+ for (let i = 0; i < batches.length; ++i) {
82
+ if (i && i % 5 === 0) {
83
+ // Every 5th batch, wait for the next tick to avoid blocking the event loop
84
+ await new Promise((resolve) => setImmediate(resolve));
85
+ }
86
+ const batch = batches[i];
87
+ symbolList.push(...(0, query_keywords_1.default)(symbols(batch, fileExtension, enableGenericSymbolParsing)));
88
+ wordList.push(...(0, query_keywords_1.default)(words(batch)));
89
+ }
90
+ symbolList.sort();
91
+ wordList.sort();
62
92
  // Iterate through words, with a corresponding pointer to symbols.
63
93
  // If the word at the word index does not match the symbol at the symbol index,
64
94
  // add the word to the output. Otherwise, advance both pointers. Repeat
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@appland/search",
3
- "version": "1.1.0",
3
+ "version": "1.1.2",
4
4
  "description": "",
5
5
  "bin": "built/cli.js",
6
6
  "publishConfig": {