@appland/search 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,20 @@
1
+ # [@appland/search-v1.1.2](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.1.1...@appland/search-v1.1.2) (2025-01-23)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * Performance issue when chunking large documents ([cfff5a0](https://github.com/getappmap/appmap-js/commit/cfff5a0f9937f8fb57d3344812bc304e6292819e))
7
+ * Prevent re-tokenization of chunks ([2b75aaf](https://github.com/getappmap/appmap-js/commit/2b75aafe35f40abae21961acf4363edbae810aee))
8
+ * Tokenization no longer hangs the process ([a7df088](https://github.com/getappmap/appmap-js/commit/a7df088461add710b0f5e91aaec0ce92b2e1baed))
9
+ * Tokenization will consider the file type ([727c29b](https://github.com/getappmap/appmap-js/commit/727c29be5f31c09e736b9ab0554a8094b46a01a4))
10
+
11
+ # [@appland/search-v1.1.1](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.1.0...@appland/search-v1.1.1) (2024-12-18)
12
+
13
+
14
+ ### Bug Fixes
15
+
16
+ * Extract complete chunk when splitting text ([75d2f5d](https://github.com/getappmap/appmap-js/commit/75d2f5df06c9794b772116c2facde366d5e1cd7d))
17
+
1
18
  # [@appland/search-v1.1.0](https://github.com/getappmap/appmap-js/compare/@appland/search-v1.0.1...@appland/search-v1.1.0) (2024-12-01)
2
19
 
3
20
 
@@ -2,8 +2,8 @@ import FileIndex from './file-index';
2
2
  import { ContentReader } from './ioutil';
3
3
  export type ListFn = (path: string) => Promise<string[]>;
4
4
  export type FilterFn = (path: string) => PromiseLike<boolean>;
5
- export type Tokenizer = (content: string, fileExtension: string) => {
5
+ export type Tokenizer = (content: string, fileExtension: string) => Promise<{
6
6
  symbols: string[];
7
7
  words: string[];
8
- };
8
+ }>;
9
9
  export default function buildFileIndex(fileIndex: FileIndex, directories: string[], listDirectory: ListFn, fileFilter: FilterFn, contentReader: ContentReader, tokenizer: Tokenizer): Promise<void>;
@@ -15,7 +15,8 @@ async function indexFile(context, filePath) {
15
15
  if (!fileContents)
16
16
  return;
17
17
  debug('Read file: %s, length: %d (%s...)', filePath, fileContents.length, fileContents.slice(0, 40));
18
- const tokens = context.tokenizer(fileContents, filePath);
18
+ const fileExtension = filePath.split('.').pop() ?? '';
19
+ const tokens = await context.tokenizer(fileContents, fileExtension);
19
20
  const symbols = tokens.symbols.join(' ');
20
21
  const words = tokens.words.join(' ');
21
22
  debug('Tokenized file: %s', filePath);
@@ -10,11 +10,13 @@ async function indexFile(context, file) {
10
10
  return;
11
11
  const extension = file.filePath.split('.').pop() || '';
12
12
  const chunks = await context.splitter(fileContent, extension);
13
- chunks.forEach((chunk) => {
13
+ for (const chunk of chunks) {
14
14
  const { content, startLine } = chunk;
15
15
  const snippetId = (0, snippet_index_1.fileChunkSnippetId)(filePath, startLine);
16
- context.snippetIndex.indexSnippet(snippetId, file.directory, context.tokenizer(content, file.filePath).symbols.join(' '), context.tokenizer(content, file.filePath).words.join(' '), content);
17
- });
16
+ const fileExtension = file.filePath.split('.').pop() ?? '';
17
+ const { symbols, words } = await context.tokenizer(content, fileExtension);
18
+ context.snippetIndex.indexSnippet(snippetId, file.directory, symbols.join(' '), words.join(' '), content);
19
+ }
18
20
  }
19
21
  async function buildSnippetIndex(snippetIndex, files, contentReader, splitter, tokenizer) {
20
22
  const context = {
package/built/splitter.js CHANGED
@@ -36,15 +36,17 @@ async function langchainSplitter(content, fileExtension) {
36
36
  splitter = new text_splitter_1.RecursiveCharacterTextSplitter();
37
37
  }
38
38
  const documents = await splitter.createDocuments([content]);
39
+ const contentLines = content.split('\n');
39
40
  // metadata includes:
40
41
  // { loc: { lines: { from: 1, to: 14 } } }
41
42
  return documents.map((doc) => {
42
43
  const loc = doc.metadata?.loc;
43
44
  const lines = loc?.lines;
44
45
  const result = {
45
- content: doc.pageContent,
46
+ content: '',
46
47
  };
47
48
  if (lines) {
49
+ result.content = contentLines.slice(lines.from - 1, lines.to).join('\n');
48
50
  result.startLine = lines.from;
49
51
  result.endLine = lines.to;
50
52
  }
@@ -1,9 +1,19 @@
1
1
  export declare const SymbolRegexes: Record<string, RegExp>;
2
2
  export declare function symbols(content: string, fileExtension: string, allowGeneric?: boolean): string[];
3
3
  export declare function words(content: string): string[];
4
+ /**
5
+ * Prepares a string for tokenization by splitting it into batches of lines, each of which is
6
+ * no longer than the specified maximum length.
7
+ *
8
+ * @param content The content to split into batches
9
+ * @param batchSize The maximum number of characters per batch
10
+ * @param maxLineLength The maximum length of a line
11
+ * @returns an array of batches of content
12
+ */
13
+ export declare function batch(content: string, batchSize?: number, maxLineLength?: number): string[];
4
14
  type FileTokens = {
5
15
  symbols: string[];
6
16
  words: string[];
7
17
  };
8
- export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): FileTokens;
18
+ export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): Promise<FileTokens>;
9
19
  export {};
package/built/tokenize.js CHANGED
@@ -6,6 +6,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
6
6
  exports.SymbolRegexes = void 0;
7
7
  exports.symbols = symbols;
8
8
  exports.words = words;
9
+ exports.batch = batch;
9
10
  exports.fileTokens = fileTokens;
10
11
  const query_keywords_1 = __importDefault(require("./query-keywords"));
11
12
  const debug_1 = __importDefault(require("debug"));
@@ -54,11 +55,40 @@ function symbols(content, fileExtension, allowGeneric = true) {
54
55
  function words(content) {
55
56
  return content.match(/\b\w+\b/g) ?? [];
56
57
  }
57
- function fileTokens(content, fileExtension, enableGenericSymbolParsing = true) {
58
+ /**
59
+ * Prepares a string for tokenization by splitting it into batches of lines, each of which is
60
+ * no longer than the specified maximum length.
61
+ *
62
+ * @param content The content to split into batches
63
+ * @param batchSize The maximum number of characters per batch
64
+ * @param maxLineLength The maximum length of a line
65
+ * @returns an array of batches of content
66
+ */
67
+ function batch(content, batchSize = 1000, maxLineLength = 1000) {
68
+ const lines = content.split('\n').filter(({ length }) => length <= maxLineLength);
69
+ const result = [];
70
+ for (let i = 0; i < lines.length; i += batchSize) {
71
+ result.push(lines.slice(i, i + batchSize).join('\n'));
72
+ }
73
+ return result;
74
+ }
75
+ async function fileTokens(content, fileExtension, enableGenericSymbolParsing = true) {
58
76
  if (enableGenericSymbolParsing)
59
77
  debug('Using generic symbol parsing for file extension: %s', fileExtension);
60
- const symbolList = (0, query_keywords_1.default)(symbols(content, fileExtension, enableGenericSymbolParsing)).sort();
61
- const wordList = (0, query_keywords_1.default)(words(content)).sort();
78
+ const batches = batch(content);
79
+ const symbolList = [];
80
+ const wordList = [];
81
+ for (let i = 0; i < batches.length; ++i) {
82
+ if (i && i % 5 === 0) {
83
+ // Every 5th batch, wait for the next tick to avoid blocking the event loop
84
+ await new Promise((resolve) => setImmediate(resolve));
85
+ }
86
+ const batch = batches[i];
87
+ symbolList.push(...(0, query_keywords_1.default)(symbols(batch, fileExtension, enableGenericSymbolParsing)));
88
+ wordList.push(...(0, query_keywords_1.default)(words(batch)));
89
+ }
90
+ symbolList.sort();
91
+ wordList.sort();
62
92
  // Iterate through words, with a corresponding pointer to symbols.
63
93
  // If the word at the word index does not match the symbol at the symbol index,
64
94
  // add the word to the output. Otherwise, advance both pointers. Repeat
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@appland/search",
3
- "version": "1.1.0",
3
+ "version": "1.1.2",
4
4
  "description": "",
5
5
  "bin": "built/cli.js",
6
6
  "publishConfig": {