@appland/search 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
package/CHANGELOG.md ADDED
@@ -0,0 +1,6 @@
1
+ # @appland/search-v1.0.0 (2024-11-06)
2
+
3
+
4
+ ### Features
5
+
6
+ * @appland/search package ([dbf7d9a](https://github.com/getappmap/appmap-js/commit/dbf7d9a32593e19df9a8732f18e32227dcb53aca))
@@ -0,0 +1,9 @@
1
+ import FileIndex from './file-index';
2
+ import { ContentReader } from './ioutil';
3
+ export type ListFn = (path: string) => Promise<string[]>;
4
+ export type FilterFn = (path: string) => PromiseLike<boolean>;
5
+ export type Tokenizer = (content: string, fileExtension: string) => {
6
+ symbols: string[];
7
+ words: string[];
8
+ };
9
+ export default function buildFileIndex(fileIndex: FileIndex, directories: string[], listDirectory: ListFn, fileFilter: FilterFn, contentReader: ContentReader, tokenizer: Tokenizer): Promise<void>;
@@ -0,0 +1,47 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.default = buildFileIndex;
7
+ const debug_1 = __importDefault(require("debug"));
8
+ const path_1 = require("path");
9
+ const console_1 = require("console");
10
+ const debug = (0, debug_1.default)('appmap:search:build-index');
11
+ async function indexFile(context, filePath) {
12
+ const fileContents = await context.contentReader(filePath);
13
+ if (!fileContents)
14
+ return;
15
+ const tokens = context.tokenizer(fileContents, filePath);
16
+ const symbols = tokens.symbols.join(' ');
17
+ const words = tokens.words.join(' ');
18
+ context.fileIndex.indexFile(context.baseDirectory, filePath, symbols, words);
19
+ }
20
+ async function indexDirectory(context, directory) {
21
+ const dirContents = await context.listDirectory(directory);
22
+ if (!dirContents)
23
+ return;
24
+ for (const dirContentItem of dirContents) {
25
+ const filePath = (0, path_1.join)(directory, dirContentItem);
26
+ debug('Indexing: %s', filePath);
27
+ if (await context.fileFilter(filePath)) {
28
+ indexFile(context, filePath).catch((e) => {
29
+ (0, console_1.warn)(`Error indexing file: ${filePath}`);
30
+ (0, console_1.warn)(e);
31
+ });
32
+ }
33
+ }
34
+ }
35
+ async function buildFileIndex(fileIndex, directories, listDirectory, fileFilter, contentReader, tokenizer) {
36
+ for (const directory of directories) {
37
+ const context = {
38
+ fileIndex,
39
+ baseDirectory: directory,
40
+ listDirectory,
41
+ fileFilter,
42
+ contentReader,
43
+ tokenizer,
44
+ };
45
+ await indexDirectory(context, directory);
46
+ }
47
+ }
@@ -0,0 +1,9 @@
1
+ import { Tokenizer } from './build-file-index';
2
+ import { ContentReader } from './ioutil';
3
+ import SnippetIndex from './snippet-index';
4
+ import { Splitter } from './splitter';
5
+ export type File = {
6
+ directory: string;
7
+ filePath: string;
8
+ };
9
+ export default function buildSnippetIndex(snippetIndex: SnippetIndex, files: File[], contentReader: ContentReader, splitter: Splitter, tokenizer: Tokenizer): Promise<void>;
@@ -0,0 +1,26 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.default = buildSnippetIndex;
4
+ async function indexFile(context, file) {
5
+ const fileContent = await context.contentReader(file.filePath);
6
+ if (!fileContent)
7
+ return;
8
+ const extension = file.filePath.split('.').pop() || '';
9
+ const chunks = await context.splitter(fileContent, extension);
10
+ chunks.forEach((chunk, index) => {
11
+ const snippetId = `${file.filePath}:${index}`;
12
+ const { content, startLine, endLine } = chunk;
13
+ context.snippetIndex.indexSnippet(snippetId, file.directory, file.filePath, startLine, endLine, context.tokenizer(content, file.filePath).symbols.join(' '), context.tokenizer(content, file.filePath).words.join(' '), content);
14
+ });
15
+ }
16
+ async function buildSnippetIndex(snippetIndex, files, contentReader, splitter, tokenizer) {
17
+ const context = {
18
+ snippetIndex,
19
+ contentReader,
20
+ splitter,
21
+ tokenizer,
22
+ };
23
+ for (const file of files) {
24
+ await indexFile(context, file);
25
+ }
26
+ }
package/built/cli.d.ts ADDED
@@ -0,0 +1 @@
1
+ export {};
package/built/cli.js ADDED
@@ -0,0 +1,105 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ const yargs_1 = __importDefault(require("yargs"));
7
+ const helpers_1 = require("yargs/helpers");
8
+ const better_sqlite3_1 = __importDefault(require("better-sqlite3"));
9
+ const debug_1 = __importDefault(require("debug"));
10
+ const tokenize_1 = require("./tokenize");
11
+ const file_index_1 = __importDefault(require("./file-index"));
12
+ const build_file_index_1 = __importDefault(require("./build-file-index"));
13
+ const project_files_1 = __importDefault(require("./project-files"));
14
+ const file_type_1 = require("./file-type");
15
+ const snippet_index_1 = __importDefault(require("./snippet-index"));
16
+ const build_snippet_index_1 = __importDefault(require("./build-snippet-index"));
17
+ const ioutil_1 = require("./ioutil");
18
+ const splitter_1 = require("./splitter");
19
+ const assert_1 = __importDefault(require("assert"));
20
+ const debug = (0, debug_1.default)('appmap:search:cli');
21
+ const cli = (0, yargs_1.default)((0, helpers_1.hideBin)(process.argv))
22
+ .command('* <query>', 'Index directories and perform a search', (yargs) => {
23
+ return yargs
24
+ .option('directories', {
25
+ alias: 'd',
26
+ type: 'array',
27
+ description: 'List of directories to index',
28
+ default: ['.'],
29
+ })
30
+ .option('file-filter', {
31
+ type: 'string',
32
+ description: 'Regex pattern to filter files',
33
+ })
34
+ .positional('query', {
35
+ describe: 'Search query',
36
+ type: 'string',
37
+ })
38
+ .strict();
39
+ }, async (argv) => {
40
+ const { directories, query } = argv;
41
+ let filterRE;
42
+ if (argv.fileFilter)
43
+ filterRE = new RegExp(argv.fileFilter);
44
+ const fileFilter = async (path) => {
45
+ debug('Filtering: %s', path);
46
+ if ((0, file_type_1.isBinaryFile)(path)) {
47
+ debug('Skipping binary file: %s', path);
48
+ return false;
49
+ }
50
+ const isData = (0, file_type_1.isDataFile)(path);
51
+ if (isData && (await (0, file_type_1.isLargeFile)(path))) {
52
+ debug('Skipping large data file: %s', path);
53
+ return false;
54
+ }
55
+ if (!filterRE)
56
+ return true;
57
+ return !filterRE.test(path);
58
+ };
59
+ const db = new better_sqlite3_1.default(':memory:');
60
+ const fileIndex = new file_index_1.default(db);
61
+ await (0, build_file_index_1.default)(fileIndex, directories, project_files_1.default, fileFilter, ioutil_1.readFileSafe, tokenize_1.fileTokens);
62
+ const filePathAtMostThreeEntries = (filePath) => {
63
+ const parts = filePath.split('/');
64
+ if (parts.length <= 3)
65
+ return filePath;
66
+ return `.../${parts.slice(-3).join('/')}`;
67
+ };
68
+ const printResult = (filePath, score) => console.log('%s %s', filePathAtMostThreeEntries(filePath), score.toPrecision(3));
69
+ console.log('File search results');
70
+ console.log('-------------------');
71
+ const fileSearchResults = fileIndex.search(query);
72
+ for (const result of fileSearchResults) {
73
+ const { filePath, score } = result;
74
+ printResult(filePath, score);
75
+ }
76
+ const splitter = splitter_1.langchainSplitter;
77
+ const snippetIndex = new snippet_index_1.default(db);
78
+ await (0, build_snippet_index_1.default)(snippetIndex, fileSearchResults, ioutil_1.readFileSafe, splitter, tokenize_1.fileTokens);
79
+ console.log('');
80
+ console.log('Snippet search results');
81
+ console.log('----------------------');
82
+ const isNullOrUndefined = (value) => value === null || value === undefined;
83
+ const snippetSearchResults = snippetIndex.searchSnippets(query);
84
+ for (const result of snippetSearchResults) {
85
+ const { snippetId, filePath, startLine, endLine, score } = result;
86
+ printResult(snippetId, score);
87
+ if (isNullOrUndefined(startLine) || isNullOrUndefined(endLine))
88
+ continue;
89
+ const content = await (0, ioutil_1.readFileSafe)(filePath);
90
+ if (!content)
91
+ continue;
92
+ (0, assert_1.default)(startLine !== undefined);
93
+ (0, assert_1.default)(endLine !== undefined);
94
+ const lines = content.split('\n').slice(startLine - 1, endLine);
95
+ console.log(lines.map((l) => ` > ${l}`).join('\n'));
96
+ }
97
+ db.close();
98
+ })
99
+ .help().argv;
100
+ if (cli instanceof Promise) {
101
+ cli.catch((e) => {
102
+ console.error(e);
103
+ process.exit(1);
104
+ });
105
+ }
@@ -0,0 +1,29 @@
1
+ import sqlite3 from 'better-sqlite3';
2
+ export type FileSearchResult = {
3
+ directory: string;
4
+ filePath: string;
5
+ score: number;
6
+ };
7
+ /**
8
+ * The FileIndex class provides an interface to interact with the SQLite search index.
9
+ *
10
+ * The primary responsibilities of this class include:
11
+ * 1. Indexing files by storing their directory paths, file paths, symbols (e.g., class names, method names), and
12
+ * general words in the database. Symbols are given more weight in the search results.
13
+ * 2. Boosting the relevance score of specific files based on external factors, such as AppMap trace data or error logs.
14
+ * 3. Performing search queries on the indexed files using full-text search with BM25 ranking. The search results are
15
+ * influenced by both the indexed content and any associated boost factors.
16
+ *
17
+ * The class uses two SQLite tables:
18
+ * - `file_content`: A virtual table that holds the file content and allows for full-text search using BM25 ranking.
19
+ * - `file_boost`: A table that stores boost factors for specific files to enhance their search relevance.
20
+ */
21
+ export default class FileIndex {
22
+ #private;
23
+ database: sqlite3.Database;
24
+ constructor(database: sqlite3.Database);
25
+ indexFile(directory: string, filePath: string, symbols: string, words: string): void;
26
+ boostFile(filePath: string, boostFactor: number): void;
27
+ search(query: string, limit?: number): FileSearchResult[];
28
+ close(): void;
29
+ }
@@ -0,0 +1,96 @@
1
+ "use strict";
2
+ var __classPrivateFieldSet = (this && this.__classPrivateFieldSet) || function (receiver, state, value, kind, f) {
3
+ if (kind === "m") throw new TypeError("Private method is not writable");
4
+ if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a setter");
5
+ if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot write private member to an object whose class did not declare it");
6
+ return (kind === "a" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;
7
+ };
8
+ var __classPrivateFieldGet = (this && this.__classPrivateFieldGet) || function (receiver, state, kind, f) {
9
+ if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a getter");
10
+ if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot read private member from an object whose class did not declare it");
11
+ return kind === "m" ? f : kind === "a" ? f.call(receiver) : f ? f.value : state.get(receiver);
12
+ };
13
+ var _FileIndex_insert, _FileIndex_updateBoost, _FileIndex_search;
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ const CREATE_TABLE_SQL = `CREATE VIRTUAL TABLE file_content USING fts5(
16
+ directory UNINDEXED,
17
+ file_path,
18
+ file_symbols,
19
+ file_words,
20
+ tokenize = 'porter unicode61'
21
+ )`;
22
+ const CREATE_BOOST_TABLE_SQL = `CREATE TABLE file_boost (
23
+ file_path TEXT PRIMARY KEY,
24
+ boost_factor REAL
25
+ )`;
26
+ const INSERT_SQL = `INSERT INTO file_content (directory, file_path, file_symbols, file_words)
27
+ VALUES (?, ?, ?, ?)`;
28
+ const UPDATE_BOOST_SQL = `INSERT OR REPLACE INTO file_boost (file_path, boost_factor)
29
+ VALUES (?, ?)`;
30
+ const SEARCH_SQL = `SELECT
31
+ file_content.directory,
32
+ file_content.file_path,
33
+ (bm25(file_content, 1)*3.0 + bm25(file_content, 2)*2.0 + bm25(file_content, 3)*1.0)
34
+ * COALESCE(file_boost.boost_factor, 1.0) * -1
35
+ AS score
36
+ FROM
37
+ file_content
38
+ LEFT JOIN
39
+ file_boost
40
+ ON
41
+ file_content.file_path = file_boost.file_path
42
+ WHERE
43
+ file_content MATCH ?
44
+ ORDER BY
45
+ score DESC
46
+ LIMIT
47
+ ?
48
+ `;
49
+ /**
50
+ * The FileIndex class provides an interface to interact with the SQLite search index.
51
+ *
52
+ * The primary responsibilities of this class include:
53
+ * 1. Indexing files by storing their directory paths, file paths, symbols (e.g., class names, method names), and
54
+ * general words in the database. Symbols are given more weight in the search results.
55
+ * 2. Boosting the relevance score of specific files based on external factors, such as AppMap trace data or error logs.
56
+ * 3. Performing search queries on the indexed files using full-text search with BM25 ranking. The search results are
57
+ * influenced by both the indexed content and any associated boost factors.
58
+ *
59
+ * The class uses two SQLite tables:
60
+ * - `file_content`: A virtual table that holds the file content and allows for full-text search using BM25 ranking.
61
+ * - `file_boost`: A table that stores boost factors for specific files to enhance their search relevance.
62
+ */
63
+ class FileIndex {
64
+ constructor(database) {
65
+ this.database = database;
66
+ _FileIndex_insert.set(this, void 0);
67
+ _FileIndex_updateBoost.set(this, void 0);
68
+ _FileIndex_search.set(this, void 0);
69
+ this.database.exec(CREATE_TABLE_SQL);
70
+ this.database.exec(CREATE_BOOST_TABLE_SQL);
71
+ this.database.pragma('journal_mode = OFF');
72
+ this.database.pragma('synchronous = OFF');
73
+ __classPrivateFieldSet(this, _FileIndex_insert, this.database.prepare(INSERT_SQL), "f");
74
+ __classPrivateFieldSet(this, _FileIndex_updateBoost, this.database.prepare(UPDATE_BOOST_SQL), "f");
75
+ __classPrivateFieldSet(this, _FileIndex_search, this.database.prepare(SEARCH_SQL), "f");
76
+ }
77
+ indexFile(directory, filePath, symbols, words) {
78
+ __classPrivateFieldGet(this, _FileIndex_insert, "f").run(directory, filePath, symbols, words);
79
+ }
80
+ boostFile(filePath, boostFactor) {
81
+ __classPrivateFieldGet(this, _FileIndex_updateBoost, "f").run(filePath, boostFactor);
82
+ }
83
+ search(query, limit = 10) {
84
+ const rows = __classPrivateFieldGet(this, _FileIndex_search, "f").all(query, limit);
85
+ return rows.map((row) => ({
86
+ directory: row.directory,
87
+ filePath: row.file_path,
88
+ score: row.score,
89
+ }));
90
+ }
91
+ close() {
92
+ this.database.close();
93
+ }
94
+ }
95
+ _FileIndex_insert = new WeakMap(), _FileIndex_updateBoost = new WeakMap(), _FileIndex_search = new WeakMap();
96
+ exports.default = FileIndex;
@@ -0,0 +1,3 @@
1
+ export declare const isLargeFile: (fileName: string) => Promise<boolean>;
2
+ export declare const isBinaryFile: (fileName: string) => boolean;
3
+ export declare const isDataFile: (fileName: string) => boolean;
@@ -0,0 +1,117 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.isDataFile = exports.isBinaryFile = exports.isLargeFile = void 0;
7
+ const promises_1 = require("fs/promises");
8
+ const debug_1 = __importDefault(require("debug"));
9
+ const debug = (0, debug_1.default)('appmap:search:file-type');
10
+ const BINARY_FILE_EXTENSIONS = [
11
+ '7z',
12
+ 'aac',
13
+ 'avi',
14
+ 'bmp',
15
+ 'bz2',
16
+ 'class',
17
+ 'dll',
18
+ 'doc',
19
+ 'docx',
20
+ 'dylib',
21
+ 'ear',
22
+ 'exe',
23
+ 'eot',
24
+ 'flac',
25
+ 'flv',
26
+ 'gif',
27
+ 'gz',
28
+ 'ico',
29
+ 'jar',
30
+ 'jpeg',
31
+ 'jpg',
32
+ 'js.map',
33
+ 'min.js',
34
+ 'min.css',
35
+ 'mkv',
36
+ 'mo',
37
+ 'mov',
38
+ 'mp3',
39
+ 'mp4',
40
+ 'mpg',
41
+ 'odt',
42
+ 'odp',
43
+ 'ods',
44
+ 'ogg',
45
+ 'otf',
46
+ 'pdf',
47
+ 'po',
48
+ 'png',
49
+ 'ppt',
50
+ 'pptx',
51
+ 'pyc',
52
+ 'rar',
53
+ 'rtf',
54
+ 'so',
55
+ 'svg',
56
+ 'tar',
57
+ 'tiff',
58
+ 'ttf',
59
+ 'wav',
60
+ 'webm',
61
+ 'webp',
62
+ 'woff',
63
+ 'woff2',
64
+ 'wmv',
65
+ 'xls',
66
+ 'xlsx',
67
+ 'xz',
68
+ 'yarn.lock',
69
+ 'zip',
70
+ ].map((ext) => '.' + ext);
71
+ const DATA_FILE_EXTENSIONS = [
72
+ 'cjs',
73
+ 'csv',
74
+ 'dat',
75
+ 'log',
76
+ 'json',
77
+ 'tsv',
78
+ 'yaml',
79
+ 'yml',
80
+ 'xml',
81
+ ].map((ext) => '.' + ext);
82
+ const DEFAULT_LARGE_FILE_THRESHOLD = 50_000;
83
+ const largeFileThreshold = () => {
84
+ const value = process.env.APPMAP_LARGE_FILE;
85
+ if (value === undefined)
86
+ return DEFAULT_LARGE_FILE_THRESHOLD;
87
+ return parseInt(value);
88
+ };
89
+ const statFileSafe = async (filePath) => {
90
+ try {
91
+ const stats = await (0, promises_1.stat)(filePath);
92
+ return stats.size;
93
+ }
94
+ catch (error) {
95
+ debug(`Error reading file: %s`, filePath);
96
+ debug(error);
97
+ return undefined;
98
+ }
99
+ };
100
+ const isLargeFile = async (fileName) => {
101
+ const fileSize = await statFileSafe(fileName);
102
+ if (fileSize === undefined)
103
+ return false;
104
+ const isLarge = fileSize > largeFileThreshold();
105
+ if (isLarge)
106
+ debug('File %s is considered large due to size %d', fileName, fileSize);
107
+ return fileSize > largeFileThreshold();
108
+ };
109
+ exports.isLargeFile = isLargeFile;
110
+ const isBinaryFile = (fileName) => {
111
+ return BINARY_FILE_EXTENSIONS.some((ext) => fileName.endsWith(ext));
112
+ };
113
+ exports.isBinaryFile = isBinaryFile;
114
+ const isDataFile = (fileName) => {
115
+ return DATA_FILE_EXTENSIONS.some((ext) => fileName.endsWith(ext));
116
+ };
117
+ exports.isDataFile = isDataFile;
package/built/git.d.ts ADDED
@@ -0,0 +1,19 @@
1
+ import { PathLike } from 'fs';
2
+ export declare enum GitState {
3
+ NotInstalled = 0,// The git cli was not found.
4
+ NoRepository = 1,// Git is installed but no repository was found.
5
+ Ok = 2
6
+ }
7
+ export declare const GitRepositoryEnvKeys: readonly ["GITHUB_REPOSITORY", "CIRCLE_REPOSITORY_URL", "GIT_URL", "CI_REPOSITORY_URL"];
8
+ export declare const GitBranchEnvKeys: readonly ["GITHUB_REF_NAME", "CIRCLE_BRANCH", "GIT_BRANCH", "TRAVIS_BRANCH", "CI_COMMIT_REF_NAME"];
9
+ export declare const GitCommitEnvKeys: readonly ["GITHUB_SHA", "CIRCLE_SHA1", "GIT_COMMIT", "TRAVIS_COMMIT", "CI_COMMIT_SHA"];
10
+ declare class GitProperties {
11
+ static contributors(sinceDaysAgo: number, cwd?: PathLike): Promise<Array<string>>;
12
+ static repository(cwd?: PathLike): Promise<string | undefined>;
13
+ static branch(cwd?: PathLike): Promise<string | undefined>;
14
+ static commit(cwd?: PathLike): Promise<string | undefined>;
15
+ static state(cwd?: PathLike): Promise<GitState>;
16
+ static clearCache(): void;
17
+ }
18
+ export declare const Git: typeof GitProperties;
19
+ export {};
package/built/git.js ADDED
@@ -0,0 +1,156 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.Git = exports.GitCommitEnvKeys = exports.GitBranchEnvKeys = exports.GitRepositoryEnvKeys = exports.GitState = void 0;
4
+ const child_process_1 = require("child_process");
5
+ const util_1 = require("util");
6
+ const exec = (0, util_1.promisify)(child_process_1.exec);
7
+ var GitState;
8
+ (function (GitState) {
9
+ GitState[GitState["NotInstalled"] = 0] = "NotInstalled";
10
+ GitState[GitState["NoRepository"] = 1] = "NoRepository";
11
+ GitState[GitState["Ok"] = 2] = "Ok";
12
+ })(GitState || (exports.GitState = GitState = {}));
13
+ exports.GitRepositoryEnvKeys = [
14
+ 'GITHUB_REPOSITORY', // GitHub
15
+ 'CIRCLE_REPOSITORY_URL', // CircleCI
16
+ 'GIT_URL', // Jenkins
17
+ 'CI_REPOSITORY_URL', // GitLab
18
+ ];
19
+ exports.GitBranchEnvKeys = [
20
+ 'GITHUB_REF_NAME', // GitHub
21
+ 'CIRCLE_BRANCH', // CircleCI
22
+ 'GIT_BRANCH', // Jenkins
23
+ 'TRAVIS_BRANCH', // TravisCI
24
+ 'CI_COMMIT_REF_NAME', // GitLab
25
+ ];
26
+ exports.GitCommitEnvKeys = [
27
+ 'GITHUB_SHA', // GitHub
28
+ 'CIRCLE_SHA1', // CircleCI
29
+ 'GIT_COMMIT', // Jenkins
30
+ 'TRAVIS_COMMIT', // TravisCI
31
+ 'CI_COMMIT_SHA', // GitLab
32
+ ];
33
+ class GitProperties {
34
+ static async contributors(sinceDaysAgo, cwd) {
35
+ const unixTimeNow = Math.floor(Number(new Date()) / 1000);
36
+ const unixTimeAgo = unixTimeNow - sinceDaysAgo * 24 * 60 * 60;
37
+ try {
38
+ const { stdout } = await exec([
39
+ 'git',
40
+ cwd && `-C ${cwd.toString()}`,
41
+ '--no-pager',
42
+ 'log',
43
+ `--since=${unixTimeAgo}`,
44
+ '--format="%ae"',
45
+ ].join(' '));
46
+ return [
47
+ ...stdout
48
+ .trim()
49
+ .split('\n')
50
+ .reduce((acc, email) => {
51
+ acc.add(email);
52
+ return acc;
53
+ }, new Set()),
54
+ ];
55
+ }
56
+ catch {
57
+ return [];
58
+ }
59
+ }
60
+ // Returns the repository URL, first by checking the environment, then by
61
+ // shelling out to git.
62
+ static async repository(cwd) {
63
+ const envKey = exports.GitRepositoryEnvKeys.find((key) => process.env[key]);
64
+ if (envKey)
65
+ return process.env[envKey];
66
+ try {
67
+ const { stdout } = await exec(['git', cwd && `-C ${cwd.toString()}`, 'config', '--get', 'remote.origin.url'].join(' '));
68
+ return stdout.trim();
69
+ }
70
+ catch {
71
+ return undefined;
72
+ }
73
+ }
74
+ // Returns the branch, first by checking the environment, then by
75
+ // shelling out to git.
76
+ static async branch(cwd) {
77
+ const envKey = exports.GitBranchEnvKeys.find((key) => process.env[key]);
78
+ if (envKey)
79
+ return process.env[envKey];
80
+ try {
81
+ const { stdout } = await exec(['git', cwd && `-C ${cwd.toString()}`, 'rev-parse', '--abbrev-ref', 'HEAD'].join(' '));
82
+ return stdout.trim();
83
+ }
84
+ catch {
85
+ return undefined;
86
+ }
87
+ }
88
+ // Returns the commit SHA, first by checking the environment, then by
89
+ // shelling out to git.
90
+ static async commit(cwd) {
91
+ const envKey = exports.GitCommitEnvKeys.find((key) => process.env[key]);
92
+ if (envKey)
93
+ return process.env[envKey];
94
+ try {
95
+ const { stdout } = await exec(['git', cwd && `-C ${cwd.toString()}`, 'rev-parse', 'HEAD'].join(' '));
96
+ return stdout.trim();
97
+ }
98
+ catch {
99
+ return undefined;
100
+ }
101
+ }
102
+ static async state(cwd) {
103
+ return new Promise((resolve) => {
104
+ try {
105
+ const commandProcess = (0, child_process_1.spawn)('git', ['status', '--porcelain'], {
106
+ shell: true,
107
+ cwd: cwd?.toString(),
108
+ stdio: 'ignore',
109
+ timeout: 2000,
110
+ });
111
+ commandProcess.on('exit', (code) => {
112
+ switch (code) {
113
+ case 127:
114
+ return resolve(GitState.NotInstalled);
115
+ case 128:
116
+ return resolve(GitState.NoRepository);
117
+ default:
118
+ return resolve(GitState.Ok);
119
+ }
120
+ });
121
+ commandProcess.on('error', () => resolve(GitState.NotInstalled));
122
+ }
123
+ catch {
124
+ resolve(GitState.NotInstalled);
125
+ }
126
+ });
127
+ }
128
+ static clearCache() {
129
+ gitCache.clear();
130
+ }
131
+ }
132
+ const gitCache = new Map();
133
+ const noCacheList = ['clearCache'];
134
+ // GitProperties is available externally as Git.
135
+ // This export provides a simple caching layer around GitProperties to avoid
136
+ // excessive shelling out to git.
137
+ exports.Git = new Proxy(GitProperties, {
138
+ get(target, prop) {
139
+ if (!noCacheList.includes(prop.toString()) &&
140
+ typeof target[prop] === 'function') {
141
+ return new Proxy(target[prop], {
142
+ apply(target, thisArg, argArray) {
143
+ const cacheKey = `${prop.toString()}(${JSON.stringify(argArray)})`;
144
+ if (gitCache.has(cacheKey)) {
145
+ return gitCache.get(cacheKey);
146
+ }
147
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-function-type
148
+ const result = Reflect.apply(target, thisArg, argArray);
149
+ gitCache.set(cacheKey, result);
150
+ return result;
151
+ },
152
+ });
153
+ }
154
+ return Reflect.get(target, prop);
155
+ },
156
+ });
@@ -0,0 +1,9 @@
1
+ export { ContentReader, readFileSafe } from './ioutil';
2
+ export { Splitter, langchainSplitter } from './splitter';
3
+ export { ListFn, FilterFn, Tokenizer, default as buildFileIndex } from './build-file-index';
4
+ export { File, default as buildSnippetIndex } from './build-snippet-index';
5
+ export { default as SnippetIndex, SnippetSearchResult } from './snippet-index';
6
+ export { default as FileIndex, FileSearchResult } from './file-index';
7
+ export { default as listProjectFiles } from './project-files';
8
+ export { isBinaryFile, isDataFile, isLargeFile } from './file-type';
9
+ export { fileTokens } from './tokenize';
package/built/index.js ADDED
@@ -0,0 +1,26 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.fileTokens = exports.isLargeFile = exports.isDataFile = exports.isBinaryFile = exports.listProjectFiles = exports.FileIndex = exports.SnippetIndex = exports.buildSnippetIndex = exports.buildFileIndex = exports.langchainSplitter = exports.readFileSafe = void 0;
7
+ var ioutil_1 = require("./ioutil");
8
+ Object.defineProperty(exports, "readFileSafe", { enumerable: true, get: function () { return ioutil_1.readFileSafe; } });
9
+ var splitter_1 = require("./splitter");
10
+ Object.defineProperty(exports, "langchainSplitter", { enumerable: true, get: function () { return splitter_1.langchainSplitter; } });
11
+ var build_file_index_1 = require("./build-file-index");
12
+ Object.defineProperty(exports, "buildFileIndex", { enumerable: true, get: function () { return __importDefault(build_file_index_1).default; } });
13
+ var build_snippet_index_1 = require("./build-snippet-index");
14
+ Object.defineProperty(exports, "buildSnippetIndex", { enumerable: true, get: function () { return __importDefault(build_snippet_index_1).default; } });
15
+ var snippet_index_1 = require("./snippet-index");
16
+ Object.defineProperty(exports, "SnippetIndex", { enumerable: true, get: function () { return __importDefault(snippet_index_1).default; } });
17
+ var file_index_1 = require("./file-index");
18
+ Object.defineProperty(exports, "FileIndex", { enumerable: true, get: function () { return __importDefault(file_index_1).default; } });
19
+ var project_files_1 = require("./project-files");
20
+ Object.defineProperty(exports, "listProjectFiles", { enumerable: true, get: function () { return __importDefault(project_files_1).default; } });
21
+ var file_type_1 = require("./file-type");
22
+ Object.defineProperty(exports, "isBinaryFile", { enumerable: true, get: function () { return file_type_1.isBinaryFile; } });
23
+ Object.defineProperty(exports, "isDataFile", { enumerable: true, get: function () { return file_type_1.isDataFile; } });
24
+ Object.defineProperty(exports, "isLargeFile", { enumerable: true, get: function () { return file_type_1.isLargeFile; } });
25
+ var tokenize_1 = require("./tokenize");
26
+ Object.defineProperty(exports, "fileTokens", { enumerable: true, get: function () { return tokenize_1.fileTokens; } });
@@ -0,0 +1,2 @@
1
+ export type ContentReader = (filePath: string) => PromiseLike<string | undefined>;
2
+ export declare function readFileSafe(filePath: string): PromiseLike<string | undefined>;
@@ -0,0 +1,19 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.readFileSafe = readFileSafe;
7
+ const fs_1 = require("fs");
8
+ const debug_1 = __importDefault(require("debug"));
9
+ const debug = (0, debug_1.default)('appmap:search:ioutil');
10
+ function readFileSafe(filePath) {
11
+ try {
12
+ return Promise.resolve((0, fs_1.readFileSync)(filePath, 'utf8'));
13
+ }
14
+ catch (error) {
15
+ debug(`Error reading file: %s`, filePath);
16
+ debug(error);
17
+ return Promise.resolve(undefined);
18
+ }
19
+ }
@@ -0,0 +1,3 @@
1
+ export default function listProjectFiles(directory: string): Promise<string[]>;
2
+ export declare function listGitProjectFiles(directory: string): Promise<string[]>;
3
+ export declare function listLikelyProjectFiles(directory: string): Promise<string[]>;
@@ -0,0 +1,106 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.default = listProjectFiles;
7
+ exports.listGitProjectFiles = listGitProjectFiles;
8
+ exports.listLikelyProjectFiles = listLikelyProjectFiles;
9
+ const util_1 = require("util");
10
+ const debug_1 = __importDefault(require("debug"));
11
+ const node_child_process_1 = require("node:child_process");
12
+ const git_1 = require("./git");
13
+ const assert_1 = __importDefault(require("assert"));
14
+ const promises_1 = require("fs/promises");
15
+ const path_1 = require("path");
16
+ const debug = (0, debug_1.default)('appmap:search:project-files');
17
+ const exec = (0, util_1.promisify)(node_child_process_1.exec);
18
+ async function listProjectFiles(directory) {
19
+ const gitState = await git_1.Git.state(directory);
20
+ debug(`Git state: %s`, gitState);
21
+ return gitState === git_1.GitState.Ok
22
+ ? await listGitProjectFiles(directory)
23
+ : await listLikelyProjectFiles(directory);
24
+ }
25
+ // Run git ls-files and git status to get a list of all git-managed files. By doing it this way,
26
+ // we automatically apply any .gitignore rules.
27
+ async function listGitProjectFiles(directory) {
28
+ const lsFiles = async () => {
29
+ try {
30
+ const { stdout } = await exec('git ls-files', {
31
+ cwd: directory,
32
+ maxBuffer: 1024 ** 2 * 20, // 20 MB
33
+ });
34
+ debug(stdout);
35
+ return stdout.split('\n').filter(Boolean);
36
+ }
37
+ catch (e) {
38
+ debug('`git ls-files` failed: %s', e);
39
+ return [];
40
+ }
41
+ };
42
+ const statusFiles = async () => {
43
+ try {
44
+ const { stdout } = await exec('git status --porcelain', {
45
+ cwd: directory,
46
+ maxBuffer: 1024 ** 2 * 20, // 20 MB
47
+ });
48
+ debug(stdout);
49
+ return stdout
50
+ .split('\n')
51
+ .map((line) => {
52
+ // git status --porcelain output starts with 3 characters: staged status, unstaged status,
53
+ // and a space.
54
+ return line.slice(3);
55
+ })
56
+ .filter(Boolean);
57
+ }
58
+ catch (e) {
59
+ debug('`git status --porcelain` failed: %s', e);
60
+ return [];
61
+ }
62
+ };
63
+ return Array.from(new Set([...(await lsFiles()), ...(await statusFiles())]));
64
+ }
65
+ const IGNORE_DIRECTORIES = new Set([
66
+ '.git',
67
+ '.venv',
68
+ '.yarn',
69
+ 'node_modules',
70
+ 'vendor',
71
+ 'build',
72
+ 'built',
73
+ 'dist',
74
+ 'out',
75
+ 'target',
76
+ 'tmp',
77
+ 'venv',
78
+ ]);
79
+ // Produce a modest-sized listing of files in the project.
80
+ // Ignore a standard list of binary file extensions and directories that tend to be full of
81
+ // non-source files.
82
+ async function listLikelyProjectFiles(directory) {
83
+ const files = new Array();
84
+ const ignoreDirectory = (dir) => IGNORE_DIRECTORIES.has(dir);
85
+ // Perform a breadth-first traversal of a directory, collecting all non-binary files and
86
+ // applying the directory ignore list.
87
+ const processDir = async (dir) => {
88
+ const queue = [dir];
89
+ while (queue.length > 0) {
90
+ const currentDir = queue.shift();
91
+ (0, assert_1.default)(currentDir, 'queue should not be empty');
92
+ const entries = await (0, promises_1.readdir)(currentDir, { withFileTypes: true });
93
+ for (const entry of entries) {
94
+ const path = (0, path_1.join)(currentDir, entry.name);
95
+ if (entry.isDirectory()) {
96
+ if (!ignoreDirectory(entry.name))
97
+ queue.push(path);
98
+ }
99
+ else
100
+ files.push((0, path_1.relative)(dir, path));
101
+ }
102
+ }
103
+ };
104
+ await processDir(directory);
105
+ return files;
106
+ }
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Extract keywords from a string or an array of strings. The extraction process includes the following steps:
3
+ *
4
+ * - Remove non-alphanumeric characters and split the keyword on spaces.
5
+ * - Split camelized words.
6
+ * - Remove stop words.
7
+ */
8
+ export default function queryKeywords(words: undefined | string | string[]): string[];
@@ -0,0 +1,76 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.default = queryKeywords;
4
+ const split_camelized_1 = require("./split-camelized");
5
+ const STOP_WORDS = new Set([
6
+ 'a',
7
+ 'an',
8
+ 'and',
9
+ 'are',
10
+ 'as',
11
+ 'at',
12
+ 'be',
13
+ 'by',
14
+ 'code',
15
+ 'for',
16
+ 'from',
17
+ 'has',
18
+ 'he',
19
+ 'in',
20
+ 'is',
21
+ 'it',
22
+ 'its',
23
+ 'of',
24
+ 'on',
25
+ 'over',
26
+ 'that',
27
+ 'the',
28
+ 'to',
29
+ 'was',
30
+ 'were',
31
+ 'will',
32
+ 'with',
33
+ 'without',
34
+ ]);
35
+ /**
36
+ * Replace non-alphanumeric characters with spaces, then split the keyword on spaces.
37
+ * So in effect, words with non-alphanumeric characters become multiple words.
38
+ * Allow dash and underscore as delimeters.
39
+ */
40
+ const sanitizeKeyword = (keyword) => keyword.replace(/[^\p{L}\p{N}\-_]/gu, ' ').split(' ');
41
+ /**
42
+ * Extract keywords from a string or an array of strings. The extraction process includes the following steps:
43
+ *
44
+ * - Remove non-alphanumeric characters and split the keyword on spaces.
45
+ * - Split camelized words.
46
+ * - Remove stop words.
47
+ */
48
+ function queryKeywords(words) {
49
+ if (!words)
50
+ return [];
51
+ const wordsArray = Array.isArray(words) ? words : [words];
52
+ if (wordsArray.length === 0)
53
+ return [];
54
+ return wordsArray
55
+ .map((word) => sanitizeKeyword(word || ''))
56
+ .flat()
57
+ .filter(Boolean)
58
+ .map((word) => {
59
+ const camelized = (0, split_camelized_1.splitCamelized)(word)
60
+ .split(/[\s\-_]/)
61
+ .map((word) => word.toLowerCase());
62
+ // Return each of the component words, and also return each pair of adjacent words as a single word.
63
+ const result = new Array();
64
+ for (let i = 0; i < camelized.length; i++) {
65
+ result.push(camelized[i]);
66
+ if (i > 0)
67
+ result.push([camelized[i - 1] + camelized[i]].join(''));
68
+ }
69
+ return result;
70
+ })
71
+ .flat()
72
+ .map((str) => str.trim())
73
+ .filter(Boolean)
74
+ .filter((str) => str.length >= 2)
75
+ .filter((str) => !STOP_WORDS.has(str));
76
+ }
@@ -0,0 +1,19 @@
1
+ import sqlite3 from 'better-sqlite3';
2
+ export type SnippetSearchResult = {
3
+ snippetId: string;
4
+ directory: string;
5
+ filePath: string;
6
+ startLine: number | undefined;
7
+ endLine: number | undefined;
8
+ score: number;
9
+ content: string;
10
+ };
11
+ export default class SnippetIndex {
12
+ #private;
13
+ database: sqlite3.Database;
14
+ constructor(database: sqlite3.Database);
15
+ indexSnippet(snippetId: string, directory: string, filePath: string, startLine: number | undefined, endLine: number | undefined, symbols: string, words: string, content: string): void;
16
+ boostSnippet(snippetId: string, boostFactor: number): void;
17
+ searchSnippets(query: string, limit?: number): SnippetSearchResult[];
18
+ close(): void;
19
+ }
@@ -0,0 +1,94 @@
1
+ "use strict";
2
+ var __classPrivateFieldSet = (this && this.__classPrivateFieldSet) || function (receiver, state, value, kind, f) {
3
+ if (kind === "m") throw new TypeError("Private method is not writable");
4
+ if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a setter");
5
+ if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot write private member to an object whose class did not declare it");
6
+ return (kind === "a" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;
7
+ };
8
+ var __classPrivateFieldGet = (this && this.__classPrivateFieldGet) || function (receiver, state, kind, f) {
9
+ if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a getter");
10
+ if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot read private member from an object whose class did not declare it");
11
+ return kind === "m" ? f : kind === "a" ? f.call(receiver) : f ? f.value : state.get(receiver);
12
+ };
13
+ var _SnippetIndex_insertSnippet, _SnippetIndex_updateSnippetBoost, _SnippetIndex_searchSnippet;
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ const CREATE_SNIPPET_CONTENT_TABLE_SQL = `CREATE VIRTUAL TABLE snippet_content USING fts5(
16
+ snippet_id UNINDEXED,
17
+ directory UNINDEXED,
18
+ file_path,
19
+ start_line UNINDEXED,
20
+ end_line UNINDEXED,
21
+ file_symbols,
22
+ file_words,
23
+ content UNINDEXED,
24
+ tokenize = 'porter unicode61'
25
+ )`;
26
+ const CREATE_SNIPPET_BOOST_TABLE_SQL = `CREATE TABLE snippet_boost (
27
+ snippet_id TEXT PRIMARY KEY,
28
+ boost_factor REAL
29
+ )`;
30
+ const INSERT_SNIPPET_SQL = `INSERT INTO snippet_content
31
+ (snippet_id, directory, file_path, start_line, end_line, file_symbols, file_words, content)
32
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`;
33
+ const UPDATE_SNIPPET_BOOST_SQL = `INSERT OR REPLACE INTO snippet_boost
34
+ (snippet_id, boost_factor)
35
+ VALUES (?, ?)`;
36
+ const SEARCH_SNIPPET_SQL = `SELECT
37
+ snippet_content.directory,
38
+ snippet_content.file_path,
39
+ snippet_content.start_line,
40
+ snippet_content.end_line,
41
+ snippet_content.snippet_id,
42
+ snippet_content.content,
43
+ (bm25(snippet_content, 1)*3.0 + bm25(snippet_content, 2)*2.0 + bm25(snippet_content, 3)*1.0)
44
+ * COALESCE(snippet_boost.boost_factor, 1.0) * -1
45
+ AS score
46
+ FROM
47
+ snippet_content
48
+ LEFT JOIN
49
+ snippet_boost
50
+ ON
51
+ snippet_content.snippet_id = snippet_boost.snippet_id
52
+ WHERE
53
+ snippet_content MATCH ?
54
+ ORDER BY
55
+ score DESC
56
+ LIMIT ?`;
57
+ class SnippetIndex {
58
+ constructor(database) {
59
+ this.database = database;
60
+ _SnippetIndex_insertSnippet.set(this, void 0);
61
+ _SnippetIndex_updateSnippetBoost.set(this, void 0);
62
+ _SnippetIndex_searchSnippet.set(this, void 0);
63
+ this.database.exec(CREATE_SNIPPET_CONTENT_TABLE_SQL);
64
+ this.database.exec(CREATE_SNIPPET_BOOST_TABLE_SQL);
65
+ this.database.pragma('journal_mode = OFF');
66
+ this.database.pragma('synchronous = OFF');
67
+ __classPrivateFieldSet(this, _SnippetIndex_insertSnippet, this.database.prepare(INSERT_SNIPPET_SQL), "f");
68
+ __classPrivateFieldSet(this, _SnippetIndex_updateSnippetBoost, this.database.prepare(UPDATE_SNIPPET_BOOST_SQL), "f");
69
+ __classPrivateFieldSet(this, _SnippetIndex_searchSnippet, this.database.prepare(SEARCH_SNIPPET_SQL), "f");
70
+ }
71
+ indexSnippet(snippetId, directory, filePath, startLine, endLine, symbols, words, content) {
72
+ __classPrivateFieldGet(this, _SnippetIndex_insertSnippet, "f").run(snippetId, directory, filePath, startLine, endLine, symbols, words, content);
73
+ }
74
+ boostSnippet(snippetId, boostFactor) {
75
+ __classPrivateFieldGet(this, _SnippetIndex_updateSnippetBoost, "f").run(snippetId, boostFactor);
76
+ }
77
+ searchSnippets(query, limit = 10) {
78
+ const rows = __classPrivateFieldGet(this, _SnippetIndex_searchSnippet, "f").all(query, limit);
79
+ return rows.map((row) => ({
80
+ directory: row.directory,
81
+ snippetId: row.snippet_id,
82
+ filePath: row.file_path,
83
+ startLine: row.start_line,
84
+ endLine: row.end_line,
85
+ score: row.score,
86
+ content: row.content,
87
+ }));
88
+ }
89
+ close() {
90
+ this.database.close();
91
+ }
92
+ }
93
+ _SnippetIndex_insertSnippet = new WeakMap(), _SnippetIndex_updateSnippetBoost = new WeakMap(), _SnippetIndex_searchSnippet = new WeakMap();
94
+ exports.default = SnippetIndex;
@@ -0,0 +1,9 @@
1
+ export declare const LOG_CAMELIZED_TO_RAW: boolean;
2
+ export declare const CAMELIZED_TO_RAW: Map<string, string>;
3
+ /**
4
+ * Split a camelized word into a new word that is separated by a given separator.
5
+ */
6
+ export declare function splitCamelized(text: string, { separator, preserveConsecutiveUppercase }?: {
7
+ separator?: string | undefined;
8
+ preserveConsecutiveUppercase?: boolean | undefined;
9
+ }): string;
@@ -0,0 +1,55 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.CAMELIZED_TO_RAW = exports.LOG_CAMELIZED_TO_RAW = void 0;
4
+ exports.splitCamelized = splitCamelized;
5
+ const console_1 = require("console");
6
+ exports.LOG_CAMELIZED_TO_RAW = process.env.APPMAP_LOG_CAMELIZED_TO_RAW === 'true';
7
+ exports.CAMELIZED_TO_RAW = new Map();
8
+ /**
9
+ * Split a camelized word into a new word that is separated by a given separator.
10
+ */
11
+ // Derived from https://raw.githubusercontent.com/sindresorhus/decamelize/main/index.js
12
+ // MIT License
13
+ // Copyright (c) Sindre Sorhus sindresorhus@gmail.com (https://sindresorhus.com)
14
+ // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
15
+ // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
16
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17
+ function splitCamelized(text, { separator = ' ', preserveConsecutiveUppercase = true } = {}) {
18
+ const handlePreserveConsecutiveUppercase = (decamelized, separator) => {
19
+ // Lowercase all single uppercase characters. As we
20
+ // want to preserve uppercase sequences, we cannot
21
+ // simply lowercase the separated string at the end.
22
+ // `data_For_USACounties` → `data_for_USACounties`
23
+ const result = decamelized.replace(/((?<![\p{Uppercase_Letter}\d])[\p{Uppercase_Letter}\d](?![\p{Uppercase_Letter}\d]))/gu, ($0) => $0.toLowerCase());
24
+ // Remaining uppercase sequences will be separated from lowercase sequences.
25
+ // `data_For_USACounties` → `data_for_USA_counties`
26
+ return result.replace(/(\p{Uppercase_Letter}+)(\p{Uppercase_Letter}\p{Lowercase_Letter}+)/gu, (_, $1, $2) => $1 + separator + $2.toLowerCase());
27
+ };
28
+ // Checking the second character is done later on. Therefore process shorter strings here.
29
+ if (text.length < 2) {
30
+ return preserveConsecutiveUppercase ? text : text.toLowerCase();
31
+ }
32
+ const replacement = `$1${separator}$2`;
33
+ // Split lowercase sequences followed by uppercase character.
34
+ // `dataForUSACounties` → `data_For_USACounties`
35
+ // `myURLstring → `my_URLstring`
36
+ const decamelized = text.replace(/([\p{Lowercase_Letter}\d])(\p{Uppercase_Letter})/gu, replacement);
37
+ let result;
38
+ if (preserveConsecutiveUppercase) {
39
+ result = handlePreserveConsecutiveUppercase(decamelized, separator);
40
+ }
41
+ else {
42
+ // Split multiple uppercase characters followed by one or more lowercase characters.
43
+ // `my_URLstring` → `my_ur_lstring`
44
+ result = decamelized
45
+ .replace(/(\p{Uppercase_Letter})(\p{Uppercase_Letter}\p{Lowercase_Letter}+)/gu, replacement)
46
+ .toLowerCase();
47
+ }
48
+ if (exports.LOG_CAMELIZED_TO_RAW) {
49
+ if (!exports.CAMELIZED_TO_RAW.has(result)) {
50
+ (0, console_1.log)(`[splitCamelized] ${text} → ${result}`);
51
+ exports.CAMELIZED_TO_RAW.set(result, text);
52
+ }
53
+ }
54
+ return result;
55
+ }
@@ -0,0 +1,7 @@
1
+ export type Chunk = {
2
+ content: string;
3
+ startLine?: number;
4
+ endLine?: number;
5
+ };
6
+ export type Splitter = (content: string, fileExtension: string) => PromiseLike<Chunk[]>;
7
+ export declare function langchainSplitter(content: string, fileExtension: string): Promise<Chunk[]>;
@@ -0,0 +1,53 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.langchainSplitter = langchainSplitter;
7
+ const text_splitter_1 = require("langchain/text_splitter");
8
+ const debug_1 = __importDefault(require("debug"));
9
+ const debug = (0, debug_1.default)('appmap:search:splitter');
10
+ const TEXT_SPLITTER_LANGUAGE_EXTENSIONS = {
11
+ cpp: ['cpp', 'h', 'hpp', 'c', 'cc', 'cxx', 'hxx'],
12
+ go: ['go'],
13
+ java: ['java', 'jsp', 'jspx'],
14
+ js: ['js', 'ts', 'mjs', 'jsx', 'tsx', 'vue', 'svelte'],
15
+ php: ['php'],
16
+ proto: ['proto'],
17
+ python: ['py'],
18
+ rst: ['rst'],
19
+ ruby: ['rb', 'haml', 'erb'],
20
+ rust: ['rs'],
21
+ scala: ['scala'],
22
+ swift: ['swift'],
23
+ markdown: ['md'],
24
+ latex: ['tex'],
25
+ html: ['html'],
26
+ sol: ['sol'],
27
+ };
28
+ async function langchainSplitter(content, fileExtension) {
29
+ const language = Object.keys(TEXT_SPLITTER_LANGUAGE_EXTENSIONS).find((language) => TEXT_SPLITTER_LANGUAGE_EXTENSIONS[language].includes(fileExtension));
30
+ let splitter;
31
+ if (language) {
32
+ splitter = text_splitter_1.RecursiveCharacterTextSplitter.fromLanguage(language);
33
+ }
34
+ else {
35
+ debug('No language found for extension: %s', fileExtension);
36
+ splitter = new text_splitter_1.RecursiveCharacterTextSplitter();
37
+ }
38
+ const documents = await splitter.createDocuments([content]);
39
+ // metadata includes:
40
+ // { loc: { lines: { from: 1, to: 14 } } }
41
+ return documents.map((doc) => {
42
+ const loc = doc.metadata?.loc;
43
+ const lines = loc?.lines;
44
+ const result = {
45
+ content: doc.pageContent,
46
+ };
47
+ if (lines) {
48
+ result.startLine = lines.from;
49
+ result.endLine = lines.to;
50
+ }
51
+ return result;
52
+ });
53
+ }
@@ -0,0 +1,9 @@
1
+ export declare const SymbolRegexes: Record<string, RegExp>;
2
+ export declare function symbols(content: string, fileExtension: string, allowGeneric?: boolean): string[];
3
+ export declare function words(content: string): string[];
4
+ type FileTokens = {
5
+ symbols: string[];
6
+ words: string[];
7
+ };
8
+ export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): FileTokens;
9
+ export {};
@@ -0,0 +1,86 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.SymbolRegexes = void 0;
7
+ exports.symbols = symbols;
8
+ exports.words = words;
9
+ exports.fileTokens = fileTokens;
10
+ const query_keywords_1 = __importDefault(require("./query-keywords"));
11
+ const debug_1 = __importDefault(require("debug"));
12
+ const debug = (0, debug_1.default)('appmap:search:tokenize');
13
+ exports.SymbolRegexes = {
14
+ cs: /(((interface|class|enum|struct)\s+(?<symbol1>\w+))|((\s|^)(?!using|try|catch|if|while|do|for|switch)(?<!#define\s+?)(?<symbol2>[\w~$]+)\s*?\([^;)]*?\)[\w\s\d<>[\].:\n]*?{))/g,
15
+ cpp: /(((struct|enum|union|class)\s+(?<symbol1>\w+)\s*?\{)|(}\s*?(?<symbol2>\w+)\s*?;)|((\s|^)(?!try|catch|if|while|do|for|switch)(?<!#define\s+?)(?<symbol3>[\w~$]+)\s*?\([^;)]*?\)[\w\s\d<>[\].:\n]*?{))/g,
16
+ rs: /(struct|enum|union|trait|type|fn)\s+(?<symbol1>[\w\p{L}]+)/gu,
17
+ go: /((type\s+(?<symbol1>[\w\p{L}]+))|(func\s+?(\(.*?\)\s*?)?(?<symbol2>[\w\p{L}]+)\s*?\())/gu,
18
+ rb: /(((class|module)\s+(?<symbol1>\w+))|(def\s+?(?<symbol2>\w+)))/g,
19
+ py: /(class|def)\s+(?<symbol1>\w+)/g,
20
+ java: /(((class|@?interface|enum)\s+(?<symbol1>[\w$]+))|((\s|^)(?!try|catch|if|while|do|for|switch)(?<symbol2>[\w$]+)\s*?\([^;)]*?\)[\w\s\d<>[\].:\n]*?{))/g,
21
+ ts: /(((class|interface|enum|type|function)\s+(?<symbol1>[#$\w\p{L}]+))|((\s|^)(?!using|try|catch|if|while|do|for|switch)(?<symbol2>[#$\w\p{L}]+)\s*?\([^;)]*?\)[\w\s<>[\].:\n]*?\{)|((?<symbol3>[#$\w\p{L}]+)\s*?(=|:)\s*?\(.*?\)\s*?=>))/gu,
22
+ kt: /(((class|typealias)\s+(?<symbol1>[\w_]+))|(fun\s+?(<.+?>\s+)?(.*?\.)?(?<symbol2>\w+)))/g,
23
+ php: /(class|trait|function)\s+(?<symbol1>[\w_$]+)/g,
24
+ };
25
+ const genericSymbolRegex = /(((^|\s)(?!using|try|catch|if|while|do|for|switch)(?<symbol1>[#$\w\p{L}~]+)\s*?\(([^;)])*?\)[\w\s<>[\].:\n]*?\{)|(^(?!.*?(?:#|\/\/|"|')).*?(interface|class|enum|struct|union|trait|type(alias|def)?|fu?nc?(tion)?|module|def)\s+?(?<symbol2>[#$\w\p{L}]+))|((?<symbol3>[#$\w\p{L}~]+)\s*?=\s*?[\w\s<>[\].:\n]*?\{))/gmu;
26
+ // Define aliases for common file extensions
27
+ ['js', 'jsx', 'ts', 'tsx', 'vue', 'svelte'].forEach((ext) => {
28
+ exports.SymbolRegexes[ext] = exports.SymbolRegexes.ts;
29
+ });
30
+ ['c', 'cc', 'cxx', 'h', 'hpp', 'cpp', 'hxx'].forEach((ext) => {
31
+ exports.SymbolRegexes[ext] = exports.SymbolRegexes.cpp;
32
+ });
33
+ function getMatches(source, regex) {
34
+ const results = [];
35
+ const matches = source.matchAll(regex);
36
+ for (const match of matches) {
37
+ const { groups } = match;
38
+ const symbol = groups?.symbol1 ?? groups?.symbol2 ?? groups?.symbol3;
39
+ if (symbol)
40
+ results.push(symbol);
41
+ }
42
+ return results;
43
+ }
44
+ function symbols(content, fileExtension, allowGeneric = true) {
45
+ let regex = allowGeneric ? genericSymbolRegex : undefined;
46
+ if (fileExtension && fileExtension in exports.SymbolRegexes) {
47
+ regex = exports.SymbolRegexes[fileExtension];
48
+ }
49
+ if (regex) {
50
+ return getMatches(content, regex);
51
+ }
52
+ return [];
53
+ }
54
+ function words(content) {
55
+ return content.match(/\b\w+\b/g) ?? [];
56
+ }
57
+ function fileTokens(content, fileExtension, enableGenericSymbolParsing = true) {
58
+ if (enableGenericSymbolParsing)
59
+ debug('Using generic symbol parsing for file extension: %s', fileExtension);
60
+ const symbolList = (0, query_keywords_1.default)(symbols(content, fileExtension, enableGenericSymbolParsing)).sort();
61
+ const wordList = (0, query_keywords_1.default)(words(content)).sort();
62
+ // Iterate through words, with a corresponding pointer to symbols.
63
+ // If the word at the word index does not match the symbol at the symbol index,
64
+ // add the word to the output. Otherwise, advance both pointers. Repeat
65
+ // until all words have been traversed.
66
+ const filteredWordList = new Array();
67
+ let symbolIndex = 0;
68
+ let wordIndex = 0;
69
+ const collectWord = () => {
70
+ const word = wordList[wordIndex];
71
+ const symbol = symbolList[symbolIndex];
72
+ if (word === symbol) {
73
+ symbolIndex += 1;
74
+ }
75
+ else {
76
+ filteredWordList.push(word);
77
+ }
78
+ wordIndex += 1;
79
+ };
80
+ while (wordIndex < wordList.length)
81
+ collectWord();
82
+ return {
83
+ symbols: symbolList,
84
+ words: filteredWordList,
85
+ };
86
+ }
package/package.json ADDED
@@ -0,0 +1,45 @@
1
+ {
2
+ "name": "@appland/search",
3
+ "version": "1.0.0",
4
+ "description": "",
5
+ "bin": "built/cli.js",
6
+ "publishConfig": {
7
+ "access": "public"
8
+ },
9
+ "main": "built/index.js",
10
+ "types": "built/index.d.ts",
11
+ "files": [
12
+ "built"
13
+ ],
14
+ "scripts": {
15
+ "lint": "eslint",
16
+ "lint:fix": "eslint --fix",
17
+ "test": "jest",
18
+ "build": "tsc",
19
+ "watch": "tsc --watch"
20
+ },
21
+ "author": "AppLand, Inc",
22
+ "license": "Commons Clause + MIT",
23
+ "devDependencies": {
24
+ "@types/better-sqlite3": "^7.6.11",
25
+ "@types/jest": "^29.5.4",
26
+ "@types/node": "^16",
27
+ "eslint": "^9",
28
+ "eslint-config-prettier": "^9",
29
+ "eslint-plugin-eslint-comments": "^3.2.0",
30
+ "eslint-plugin-import": "^2.31.0",
31
+ "eslint-plugin-jest": "^28.8.3",
32
+ "eslint-plugin-prettier": "^5.2.1",
33
+ "eslint-plugin-promise": "^7.1.0",
34
+ "jest": "^29.7.0",
35
+ "prettier": "^3.3.3",
36
+ "ts-jest": "^29.2.5",
37
+ "tsc": "^2.0.4",
38
+ "typescript": "^5",
39
+ "typescript-eslint": "^8.11.0"
40
+ },
41
+ "dependencies": {
42
+ "better-sqlite3": "^11.5.0",
43
+ "yargs": "^17.7.2"
44
+ }
45
+ }