npm - @appland/search - Versions diffs - 1.0.0 - Mend

@appland/search 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/CHANGELOG.md +6 -0
package/built/build-file-index.d.ts +9 -0
package/built/build-file-index.js +47 -0
package/built/build-snippet-index.d.ts +9 -0
package/built/build-snippet-index.js +26 -0
package/built/cli.d.ts +1 -0
package/built/cli.js +105 -0
package/built/file-index.d.ts +29 -0
package/built/file-index.js +96 -0
package/built/file-type.d.ts +3 -0
package/built/file-type.js +117 -0
package/built/git.d.ts +19 -0
package/built/git.js +156 -0
package/built/index.d.ts +9 -0
package/built/index.js +26 -0
package/built/ioutil.d.ts +2 -0
package/built/ioutil.js +19 -0
package/built/project-files.d.ts +3 -0
package/built/project-files.js +106 -0
package/built/query-keywords.d.ts +8 -0
package/built/query-keywords.js +76 -0
package/built/snippet-index.d.ts +19 -0
package/built/snippet-index.js +94 -0
package/built/split-camelized.d.ts +9 -0
package/built/split-camelized.js +55 -0
package/built/splitter.d.ts +7 -0
package/built/splitter.js +53 -0
package/built/tokenize.d.ts +9 -0
package/built/tokenize.js +86 -0
package/package.json +45 -0

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,6 @@
+# @appland/search-v1.0.0 (2024-11-06)
+### Features
+* @appland/search package ([dbf7d9a](https://github.com/getappmap/appmap-js/commit/dbf7d9a32593e19df9a8732f18e32227dcb53aca))

package/built/build-file-index.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+import FileIndex from './file-index';
+import { ContentReader } from './ioutil';
+export type ListFn = (path: string) => Promise<string[]>;
+export type FilterFn = (path: string) => PromiseLike<boolean>;
+export type Tokenizer = (content: string, fileExtension: string) => {
+    symbols: string[];
+    words: string[];
+};
+export default function buildFileIndex(fileIndex: FileIndex, directories: string[], listDirectory: ListFn, fileFilter: FilterFn, contentReader: ContentReader, tokenizer: Tokenizer): Promise<void>;

package/built/build-file-index.js ADDED Viewed

@@ -0,0 +1,47 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.default = buildFileIndex;
+const debug_1 = __importDefault(require("debug"));
+const path_1 = require("path");
+const console_1 = require("console");
+const debug = (0, debug_1.default)('appmap:search:build-index');
+async function indexFile(context, filePath) {
+    const fileContents = await context.contentReader(filePath);
+    if (!fileContents)
+        return;
+    const tokens = context.tokenizer(fileContents, filePath);
+    const symbols = tokens.symbols.join(' ');
+    const words = tokens.words.join(' ');
+    context.fileIndex.indexFile(context.baseDirectory, filePath, symbols, words);
+}
+async function indexDirectory(context, directory) {
+    const dirContents = await context.listDirectory(directory);
+    if (!dirContents)
+        return;
+    for (const dirContentItem of dirContents) {
+        const filePath = (0, path_1.join)(directory, dirContentItem);
+        debug('Indexing: %s', filePath);
+        if (await context.fileFilter(filePath)) {
+            indexFile(context, filePath).catch((e) => {
+                (0, console_1.warn)(`Error indexing file: ${filePath}`);
+                (0, console_1.warn)(e);
+            });
+        }
+    }
+}
+async function buildFileIndex(fileIndex, directories, listDirectory, fileFilter, contentReader, tokenizer) {
+    for (const directory of directories) {
+        const context = {
+            fileIndex,
+            baseDirectory: directory,
+            listDirectory,
+            fileFilter,
+            contentReader,
+            tokenizer,
+        };
+        await indexDirectory(context, directory);
+    }
+}

package/built/build-snippet-index.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+import { Tokenizer } from './build-file-index';
+import { ContentReader } from './ioutil';
+import SnippetIndex from './snippet-index';
+import { Splitter } from './splitter';
+export type File = {
+    directory: string;
+    filePath: string;
+};
+export default function buildSnippetIndex(snippetIndex: SnippetIndex, files: File[], contentReader: ContentReader, splitter: Splitter, tokenizer: Tokenizer): Promise<void>;

package/built/build-snippet-index.js ADDED Viewed

@@ -0,0 +1,26 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.default = buildSnippetIndex;
+async function indexFile(context, file) {
+    const fileContent = await context.contentReader(file.filePath);
+    if (!fileContent)
+        return;
+    const extension = file.filePath.split('.').pop() || '';
+    const chunks = await context.splitter(fileContent, extension);
+    chunks.forEach((chunk, index) => {
+        const snippetId = `${file.filePath}:${index}`;
+        const { content, startLine, endLine } = chunk;
+        context.snippetIndex.indexSnippet(snippetId, file.directory, file.filePath, startLine, endLine, context.tokenizer(content, file.filePath).symbols.join(' '), context.tokenizer(content, file.filePath).words.join(' '), content);
+    });
+}
+async function buildSnippetIndex(snippetIndex, files, contentReader, splitter, tokenizer) {
+    const context = {
+        snippetIndex,
+        contentReader,
+        splitter,
+        tokenizer,
+    };
+    for (const file of files) {
+        await indexFile(context, file);
+    }
+}

package/built/cli.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/built/cli.js ADDED Viewed

@@ -0,0 +1,105 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+const yargs_1 = __importDefault(require("yargs"));
+const helpers_1 = require("yargs/helpers");
+const better_sqlite3_1 = __importDefault(require("better-sqlite3"));
+const debug_1 = __importDefault(require("debug"));
+const tokenize_1 = require("./tokenize");
+const file_index_1 = __importDefault(require("./file-index"));
+const build_file_index_1 = __importDefault(require("./build-file-index"));
+const project_files_1 = __importDefault(require("./project-files"));
+const file_type_1 = require("./file-type");
+const snippet_index_1 = __importDefault(require("./snippet-index"));
+const build_snippet_index_1 = __importDefault(require("./build-snippet-index"));
+const ioutil_1 = require("./ioutil");
+const splitter_1 = require("./splitter");
+const assert_1 = __importDefault(require("assert"));
+const debug = (0, debug_1.default)('appmap:search:cli');
+const cli = (0, yargs_1.default)((0, helpers_1.hideBin)(process.argv))
+    .command('* <query>', 'Index directories and perform a search', (yargs) => {
+    return yargs
+        .option('directories', {
+        alias: 'd',
+        type: 'array',
+        description: 'List of directories to index',
+        default: ['.'],
+    })
+        .option('file-filter', {
+        type: 'string',
+        description: 'Regex pattern to filter files',
+    })
+        .positional('query', {
+        describe: 'Search query',
+        type: 'string',
+    })
+        .strict();
+}, async (argv) => {
+    const { directories, query } = argv;
+    let filterRE;
+    if (argv.fileFilter)
+        filterRE = new RegExp(argv.fileFilter);
+    const fileFilter = async (path) => {
+        debug('Filtering: %s', path);
+        if ((0, file_type_1.isBinaryFile)(path)) {
+            debug('Skipping binary file: %s', path);
+            return false;
+        }
+        const isData = (0, file_type_1.isDataFile)(path);
+        if (isData && (await (0, file_type_1.isLargeFile)(path))) {
+            debug('Skipping large data file: %s', path);
+            return false;
+        }
+        if (!filterRE)
+            return true;
+        return !filterRE.test(path);
+    };
+    const db = new better_sqlite3_1.default(':memory:');
+    const fileIndex = new file_index_1.default(db);
+    await (0, build_file_index_1.default)(fileIndex, directories, project_files_1.default, fileFilter, ioutil_1.readFileSafe, tokenize_1.fileTokens);
+    const filePathAtMostThreeEntries = (filePath) => {
+        const parts = filePath.split('/');
+        if (parts.length <= 3)
+            return filePath;
+        return `.../${parts.slice(-3).join('/')}`;
+    };
+    const printResult = (filePath, score) => console.log('%s   %s', filePathAtMostThreeEntries(filePath), score.toPrecision(3));
+    console.log('File search results');
+    console.log('-------------------');
+    const fileSearchResults = fileIndex.search(query);
+    for (const result of fileSearchResults) {
+        const { filePath, score } = result;
+        printResult(filePath, score);
+    }
+    const splitter = splitter_1.langchainSplitter;
+    const snippetIndex = new snippet_index_1.default(db);
+    await (0, build_snippet_index_1.default)(snippetIndex, fileSearchResults, ioutil_1.readFileSafe, splitter, tokenize_1.fileTokens);
+    console.log('');
+    console.log('Snippet search results');
+    console.log('----------------------');
+    const isNullOrUndefined = (value) => value === null || value === undefined;
+    const snippetSearchResults = snippetIndex.searchSnippets(query);
+    for (const result of snippetSearchResults) {
+        const { snippetId, filePath, startLine, endLine, score } = result;
+        printResult(snippetId, score);
+        if (isNullOrUndefined(startLine) || isNullOrUndefined(endLine))
+            continue;
+        const content = await (0, ioutil_1.readFileSafe)(filePath);
+        if (!content)
+            continue;
+        (0, assert_1.default)(startLine !== undefined);
+        (0, assert_1.default)(endLine !== undefined);
+        const lines = content.split('\n').slice(startLine - 1, endLine);
+        console.log(lines.map((l) => `    > ${l}`).join('\n'));
+    }
+    db.close();
+})
+    .help().argv;
+if (cli instanceof Promise) {
+    cli.catch((e) => {
+        console.error(e);
+        process.exit(1);
+    });
+}

package/built/file-index.d.ts ADDED Viewed

@@ -0,0 +1,29 @@
+import sqlite3 from 'better-sqlite3';
+export type FileSearchResult = {
+    directory: string;
+    filePath: string;
+    score: number;
+};
+/**
+ * The FileIndex class provides an interface to interact with the SQLite search index.
+ *
+ * The primary responsibilities of this class include:
+ * 1. Indexing files by storing their directory paths, file paths, symbols (e.g., class names, method names), and
+ *    general words in the database. Symbols are given more weight in the search results.
+ * 2. Boosting the relevance score of specific files based on external factors, such as AppMap trace data or error logs.
+ * 3. Performing search queries on the indexed files using full-text search with BM25 ranking. The search results are
+ *    influenced by both the indexed content and any associated boost factors.
+ *
+ * The class uses two SQLite tables:
+ * - `file_content`: A virtual table that holds the file content and allows for full-text search using BM25 ranking.
+ * - `file_boost`: A table that stores boost factors for specific files to enhance their search relevance.
+ */
+export default class FileIndex {
+    #private;
+    database: sqlite3.Database;
+    constructor(database: sqlite3.Database);
+    indexFile(directory: string, filePath: string, symbols: string, words: string): void;
+    boostFile(filePath: string, boostFactor: number): void;
+    search(query: string, limit?: number): FileSearchResult[];
+    close(): void;
+}

package/built/file-index.js ADDED Viewed

@@ -0,0 +1,96 @@
+"use strict";
+var __classPrivateFieldSet = (this && this.__classPrivateFieldSet) || function (receiver, state, value, kind, f) {
+    if (kind === "m") throw new TypeError("Private method is not writable");
+    if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a setter");
+    if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot write private member to an object whose class did not declare it");
+    return (kind === "a" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;
+};
+var __classPrivateFieldGet = (this && this.__classPrivateFieldGet) || function (receiver, state, kind, f) {
+    if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a getter");
+    if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot read private member from an object whose class did not declare it");
+    return kind === "m" ? f : kind === "a" ? f.call(receiver) : f ? f.value : state.get(receiver);
+};
+var _FileIndex_insert, _FileIndex_updateBoost, _FileIndex_search;
+Object.defineProperty(exports, "__esModule", { value: true });
+const CREATE_TABLE_SQL = `CREATE VIRTUAL TABLE file_content USING fts5(
+  directory UNINDEXED,
+  file_path,
+  file_symbols,
+  file_words,
+  tokenize = 'porter unicode61'
+)`;
+const CREATE_BOOST_TABLE_SQL = `CREATE TABLE file_boost (
+  file_path TEXT PRIMARY KEY,
+  boost_factor REAL
+)`;
+const INSERT_SQL = `INSERT INTO file_content (directory, file_path, file_symbols, file_words)
+VALUES (?, ?, ?, ?)`;
+const UPDATE_BOOST_SQL = `INSERT OR REPLACE INTO file_boost (file_path, boost_factor)
+VALUES (?, ?)`;
+const SEARCH_SQL = `SELECT
+    file_content.directory,
+    file_content.file_path,
+    (bm25(file_content, 1)*3.0 + bm25(file_content, 2)*2.0 + bm25(file_content, 3)*1.0)
+      * COALESCE(file_boost.boost_factor, 1.0) * -1
+      AS score
+FROM
+    file_content
+LEFT JOIN
+    file_boost
+ON
+    file_content.file_path = file_boost.file_path
+WHERE
+    file_content MATCH ?
+ORDER BY
+    score DESC
+LIMIT
+  ?
+`;
+/**
+ * The FileIndex class provides an interface to interact with the SQLite search index.
+ *
+ * The primary responsibilities of this class include:
+ * 1. Indexing files by storing their directory paths, file paths, symbols (e.g., class names, method names), and
+ *    general words in the database. Symbols are given more weight in the search results.
+ * 2. Boosting the relevance score of specific files based on external factors, such as AppMap trace data or error logs.
+ * 3. Performing search queries on the indexed files using full-text search with BM25 ranking. The search results are
+ *    influenced by both the indexed content and any associated boost factors.
+ *
+ * The class uses two SQLite tables:
+ * - `file_content`: A virtual table that holds the file content and allows for full-text search using BM25 ranking.
+ * - `file_boost`: A table that stores boost factors for specific files to enhance their search relevance.
+ */
+class FileIndex {
+    constructor(database) {
+        this.database = database;
+        _FileIndex_insert.set(this, void 0);
+        _FileIndex_updateBoost.set(this, void 0);
+        _FileIndex_search.set(this, void 0);
+        this.database.exec(CREATE_TABLE_SQL);
+        this.database.exec(CREATE_BOOST_TABLE_SQL);
+        this.database.pragma('journal_mode = OFF');
+        this.database.pragma('synchronous = OFF');
+        __classPrivateFieldSet(this, _FileIndex_insert, this.database.prepare(INSERT_SQL), "f");
+        __classPrivateFieldSet(this, _FileIndex_updateBoost, this.database.prepare(UPDATE_BOOST_SQL), "f");
+        __classPrivateFieldSet(this, _FileIndex_search, this.database.prepare(SEARCH_SQL), "f");
+    }
+    indexFile(directory, filePath, symbols, words) {
+        __classPrivateFieldGet(this, _FileIndex_insert, "f").run(directory, filePath, symbols, words);
+    }
+    boostFile(filePath, boostFactor) {
+        __classPrivateFieldGet(this, _FileIndex_updateBoost, "f").run(filePath, boostFactor);
+    }
+    search(query, limit = 10) {
+        const rows = __classPrivateFieldGet(this, _FileIndex_search, "f").all(query, limit);
+        return rows.map((row) => ({
+            directory: row.directory,
+            filePath: row.file_path,
+            score: row.score,
+        }));
+    }
+    close() {
+        this.database.close();
+    }
+}
+_FileIndex_insert = new WeakMap(), _FileIndex_updateBoost = new WeakMap(), _FileIndex_search = new WeakMap();
+exports.default = FileIndex;

package/built/file-type.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export declare const isLargeFile: (fileName: string) => Promise<boolean>;
+export declare const isBinaryFile: (fileName: string) => boolean;
+export declare const isDataFile: (fileName: string) => boolean;

package/built/file-type.js ADDED Viewed

@@ -0,0 +1,117 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.isDataFile = exports.isBinaryFile = exports.isLargeFile = void 0;
+const promises_1 = require("fs/promises");
+const debug_1 = __importDefault(require("debug"));
+const debug = (0, debug_1.default)('appmap:search:file-type');
+const BINARY_FILE_EXTENSIONS = [
+    '7z',
+    'aac',
+    'avi',
+    'bmp',
+    'bz2',
+    'class',
+    'dll',
+    'doc',
+    'docx',
+    'dylib',
+    'ear',
+    'exe',
+    'eot',
+    'flac',
+    'flv',
+    'gif',
+    'gz',
+    'ico',
+    'jar',
+    'jpeg',
+    'jpg',
+    'js.map',
+    'min.js',
+    'min.css',
+    'mkv',
+    'mo',
+    'mov',
+    'mp3',
+    'mp4',
+    'mpg',
+    'odt',
+    'odp',
+    'ods',
+    'ogg',
+    'otf',
+    'pdf',
+    'po',
+    'png',
+    'ppt',
+    'pptx',
+    'pyc',
+    'rar',
+    'rtf',
+    'so',
+    'svg',
+    'tar',
+    'tiff',
+    'ttf',
+    'wav',
+    'webm',
+    'webp',
+    'woff',
+    'woff2',
+    'wmv',
+    'xls',
+    'xlsx',
+    'xz',
+    'yarn.lock',
+    'zip',
+].map((ext) => '.' + ext);
+const DATA_FILE_EXTENSIONS = [
+    'cjs',
+    'csv',
+    'dat',
+    'log',
+    'json',
+    'tsv',
+    'yaml',
+    'yml',
+    'xml',
+].map((ext) => '.' + ext);
+const DEFAULT_LARGE_FILE_THRESHOLD = 50_000;
+const largeFileThreshold = () => {
+    const value = process.env.APPMAP_LARGE_FILE;
+    if (value === undefined)
+        return DEFAULT_LARGE_FILE_THRESHOLD;
+    return parseInt(value);
+};
+const statFileSafe = async (filePath) => {
+    try {
+        const stats = await (0, promises_1.stat)(filePath);
+        return stats.size;
+    }
+    catch (error) {
+        debug(`Error reading file: %s`, filePath);
+        debug(error);
+        return undefined;
+    }
+};
+const isLargeFile = async (fileName) => {
+    const fileSize = await statFileSafe(fileName);
+    if (fileSize === undefined)
+        return false;
+    const isLarge = fileSize > largeFileThreshold();
+    if (isLarge)
+        debug('File %s is considered large due to size %d', fileName, fileSize);
+    return fileSize > largeFileThreshold();
+};
+exports.isLargeFile = isLargeFile;
+const isBinaryFile = (fileName) => {
+    return BINARY_FILE_EXTENSIONS.some((ext) => fileName.endsWith(ext));
+};
+exports.isBinaryFile = isBinaryFile;
+const isDataFile = (fileName) => {
+    return DATA_FILE_EXTENSIONS.some((ext) => fileName.endsWith(ext));
+};
+exports.isDataFile = isDataFile;

package/built/git.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import { PathLike } from 'fs';
+export declare enum GitState {
+    NotInstalled = 0,// The git cli was not found.
+    NoRepository = 1,// Git is installed but no repository was found.
+    Ok = 2
+}
+export declare const GitRepositoryEnvKeys: readonly ["GITHUB_REPOSITORY", "CIRCLE_REPOSITORY_URL", "GIT_URL", "CI_REPOSITORY_URL"];
+export declare const GitBranchEnvKeys: readonly ["GITHUB_REF_NAME", "CIRCLE_BRANCH", "GIT_BRANCH", "TRAVIS_BRANCH", "CI_COMMIT_REF_NAME"];
+export declare const GitCommitEnvKeys: readonly ["GITHUB_SHA", "CIRCLE_SHA1", "GIT_COMMIT", "TRAVIS_COMMIT", "CI_COMMIT_SHA"];
+declare class GitProperties {
+    static contributors(sinceDaysAgo: number, cwd?: PathLike): Promise<Array<string>>;
+    static repository(cwd?: PathLike): Promise<string | undefined>;
+    static branch(cwd?: PathLike): Promise<string | undefined>;
+    static commit(cwd?: PathLike): Promise<string | undefined>;
+    static state(cwd?: PathLike): Promise<GitState>;
+    static clearCache(): void;
+}
+export declare const Git: typeof GitProperties;
+export {};

package/built/git.js ADDED Viewed

@@ -0,0 +1,156 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.Git = exports.GitCommitEnvKeys = exports.GitBranchEnvKeys = exports.GitRepositoryEnvKeys = exports.GitState = void 0;
+const child_process_1 = require("child_process");
+const util_1 = require("util");
+const exec = (0, util_1.promisify)(child_process_1.exec);
+var GitState;
+(function (GitState) {
+    GitState[GitState["NotInstalled"] = 0] = "NotInstalled";
+    GitState[GitState["NoRepository"] = 1] = "NoRepository";
+    GitState[GitState["Ok"] = 2] = "Ok";
+})(GitState || (exports.GitState = GitState = {}));
+exports.GitRepositoryEnvKeys = [
+    'GITHUB_REPOSITORY', // GitHub
+    'CIRCLE_REPOSITORY_URL', // CircleCI
+    'GIT_URL', // Jenkins
+    'CI_REPOSITORY_URL', // GitLab
+];
+exports.GitBranchEnvKeys = [
+    'GITHUB_REF_NAME', // GitHub
+    'CIRCLE_BRANCH', // CircleCI
+    'GIT_BRANCH', // Jenkins
+    'TRAVIS_BRANCH', // TravisCI
+    'CI_COMMIT_REF_NAME', // GitLab
+];
+exports.GitCommitEnvKeys = [
+    'GITHUB_SHA', // GitHub
+    'CIRCLE_SHA1', // CircleCI
+    'GIT_COMMIT', // Jenkins
+    'TRAVIS_COMMIT', // TravisCI
+    'CI_COMMIT_SHA', // GitLab
+];
+class GitProperties {
+    static async contributors(sinceDaysAgo, cwd) {
+        const unixTimeNow = Math.floor(Number(new Date()) / 1000);
+        const unixTimeAgo = unixTimeNow - sinceDaysAgo * 24 * 60 * 60;
+        try {
+            const { stdout } = await exec([
+                'git',
+                cwd && `-C ${cwd.toString()}`,
+                '--no-pager',
+                'log',
+                `--since=${unixTimeAgo}`,
+                '--format="%ae"',
+            ].join(' '));
+            return [
+                ...stdout
+                    .trim()
+                    .split('\n')
+                    .reduce((acc, email) => {
+                    acc.add(email);
+                    return acc;
+                }, new Set()),
+            ];
+        }
+        catch {
+            return [];
+        }
+    }
+    // Returns the repository URL, first by checking the environment, then by
+    // shelling out to git.
+    static async repository(cwd) {
+        const envKey = exports.GitRepositoryEnvKeys.find((key) => process.env[key]);
+        if (envKey)
+            return process.env[envKey];
+        try {
+            const { stdout } = await exec(['git', cwd && `-C ${cwd.toString()}`, 'config', '--get', 'remote.origin.url'].join(' '));
+            return stdout.trim();
+        }
+        catch {
+            return undefined;
+        }
+    }
+    // Returns the branch, first by checking the environment, then by
+    // shelling out to git.
+    static async branch(cwd) {
+        const envKey = exports.GitBranchEnvKeys.find((key) => process.env[key]);
+        if (envKey)
+            return process.env[envKey];
+        try {
+            const { stdout } = await exec(['git', cwd && `-C ${cwd.toString()}`, 'rev-parse', '--abbrev-ref', 'HEAD'].join(' '));
+            return stdout.trim();
+        }
+        catch {
+            return undefined;
+        }
+    }
+    // Returns the commit SHA, first by checking the environment, then by
+    // shelling out to git.
+    static async commit(cwd) {
+        const envKey = exports.GitCommitEnvKeys.find((key) => process.env[key]);
+        if (envKey)
+            return process.env[envKey];
+        try {
+            const { stdout } = await exec(['git', cwd && `-C ${cwd.toString()}`, 'rev-parse', 'HEAD'].join(' '));
+            return stdout.trim();
+        }
+        catch {
+            return undefined;
+        }
+    }
+    static async state(cwd) {
+        return new Promise((resolve) => {
+            try {
+                const commandProcess = (0, child_process_1.spawn)('git', ['status', '--porcelain'], {
+                    shell: true,
+                    cwd: cwd?.toString(),
+                    stdio: 'ignore',
+                    timeout: 2000,
+                });
+                commandProcess.on('exit', (code) => {
+                    switch (code) {
+                        case 127:
+                            return resolve(GitState.NotInstalled);
+                        case 128:
+                            return resolve(GitState.NoRepository);
+                        default:
+                            return resolve(GitState.Ok);
+                    }
+                });
+                commandProcess.on('error', () => resolve(GitState.NotInstalled));
+            }
+            catch {
+                resolve(GitState.NotInstalled);
+            }
+        });
+    }
+    static clearCache() {
+        gitCache.clear();
+    }
+}
+const gitCache = new Map();
+const noCacheList = ['clearCache'];
+// GitProperties is available externally as Git.
+// This export provides a simple caching layer around GitProperties to avoid
+// excessive shelling out to git.
+exports.Git = new Proxy(GitProperties, {
+    get(target, prop) {
+        if (!noCacheList.includes(prop.toString()) &&
+            typeof target[prop] === 'function') {
+            return new Proxy(target[prop], {
+                apply(target, thisArg, argArray) {
+                    const cacheKey = `${prop.toString()}(${JSON.stringify(argArray)})`;
+                    if (gitCache.has(cacheKey)) {
+                        return gitCache.get(cacheKey);
+                    }
+                    // eslint-disable-next-line @typescript-eslint/no-unsafe-function-type
+                    const result = Reflect.apply(target, thisArg, argArray);
+                    gitCache.set(cacheKey, result);
+                    return result;
+                },
+            });
+        }
+        return Reflect.get(target, prop);
+    },
+});

package/built/index.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+export { ContentReader, readFileSafe } from './ioutil';
+export { Splitter, langchainSplitter } from './splitter';
+export { ListFn, FilterFn, Tokenizer, default as buildFileIndex } from './build-file-index';
+export { File, default as buildSnippetIndex } from './build-snippet-index';
+export { default as SnippetIndex, SnippetSearchResult } from './snippet-index';
+export { default as FileIndex, FileSearchResult } from './file-index';
+export { default as listProjectFiles } from './project-files';
+export { isBinaryFile, isDataFile, isLargeFile } from './file-type';
+export { fileTokens } from './tokenize';

package/built/index.js ADDED Viewed

@@ -0,0 +1,26 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.fileTokens = exports.isLargeFile = exports.isDataFile = exports.isBinaryFile = exports.listProjectFiles = exports.FileIndex = exports.SnippetIndex = exports.buildSnippetIndex = exports.buildFileIndex = exports.langchainSplitter = exports.readFileSafe = void 0;
+var ioutil_1 = require("./ioutil");
+Object.defineProperty(exports, "readFileSafe", { enumerable: true, get: function () { return ioutil_1.readFileSafe; } });
+var splitter_1 = require("./splitter");
+Object.defineProperty(exports, "langchainSplitter", { enumerable: true, get: function () { return splitter_1.langchainSplitter; } });
+var build_file_index_1 = require("./build-file-index");
+Object.defineProperty(exports, "buildFileIndex", { enumerable: true, get: function () { return __importDefault(build_file_index_1).default; } });
+var build_snippet_index_1 = require("./build-snippet-index");
+Object.defineProperty(exports, "buildSnippetIndex", { enumerable: true, get: function () { return __importDefault(build_snippet_index_1).default; } });
+var snippet_index_1 = require("./snippet-index");
+Object.defineProperty(exports, "SnippetIndex", { enumerable: true, get: function () { return __importDefault(snippet_index_1).default; } });
+var file_index_1 = require("./file-index");
+Object.defineProperty(exports, "FileIndex", { enumerable: true, get: function () { return __importDefault(file_index_1).default; } });
+var project_files_1 = require("./project-files");
+Object.defineProperty(exports, "listProjectFiles", { enumerable: true, get: function () { return __importDefault(project_files_1).default; } });
+var file_type_1 = require("./file-type");
+Object.defineProperty(exports, "isBinaryFile", { enumerable: true, get: function () { return file_type_1.isBinaryFile; } });
+Object.defineProperty(exports, "isDataFile", { enumerable: true, get: function () { return file_type_1.isDataFile; } });
+Object.defineProperty(exports, "isLargeFile", { enumerable: true, get: function () { return file_type_1.isLargeFile; } });
+var tokenize_1 = require("./tokenize");
+Object.defineProperty(exports, "fileTokens", { enumerable: true, get: function () { return tokenize_1.fileTokens; } });

package/built/ioutil.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export type ContentReader = (filePath: string) => PromiseLike<string \| undefined>;
2	+ export declare function readFileSafe(filePath: string): PromiseLike<string \| undefined>;

package/built/ioutil.js ADDED Viewed

@@ -0,0 +1,19 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.readFileSafe = readFileSafe;
+const fs_1 = require("fs");
+const debug_1 = __importDefault(require("debug"));
+const debug = (0, debug_1.default)('appmap:search:ioutil');
+function readFileSafe(filePath) {
+    try {
+        return Promise.resolve((0, fs_1.readFileSync)(filePath, 'utf8'));
+    }
+    catch (error) {
+        debug(`Error reading file: %s`, filePath);
+        debug(error);
+        return Promise.resolve(undefined);
+    }
+}

package/built/project-files.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export default function listProjectFiles(directory: string): Promise<string[]>;
+export declare function listGitProjectFiles(directory: string): Promise<string[]>;
+export declare function listLikelyProjectFiles(directory: string): Promise<string[]>;

package/built/project-files.js ADDED Viewed

@@ -0,0 +1,106 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.default = listProjectFiles;
+exports.listGitProjectFiles = listGitProjectFiles;
+exports.listLikelyProjectFiles = listLikelyProjectFiles;
+const util_1 = require("util");
+const debug_1 = __importDefault(require("debug"));
+const node_child_process_1 = require("node:child_process");
+const git_1 = require("./git");
+const assert_1 = __importDefault(require("assert"));
+const promises_1 = require("fs/promises");
+const path_1 = require("path");
+const debug = (0, debug_1.default)('appmap:search:project-files');
+const exec = (0, util_1.promisify)(node_child_process_1.exec);
+async function listProjectFiles(directory) {
+    const gitState = await git_1.Git.state(directory);
+    debug(`Git state: %s`, gitState);
+    return gitState === git_1.GitState.Ok
+        ? await listGitProjectFiles(directory)
+        : await listLikelyProjectFiles(directory);
+}
+// Run git ls-files and git status to get a list of all git-managed files. By doing it this way,
+// we automatically apply any .gitignore rules.
+async function listGitProjectFiles(directory) {
+    const lsFiles = async () => {
+        try {
+            const { stdout } = await exec('git ls-files', {
+                cwd: directory,
+                maxBuffer: 1024 ** 2 * 20, // 20 MB
+            });
+            debug(stdout);
+            return stdout.split('\n').filter(Boolean);
+        }
+        catch (e) {
+            debug('`git ls-files` failed: %s', e);
+            return [];
+        }
+    };
+    const statusFiles = async () => {
+        try {
+            const { stdout } = await exec('git status --porcelain', {
+                cwd: directory,
+                maxBuffer: 1024 ** 2 * 20, // 20 MB
+            });
+            debug(stdout);
+            return stdout
+                .split('\n')
+                .map((line) => {
+                // git status --porcelain output starts with 3 characters: staged status, unstaged status,
+                // and a space.
+                return line.slice(3);
+            })
+                .filter(Boolean);
+        }
+        catch (e) {
+            debug('`git status --porcelain` failed: %s', e);
+            return [];
+        }
+    };
+    return Array.from(new Set([...(await lsFiles()), ...(await statusFiles())]));
+}
+const IGNORE_DIRECTORIES = new Set([
+    '.git',
+    '.venv',
+    '.yarn',
+    'node_modules',
+    'vendor',
+    'build',
+    'built',
+    'dist',
+    'out',
+    'target',
+    'tmp',
+    'venv',
+]);
+// Produce a modest-sized listing of files in the project.
+// Ignore a standard list of binary file extensions and directories that tend to be full of
+// non-source files.
+async function listLikelyProjectFiles(directory) {
+    const files = new Array();
+    const ignoreDirectory = (dir) => IGNORE_DIRECTORIES.has(dir);
+    // Perform a breadth-first traversal of a directory, collecting all non-binary files and
+    // applying the directory ignore list.
+    const processDir = async (dir) => {
+        const queue = [dir];
+        while (queue.length > 0) {
+            const currentDir = queue.shift();
+            (0, assert_1.default)(currentDir, 'queue should not be empty');
+            const entries = await (0, promises_1.readdir)(currentDir, { withFileTypes: true });
+            for (const entry of entries) {
+                const path = (0, path_1.join)(currentDir, entry.name);
+                if (entry.isDirectory()) {
+                    if (!ignoreDirectory(entry.name))
+                        queue.push(path);
+                }
+                else
+                    files.push((0, path_1.relative)(dir, path));
+            }
+        }
+    };
+    await processDir(directory);
+    return files;
+}

package/built/query-keywords.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+/**
+ * Extract keywords from a string or an array of strings. The extraction process includes the following steps:
+ *
+ * - Remove non-alphanumeric characters and split the keyword on spaces.
+ * - Split camelized words.
+ * - Remove stop words.
+ */
+export default function queryKeywords(words: undefined | string | string[]): string[];

package/built/query-keywords.js ADDED Viewed

@@ -0,0 +1,76 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.default = queryKeywords;
+const split_camelized_1 = require("./split-camelized");
+const STOP_WORDS = new Set([
+    'a',
+    'an',
+    'and',
+    'are',
+    'as',
+    'at',
+    'be',
+    'by',
+    'code',
+    'for',
+    'from',
+    'has',
+    'he',
+    'in',
+    'is',
+    'it',
+    'its',
+    'of',
+    'on',
+    'over',
+    'that',
+    'the',
+    'to',
+    'was',
+    'were',
+    'will',
+    'with',
+    'without',
+]);
+/**
+ * Replace non-alphanumeric characters with spaces, then split the keyword on spaces.
+ * So in effect, words with non-alphanumeric characters become multiple words.
+ * Allow dash and underscore as delimeters.
+ */
+const sanitizeKeyword = (keyword) => keyword.replace(/[^\p{L}\p{N}\-_]/gu, ' ').split(' ');
+/**
+ * Extract keywords from a string or an array of strings. The extraction process includes the following steps:
+ *
+ * - Remove non-alphanumeric characters and split the keyword on spaces.
+ * - Split camelized words.
+ * - Remove stop words.
+ */
+function queryKeywords(words) {
+    if (!words)
+        return [];
+    const wordsArray = Array.isArray(words) ? words : [words];
+    if (wordsArray.length === 0)
+        return [];
+    return wordsArray
+        .map((word) => sanitizeKeyword(word || ''))
+        .flat()
+        .filter(Boolean)
+        .map((word) => {
+        const camelized = (0, split_camelized_1.splitCamelized)(word)
+            .split(/[\s\-_]/)
+            .map((word) => word.toLowerCase());
+        // Return each of the component words, and also return each pair of adjacent words as a single word.
+        const result = new Array();
+        for (let i = 0; i < camelized.length; i++) {
+            result.push(camelized[i]);
+            if (i > 0)
+                result.push([camelized[i - 1] + camelized[i]].join(''));
+        }
+        return result;
+    })
+        .flat()
+        .map((str) => str.trim())
+        .filter(Boolean)
+        .filter((str) => str.length >= 2)
+        .filter((str) => !STOP_WORDS.has(str));
+}

package/built/snippet-index.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import sqlite3 from 'better-sqlite3';
+export type SnippetSearchResult = {
+    snippetId: string;
+    directory: string;
+    filePath: string;
+    startLine: number | undefined;
+    endLine: number | undefined;
+    score: number;
+    content: string;
+};
+export default class SnippetIndex {
+    #private;
+    database: sqlite3.Database;
+    constructor(database: sqlite3.Database);
+    indexSnippet(snippetId: string, directory: string, filePath: string, startLine: number | undefined, endLine: number | undefined, symbols: string, words: string, content: string): void;
+    boostSnippet(snippetId: string, boostFactor: number): void;
+    searchSnippets(query: string, limit?: number): SnippetSearchResult[];
+    close(): void;
+}

package/built/snippet-index.js ADDED Viewed

@@ -0,0 +1,94 @@
+"use strict";
+var __classPrivateFieldSet = (this && this.__classPrivateFieldSet) || function (receiver, state, value, kind, f) {
+    if (kind === "m") throw new TypeError("Private method is not writable");
+    if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a setter");
+    if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot write private member to an object whose class did not declare it");
+    return (kind === "a" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;
+};
+var __classPrivateFieldGet = (this && this.__classPrivateFieldGet) || function (receiver, state, kind, f) {
+    if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a getter");
+    if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot read private member from an object whose class did not declare it");
+    return kind === "m" ? f : kind === "a" ? f.call(receiver) : f ? f.value : state.get(receiver);
+};
+var _SnippetIndex_insertSnippet, _SnippetIndex_updateSnippetBoost, _SnippetIndex_searchSnippet;
+Object.defineProperty(exports, "__esModule", { value: true });
+const CREATE_SNIPPET_CONTENT_TABLE_SQL = `CREATE VIRTUAL TABLE snippet_content USING fts5(
+  snippet_id UNINDEXED,
+  directory UNINDEXED,
+  file_path,
+  start_line UNINDEXED,
+  end_line UNINDEXED,
+  file_symbols,
+  file_words,
+  content UNINDEXED,
+  tokenize = 'porter unicode61'
+)`;
+const CREATE_SNIPPET_BOOST_TABLE_SQL = `CREATE TABLE snippet_boost (
+  snippet_id TEXT PRIMARY KEY,
+  boost_factor REAL
+)`;
+const INSERT_SNIPPET_SQL = `INSERT INTO snippet_content
+(snippet_id, directory, file_path, start_line, end_line, file_symbols, file_words, content)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?)`;
+const UPDATE_SNIPPET_BOOST_SQL = `INSERT OR REPLACE INTO snippet_boost
+(snippet_id, boost_factor)
+VALUES (?, ?)`;
+const SEARCH_SNIPPET_SQL = `SELECT
+  snippet_content.directory,
+  snippet_content.file_path,
+  snippet_content.start_line,
+  snippet_content.end_line,
+  snippet_content.snippet_id,
+  snippet_content.content,
+  (bm25(snippet_content, 1)*3.0 + bm25(snippet_content, 2)*2.0 + bm25(snippet_content, 3)*1.0)
+      * COALESCE(snippet_boost.boost_factor, 1.0) * -1
+    AS score
+FROM
+  snippet_content
+LEFT JOIN
+  snippet_boost
+ON
+  snippet_content.snippet_id = snippet_boost.snippet_id
+WHERE
+  snippet_content MATCH ?
+ORDER BY
+  score DESC
+LIMIT ?`;
+class SnippetIndex {
+    constructor(database) {
+        this.database = database;
+        _SnippetIndex_insertSnippet.set(this, void 0);
+        _SnippetIndex_updateSnippetBoost.set(this, void 0);
+        _SnippetIndex_searchSnippet.set(this, void 0);
+        this.database.exec(CREATE_SNIPPET_CONTENT_TABLE_SQL);
+        this.database.exec(CREATE_SNIPPET_BOOST_TABLE_SQL);
+        this.database.pragma('journal_mode = OFF');
+        this.database.pragma('synchronous = OFF');
+        __classPrivateFieldSet(this, _SnippetIndex_insertSnippet, this.database.prepare(INSERT_SNIPPET_SQL), "f");
+        __classPrivateFieldSet(this, _SnippetIndex_updateSnippetBoost, this.database.prepare(UPDATE_SNIPPET_BOOST_SQL), "f");
+        __classPrivateFieldSet(this, _SnippetIndex_searchSnippet, this.database.prepare(SEARCH_SNIPPET_SQL), "f");
+    }
+    indexSnippet(snippetId, directory, filePath, startLine, endLine, symbols, words, content) {
+        __classPrivateFieldGet(this, _SnippetIndex_insertSnippet, "f").run(snippetId, directory, filePath, startLine, endLine, symbols, words, content);
+    }
+    boostSnippet(snippetId, boostFactor) {
+        __classPrivateFieldGet(this, _SnippetIndex_updateSnippetBoost, "f").run(snippetId, boostFactor);
+    }
+    searchSnippets(query, limit = 10) {
+        const rows = __classPrivateFieldGet(this, _SnippetIndex_searchSnippet, "f").all(query, limit);
+        return rows.map((row) => ({
+            directory: row.directory,
+            snippetId: row.snippet_id,
+            filePath: row.file_path,
+            startLine: row.start_line,
+            endLine: row.end_line,
+            score: row.score,
+            content: row.content,
+        }));
+    }
+    close() {
+        this.database.close();
+    }
+}
+_SnippetIndex_insertSnippet = new WeakMap(), _SnippetIndex_updateSnippetBoost = new WeakMap(), _SnippetIndex_searchSnippet = new WeakMap();
+exports.default = SnippetIndex;

package/built/split-camelized.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+export declare const LOG_CAMELIZED_TO_RAW: boolean;
+export declare const CAMELIZED_TO_RAW: Map<string, string>;
+/**
+ * Split a camelized word into a new word that is separated by a given separator.
+ */
+export declare function splitCamelized(text: string, { separator, preserveConsecutiveUppercase }?: {
+    separator?: string | undefined;
+    preserveConsecutiveUppercase?: boolean | undefined;
+}): string;

package/built/split-camelized.js ADDED Viewed

@@ -0,0 +1,55 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.CAMELIZED_TO_RAW = exports.LOG_CAMELIZED_TO_RAW = void 0;
+exports.splitCamelized = splitCamelized;
+const console_1 = require("console");
+exports.LOG_CAMELIZED_TO_RAW = process.env.APPMAP_LOG_CAMELIZED_TO_RAW === 'true';
+exports.CAMELIZED_TO_RAW = new Map();
+/**
+ * Split a camelized word into a new word that is separated by a given separator.
+ */
+// Derived from https://raw.githubusercontent.com/sindresorhus/decamelize/main/index.js
+// MIT License
+// Copyright (c) Sindre Sorhus sindresorhus@gmail.com (https://sindresorhus.com)
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+function splitCamelized(text, { separator = ' ', preserveConsecutiveUppercase = true } = {}) {
+    const handlePreserveConsecutiveUppercase = (decamelized, separator) => {
+        // Lowercase all single uppercase characters. As we
+        // want to preserve uppercase sequences, we cannot
+        // simply lowercase the separated string at the end.
+        // `data_For_USACounties` → `data_for_USACounties`
+        const result = decamelized.replace(/((?<![\p{Uppercase_Letter}\d])[\p{Uppercase_Letter}\d](?![\p{Uppercase_Letter}\d]))/gu, ($0) => $0.toLowerCase());
+        // Remaining uppercase sequences will be separated from lowercase sequences.
+        // `data_For_USACounties` → `data_for_USA_counties`
+        return result.replace(/(\p{Uppercase_Letter}+)(\p{Uppercase_Letter}\p{Lowercase_Letter}+)/gu, (_, $1, $2) => $1 + separator + $2.toLowerCase());
+    };
+    // Checking the second character is done later on. Therefore process shorter strings here.
+    if (text.length < 2) {
+        return preserveConsecutiveUppercase ? text : text.toLowerCase();
+    }
+    const replacement = `$1${separator}$2`;
+    // Split lowercase sequences followed by uppercase character.
+    // `dataForUSACounties` → `data_For_USACounties`
+    // `myURLstring → `my_URLstring`
+    const decamelized = text.replace(/([\p{Lowercase_Letter}\d])(\p{Uppercase_Letter})/gu, replacement);
+    let result;
+    if (preserveConsecutiveUppercase) {
+        result = handlePreserveConsecutiveUppercase(decamelized, separator);
+    }
+    else {
+        // Split multiple uppercase characters followed by one or more lowercase characters.
+        // `my_URLstring` → `my_ur_lstring`
+        result = decamelized
+            .replace(/(\p{Uppercase_Letter})(\p{Uppercase_Letter}\p{Lowercase_Letter}+)/gu, replacement)
+            .toLowerCase();
+    }
+    if (exports.LOG_CAMELIZED_TO_RAW) {
+        if (!exports.CAMELIZED_TO_RAW.has(result)) {
+            (0, console_1.log)(`[splitCamelized] ${text} → ${result}`);
+            exports.CAMELIZED_TO_RAW.set(result, text);
+        }
+    }
+    return result;
+}

package/built/splitter.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+export type Chunk = {
+    content: string;
+    startLine?: number;
+    endLine?: number;
+};
+export type Splitter = (content: string, fileExtension: string) => PromiseLike<Chunk[]>;
+export declare function langchainSplitter(content: string, fileExtension: string): Promise<Chunk[]>;

package/built/splitter.js ADDED Viewed

@@ -0,0 +1,53 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.langchainSplitter = langchainSplitter;
+const text_splitter_1 = require("langchain/text_splitter");
+const debug_1 = __importDefault(require("debug"));
+const debug = (0, debug_1.default)('appmap:search:splitter');
+const TEXT_SPLITTER_LANGUAGE_EXTENSIONS = {
+    cpp: ['cpp', 'h', 'hpp', 'c', 'cc', 'cxx', 'hxx'],
+    go: ['go'],
+    java: ['java', 'jsp', 'jspx'],
+    js: ['js', 'ts', 'mjs', 'jsx', 'tsx', 'vue', 'svelte'],
+    php: ['php'],
+    proto: ['proto'],
+    python: ['py'],
+    rst: ['rst'],
+    ruby: ['rb', 'haml', 'erb'],
+    rust: ['rs'],
+    scala: ['scala'],
+    swift: ['swift'],
+    markdown: ['md'],
+    latex: ['tex'],
+    html: ['html'],
+    sol: ['sol'],
+};
+async function langchainSplitter(content, fileExtension) {
+    const language = Object.keys(TEXT_SPLITTER_LANGUAGE_EXTENSIONS).find((language) => TEXT_SPLITTER_LANGUAGE_EXTENSIONS[language].includes(fileExtension));
+    let splitter;
+    if (language) {
+        splitter = text_splitter_1.RecursiveCharacterTextSplitter.fromLanguage(language);
+    }
+    else {
+        debug('No language found for extension: %s', fileExtension);
+        splitter = new text_splitter_1.RecursiveCharacterTextSplitter();
+    }
+    const documents = await splitter.createDocuments([content]);
+    // metadata includes:
+    // { loc: { lines: { from: 1, to: 14 } } }
+    return documents.map((doc) => {
+        const loc = doc.metadata?.loc;
+        const lines = loc?.lines;
+        const result = {
+            content: doc.pageContent,
+        };
+        if (lines) {
+            result.startLine = lines.from;
+            result.endLine = lines.to;
+        }
+        return result;
+    });
+}

package/built/tokenize.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+export declare const SymbolRegexes: Record<string, RegExp>;
+export declare function symbols(content: string, fileExtension: string, allowGeneric?: boolean): string[];
+export declare function words(content: string): string[];
+type FileTokens = {
+    symbols: string[];
+    words: string[];
+};
+export declare function fileTokens(content: string, fileExtension: string, enableGenericSymbolParsing?: boolean): FileTokens;
+export {};

package/built/tokenize.js ADDED Viewed

@@ -0,0 +1,86 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.SymbolRegexes = void 0;
+exports.symbols = symbols;
+exports.words = words;
+exports.fileTokens = fileTokens;
+const query_keywords_1 = __importDefault(require("./query-keywords"));
+const debug_1 = __importDefault(require("debug"));
+const debug = (0, debug_1.default)('appmap:search:tokenize');
+exports.SymbolRegexes = {
+    cs: /(((interface|class|enum|struct)\s+(?<symbol1>\w+))|((\s|^)(?!using|try|catch|if|while|do|for|switch)(?<!#define\s+?)(?<symbol2>[\w~$]+)\s*?\([^;)]*?\)[\w\s\d<>[\].:\n]*?{))/g,
+    cpp: /(((struct|enum|union|class)\s+(?<symbol1>\w+)\s*?\{)|(}\s*?(?<symbol2>\w+)\s*?;)|((\s|^)(?!try|catch|if|while|do|for|switch)(?<!#define\s+?)(?<symbol3>[\w~$]+)\s*?\([^;)]*?\)[\w\s\d<>[\].:\n]*?{))/g,
+    rs: /(struct|enum|union|trait|type|fn)\s+(?<symbol1>[\w\p{L}]+)/gu,
+    go: /((type\s+(?<symbol1>[\w\p{L}]+))|(func\s+?(\(.*?\)\s*?)?(?<symbol2>[\w\p{L}]+)\s*?\())/gu,
+    rb: /(((class|module)\s+(?<symbol1>\w+))|(def\s+?(?<symbol2>\w+)))/g,
+    py: /(class|def)\s+(?<symbol1>\w+)/g,
+    java: /(((class|@?interface|enum)\s+(?<symbol1>[\w$]+))|((\s|^)(?!try|catch|if|while|do|for|switch)(?<symbol2>[\w$]+)\s*?\([^;)]*?\)[\w\s\d<>[\].:\n]*?{))/g,
+    ts: /(((class|interface|enum|type|function)\s+(?<symbol1>[#$\w\p{L}]+))|((\s|^)(?!using|try|catch|if|while|do|for|switch)(?<symbol2>[#$\w\p{L}]+)\s*?\([^;)]*?\)[\w\s<>[\].:\n]*?\{)|((?<symbol3>[#$\w\p{L}]+)\s*?(=|:)\s*?\(.*?\)\s*?=>))/gu,
+    kt: /(((class|typealias)\s+(?<symbol1>[\w_]+))|(fun\s+?(<.+?>\s+)?(.*?\.)?(?<symbol2>\w+)))/g,
+    php: /(class|trait|function)\s+(?<symbol1>[\w_$]+)/g,
+};
+const genericSymbolRegex = /(((^|\s)(?!using|try|catch|if|while|do|for|switch)(?<symbol1>[#$\w\p{L}~]+)\s*?\(([^;)])*?\)[\w\s<>[\].:\n]*?\{)|(^(?!.*?(?:#|\/\/|"|')).*?(interface|class|enum|struct|union|trait|type(alias|def)?|fu?nc?(tion)?|module|def)\s+?(?<symbol2>[#$\w\p{L}]+))|((?<symbol3>[#$\w\p{L}~]+)\s*?=\s*?[\w\s<>[\].:\n]*?\{))/gmu;
+// Define aliases for common file extensions
+['js', 'jsx', 'ts', 'tsx', 'vue', 'svelte'].forEach((ext) => {
+    exports.SymbolRegexes[ext] = exports.SymbolRegexes.ts;
+});
+['c', 'cc', 'cxx', 'h', 'hpp', 'cpp', 'hxx'].forEach((ext) => {
+    exports.SymbolRegexes[ext] = exports.SymbolRegexes.cpp;
+});
+function getMatches(source, regex) {
+    const results = [];
+    const matches = source.matchAll(regex);
+    for (const match of matches) {
+        const { groups } = match;
+        const symbol = groups?.symbol1 ?? groups?.symbol2 ?? groups?.symbol3;
+        if (symbol)
+            results.push(symbol);
+    }
+    return results;
+}
+function symbols(content, fileExtension, allowGeneric = true) {
+    let regex = allowGeneric ? genericSymbolRegex : undefined;
+    if (fileExtension && fileExtension in exports.SymbolRegexes) {
+        regex = exports.SymbolRegexes[fileExtension];
+    }
+    if (regex) {
+        return getMatches(content, regex);
+    }
+    return [];
+}
+function words(content) {
+    return content.match(/\b\w+\b/g) ?? [];
+}
+function fileTokens(content, fileExtension, enableGenericSymbolParsing = true) {
+    if (enableGenericSymbolParsing)
+        debug('Using generic symbol parsing for file extension: %s', fileExtension);
+    const symbolList = (0, query_keywords_1.default)(symbols(content, fileExtension, enableGenericSymbolParsing)).sort();
+    const wordList = (0, query_keywords_1.default)(words(content)).sort();
+    // Iterate through words, with a corresponding pointer to symbols.
+    // If the word at the word index does not match the symbol at the symbol index,
+    // add the word to the output. Otherwise, advance both pointers. Repeat
+    // until all words have been traversed.
+    const filteredWordList = new Array();
+    let symbolIndex = 0;
+    let wordIndex = 0;
+    const collectWord = () => {
+        const word = wordList[wordIndex];
+        const symbol = symbolList[symbolIndex];
+        if (word === symbol) {
+            symbolIndex += 1;
+        }
+        else {
+            filteredWordList.push(word);
+        }
+        wordIndex += 1;
+    };
+    while (wordIndex < wordList.length)
+        collectWord();
+    return {
+        symbols: symbolList,
+        words: filteredWordList,
+    };
+}

package/package.json ADDED Viewed

@@ -0,0 +1,45 @@
+{
+  "name": "@appland/search",
+  "version": "1.0.0",
+  "description": "",
+  "bin": "built/cli.js",
+  "publishConfig": {
+    "access": "public"
+  },
+  "main": "built/index.js",
+  "types": "built/index.d.ts",
+  "files": [
+    "built"
+  ],
+  "scripts": {
+    "lint": "eslint",
+    "lint:fix": "eslint --fix",
+    "test": "jest",
+    "build": "tsc",
+    "watch": "tsc --watch"
+  },
+  "author": "AppLand, Inc",
+  "license": "Commons Clause + MIT",
+  "devDependencies": {
+    "@types/better-sqlite3": "^7.6.11",
+    "@types/jest": "^29.5.4",
+    "@types/node": "^16",
+    "eslint": "^9",
+    "eslint-config-prettier": "^9",
+    "eslint-plugin-eslint-comments": "^3.2.0",
+    "eslint-plugin-import": "^2.31.0",
+    "eslint-plugin-jest": "^28.8.3",
+    "eslint-plugin-prettier": "^5.2.1",
+    "eslint-plugin-promise": "^7.1.0",
+    "jest": "^29.7.0",
+    "prettier": "^3.3.3",
+    "ts-jest": "^29.2.5",
+    "tsc": "^2.0.4",
+    "typescript": "^5",
+    "typescript-eslint": "^8.11.0"
+  },
+  "dependencies": {
+    "better-sqlite3": "^11.5.0",
+    "yargs": "^17.7.2"
+  }
+}