npm - @myst-theme/search - Versions diffs - 0.0.0 - Mend

@myst-theme/search 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,3 @@
+# @myst-theme/search
+An implementation and spec for client-side searching in MyST sites.

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export * from './types.js';
+export * from './rank.js';
+export * from './search.js';
+//# sourceMappingURL=index.d.ts.map

package/dist/index.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,cAAc,WAAW,CAAC;AAC1B,cAAc,aAAa,CAAC"}

package/dist/index.js ADDED Viewed

@@ -0,0 +1,3 @@
+export * from './types.js';
+export * from './rank.js';
+export * from './search.js';

package/dist/rank.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { SearchResult, RankedSearchResult, AttributeType } from './types.js';
+export declare const POSITIONAL_SEARCH_ATTRIBUTES: AttributeType[];
+/**
+ * Rank and then filter raw search results
+ */
+export declare function rankResults(results: SearchResult[]): RankedSearchResult[];
+//# sourceMappingURL=rank.d.ts.map

package/dist/rank.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"rank.d.ts","sourceRoot":"","sources":["../src/rank.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAS,YAAY,EAAE,kBAAkB,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAIzF,eAAO,MAAM,4BAA4B,EAAE,aAAa,EAAyB,CAAC;AAmSlF;;GAEG;AACH,wBAAgB,WAAW,CAAC,OAAO,EAAE,YAAY,EAAE,GAAG,kBAAkB,EAAE,CAEzE"}

package/dist/rank.js ADDED Viewed

@@ -0,0 +1,251 @@
+import { SEARCH_ATTRIBUTES_ORDERED } from './types.js';
+import { extractField, SPACE_OR_PUNCTUATION } from './search.js';
+export const POSITIONAL_SEARCH_ATTRIBUTES = ['content'];
+// Weights that prioritise headings over content
+const TYPE_WEIGHTS = new Map([
+    ['lvl1', 90],
+    ['lvl2', 80],
+    ['lvl3', 70],
+    ['lvl4', 60],
+    ['lvl5', 50],
+    ['lvl6', 40],
+    ['content', 0],
+]);
+/*
+ * Generic `cmp` helper function
+ *
+ * @param left - left value
+ * @param right - right value
+ */
+function cmp(left, right) {
+    if (left < right) {
+        return -1;
+    }
+    else if (left > right) {
+        return +1;
+    }
+    else {
+        return 0;
+    }
+}
+/**
+ * Build a RegExp that matches a single TOKEN bounded by SPACE_OR_PUNCTUATION, or string boundaries
+ *
+ * @param text - text to match, e.g. ` foo `, ` foo bar `, `foo bar`
+ */
+function buildRegExpToken(token) {
+    return new RegExp(`(?:(?:${SPACE_OR_PUNCTUATION.source})|^)${token}(?:(?:${SPACE_OR_PUNCTUATION.source})|$)`, `${SPACE_OR_PUNCTUATION.flags}i`);
+}
+/**
+ * Compute the proximity between two queries, bounded by a limit
+ *
+ * @param record - parent search record
+ * @param left - first query
+ * @param right - second query
+ * @param bound - upper limit on computed proximity
+ */
+function queryPairProximity(record, left, right, bound) {
+    // TODO: this is highly-nested, and probably slow
+    //       it should be re-written for performance
+    let bestProximity = bound;
+    // For each term in the left query
+    for (const [leftTerm, leftFields] of Object.entries(left.matches)) {
+        const leftPattern = buildRegExpToken(leftTerm);
+        // For each field matched with this left term
+        for (const leftField of leftFields) {
+            // Pull out the (left) field content
+            const content = extractField(record, leftField);
+            // For each term in the right query
+            for (const [rightTerm, rightFields] of Object.entries(right.matches)) {
+                const rightPattern = buildRegExpToken(rightTerm);
+                // For each field matched with this right term
+                for (const rightField of rightFields) {
+                    // Terms matching different fields can never be better than the bound
+                    if (leftField !== rightField) {
+                        continue;
+                    }
+                    // Find all of the matches in the content for each pattern
+                    const leftMatches = content.matchAll(leftPattern);
+                    const rightMatches = content.matchAll(rightPattern);
+                    // Iterate over match pairs
+                    for (const leftMatch of leftMatches) {
+                        for (const rightMatch of rightMatches) {
+                            // Find the ordered (start, stop) pairs for these two matches
+                            const [start, stop] = leftMatch.index < rightMatch.index
+                                ? [leftMatch.index, rightMatch.index]
+                                : [rightMatch.index, leftMatch.index];
+                            // Identify how many token separators there are in this range
+                            const numSeparators = Array.from(content.slice(start, stop).matchAll(SPACE_OR_PUNCTUATION)).length;
+                            // Fast-path, can never beat 1!
+                            if (numSeparators === 1) {
+                                return 1;
+                            }
+                            // Does this result improve our current proximity?
+                            if (numSeparators < bestProximity) {
+                                bestProximity = numSeparators;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return bestProximity;
+}
+/**
+ * Compute the associative pair-wise proximity of a search result
+ *
+ * @param result - search result
+ * @param bound - upper bound on final proximity
+ */
+function wordsProximity(result, bound) {
+    const { queries } = result;
+    let proximity = 0;
+    for (let i = 0; i < queries.length - 1; i++) {
+        const left = queries[i];
+        const right = queries[i + 1];
+        proximity += queryPairProximity(result, left, right, bound);
+    }
+    return Math.min(proximity, bound);
+}
+/**
+ * Identify the best-matched attribute and the match position
+ *
+ * @param result - search result
+ */
+function matchedAttributePosition(result) {
+    // Build mapping from fields to terms matching that field
+    // i.e. invert and flatten `result.queries[...].matches`
+    const fieldToTerms = new Map();
+    result.queries.forEach((query) => {
+        Object.entries(query.matches).forEach(([term, fields]) => {
+            fields.forEach((field) => {
+                let terms = fieldToTerms.get(field);
+                if (!terms) {
+                    terms = [];
+                    fieldToTerms.set(field, terms);
+                }
+                terms.push(term);
+            });
+        });
+    });
+    // Find first field that we matched
+    const attribute = SEARCH_ATTRIBUTES_ORDERED.find((field) => fieldToTerms.has(field));
+    let position;
+    // If this field is positional, find the start of the text match
+    if (POSITIONAL_SEARCH_ATTRIBUTES.includes(attribute)) {
+        // Find the terms that this field matches
+        const attributeTerms = fieldToTerms.get(attribute);
+        // Extract the field value
+        const value = extractField(result, attribute);
+        // Match each term against the field value, and extract the match position
+        const matchPositions = attributeTerms
+            .flatMap((term) => Array.from(value.matchAll(buildRegExpToken(term))))
+            .map((match) => match.index);
+        // Find the smallest (earliest) match position
+        position = Math.min(...matchPositions);
+    }
+    // Otherwise, we don't care about the position
+    else {
+        position = undefined;
+    }
+    return { attribute, position };
+}
+/**
+ * Determine how many terms matched the corpus exactly
+ *
+ * @param result - search result
+ */
+function matchedExactWords(result) {
+    const allMatches = result.queries.flatMap(
+    // For each query (foo bar baz -> foo, then bar, then baz)
+    (query) => Object.entries(query.matches)
+        .flatMap(
+    // For each (match, matched fields) pair in the query matches
+    ([match, fields]) => {
+        const pattern = buildRegExpToken(match);
+        return fields.flatMap(
+        // For each matched field
+        (field) => {
+            // Retrieve corpus and test for pattern
+            const value = extractField(result, field);
+            return Array.from(value.matchAll(pattern)).map((m) => (m ? query.term : undefined));
+        });
+    })
+        .filter((item) => item));
+    const uniqueMatches = new Set(allMatches);
+    return uniqueMatches.size;
+}
+/**
+ * Determine the number of fuzzy matches in a search result
+ *
+ * @param result - search result
+ */
+function numberOfTypos(result) {
+    return result.queries
+        .map((query) => {
+        const typoTerms = Object.keys(query.matches).filter((match) => match !== query.term);
+        return typoTerms.length;
+    })
+        .reduce((sum, value) => sum + value);
+}
+/**
+ * Rank a search result using Algolia-derived metrics
+ *
+ * @param result - search result
+ */
+function rankSearchResult(result) {
+    return {
+        ...result,
+        ranking: {
+            typos: numberOfTypos(result),
+            ...matchedAttributePosition(result),
+            proximity: wordsProximity(result, 8), // TODO
+            exact: matchedExactWords(result),
+            level: TYPE_WEIGHTS.get(result.type),
+            appearance: result.position,
+        },
+    };
+}
+/**
+ * Compare ranked search results to prioritise higher rankings
+ *
+ * @param left - ranked search result
+ * @param right - ranked search result
+ */
+function cmpRankedSearchResults(left, right) {
+    const leftRank = left.ranking;
+    const rightRank = right.ranking;
+    if (leftRank.typos !== rightRank.typos) {
+        return cmp(leftRank.typos, rightRank.typos);
+    }
+    if (leftRank.attribute !== rightRank.attribute) {
+        const i = SEARCH_ATTRIBUTES_ORDERED.findIndex((item) => item === leftRank.attribute);
+        const j = SEARCH_ATTRIBUTES_ORDERED.findIndex((item) => item === rightRank.attribute);
+        return cmp(i, j);
+    }
+    if (leftRank.position != null &&
+        rightRank.position != null &&
+        leftRank.position !== rightRank.position) {
+        return cmp(leftRank.position, rightRank.position);
+    }
+    if (leftRank.proximity !== rightRank.proximity) {
+        return cmp(leftRank.proximity, rightRank.proximity);
+    }
+    if (leftRank.exact !== rightRank.exact) {
+        return cmp(rightRank.exact, leftRank.exact);
+    }
+    if (leftRank.level !== rightRank.level) {
+        return cmp(rightRank.level, leftRank.level);
+    }
+    if (leftRank.appearance !== rightRank.appearance) {
+        return cmp(leftRank.appearance, rightRank.appearance);
+    }
+    return 0;
+}
+/**
+ * Rank and then filter raw search results
+ */
+export function rankResults(results) {
+    return results.map(rankSearchResult).sort(cmpRankedSearchResults);
+}

package/dist/search.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export declare const SPACE_OR_PUNCTUATION: RegExp;
+export declare function extractField(document: Record<string, unknown>, fieldName: string): string;
+//# sourceMappingURL=search.d.ts.map

package/dist/search.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"search.d.ts","sourceRoot":"","sources":["../src/search.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,oBAAoB,QAAwB,CAAC;AAC1D,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,MAAM,UAKhF"}

package/dist/search.js ADDED Viewed

@@ -0,0 +1,7 @@
+export const SPACE_OR_PUNCTUATION = /[\n\r\p{Z}\p{P}]+/gu;
+export function extractField(document, fieldName) {
+    // Access nested fields
+    // eslint-disable-next-line @typescript-eslint/ban-ts-comment
+    // @ts-ignore
+    return fieldName.split('.').reduce((doc, key) => doc && doc[key], document);
+}

package/dist/types.d.ts ADDED Viewed

@@ -0,0 +1,31 @@
+import type { SearchRecord, DocumentHierarchy } from 'myst-spec-ext';
+export type { MystSearchIndex, SearchRecord } from 'myst-spec-ext';
+export type HeadingLevel = keyof DocumentHierarchy;
+export type Query = {
+    term: string;
+    matches: Record<string, string[]>;
+};
+export type SearchResult = SearchRecord & {
+    id: string | number;
+    queries: Query[];
+};
+export interface ISearch {
+    (query: string): Promise<SearchResult[] | undefined>;
+}
+export declare const SEARCH_ATTRIBUTES_ORDERED: readonly ["hierarchy.lvl1", "hierarchy.lvl2", "hierarchy.lvl3", "hierarchy.lvl4", "hierarchy.lvl5", "hierarchy.lvl6", "content"];
+export type AttributeType = (typeof SEARCH_ATTRIBUTES_ORDERED)[number];
+/**
+ * Type describing a seach result that has ranking
+ */
+export type RankedSearchResult = SearchResult & {
+    ranking: {
+        typos: number;
+        attribute: AttributeType;
+        position?: number;
+        proximity: number;
+        exact: number;
+        level: number;
+        appearance: number;
+    };
+};
+//# sourceMappingURL=types.d.ts.map

package/dist/types.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AACrE,YAAY,EAAE,eAAe,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AAEnE,MAAM,MAAM,YAAY,GAAG,MAAM,iBAAiB,CAAC;AAEnD,MAAM,MAAM,KAAK,GAAG;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;CACnC,CAAC;AAEF,MAAM,MAAM,YAAY,GAAG,YAAY,GAAG;IACxC,EAAE,EAAE,MAAM,GAAG,MAAM,CAAC;IACpB,OAAO,EAAE,KAAK,EAAE,CAAC;CAClB,CAAC;AAEF,MAAM,WAAW,OAAO;IACtB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,EAAE,GAAG,SAAS,CAAC,CAAC;CACtD;AAGD,eAAO,MAAM,yBAAyB,kIAQ5B,CAAC;AAEX,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,yBAAyB,CAAC,CAAC,MAAM,CAAC,CAAC;AAEvE;;GAEG;AACH,MAAM,MAAM,kBAAkB,GAAG,YAAY,GAAG;IAC9C,OAAO,EAAE;QAEP,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,aAAa,CAAC;QACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;CACH,CAAC"}

package/dist/types.js ADDED Viewed

@@ -0,0 +1,10 @@
+/// Search ranking
+export const SEARCH_ATTRIBUTES_ORDERED = [
+    'hierarchy.lvl1',
+    'hierarchy.lvl2',
+    'hierarchy.lvl3',
+    'hierarchy.lvl4',
+    'hierarchy.lvl5',
+    'hierarchy.lvl6',
+    'content',
+];

package/package.json ADDED Viewed

@@ -0,0 +1,20 @@
+{
+  "name": "@myst-theme/search",
+  "version": "0.0.0",
+  "type": "module",
+  "exports": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "files": [
+    "dist"
+  ],
+  "license": "MIT",
+  "sideEffects": false,
+  "scripts": {
+    "clean": "rimraf dist",
+    "lint": "eslint \"src/**/*.ts*\" -c ./.eslintrc.cjs",
+    "lint:format": "prettier --check \"src/**/*.{ts,tsx,md}\"",
+    "build:esm": "tsc --project ./tsconfig.json --module Node16 --outDir dist --declaration",
+    "build": "npm-run-all -l clean -p build:esm"
+  },
+  "dependencies": {}
+}