@myst-theme/search 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,3 @@
1
+ # @myst-theme/search
2
+
3
+ An implementation and spec for client-side searching in MyST sites.
@@ -0,0 +1,4 @@
1
+ export * from './types.js';
2
+ export * from './rank.js';
3
+ export * from './search.js';
4
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,YAAY,CAAC;AAC3B,cAAc,WAAW,CAAC;AAC1B,cAAc,aAAa,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,3 @@
1
+ export * from './types.js';
2
+ export * from './rank.js';
3
+ export * from './search.js';
package/dist/rank.d.ts ADDED
@@ -0,0 +1,7 @@
1
+ import type { SearchResult, RankedSearchResult, AttributeType } from './types.js';
2
+ export declare const POSITIONAL_SEARCH_ATTRIBUTES: AttributeType[];
3
+ /**
4
+ * Rank and then filter raw search results
5
+ */
6
+ export declare function rankResults(results: SearchResult[]): RankedSearchResult[];
7
+ //# sourceMappingURL=rank.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rank.d.ts","sourceRoot":"","sources":["../src/rank.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAS,YAAY,EAAE,kBAAkB,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAIzF,eAAO,MAAM,4BAA4B,EAAE,aAAa,EAAyB,CAAC;AAmSlF;;GAEG;AACH,wBAAgB,WAAW,CAAC,OAAO,EAAE,YAAY,EAAE,GAAG,kBAAkB,EAAE,CAEzE"}
package/dist/rank.js ADDED
@@ -0,0 +1,251 @@
1
+ import { SEARCH_ATTRIBUTES_ORDERED } from './types.js';
2
+ import { extractField, SPACE_OR_PUNCTUATION } from './search.js';
3
+ export const POSITIONAL_SEARCH_ATTRIBUTES = ['content'];
4
+ // Weights that prioritise headings over content
5
+ const TYPE_WEIGHTS = new Map([
6
+ ['lvl1', 90],
7
+ ['lvl2', 80],
8
+ ['lvl3', 70],
9
+ ['lvl4', 60],
10
+ ['lvl5', 50],
11
+ ['lvl6', 40],
12
+ ['content', 0],
13
+ ]);
14
+ /*
15
+ * Generic `cmp` helper function
16
+ *
17
+ * @param left - left value
18
+ * @param right - right value
19
+ */
20
+ function cmp(left, right) {
21
+ if (left < right) {
22
+ return -1;
23
+ }
24
+ else if (left > right) {
25
+ return +1;
26
+ }
27
+ else {
28
+ return 0;
29
+ }
30
+ }
31
+ /**
32
+ * Build a RegExp that matches a single TOKEN bounded by SPACE_OR_PUNCTUATION, or string boundaries
33
+ *
34
+ * @param text - text to match, e.g. ` foo `, ` foo bar `, `foo bar`
35
+ */
36
+ function buildRegExpToken(token) {
37
+ return new RegExp(`(?:(?:${SPACE_OR_PUNCTUATION.source})|^)${token}(?:(?:${SPACE_OR_PUNCTUATION.source})|$)`, `${SPACE_OR_PUNCTUATION.flags}i`);
38
+ }
39
+ /**
40
+ * Compute the proximity between two queries, bounded by a limit
41
+ *
42
+ * @param record - parent search record
43
+ * @param left - first query
44
+ * @param right - second query
45
+ * @param bound - upper limit on computed proximity
46
+ */
47
+ function queryPairProximity(record, left, right, bound) {
48
+ // TODO: this is highly-nested, and probably slow
49
+ // it should be re-written for performance
50
+ let bestProximity = bound;
51
+ // For each term in the left query
52
+ for (const [leftTerm, leftFields] of Object.entries(left.matches)) {
53
+ const leftPattern = buildRegExpToken(leftTerm);
54
+ // For each field matched with this left term
55
+ for (const leftField of leftFields) {
56
+ // Pull out the (left) field content
57
+ const content = extractField(record, leftField);
58
+ // For each term in the right query
59
+ for (const [rightTerm, rightFields] of Object.entries(right.matches)) {
60
+ const rightPattern = buildRegExpToken(rightTerm);
61
+ // For each field matched with this right term
62
+ for (const rightField of rightFields) {
63
+ // Terms matching different fields can never be better than the bound
64
+ if (leftField !== rightField) {
65
+ continue;
66
+ }
67
+ // Find all of the matches in the content for each pattern
68
+ const leftMatches = content.matchAll(leftPattern);
69
+ const rightMatches = content.matchAll(rightPattern);
70
+ // Iterate over match pairs
71
+ for (const leftMatch of leftMatches) {
72
+ for (const rightMatch of rightMatches) {
73
+ // Find the ordered (start, stop) pairs for these two matches
74
+ const [start, stop] = leftMatch.index < rightMatch.index
75
+ ? [leftMatch.index, rightMatch.index]
76
+ : [rightMatch.index, leftMatch.index];
77
+ // Identify how many token separators there are in this range
78
+ const numSeparators = Array.from(content.slice(start, stop).matchAll(SPACE_OR_PUNCTUATION)).length;
79
+ // Fast-path, can never beat 1!
80
+ if (numSeparators === 1) {
81
+ return 1;
82
+ }
83
+ // Does this result improve our current proximity?
84
+ if (numSeparators < bestProximity) {
85
+ bestProximity = numSeparators;
86
+ }
87
+ }
88
+ }
89
+ }
90
+ }
91
+ }
92
+ }
93
+ return bestProximity;
94
+ }
95
+ /**
96
+ * Compute the associative pair-wise proximity of a search result
97
+ *
98
+ * @param result - search result
99
+ * @param bound - upper bound on final proximity
100
+ */
101
+ function wordsProximity(result, bound) {
102
+ const { queries } = result;
103
+ let proximity = 0;
104
+ for (let i = 0; i < queries.length - 1; i++) {
105
+ const left = queries[i];
106
+ const right = queries[i + 1];
107
+ proximity += queryPairProximity(result, left, right, bound);
108
+ }
109
+ return Math.min(proximity, bound);
110
+ }
111
+ /**
112
+ * Identify the best-matched attribute and the match position
113
+ *
114
+ * @param result - search result
115
+ */
116
+ function matchedAttributePosition(result) {
117
+ // Build mapping from fields to terms matching that field
118
+ // i.e. invert and flatten `result.queries[...].matches`
119
+ const fieldToTerms = new Map();
120
+ result.queries.forEach((query) => {
121
+ Object.entries(query.matches).forEach(([term, fields]) => {
122
+ fields.forEach((field) => {
123
+ let terms = fieldToTerms.get(field);
124
+ if (!terms) {
125
+ terms = [];
126
+ fieldToTerms.set(field, terms);
127
+ }
128
+ terms.push(term);
129
+ });
130
+ });
131
+ });
132
+ // Find first field that we matched
133
+ const attribute = SEARCH_ATTRIBUTES_ORDERED.find((field) => fieldToTerms.has(field));
134
+ let position;
135
+ // If this field is positional, find the start of the text match
136
+ if (POSITIONAL_SEARCH_ATTRIBUTES.includes(attribute)) {
137
+ // Find the terms that this field matches
138
+ const attributeTerms = fieldToTerms.get(attribute);
139
+ // Extract the field value
140
+ const value = extractField(result, attribute);
141
+ // Match each term against the field value, and extract the match position
142
+ const matchPositions = attributeTerms
143
+ .flatMap((term) => Array.from(value.matchAll(buildRegExpToken(term))))
144
+ .map((match) => match.index);
145
+ // Find the smallest (earliest) match position
146
+ position = Math.min(...matchPositions);
147
+ }
148
+ // Otherwise, we don't care about the position
149
+ else {
150
+ position = undefined;
151
+ }
152
+ return { attribute, position };
153
+ }
154
+ /**
155
+ * Determine how many terms matched the corpus exactly
156
+ *
157
+ * @param result - search result
158
+ */
159
+ function matchedExactWords(result) {
160
+ const allMatches = result.queries.flatMap(
161
+ // For each query (foo bar baz -> foo, then bar, then baz)
162
+ (query) => Object.entries(query.matches)
163
+ .flatMap(
164
+ // For each (match, matched fields) pair in the query matches
165
+ ([match, fields]) => {
166
+ const pattern = buildRegExpToken(match);
167
+ return fields.flatMap(
168
+ // For each matched field
169
+ (field) => {
170
+ // Retrieve corpus and test for pattern
171
+ const value = extractField(result, field);
172
+ return Array.from(value.matchAll(pattern)).map((m) => (m ? query.term : undefined));
173
+ });
174
+ })
175
+ .filter((item) => item));
176
+ const uniqueMatches = new Set(allMatches);
177
+ return uniqueMatches.size;
178
+ }
179
+ /**
180
+ * Determine the number of fuzzy matches in a search result
181
+ *
182
+ * @param result - search result
183
+ */
184
+ function numberOfTypos(result) {
185
+ return result.queries
186
+ .map((query) => {
187
+ const typoTerms = Object.keys(query.matches).filter((match) => match !== query.term);
188
+ return typoTerms.length;
189
+ })
190
+ .reduce((sum, value) => sum + value);
191
+ }
192
+ /**
193
+ * Rank a search result using Algolia-derived metrics
194
+ *
195
+ * @param result - search result
196
+ */
197
+ function rankSearchResult(result) {
198
+ return {
199
+ ...result,
200
+ ranking: {
201
+ typos: numberOfTypos(result),
202
+ ...matchedAttributePosition(result),
203
+ proximity: wordsProximity(result, 8), // TODO
204
+ exact: matchedExactWords(result),
205
+ level: TYPE_WEIGHTS.get(result.type),
206
+ appearance: result.position,
207
+ },
208
+ };
209
+ }
210
+ /**
211
+ * Compare ranked search results to prioritise higher rankings
212
+ *
213
+ * @param left - ranked search result
214
+ * @param right - ranked search result
215
+ */
216
+ function cmpRankedSearchResults(left, right) {
217
+ const leftRank = left.ranking;
218
+ const rightRank = right.ranking;
219
+ if (leftRank.typos !== rightRank.typos) {
220
+ return cmp(leftRank.typos, rightRank.typos);
221
+ }
222
+ if (leftRank.attribute !== rightRank.attribute) {
223
+ const i = SEARCH_ATTRIBUTES_ORDERED.findIndex((item) => item === leftRank.attribute);
224
+ const j = SEARCH_ATTRIBUTES_ORDERED.findIndex((item) => item === rightRank.attribute);
225
+ return cmp(i, j);
226
+ }
227
+ if (leftRank.position != null &&
228
+ rightRank.position != null &&
229
+ leftRank.position !== rightRank.position) {
230
+ return cmp(leftRank.position, rightRank.position);
231
+ }
232
+ if (leftRank.proximity !== rightRank.proximity) {
233
+ return cmp(leftRank.proximity, rightRank.proximity);
234
+ }
235
+ if (leftRank.exact !== rightRank.exact) {
236
+ return cmp(rightRank.exact, leftRank.exact);
237
+ }
238
+ if (leftRank.level !== rightRank.level) {
239
+ return cmp(rightRank.level, leftRank.level);
240
+ }
241
+ if (leftRank.appearance !== rightRank.appearance) {
242
+ return cmp(leftRank.appearance, rightRank.appearance);
243
+ }
244
+ return 0;
245
+ }
246
+ /**
247
+ * Rank and then filter raw search results
248
+ */
249
+ export function rankResults(results) {
250
+ return results.map(rankSearchResult).sort(cmpRankedSearchResults);
251
+ }
@@ -0,0 +1,3 @@
1
+ export declare const SPACE_OR_PUNCTUATION: RegExp;
2
+ export declare function extractField(document: Record<string, unknown>, fieldName: string): string;
3
+ //# sourceMappingURL=search.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"search.d.ts","sourceRoot":"","sources":["../src/search.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,oBAAoB,QAAwB,CAAC;AAC1D,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,MAAM,UAKhF"}
package/dist/search.js ADDED
@@ -0,0 +1,7 @@
1
+ export const SPACE_OR_PUNCTUATION = /[\n\r\p{Z}\p{P}]+/gu;
2
+ export function extractField(document, fieldName) {
3
+ // Access nested fields
4
+ // eslint-disable-next-line @typescript-eslint/ban-ts-comment
5
+ // @ts-ignore
6
+ return fieldName.split('.').reduce((doc, key) => doc && doc[key], document);
7
+ }
@@ -0,0 +1,31 @@
1
+ import type { SearchRecord, DocumentHierarchy } from 'myst-spec-ext';
2
+ export type { MystSearchIndex, SearchRecord } from 'myst-spec-ext';
3
+ export type HeadingLevel = keyof DocumentHierarchy;
4
+ export type Query = {
5
+ term: string;
6
+ matches: Record<string, string[]>;
7
+ };
8
+ export type SearchResult = SearchRecord & {
9
+ id: string | number;
10
+ queries: Query[];
11
+ };
12
+ export interface ISearch {
13
+ (query: string): Promise<SearchResult[] | undefined>;
14
+ }
15
+ export declare const SEARCH_ATTRIBUTES_ORDERED: readonly ["hierarchy.lvl1", "hierarchy.lvl2", "hierarchy.lvl3", "hierarchy.lvl4", "hierarchy.lvl5", "hierarchy.lvl6", "content"];
16
+ export type AttributeType = (typeof SEARCH_ATTRIBUTES_ORDERED)[number];
17
+ /**
18
+ * Type describing a seach result that has ranking
19
+ */
20
+ export type RankedSearchResult = SearchResult & {
21
+ ranking: {
22
+ typos: number;
23
+ attribute: AttributeType;
24
+ position?: number;
25
+ proximity: number;
26
+ exact: number;
27
+ level: number;
28
+ appearance: number;
29
+ };
30
+ };
31
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AACrE,YAAY,EAAE,eAAe,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AAEnE,MAAM,MAAM,YAAY,GAAG,MAAM,iBAAiB,CAAC;AAEnD,MAAM,MAAM,KAAK,GAAG;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;CACnC,CAAC;AAEF,MAAM,MAAM,YAAY,GAAG,YAAY,GAAG;IACxC,EAAE,EAAE,MAAM,GAAG,MAAM,CAAC;IACpB,OAAO,EAAE,KAAK,EAAE,CAAC;CAClB,CAAC;AAEF,MAAM,WAAW,OAAO;IACtB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,EAAE,GAAG,SAAS,CAAC,CAAC;CACtD;AAGD,eAAO,MAAM,yBAAyB,kIAQ5B,CAAC;AAEX,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,yBAAyB,CAAC,CAAC,MAAM,CAAC,CAAC;AAEvE;;GAEG;AACH,MAAM,MAAM,kBAAkB,GAAG,YAAY,GAAG;IAC9C,OAAO,EAAE;QAEP,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,aAAa,CAAC;QACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;CACH,CAAC"}
package/dist/types.js ADDED
@@ -0,0 +1,10 @@
1
+ /// Search ranking
2
+ export const SEARCH_ATTRIBUTES_ORDERED = [
3
+ 'hierarchy.lvl1',
4
+ 'hierarchy.lvl2',
5
+ 'hierarchy.lvl3',
6
+ 'hierarchy.lvl4',
7
+ 'hierarchy.lvl5',
8
+ 'hierarchy.lvl6',
9
+ 'content',
10
+ ];
package/package.json ADDED
@@ -0,0 +1,20 @@
1
+ {
2
+ "name": "@myst-theme/search",
3
+ "version": "0.0.0",
4
+ "type": "module",
5
+ "exports": "./dist/index.js",
6
+ "types": "./dist/index.d.ts",
7
+ "files": [
8
+ "dist"
9
+ ],
10
+ "license": "MIT",
11
+ "sideEffects": false,
12
+ "scripts": {
13
+ "clean": "rimraf dist",
14
+ "lint": "eslint \"src/**/*.ts*\" -c ./.eslintrc.cjs",
15
+ "lint:format": "prettier --check \"src/**/*.{ts,tsx,md}\"",
16
+ "build:esm": "tsc --project ./tsconfig.json --module Node16 --outDir dist --declaration",
17
+ "build": "npm-run-all -l clean -p build:esm"
18
+ },
19
+ "dependencies": {}
20
+ }