@se-studio/search 1.0.36 → 1.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,27 @@
1
1
  # @se-studio/search
2
2
 
3
+ ## 1.0.38
4
+
5
+ ### Patch Changes
6
+
7
+ - Version bump: patch for changed packages
8
+ - Updated dependencies
9
+ - @se-studio/contentful-rest-api@1.0.132
10
+ - @se-studio/core-data-types@1.0.127
11
+ - @se-studio/markdown-renderer@1.0.91
12
+ - @se-studio/wordpress-rest-api@1.0.8
13
+
14
+ ## 1.0.37
15
+
16
+ ### Patch Changes
17
+
18
+ - Version bump: patch for changed packages
19
+ - Updated dependencies
20
+ - @se-studio/contentful-rest-api@1.0.131
21
+ - @se-studio/core-data-types@1.0.126
22
+ - @se-studio/markdown-renderer@1.0.90
23
+ - @se-studio/wordpress-rest-api@1.0.7
24
+
3
25
  ## 1.0.36
4
26
 
5
27
  ### Patch Changes
package/README.md CHANGED
@@ -198,6 +198,30 @@ documentTransformer?: (doc: SearchDocument, contentData: ContentData) => SearchD
198
198
 
199
199
  The transformer runs on every chunk of a multi-chunk entry, so a 3-chunk page will call it 3 times.
200
200
 
201
+ ## Canonical URLs and custom indexes (`IIndexableContent`)
202
+
203
+ For **Upstash Search**, each `SearchDocument` already carries **`metadata.href`** (full path), populated from the CMS model’s `href` in `buildSearchDocuments` — do not infer URLs from `slug` + content type at query time.
204
+
205
+ If you maintain a **separate** full-text index (e.g. Lunr, blob JSON), use the shared **`IIndexableContent`** type from `@se-studio/search`. It requires:
206
+
207
+ - **`href`** – full canonical path as used in `<a href>` (e.g. `/insights/my-article`)
208
+ - **`content`**, **`title`**, **`id`**, **`contentType`** (string label), **`metadata`**
209
+
210
+ Optional **`slug`** remains useful for tokenisation; do not rely on it alone to build user-facing links.
211
+
212
+ Helpers:
213
+
214
+ - **`stripMarkdownToPlainText`**, **`calculateReadingTime`** – `@se-studio/search/indexing`
215
+ - **`indexArticleLinksToIndexableContent`** – builds `IIndexableContent[]` from `IArticleLink[]` using **one `MarkdownExporter.fetchContent()` call per article** (see below). Pass **`markdownContext`** (`config`, `siteConfig`, `urlCalculators`, optional `customConverters`); `contentContext` is supplied per entry from each `fetchContent` result.
216
+
217
+ ## Contentful indexing hazards (includes truncation)
218
+
219
+ The Contentful REST API can **silently truncate** `includes.Entry` and `includes.Asset` when a single response is large — for example, batching many entries in **one** `getEntries` call with **`include`** and walking `includes` to resolve rich text or linked entries often yields **incomplete** trees. Symptoms include missing body text or featured images in the index, with **no error** from the API.
220
+
221
+ **Recommended default:** fetch **each** entry individually via **`MarkdownExporter.fetchContent()`** (same pattern as `rebuildSearchIndex` in this package: one call per link/slug). That uses the normal per-entry REST/converter path instead of a multi-entry batched query.
222
+
223
+ If you must batch requests, you need a strategy that does not depend on unbounded resolved includes in one response (e.g. lower depth, smaller batches, or follow-up fetches) — the package cannot fix API limits from TypeScript alone.
224
+
201
225
  ## Architecture
202
226
 
203
227
  - **Single Upstash database, two indexes**: `published` and `preview`
@@ -213,7 +237,7 @@ The transformer runs on every chunk of a multi-chunk entry, so a 3-chunk page wi
213
237
  |--------|---------|
214
238
  | `@se-studio/search` | Types only |
215
239
  | `@se-studio/search/client` | `createSearchClient()` |
216
- | `@se-studio/search/indexing` | `rebuildSearchIndex()`, `buildSearchDocument()` |
240
+ | `@se-studio/search/indexing` | `rebuildSearchIndex()`, `buildSearchDocuments`, `indexArticleLinksToIndexableContent`, `stripMarkdownToPlainText`, `calculateReadingTime`, … |
217
241
  | `@se-studio/search/webhook` | `createSearchWebhookHandler()` |
218
242
  | `@se-studio/search/api` | `createSearchApiHandler()`, `createRebuildApiHandler()` |
219
243
  | `@se-studio/search/hooks` | `useSearch()` |
package/dist/index.d.ts CHANGED
@@ -1,3 +1,4 @@
1
1
  export type { ContentData } from '@se-studio/markdown-renderer';
2
+ export type { IIndexableContent } from './indexable-content';
2
3
  export type { ContentTypeIndexConfig, RebuildResult, SearchableContentType, SearchConfig, SearchDocument, SearchDocumentMetadata, SearchIndexConfig, SearchIndexingConfig, SearchOptions, SearchResponse, SearchResult, } from './types';
3
4
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,YAAY,EAAE,WAAW,EAAE,MAAM,8BAA8B,CAAC;AAChE,YAAY,EACV,sBAAsB,EACtB,aAAa,EACb,qBAAqB,EACrB,YAAY,EACZ,cAAc,EACd,sBAAsB,EACtB,iBAAiB,EACjB,oBAAoB,EACpB,aAAa,EACb,cAAc,EACd,YAAY,GACb,MAAM,SAAS,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,YAAY,EAAE,WAAW,EAAE,MAAM,8BAA8B,CAAC;AAChE,YAAY,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAC7D,YAAY,EACV,sBAAsB,EACtB,aAAa,EACb,qBAAqB,EACrB,YAAY,EACZ,cAAc,EACd,sBAAsB,EACtB,iBAAiB,EACjB,oBAAoB,EACpB,aAAa,EACb,cAAc,EACd,YAAY,GACb,MAAM,SAAS,CAAC"}
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Flat document shape for building non-Upstash indexes (e.g. Lunr, blob-stored indexes).
3
+ * Use {@link SearchDocument} / {@link SearchDocumentMetadata} for Upstash Search instead.
4
+ *
5
+ * **href** must be the full path as used in `<a href>` (e.g. `/insights/my-article`), not a
6
+ * bare slug — URL topology must not be inferred from `slug` + `contentType` at query time.
7
+ */
8
+ export interface IIndexableContent {
9
+ id: string;
10
+ /** Full canonical path for links, e.g. `/insights/my-article` */
11
+ href: string;
12
+ /** Logical type label (app-defined); e.g. `article`, `page`, `person` */
13
+ contentType: string;
14
+ title: string;
15
+ /** Plain-text body used for full-text indexing */
16
+ content: string;
17
+ /** Optional bare slug for tokenisation or legacy callers */
18
+ slug?: string;
19
+ date?: string;
20
+ tags?: string[];
21
+ author?: string;
22
+ metadata: Record<string, unknown>;
23
+ }
24
+ //# sourceMappingURL=indexable-content.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"indexable-content.d.ts","sourceRoot":"","sources":["../src/indexable-content.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,MAAM,WAAW,iBAAiB;IAChC,EAAE,EAAE,MAAM,CAAC;IACX,iEAAiE;IACjE,IAAI,EAAE,MAAM,CAAC;IACb,yEAAyE;IACzE,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,kDAAkD;IAClD,OAAO,EAAE,MAAM,CAAC;IAChB,4DAA4D;IAC5D,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=indexable-content.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"indexable-content.js","sourceRoot":"","sources":["../src/indexable-content.ts"],"names":[],"mappings":""}
@@ -0,0 +1,48 @@
1
+ import type { IArticleLink } from '@se-studio/core-data-types';
2
+ import { type ContentData, type MarkdownConverterContext, type MarkdownExporter } from '@se-studio/markdown-renderer';
3
+ import type { IIndexableContent } from '../indexable-content';
4
+ /** Same fields as {@link MarkdownConverterContext} except `contentContext` (supplied per fetch). */
5
+ export type ArticleMarkdownConverterParams = Omit<MarkdownConverterContext, 'contentContext'>;
6
+ export interface ArticleIndexableHooks {
7
+ /**
8
+ * Return false to skip a link. Defaults to {@link defaultShouldIndexArticleLink}
9
+ * when omitted.
10
+ */
11
+ shouldIndexArticleLink?(link: IArticleLink): boolean;
12
+ /**
13
+ * Override plain body after markdown conversion (e.g. custom truncation).
14
+ */
15
+ mapPlainTextBody?(link: IArticleLink, plainText: string, contentData: ContentData | null): string;
16
+ /**
17
+ * Build metadata beyond defaults (`originalType`, `readingTime`, `link`).
18
+ */
19
+ buildMetadata?(link: IArticleLink, plainTextBody: string, contentData: ContentData | null): Record<string, unknown>;
20
+ }
21
+ export interface IndexArticleLinksOptions {
22
+ exporter: MarkdownExporter;
23
+ /**
24
+ * Passed into each `MarkdownConverter.convert()` together with `contentData.context`
25
+ * from `fetchContent` (the entry-specific context is required for correct markdown).
26
+ */
27
+ markdownContext: ArticleMarkdownConverterParams;
28
+ /**
29
+ * Passed to `fetchContent('article', slug, { articleType })` when `link.articleType?.slug`
30
+ * is missing — mirrors {@link rebuildSearchIndex} (`blog` fallback in rebuild).
31
+ */
32
+ defaultArticleTypeSlug?: string;
33
+ /** Label stored on each `IIndexableContent.contentType` (default: `article`). */
34
+ indexableContentType?: string;
35
+ hooks?: ArticleIndexableHooks;
36
+ onProgress?: (current: number, total: number, itemName: string) => Promise<void>;
37
+ }
38
+ /**
39
+ * Default article link filter: internal paths only, no download articles.
40
+ */
41
+ export declare function defaultShouldIndexArticleLink(link: IArticleLink): boolean;
42
+ /**
43
+ * Builds {@link IIndexableContent} for each article link using **one**
44
+ * `MarkdownExporter.fetchContent()` call per article (avoids Contentful `includes` truncation
45
+ * from batched entry queries).
46
+ */
47
+ export declare function indexArticleLinksToIndexableContent(links: readonly IArticleLink[], options: IndexArticleLinksOptions): Promise<IIndexableContent[]>;
48
+ //# sourceMappingURL=article-indexable.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"article-indexable.d.ts","sourceRoot":"","sources":["../../src/indexing/article-indexable.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAY,MAAM,4BAA4B,CAAC;AACzE,OAAO,EACL,KAAK,WAAW,EAGhB,KAAK,wBAAwB,EAC7B,KAAK,gBAAgB,EACtB,MAAM,8BAA8B,CAAC;AACtC,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAG9D,oGAAoG;AACpG,MAAM,MAAM,8BAA8B,GAAG,IAAI,CAAC,wBAAwB,EAAE,gBAAgB,CAAC,CAAC;AAE9F,MAAM,WAAW,qBAAqB;IACpC;;;OAGG;IACH,sBAAsB,CAAC,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC;IACrD;;OAEG;IACH,gBAAgB,CAAC,CAAC,IAAI,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,GAAG,IAAI,GAAG,MAAM,CAAC;IAClG;;OAEG;IACH,aAAa,CAAC,CACZ,IAAI,EAAE,YAAY,EAClB,aAAa,EAAE,MAAM,EACrB,WAAW,EAAE,WAAW,GAAG,IAAI,GAC9B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC5B;AAED,MAAM,WAAW,wBAAwB;IACvC,QAAQ,EAAE,gBAAgB,CAAC;IAC3B;;;OAGG;IACH,eAAe,EAAE,8BAA8B,CAAC;IAChD;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC,iFAAiF;IACjF,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,KAAK,CAAC,EAAE,qBAAqB,CAAC;IAC9B,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;CAClF;AAkBD;;GAEG;AACH,wBAAgB,6BAA6B,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAQzE;AAMD;;;;GAIG;AACH,wBAAsB,mCAAmC,CACvD,KAAK,EAAE,SAAS,YAAY,EAAE,EAC9B,OAAO,EAAE,wBAAwB,GAChC,OAAO,CAAC,iBAAiB,EAAE,CAAC,CA0E9B"}
@@ -0,0 +1,95 @@
1
+ import { cleanMarkdownText, MarkdownConverter, } from '@se-studio/markdown-renderer';
2
+ import { calculateReadingTime, stripMarkdownToPlainText } from './readability';
3
+ function tagTitle(t) {
4
+ return t.title ?? t.name ?? t.slug ?? '';
5
+ }
6
+ function defaultTags(link) {
7
+ const raw = link.tags ?? [];
8
+ const names = raw.map(tagTitle).filter(Boolean);
9
+ return names.length > 0 ? names : ['article'];
10
+ }
11
+ function defaultAuthor(link) {
12
+ const a = link.author;
13
+ if (!a)
14
+ return undefined;
15
+ return a.name ?? a.title ?? undefined;
16
+ }
17
+ /**
18
+ * Default article link filter: internal paths only, no download articles.
19
+ */
20
+ export function defaultShouldIndexArticleLink(link) {
21
+ if (!link.href || link.href.startsWith('http')) {
22
+ return false;
23
+ }
24
+ if (link.download) {
25
+ return false;
26
+ }
27
+ return true;
28
+ }
29
+ function resolveArticleTypeSlug(link, fallback) {
30
+ return link.articleType?.slug ?? fallback;
31
+ }
32
+ /**
33
+ * Builds {@link IIndexableContent} for each article link using **one**
34
+ * `MarkdownExporter.fetchContent()` call per article (avoids Contentful `includes` truncation
35
+ * from batched entry queries).
36
+ */
37
+ export async function indexArticleLinksToIndexableContent(links, options) {
38
+ const { exporter, markdownContext, defaultArticleTypeSlug = 'blog', indexableContentType = 'article', hooks, onProgress, } = options;
39
+ const shouldIndex = hooks?.shouldIndexArticleLink ?? defaultShouldIndexArticleLink;
40
+ const converter = new MarkdownConverter();
41
+ const results = [];
42
+ const indexable = links.filter((l) => shouldIndex(l));
43
+ const total = indexable.length;
44
+ let current = 0;
45
+ for (const link of indexable) {
46
+ current++;
47
+ const articleType = resolveArticleTypeSlug(link, defaultArticleTypeSlug);
48
+ let contentData = null;
49
+ try {
50
+ contentData = await exporter.fetchContent('article', link.slug, { articleType });
51
+ }
52
+ catch {
53
+ contentData = null;
54
+ }
55
+ let plainBody = '';
56
+ if (contentData) {
57
+ const markdown = converter.convert(contentData, {
58
+ ...markdownContext,
59
+ contentContext: contentData.context,
60
+ });
61
+ plainBody = stripMarkdownToPlainText(cleanMarkdownText(markdown));
62
+ }
63
+ if (hooks?.mapPlainTextBody) {
64
+ plainBody = hooks.mapPlainTextBody(link, plainBody, contentData);
65
+ }
66
+ const readingMinutes = plainBody.trim() ? calculateReadingTime(plainBody) : undefined;
67
+ const tagNames = defaultTags(link);
68
+ const authorName = defaultAuthor(link);
69
+ const baseMeta = {
70
+ originalType: 'article',
71
+ link: link.href,
72
+ ...(readingMinutes !== undefined ? { readingTime: `${readingMinutes} min read` } : {}),
73
+ };
74
+ const metadata = hooks?.buildMetadata
75
+ ? { ...baseMeta, ...hooks.buildMetadata(link, plainBody, contentData) }
76
+ : baseMeta;
77
+ results.push({
78
+ id: link.id,
79
+ href: link.href,
80
+ contentType: indexableContentType,
81
+ title: link.title ?? link.text ?? '',
82
+ content: plainBody,
83
+ slug: link.slug,
84
+ date: link.date ?? undefined,
85
+ tags: tagNames,
86
+ author: authorName,
87
+ metadata,
88
+ });
89
+ if (onProgress) {
90
+ await onProgress(current, total, link.title ?? link.text ?? '');
91
+ }
92
+ }
93
+ return results;
94
+ }
95
+ //# sourceMappingURL=article-indexable.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"article-indexable.js","sourceRoot":"","sources":["../../src/indexing/article-indexable.ts"],"names":[],"mappings":"AACA,OAAO,EAEL,iBAAiB,EACjB,iBAAiB,GAGlB,MAAM,8BAA8B,CAAC;AAEtC,OAAO,EAAE,oBAAoB,EAAE,wBAAwB,EAAE,MAAM,eAAe,CAAC;AA2C/E,SAAS,QAAQ,CAAC,CAAW;IAC3B,OAAO,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,IAAI,EAAE,CAAC;AAC3C,CAAC;AAED,SAAS,WAAW,CAAC,IAAkB;IACrC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;IAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAChD,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;AAChD,CAAC;AAED,SAAS,aAAa,CAAC,IAAkB;IACvC,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC;IACtB,IAAI,CAAC,CAAC;QAAE,OAAO,SAAS,CAAC;IACzB,OAAO,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,KAAK,IAAI,SAAS,CAAC;AACxC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,6BAA6B,CAAC,IAAkB;IAC9D,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/C,OAAO,KAAK,CAAC;IACf,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAClB,OAAO,KAAK,CAAC;IACf,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,sBAAsB,CAAC,IAAkB,EAAE,QAAgB;IAClE,OAAO,IAAI,CAAC,WAAW,EAAE,IAAI,IAAI,QAAQ,CAAC;AAC5C,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,mCAAmC,CACvD,KAA8B,EAC9B,OAAiC;IAEjC,MAAM,EACJ,QAAQ,EACR,eAAe,EACf,sBAAsB,GAAG,MAAM,EAC/B,oBAAoB,GAAG,SAAS,EAChC,KAAK,EACL,UAAU,GACX,GAAG,OAAO,CAAC;IAEZ,MAAM,WAAW,GAAG,KAAK,EAAE,sBAAsB,IAAI,6BAA6B,CAAC;IACnF,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,OAAO,GAAwB,EAAE,CAAC;IACxC,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IACtD,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC;IAE/B,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,OAAO,EAAE,CAAC;QACV,MAAM,WAAW,GAAG,sBAAsB,CAAC,IAAI,EAAE,sBAAsB,CAAC,CAAC;QACzE,IAAI,WAAW,GAAuB,IAAI,CAAC;QAC3C,IAAI,CAAC;YACH,WAAW,GAAG,MAAM,QAAQ,CAAC,YAAY,CAAC,SAAS,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,WAAW,EAAE,CAAC,CAAC;QACnF,CAAC;QAAC,MAAM,CAAC;YACP,WAAW,GAAG,IAAI,CAAC;QACrB,CAAC;QAED,IAAI,SAAS,GAAG,EAAE,CAAC;QACnB,IAAI,WAAW,EAAE,CAAC;YAChB,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,WAAW,EAAE;gBAC9C,GAAG,eAAe;gBAClB,cAAc,EAAE,WAAW,CAAC,OAAO;aACpC,CAAC,CAAC;YACH,SAAS,GAAG,wBAAwB,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC;QACpE,CAAC;QAED,IAAI,KAAK,EAAE,gBAAgB,EAAE,CAAC;YAC5B,SAAS,GAAG,KAAK,CAAC,gBAAgB,CAAC,IAAI,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;QACnE,CAAC;QAED,MAAM,cAAc,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAEtF,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,UAAU,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;QAEvC,MAAM,QAAQ,GAA4B;YACxC,YAAY,EAAE,SAAS;YACvB,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,GAAG,CAAC,cAAc,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,GAAG,cAAc,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACvF,CAAC;QAEF,MAAM,QAAQ,GAAG,KAAK,EAAE,aAAa;YACnC,CAAC,CAAC,EAAE,GAAG,QAAQ,EAAE,GAAG,KAAK,CAAC,aAAa,CAAC,IAAI,EAAE,SAAS,EAAE,WAAW,CAAC,EAAE;YACvE,CAAC,CAAC,QAAQ,CAAC;QAEb,OAAO,CAAC,IAAI,CAAC;YACX,EAAE,EAAE,IAAI,CAAC,EAAE;YACX,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,WAAW,EAAE,oBAAoB;YACjC,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,IAAI,IAAI,EAAE;YACpC,OAAO,EAAE,SAAS;YAClB,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,SAAS;YAC5B,IAAI,EAAE,QAAQ;YACd,MAAM,EAAE,UAAU;YAClB,QAAQ;SACT,CAAC,CAAC;QAEH,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,UAAU,CAAC,OAAO,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC;QAClE,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"content-extractor.d.ts","sourceRoot":"","sources":["../../src/indexing/content-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,wBAAwB,EAAE,MAAM,8BAA8B,CAAC;AAY1F;;;GAGG;AACH,wBAAgB,sBAAsB,CAAC,WAAW,EAAE,MAAM,EAAE,iBAAiB,EAAE,MAAM,GAAG,MAAM,CAE7F;AAuCD;;;;;;GAMG;AACH,wBAAgB,uBAAuB,CACrC,WAAW,EAAE,WAAW,EACxB,gBAAgB,EAAE,wBAAwB,EAC1C,gBAAgB,EAAE,MAAM,GACvB,MAAM,EAAE,CA8BV;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,GAAG,MAAM,CAE7E"}
1
+ {"version":3,"file":"content-extractor.d.ts","sourceRoot":"","sources":["../../src/indexing/content-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,wBAAwB,EAAE,MAAM,8BAA8B,CAAC;AAa1F;;;GAGG;AACH,wBAAgB,sBAAsB,CAAC,WAAW,EAAE,MAAM,EAAE,iBAAiB,EAAE,MAAM,GAAG,MAAM,CAE7F;AAED;;;;;;GAMG;AACH,wBAAgB,uBAAuB,CACrC,WAAW,EAAE,WAAW,EACxB,gBAAgB,EAAE,wBAAwB,EAC1C,gBAAgB,EAAE,MAAM,GACvB,MAAM,EAAE,CA8BV;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,GAAG,MAAM,CAE7E"}
@@ -1,5 +1,6 @@
1
1
  import { cleanMarkdownText, MarkdownConverter } from '@se-studio/markdown-renderer';
2
2
  import { debugLog } from '../debug';
3
+ import { stripMarkdownToPlainText } from './readability';
3
4
  /** Upstash Search hard limit for total content per document. */
4
5
  const UPSTASH_CONTENT_LIMIT = 4096;
5
6
  /** Buffer for JSON key names and structural overhead in the content object. */
@@ -12,33 +13,6 @@ const CHUNK_OVERLAP = 300;
12
13
  export function calculateMaxBodyLength(titleLength, descriptionLength) {
13
14
  return Math.max(0, UPSTASH_CONTENT_LIMIT - titleLength - descriptionLength - JSON_OVERHEAD);
14
15
  }
15
- /**
16
- * Strips markdown syntax to produce plain text suitable for search indexing.
17
- * Removes headings markers, bold/italic markers, links, images, and frontmatter.
18
- */
19
- function stripMarkdown(md) {
20
- let text = md;
21
- // Remove YAML frontmatter
22
- text = text.replace(/^---[\s\S]*?---\n*/m, '');
23
- // Remove images: ![alt](url)
24
- text = text.replace(/!\[.*?\]\(.*?\)/g, '');
25
- // Convert links to text: [text](url) → text
26
- text = text.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
27
- // Remove heading markers: ### Heading → Heading
28
- text = text.replace(/^#{1,6}\s+/gm, '');
29
- // Remove bold/italic markers
30
- text = text.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
31
- text = text.replace(/_{1,3}([^_]+)_{1,3}/g, '$1');
32
- // Remove horizontal rules
33
- text = text.replace(/^[-*_]{3,}\s*$/gm, '');
34
- // Remove list markers
35
- text = text.replace(/^[\s]*[-*+]\s+/gm, '');
36
- text = text.replace(/^[\s]*\d+\.\s+/gm, '');
37
- // Collapse whitespace
38
- text = text.replace(/\n{3,}/g, '\n\n');
39
- text = text.trim();
40
- return text;
41
- }
42
16
  /**
43
17
  * Converts CMS content to plain text, then splits it into chunks that each
44
18
  * fit within `maxCharsPerChunk`, with `CHUNK_OVERLAP` characters of overlap
@@ -50,7 +24,7 @@ export function extractSearchableChunks(contentData, converterContext, maxCharsP
50
24
  const converter = new MarkdownConverter();
51
25
  const markdown = converter.convert(contentData, converterContext);
52
26
  debugLog('extractor', `Markdown length: ${markdown.length} chars`);
53
- const plainText = stripMarkdown(cleanMarkdownText(markdown));
27
+ const plainText = stripMarkdownToPlainText(cleanMarkdownText(markdown));
54
28
  debugLog('extractor', `Plain text length: ${plainText.length} chars (chunk limit ${maxCharsPerChunk})`);
55
29
  if (plainText.length <= maxCharsPerChunk) {
56
30
  return [plainText];
@@ -1 +1 @@
1
- {"version":3,"file":"content-extractor.js","sourceRoot":"","sources":["../../src/indexing/content-extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,8BAA8B,CAAC;AACpF,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AAEpC,gEAAgE;AAChE,MAAM,qBAAqB,GAAG,IAAI,CAAC;AAEnC,+EAA+E;AAC/E,MAAM,aAAa,GAAG,GAAG,CAAC;AAE1B,MAAM,aAAa,GAAG,GAAG,CAAC;AAE1B;;;GAGG;AACH,MAAM,UAAU,sBAAsB,CAAC,WAAmB,EAAE,iBAAyB;IACnF,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,qBAAqB,GAAG,WAAW,GAAG,iBAAiB,GAAG,aAAa,CAAC,CAAC;AAC9F,CAAC;AAED;;;GAGG;AACH,SAAS,aAAa,CAAC,EAAU;IAC/B,IAAI,IAAI,GAAG,EAAE,CAAC;IAEd,0BAA0B;IAC1B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,qBAAqB,EAAE,EAAE,CAAC,CAAC;IAE/C,6BAA6B;IAC7B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAE5C,4CAA4C;IAC5C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC,CAAC;IAEpD,gDAAgD;IAChD,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;IAExC,6BAA6B;IAC7B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC,CAAC;IACpD,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,sBAAsB,EAAE,IAAI,CAAC,CAAC;IAElD,0BAA0B;IAC1B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAE5C,sBAAsB;IACtB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAC5C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAE5C,sBAAsB;IACtB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IACvC,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAEnB,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,uBAAuB,CACrC,WAAwB,EACxB,gBAA0C,EAC1C,gBAAwB;IAExB,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,WAAW,EAAE,gBAAgB,CAAC,CAAC;IAClE,QAAQ,CAAC,WAAW,EAAE,oBAAoB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IACnE,MAAM,SAAS,GAAG,aAAa,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC7D,QAAQ,CACN,WAAW,EACX,sBAAsB,SAAS,CAAC,MAAM,uBAAuB,gBAAgB,GAAG,CACjF,CAAC;IAEF,IAAI,SAAS,CAAC,MAAM,IAAI,gBAAgB,EAAE,CAAC;QACzC,OAAO,CAAC,SAAS,CAAC,CAAC;IACrB,CAAC;IACD,IAAI,gBAAgB,IAAI,aAAa,IAAI,gBAAgB,IAAI,CAAC,EAAE,CAAC;QAC/D,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,gBAAgB,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7D,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,MAAM,IAAI,GAAG,gBAAgB,GAAG,aAAa,CAAC;IAE9C,OAAO,MAAM,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;QACjC,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,GAAG,gBAAgB,CAAC,CAAC;QACjE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnB,MAAM,IAAI,IAAI,CAAC;QACf,IAAI,MAAM,GAAG,aAAa,IAAI,SAAS,CAAC,MAAM;YAAE,MAAM;IACxD,CAAC;IAED,QAAQ,CAAC,WAAW,EAAE,cAAc,MAAM,CAAC,MAAM,oBAAoB,aAAa,GAAG,CAAC,CAAC;IACvF,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kBAAkB,CAAC,KAAa,EAAE,WAAmB;IACnE,OAAO,iBAAiB,CAAC,GAAG,KAAK,KAAK,WAAW,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;AAC9D,CAAC"}
1
+ {"version":3,"file":"content-extractor.js","sourceRoot":"","sources":["../../src/indexing/content-extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,8BAA8B,CAAC;AACpF,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AACpC,OAAO,EAAE,wBAAwB,EAAE,MAAM,eAAe,CAAC;AAEzD,gEAAgE;AAChE,MAAM,qBAAqB,GAAG,IAAI,CAAC;AAEnC,+EAA+E;AAC/E,MAAM,aAAa,GAAG,GAAG,CAAC;AAE1B,MAAM,aAAa,GAAG,GAAG,CAAC;AAE1B;;;GAGG;AACH,MAAM,UAAU,sBAAsB,CAAC,WAAmB,EAAE,iBAAyB;IACnF,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,qBAAqB,GAAG,WAAW,GAAG,iBAAiB,GAAG,aAAa,CAAC,CAAC;AAC9F,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,uBAAuB,CACrC,WAAwB,EACxB,gBAA0C,EAC1C,gBAAwB;IAExB,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,WAAW,EAAE,gBAAgB,CAAC,CAAC;IAClE,QAAQ,CAAC,WAAW,EAAE,oBAAoB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IACnE,MAAM,SAAS,GAAG,wBAAwB,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC;IACxE,QAAQ,CACN,WAAW,EACX,sBAAsB,SAAS,CAAC,MAAM,uBAAuB,gBAAgB,GAAG,CACjF,CAAC;IAEF,IAAI,SAAS,CAAC,MAAM,IAAI,gBAAgB,EAAE,CAAC;QACzC,OAAO,CAAC,SAAS,CAAC,CAAC;IACrB,CAAC;IACD,IAAI,gBAAgB,IAAI,aAAa,IAAI,gBAAgB,IAAI,CAAC,EAAE,CAAC;QAC/D,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,gBAAgB,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7D,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,MAAM,IAAI,GAAG,gBAAgB,GAAG,aAAa,CAAC;IAE9C,OAAO,MAAM,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;QACjC,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,GAAG,gBAAgB,CAAC,CAAC;QACjE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnB,MAAM,IAAI,IAAI,CAAC;QACf,IAAI,MAAM,GAAG,aAAa,IAAI,SAAS,CAAC,MAAM;YAAE,MAAM;IACxD,CAAC;IAED,QAAQ,CAAC,WAAW,EAAE,cAAc,MAAM,CAAC,MAAM,oBAAoB,aAAa,GAAG,CAAC,CAAC;IACvF,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kBAAkB,CAAC,KAAa,EAAE,WAAmB;IACnE,OAAO,iBAAiB,CAAC,GAAG,KAAK,KAAK,WAAW,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;AAC9D,CAAC"}
@@ -1,5 +1,8 @@
1
+ export type { ArticleIndexableHooks, ArticleMarkdownConverterParams, IndexArticleLinksOptions, } from './article-indexable';
2
+ export { defaultShouldIndexArticleLink, indexArticleLinksToIndexableContent, } from './article-indexable';
1
3
  export { calculateMaxBodyLength, extractSearchableChunks, extractShallowText, } from './content-extractor';
2
4
  export { allChunkIds, baseEntryId, buildSearchDocuments, chunkDocumentId, shouldIndex, } from './document-builder';
5
+ export { calculateReadingTime, DEFAULT_WORDS_PER_MINUTE, stripMarkdownToPlainText, } from './readability';
3
6
  export { rebuildSearchIndex } from './rebuild';
4
7
  export { rebuildSearchIndexWordPress } from './rebuild-wordpress';
5
8
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/indexing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,sBAAsB,EACtB,uBAAuB,EACvB,kBAAkB,GACnB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,WAAW,EACX,WAAW,EACX,oBAAoB,EACpB,eAAe,EACf,WAAW,GACZ,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,EAAE,2BAA2B,EAAE,MAAM,qBAAqB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/indexing/index.ts"],"names":[],"mappings":"AAAA,YAAY,EACV,qBAAqB,EACrB,8BAA8B,EAC9B,wBAAwB,GACzB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,6BAA6B,EAC7B,mCAAmC,GACpC,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,sBAAsB,EACtB,uBAAuB,EACvB,kBAAkB,GACnB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,WAAW,EACX,WAAW,EACX,oBAAoB,EACpB,eAAe,EACf,WAAW,GACZ,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EACL,oBAAoB,EACpB,wBAAwB,EACxB,wBAAwB,GACzB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,EAAE,2BAA2B,EAAE,MAAM,qBAAqB,CAAC"}
@@ -1,5 +1,7 @@
1
+ export { defaultShouldIndexArticleLink, indexArticleLinksToIndexableContent, } from './article-indexable';
1
2
  export { calculateMaxBodyLength, extractSearchableChunks, extractShallowText, } from './content-extractor';
2
3
  export { allChunkIds, baseEntryId, buildSearchDocuments, chunkDocumentId, shouldIndex, } from './document-builder';
4
+ export { calculateReadingTime, DEFAULT_WORDS_PER_MINUTE, stripMarkdownToPlainText, } from './readability';
3
5
  export { rebuildSearchIndex } from './rebuild';
4
6
  export { rebuildSearchIndexWordPress } from './rebuild-wordpress';
5
7
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/indexing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,sBAAsB,EACtB,uBAAuB,EACvB,kBAAkB,GACnB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,WAAW,EACX,WAAW,EACX,oBAAoB,EACpB,eAAe,EACf,WAAW,GACZ,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,EAAE,2BAA2B,EAAE,MAAM,qBAAqB,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/indexing/index.ts"],"names":[],"mappings":"AAKA,OAAO,EACL,6BAA6B,EAC7B,mCAAmC,GACpC,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,sBAAsB,EACtB,uBAAuB,EACvB,kBAAkB,GACnB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,WAAW,EACX,WAAW,EACX,oBAAoB,EACpB,eAAe,EACf,WAAW,GACZ,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EACL,oBAAoB,EACpB,wBAAwB,EACxB,wBAAwB,GACzB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,EAAE,2BAA2B,EAAE,MAAM,qBAAqB,CAAC"}
@@ -0,0 +1,14 @@
1
+ /** Default reading speed for {@link calculateReadingTime}. */
2
+ export declare const DEFAULT_WORDS_PER_MINUTE = 200;
3
+ /**
4
+ * Estimates reading time in minutes from plain text (e.g. after markdown stripping).
5
+ * Empty or whitespace-only input returns `1` (same convention as marketing-site reading time).
6
+ */
7
+ export declare function calculateReadingTime(text: string, wordsPerMinute?: number): number;
8
+ /**
9
+ * Strip markdown syntax leaving only readable words for search indexing or word counts.
10
+ * Removes frontmatter, heading markers, bold/italic, links (keeping text), images,
11
+ * fenced and inline code, blockquotes, list markers, and horizontal rules.
12
+ */
13
+ export declare function stripMarkdownToPlainText(markdown: string): string;
14
+ //# sourceMappingURL=readability.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"readability.d.ts","sourceRoot":"","sources":["../../src/indexing/readability.ts"],"names":[],"mappings":"AAAA,8DAA8D;AAC9D,eAAO,MAAM,wBAAwB,MAAM,CAAC;AAE5C;;;GAGG;AACH,wBAAgB,oBAAoB,CAClC,IAAI,EAAE,MAAM,EACZ,cAAc,GAAE,MAAiC,GAChD,MAAM,CAGR;AAED;;;;GAIG;AACH,wBAAgB,wBAAwB,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CA8BjE"}
@@ -0,0 +1,45 @@
1
+ /** Default reading speed for {@link calculateReadingTime}. */
2
+ export const DEFAULT_WORDS_PER_MINUTE = 200;
3
+ /**
4
+ * Estimates reading time in minutes from plain text (e.g. after markdown stripping).
5
+ * Empty or whitespace-only input returns `1` (same convention as marketing-site reading time).
6
+ */
7
+ export function calculateReadingTime(text, wordsPerMinute = DEFAULT_WORDS_PER_MINUTE) {
8
+ const words = text.trim().split(/\s+/).filter(Boolean);
9
+ return Math.max(1, Math.round(words.length / wordsPerMinute));
10
+ }
11
+ /**
12
+ * Strip markdown syntax leaving only readable words for search indexing or word counts.
13
+ * Removes frontmatter, heading markers, bold/italic, links (keeping text), images,
14
+ * fenced and inline code, blockquotes, list markers, and horizontal rules.
15
+ */
16
+ export function stripMarkdownToPlainText(markdown) {
17
+ return (markdown
18
+ // Remove YAML frontmatter block
19
+ .replace(/^---[\s\S]*?---\n?/, '')
20
+ // Fenced blocks before inline backticks (avoids corrupting ``` fences)
21
+ .replace(/```[\s\S]*?```/g, '')
22
+ // Remove images before links (order matters)
23
+ .replace(/!\[[^\]]*\]\([^)]+\)/g, '')
24
+ // Remove links, keep display text
25
+ .replace(/\[([^\]]*)\]\([^)]*\)/g, '$1')
26
+ // Remove heading markers
27
+ .replace(/^#{1,6}\s+/gm, '')
28
+ // Remove bold/italic (triple, double, single asterisk/underscore)
29
+ .replace(/\*{1,3}([^*\n]*)\*{1,3}/g, '$1')
30
+ .replace(/_{1,3}([^_\n]*)_{1,3}/g, '$1')
31
+ // Remove inline code backticks
32
+ .replace(/`([^`]+)`/g, '$1')
33
+ // Remove blockquote markers
34
+ .replace(/^>\s*/gm, '')
35
+ // Remove unordered list markers
36
+ .replace(/^[-*+]\s+/gm, '')
37
+ // Remove ordered list markers
38
+ .replace(/^\d+\.\s+/gm, '')
39
+ // Remove horizontal rules
40
+ .replace(/^[-*_]{3,}\s*$/gm, '')
41
+ // Collapse excess whitespace
42
+ .replace(/\n{3,}/g, '\n\n')
43
+ .trim());
44
+ }
45
+ //# sourceMappingURL=readability.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"readability.js","sourceRoot":"","sources":["../../src/indexing/readability.ts"],"names":[],"mappings":"AAAA,8DAA8D;AAC9D,MAAM,CAAC,MAAM,wBAAwB,GAAG,GAAG,CAAC;AAE5C;;;GAGG;AACH,MAAM,UAAU,oBAAoB,CAClC,IAAY,EACZ,iBAAyB,wBAAwB;IAEjD,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IACvD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,cAAc,CAAC,CAAC,CAAC;AAChE,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,wBAAwB,CAAC,QAAgB;IACvD,OAAO,CACL,QAAQ;QACN,gCAAgC;SAC/B,OAAO,CAAC,oBAAoB,EAAE,EAAE,CAAC;QAClC,uEAAuE;SACtE,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC;QAC/B,6CAA6C;SAC5C,OAAO,CAAC,uBAAuB,EAAE,EAAE,CAAC;QACrC,kCAAkC;SACjC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC;QACxC,yBAAyB;SACxB,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC;QAC5B,kEAAkE;SACjE,OAAO,CAAC,0BAA0B,EAAE,IAAI,CAAC;SACzC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC;QACxC,+BAA+B;SAC9B,OAAO,CAAC,YAAY,EAAE,IAAI,CAAC;QAC5B,4BAA4B;SAC3B,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;QACvB,gCAAgC;SAC/B,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;QAC3B,8BAA8B;SAC7B,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;QAC3B,0BAA0B;SACzB,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC;QAChC,6BAA6B;SAC5B,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CACV,CAAC;AACJ,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@se-studio/search",
3
- "version": "1.0.36",
3
+ "version": "1.0.38",
4
4
  "description": "AI-powered site search with Upstash Search for Next.js marketing sites",
5
5
  "repository": {
6
6
  "type": "git",
@@ -43,22 +43,22 @@
43
43
  ],
44
44
  "dependencies": {
45
45
  "@upstash/search": "^0.1.7",
46
- "@se-studio/contentful-rest-api": "1.0.128",
47
- "@se-studio/core-data-types": "1.0.125",
48
- "@se-studio/markdown-renderer": "1.0.89",
49
- "@se-studio/wordpress-rest-api": "1.0.6"
46
+ "@se-studio/contentful-rest-api": "1.0.132",
47
+ "@se-studio/core-data-types": "1.0.127",
48
+ "@se-studio/markdown-renderer": "1.0.91",
49
+ "@se-studio/wordpress-rest-api": "1.0.8"
50
50
  },
51
51
  "peerDependencies": {
52
52
  "next": ">=15.5.0",
53
53
  "react": "^19.0.0"
54
54
  },
55
55
  "devDependencies": {
56
- "@biomejs/biome": "^2.4.10",
57
- "@types/node": "^22.19.15",
56
+ "@biomejs/biome": "^2.4.12",
57
+ "@types/node": "^22.19.17",
58
58
  "@types/react": "^19.2.14",
59
- "next": "^15.5.14",
59
+ "next": "^15.5.15",
60
60
  "typescript": "^6.0.2",
61
- "vitest": "^4.1.2"
61
+ "vitest": "^4.1.4"
62
62
  },
63
63
  "scripts": {
64
64
  "build": "tsc --project tsconfig.build.json",