@se-studio/search 1.0.36 → 1.0.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/README.md +25 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/indexable-content.d.ts +24 -0
- package/dist/indexable-content.d.ts.map +1 -0
- package/dist/indexable-content.js +2 -0
- package/dist/indexable-content.js.map +1 -0
- package/dist/indexing/article-indexable.d.ts +48 -0
- package/dist/indexing/article-indexable.d.ts.map +1 -0
- package/dist/indexing/article-indexable.js +95 -0
- package/dist/indexing/article-indexable.js.map +1 -0
- package/dist/indexing/content-extractor.d.ts.map +1 -1
- package/dist/indexing/content-extractor.js +2 -28
- package/dist/indexing/content-extractor.js.map +1 -1
- package/dist/indexing/index.d.ts +3 -0
- package/dist/indexing/index.d.ts.map +1 -1
- package/dist/indexing/index.js +2 -0
- package/dist/indexing/index.js.map +1 -1
- package/dist/indexing/readability.d.ts +14 -0
- package/dist/indexing/readability.d.ts.map +1 -0
- package/dist/indexing/readability.js +45 -0
- package/dist/indexing/readability.js.map +1 -0
- package/package.json +9 -9
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,16 @@
|
|
|
1
1
|
# @se-studio/search
|
|
2
2
|
|
|
3
|
+
## 1.0.37
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- Version bump: patch for changed packages
|
|
8
|
+
- Updated dependencies
|
|
9
|
+
- @se-studio/contentful-rest-api@1.0.131
|
|
10
|
+
- @se-studio/core-data-types@1.0.126
|
|
11
|
+
- @se-studio/markdown-renderer@1.0.90
|
|
12
|
+
- @se-studio/wordpress-rest-api@1.0.7
|
|
13
|
+
|
|
3
14
|
## 1.0.36
|
|
4
15
|
|
|
5
16
|
### Patch Changes
|
package/README.md
CHANGED
|
@@ -198,6 +198,30 @@ documentTransformer?: (doc: SearchDocument, contentData: ContentData) => SearchD
|
|
|
198
198
|
|
|
199
199
|
The transformer runs on every chunk of a multi-chunk entry, so a 3-chunk page will call it 3 times.
|
|
200
200
|
|
|
201
|
+
## Canonical URLs and custom indexes (`IIndexableContent`)
|
|
202
|
+
|
|
203
|
+
For **Upstash Search**, each `SearchDocument` already carries **`metadata.href`** (full path), populated from the CMS model’s `href` in `buildSearchDocuments` — do not infer URLs from `slug` + content type at query time.
|
|
204
|
+
|
|
205
|
+
If you maintain a **separate** full-text index (e.g. Lunr, blob JSON), use the shared **`IIndexableContent`** type from `@se-studio/search`. It requires:
|
|
206
|
+
|
|
207
|
+
- **`href`** – full canonical path as used in `<a href>` (e.g. `/insights/my-article`)
|
|
208
|
+
- **`content`**, **`title`**, **`id`**, **`contentType`** (string label), **`metadata`**
|
|
209
|
+
|
|
210
|
+
Optional **`slug`** remains useful for tokenisation; do not rely on it alone to build user-facing links.
|
|
211
|
+
|
|
212
|
+
Helpers:
|
|
213
|
+
|
|
214
|
+
- **`stripMarkdownToPlainText`**, **`calculateReadingTime`** – `@se-studio/search/indexing`
|
|
215
|
+
- **`indexArticleLinksToIndexableContent`** – builds `IIndexableContent[]` from `IArticleLink[]` using **one `MarkdownExporter.fetchContent()` call per article** (see below). Pass **`markdownContext`** (`config`, `siteConfig`, `urlCalculators`, optional `customConverters`); `contentContext` is supplied per entry from each `fetchContent` result.
|
|
216
|
+
|
|
217
|
+
## Contentful indexing hazards (includes truncation)
|
|
218
|
+
|
|
219
|
+
The Contentful REST API can **silently truncate** `includes.Entry` and `includes.Asset` when a single response is large — for example, batching many entries in **one** `getEntries` call with **`include`** and walking `includes` to resolve rich text or linked entries often yields **incomplete** trees. Symptoms include missing body text or featured images in the index, with **no error** from the API.
|
|
220
|
+
|
|
221
|
+
**Recommended default:** fetch **each** entry individually via **`MarkdownExporter.fetchContent()`** (same pattern as `rebuildSearchIndex` in this package: one call per link/slug). That uses the normal per-entry REST/converter path instead of a multi-entry batched query.
|
|
222
|
+
|
|
223
|
+
If you must batch requests, you need a strategy that does not depend on unbounded resolved includes in one response (e.g. lower depth, smaller batches, or follow-up fetches) — the package cannot fix API limits from TypeScript alone.
|
|
224
|
+
|
|
201
225
|
## Architecture
|
|
202
226
|
|
|
203
227
|
- **Single Upstash database, two indexes**: `published` and `preview`
|
|
@@ -213,7 +237,7 @@ The transformer runs on every chunk of a multi-chunk entry, so a 3-chunk page wi
|
|
|
213
237
|
|--------|---------|
|
|
214
238
|
| `@se-studio/search` | Types only |
|
|
215
239
|
| `@se-studio/search/client` | `createSearchClient()` |
|
|
216
|
-
| `@se-studio/search/indexing` | `rebuildSearchIndex()`, `
|
|
240
|
+
| `@se-studio/search/indexing` | `rebuildSearchIndex()`, `buildSearchDocuments`, `indexArticleLinksToIndexableContent`, `stripMarkdownToPlainText`, `calculateReadingTime`, … |
|
|
217
241
|
| `@se-studio/search/webhook` | `createSearchWebhookHandler()` |
|
|
218
242
|
| `@se-studio/search/api` | `createSearchApiHandler()`, `createRebuildApiHandler()` |
|
|
219
243
|
| `@se-studio/search/hooks` | `useSearch()` |
|
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
export type { ContentData } from '@se-studio/markdown-renderer';
|
|
2
|
+
export type { IIndexableContent } from './indexable-content';
|
|
2
3
|
export type { ContentTypeIndexConfig, RebuildResult, SearchableContentType, SearchConfig, SearchDocument, SearchDocumentMetadata, SearchIndexConfig, SearchIndexingConfig, SearchOptions, SearchResponse, SearchResult, } from './types';
|
|
3
4
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,YAAY,EAAE,WAAW,EAAE,MAAM,8BAA8B,CAAC;AAChE,YAAY,EACV,sBAAsB,EACtB,aAAa,EACb,qBAAqB,EACrB,YAAY,EACZ,cAAc,EACd,sBAAsB,EACtB,iBAAiB,EACjB,oBAAoB,EACpB,aAAa,EACb,cAAc,EACd,YAAY,GACb,MAAM,SAAS,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,YAAY,EAAE,WAAW,EAAE,MAAM,8BAA8B,CAAC;AAChE,YAAY,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAC7D,YAAY,EACV,sBAAsB,EACtB,aAAa,EACb,qBAAqB,EACrB,YAAY,EACZ,cAAc,EACd,sBAAsB,EACtB,iBAAiB,EACjB,oBAAoB,EACpB,aAAa,EACb,cAAc,EACd,YAAY,GACb,MAAM,SAAS,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Flat document shape for building non-Upstash indexes (e.g. Lunr, blob-stored indexes).
|
|
3
|
+
* Use {@link SearchDocument} / {@link SearchDocumentMetadata} for Upstash Search instead.
|
|
4
|
+
*
|
|
5
|
+
* **href** must be the full path as used in `<a href>` (e.g. `/insights/my-article`), not a
|
|
6
|
+
* bare slug — URL topology must not be inferred from `slug` + `contentType` at query time.
|
|
7
|
+
*/
|
|
8
|
+
export interface IIndexableContent {
|
|
9
|
+
id: string;
|
|
10
|
+
/** Full canonical path for links, e.g. `/insights/my-article` */
|
|
11
|
+
href: string;
|
|
12
|
+
/** Logical type label (app-defined); e.g. `article`, `page`, `person` */
|
|
13
|
+
contentType: string;
|
|
14
|
+
title: string;
|
|
15
|
+
/** Plain-text body used for full-text indexing */
|
|
16
|
+
content: string;
|
|
17
|
+
/** Optional bare slug for tokenisation or legacy callers */
|
|
18
|
+
slug?: string;
|
|
19
|
+
date?: string;
|
|
20
|
+
tags?: string[];
|
|
21
|
+
author?: string;
|
|
22
|
+
metadata: Record<string, unknown>;
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=indexable-content.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"indexable-content.d.ts","sourceRoot":"","sources":["../src/indexable-content.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,MAAM,WAAW,iBAAiB;IAChC,EAAE,EAAE,MAAM,CAAC;IACX,iEAAiE;IACjE,IAAI,EAAE,MAAM,CAAC;IACb,yEAAyE;IACzE,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,kDAAkD;IAClD,OAAO,EAAE,MAAM,CAAC;IAChB,4DAA4D;IAC5D,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"indexable-content.js","sourceRoot":"","sources":["../src/indexable-content.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import type { IArticleLink } from '@se-studio/core-data-types';
|
|
2
|
+
import { type ContentData, type MarkdownConverterContext, type MarkdownExporter } from '@se-studio/markdown-renderer';
|
|
3
|
+
import type { IIndexableContent } from '../indexable-content';
|
|
4
|
+
/** Same fields as {@link MarkdownConverterContext} except `contentContext` (supplied per fetch). */
|
|
5
|
+
export type ArticleMarkdownConverterParams = Omit<MarkdownConverterContext, 'contentContext'>;
|
|
6
|
+
export interface ArticleIndexableHooks {
|
|
7
|
+
/**
|
|
8
|
+
* Return false to skip a link. Defaults to {@link defaultShouldIndexArticleLink}
|
|
9
|
+
* when omitted.
|
|
10
|
+
*/
|
|
11
|
+
shouldIndexArticleLink?(link: IArticleLink): boolean;
|
|
12
|
+
/**
|
|
13
|
+
* Override plain body after markdown conversion (e.g. custom truncation).
|
|
14
|
+
*/
|
|
15
|
+
mapPlainTextBody?(link: IArticleLink, plainText: string, contentData: ContentData | null): string;
|
|
16
|
+
/**
|
|
17
|
+
* Build metadata beyond defaults (`originalType`, `readingTime`, `link`).
|
|
18
|
+
*/
|
|
19
|
+
buildMetadata?(link: IArticleLink, plainTextBody: string, contentData: ContentData | null): Record<string, unknown>;
|
|
20
|
+
}
|
|
21
|
+
export interface IndexArticleLinksOptions {
|
|
22
|
+
exporter: MarkdownExporter;
|
|
23
|
+
/**
|
|
24
|
+
* Passed into each `MarkdownConverter.convert()` together with `contentData.context`
|
|
25
|
+
* from `fetchContent` (the entry-specific context is required for correct markdown).
|
|
26
|
+
*/
|
|
27
|
+
markdownContext: ArticleMarkdownConverterParams;
|
|
28
|
+
/**
|
|
29
|
+
* Passed to `fetchContent('article', slug, { articleType })` when `link.articleType?.slug`
|
|
30
|
+
* is missing — mirrors {@link rebuildSearchIndex} (`blog` fallback in rebuild).
|
|
31
|
+
*/
|
|
32
|
+
defaultArticleTypeSlug?: string;
|
|
33
|
+
/** Label stored on each `IIndexableContent.contentType` (default: `article`). */
|
|
34
|
+
indexableContentType?: string;
|
|
35
|
+
hooks?: ArticleIndexableHooks;
|
|
36
|
+
onProgress?: (current: number, total: number, itemName: string) => Promise<void>;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Default article link filter: internal paths only, no download articles.
|
|
40
|
+
*/
|
|
41
|
+
export declare function defaultShouldIndexArticleLink(link: IArticleLink): boolean;
|
|
42
|
+
/**
|
|
43
|
+
* Builds {@link IIndexableContent} for each article link using **one**
|
|
44
|
+
* `MarkdownExporter.fetchContent()` call per article (avoids Contentful `includes` truncation
|
|
45
|
+
* from batched entry queries).
|
|
46
|
+
*/
|
|
47
|
+
export declare function indexArticleLinksToIndexableContent(links: readonly IArticleLink[], options: IndexArticleLinksOptions): Promise<IIndexableContent[]>;
|
|
48
|
+
//# sourceMappingURL=article-indexable.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"article-indexable.d.ts","sourceRoot":"","sources":["../../src/indexing/article-indexable.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAY,MAAM,4BAA4B,CAAC;AACzE,OAAO,EACL,KAAK,WAAW,EAGhB,KAAK,wBAAwB,EAC7B,KAAK,gBAAgB,EACtB,MAAM,8BAA8B,CAAC;AACtC,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAG9D,oGAAoG;AACpG,MAAM,MAAM,8BAA8B,GAAG,IAAI,CAAC,wBAAwB,EAAE,gBAAgB,CAAC,CAAC;AAE9F,MAAM,WAAW,qBAAqB;IACpC;;;OAGG;IACH,sBAAsB,CAAC,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC;IACrD;;OAEG;IACH,gBAAgB,CAAC,CAAC,IAAI,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,GAAG,IAAI,GAAG,MAAM,CAAC;IAClG;;OAEG;IACH,aAAa,CAAC,CACZ,IAAI,EAAE,YAAY,EAClB,aAAa,EAAE,MAAM,EACrB,WAAW,EAAE,WAAW,GAAG,IAAI,GAC9B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC5B;AAED,MAAM,WAAW,wBAAwB;IACvC,QAAQ,EAAE,gBAAgB,CAAC;IAC3B;;;OAGG;IACH,eAAe,EAAE,8BAA8B,CAAC;IAChD;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC,iFAAiF;IACjF,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,KAAK,CAAC,EAAE,qBAAqB,CAAC;IAC9B,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;CAClF;AAkBD;;GAEG;AACH,wBAAgB,6BAA6B,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAQzE;AAMD;;;;GAIG;AACH,wBAAsB,mCAAmC,CACvD,KAAK,EAAE,SAAS,YAAY,EAAE,EAC9B,OAAO,EAAE,wBAAwB,GAChC,OAAO,CAAC,iBAAiB,EAAE,CAAC,CA0E9B"}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { cleanMarkdownText, MarkdownConverter, } from '@se-studio/markdown-renderer';
|
|
2
|
+
import { calculateReadingTime, stripMarkdownToPlainText } from './readability';
|
|
3
|
+
function tagTitle(t) {
|
|
4
|
+
return t.title ?? t.name ?? t.slug ?? '';
|
|
5
|
+
}
|
|
6
|
+
function defaultTags(link) {
|
|
7
|
+
const raw = link.tags ?? [];
|
|
8
|
+
const names = raw.map(tagTitle).filter(Boolean);
|
|
9
|
+
return names.length > 0 ? names : ['article'];
|
|
10
|
+
}
|
|
11
|
+
function defaultAuthor(link) {
|
|
12
|
+
const a = link.author;
|
|
13
|
+
if (!a)
|
|
14
|
+
return undefined;
|
|
15
|
+
return a.name ?? a.title ?? undefined;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Default article link filter: internal paths only, no download articles.
|
|
19
|
+
*/
|
|
20
|
+
export function defaultShouldIndexArticleLink(link) {
|
|
21
|
+
if (!link.href || link.href.startsWith('http')) {
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
if (link.download) {
|
|
25
|
+
return false;
|
|
26
|
+
}
|
|
27
|
+
return true;
|
|
28
|
+
}
|
|
29
|
+
function resolveArticleTypeSlug(link, fallback) {
|
|
30
|
+
return link.articleType?.slug ?? fallback;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Builds {@link IIndexableContent} for each article link using **one**
|
|
34
|
+
* `MarkdownExporter.fetchContent()` call per article (avoids Contentful `includes` truncation
|
|
35
|
+
* from batched entry queries).
|
|
36
|
+
*/
|
|
37
|
+
export async function indexArticleLinksToIndexableContent(links, options) {
|
|
38
|
+
const { exporter, markdownContext, defaultArticleTypeSlug = 'blog', indexableContentType = 'article', hooks, onProgress, } = options;
|
|
39
|
+
const shouldIndex = hooks?.shouldIndexArticleLink ?? defaultShouldIndexArticleLink;
|
|
40
|
+
const converter = new MarkdownConverter();
|
|
41
|
+
const results = [];
|
|
42
|
+
const indexable = links.filter((l) => shouldIndex(l));
|
|
43
|
+
const total = indexable.length;
|
|
44
|
+
let current = 0;
|
|
45
|
+
for (const link of indexable) {
|
|
46
|
+
current++;
|
|
47
|
+
const articleType = resolveArticleTypeSlug(link, defaultArticleTypeSlug);
|
|
48
|
+
let contentData = null;
|
|
49
|
+
try {
|
|
50
|
+
contentData = await exporter.fetchContent('article', link.slug, { articleType });
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
contentData = null;
|
|
54
|
+
}
|
|
55
|
+
let plainBody = '';
|
|
56
|
+
if (contentData) {
|
|
57
|
+
const markdown = converter.convert(contentData, {
|
|
58
|
+
...markdownContext,
|
|
59
|
+
contentContext: contentData.context,
|
|
60
|
+
});
|
|
61
|
+
plainBody = stripMarkdownToPlainText(cleanMarkdownText(markdown));
|
|
62
|
+
}
|
|
63
|
+
if (hooks?.mapPlainTextBody) {
|
|
64
|
+
plainBody = hooks.mapPlainTextBody(link, plainBody, contentData);
|
|
65
|
+
}
|
|
66
|
+
const readingMinutes = plainBody.trim() ? calculateReadingTime(plainBody) : undefined;
|
|
67
|
+
const tagNames = defaultTags(link);
|
|
68
|
+
const authorName = defaultAuthor(link);
|
|
69
|
+
const baseMeta = {
|
|
70
|
+
originalType: 'article',
|
|
71
|
+
link: link.href,
|
|
72
|
+
...(readingMinutes !== undefined ? { readingTime: `${readingMinutes} min read` } : {}),
|
|
73
|
+
};
|
|
74
|
+
const metadata = hooks?.buildMetadata
|
|
75
|
+
? { ...baseMeta, ...hooks.buildMetadata(link, plainBody, contentData) }
|
|
76
|
+
: baseMeta;
|
|
77
|
+
results.push({
|
|
78
|
+
id: link.id,
|
|
79
|
+
href: link.href,
|
|
80
|
+
contentType: indexableContentType,
|
|
81
|
+
title: link.title ?? link.text ?? '',
|
|
82
|
+
content: plainBody,
|
|
83
|
+
slug: link.slug,
|
|
84
|
+
date: link.date ?? undefined,
|
|
85
|
+
tags: tagNames,
|
|
86
|
+
author: authorName,
|
|
87
|
+
metadata,
|
|
88
|
+
});
|
|
89
|
+
if (onProgress) {
|
|
90
|
+
await onProgress(current, total, link.title ?? link.text ?? '');
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return results;
|
|
94
|
+
}
|
|
95
|
+
//# sourceMappingURL=article-indexable.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"article-indexable.js","sourceRoot":"","sources":["../../src/indexing/article-indexable.ts"],"names":[],"mappings":"AACA,OAAO,EAEL,iBAAiB,EACjB,iBAAiB,GAGlB,MAAM,8BAA8B,CAAC;AAEtC,OAAO,EAAE,oBAAoB,EAAE,wBAAwB,EAAE,MAAM,eAAe,CAAC;AA2C/E,SAAS,QAAQ,CAAC,CAAW;IAC3B,OAAO,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,IAAI,EAAE,CAAC;AAC3C,CAAC;AAED,SAAS,WAAW,CAAC,IAAkB;IACrC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;IAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAChD,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;AAChD,CAAC;AAED,SAAS,aAAa,CAAC,IAAkB;IACvC,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC;IACtB,IAAI,CAAC,CAAC;QAAE,OAAO,SAAS,CAAC;IACzB,OAAO,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,KAAK,IAAI,SAAS,CAAC;AACxC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,6BAA6B,CAAC,IAAkB;IAC9D,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/C,OAAO,KAAK,CAAC;IACf,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAClB,OAAO,KAAK,CAAC;IACf,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,sBAAsB,CAAC,IAAkB,EAAE,QAAgB;IAClE,OAAO,IAAI,CAAC,WAAW,EAAE,IAAI,IAAI,QAAQ,CAAC;AAC5C,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,mCAAmC,CACvD,KAA8B,EAC9B,OAAiC;IAEjC,MAAM,EACJ,QAAQ,EACR,eAAe,EACf,sBAAsB,GAAG,MAAM,EAC/B,oBAAoB,GAAG,SAAS,EAChC,KAAK,EACL,UAAU,GACX,GAAG,OAAO,CAAC;IAEZ,MAAM,WAAW,GAAG,KAAK,EAAE,sBAAsB,IAAI,6BAA6B,CAAC;IACnF,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,OAAO,GAAwB,EAAE,CAAC;IACxC,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IACtD,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC;IAE/B,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,OAAO,EAAE,CAAC;QACV,MAAM,WAAW,GAAG,sBAAsB,CAAC,IAAI,EAAE,sBAAsB,CAAC,CAAC;QACzE,IAAI,WAAW,GAAuB,IAAI,CAAC;QAC3C,IAAI,CAAC;YACH,WAAW,GAAG,MAAM,QAAQ,CAAC,YAAY,CAAC,SAAS,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,WAAW,EAAE,CAAC,CAAC;QACnF,CAAC;QAAC,MAAM,CAAC;YACP,WAAW,GAAG,IAAI,CAAC;QACrB,CAAC;QAED,IAAI,SAAS,GAAG,EAAE,CAAC;QACnB,IAAI,WAAW,EAAE,CAAC;YAChB,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,WAAW,EAAE;gBAC9C,GAAG,eAAe;gBAClB,cAAc,EAAE,WAAW,CAAC,OAAO;aACpC,CAAC,CAAC;YACH,SAAS,GAAG,wBAAwB,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC;QACpE,CAAC;QAED,IAAI,KAAK,EAAE,gBAAgB,EAAE,CAAC;YAC5B,SAAS,GAAG,KAAK,CAAC,gBAAgB,CAAC,IAAI,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;QACnE,CAAC;QAED,MAAM,cAAc,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAEtF,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,UAAU,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;QAEvC,MAAM,QAAQ,GAA4B;YACxC,YAAY,EAAE,SAAS;YACvB,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,GAAG,CAAC,cAAc,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,GAAG,cAAc,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACvF,CAAC;QAEF,MAAM,QAAQ,GAAG,KAAK,EAAE,aAAa;YACnC,CAAC,CAAC,EAAE,GAAG,QAAQ,EAAE,GAAG,KAAK,CAAC,aAAa,CAAC,IAAI,EAAE,SAAS,EAAE,WAAW,CAAC,EAAE;YACvE,CAAC,CAAC,QAAQ,CAAC;QAEb,OAAO,CAAC,IAAI,CAAC;YACX,EAAE,EAAE,IAAI,CAAC,EAAE;YACX,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,WAAW,EAAE,oBAAoB;YACjC,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,IAAI,IAAI,EAAE;YACpC,OAAO,EAAE,SAAS;YAClB,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,SAAS;YAC5B,IAAI,EAAE,QAAQ;YACd,MAAM,EAAE,UAAU;YAClB,QAAQ;SACT,CAAC,CAAC;QAEH,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,UAAU,CAAC,OAAO,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC;QAClE,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content-extractor.d.ts","sourceRoot":"","sources":["../../src/indexing/content-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,wBAAwB,EAAE,MAAM,8BAA8B,CAAC;
|
|
1
|
+
{"version":3,"file":"content-extractor.d.ts","sourceRoot":"","sources":["../../src/indexing/content-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,wBAAwB,EAAE,MAAM,8BAA8B,CAAC;AAa1F;;;GAGG;AACH,wBAAgB,sBAAsB,CAAC,WAAW,EAAE,MAAM,EAAE,iBAAiB,EAAE,MAAM,GAAG,MAAM,CAE7F;AAED;;;;;;GAMG;AACH,wBAAgB,uBAAuB,CACrC,WAAW,EAAE,WAAW,EACxB,gBAAgB,EAAE,wBAAwB,EAC1C,gBAAgB,EAAE,MAAM,GACvB,MAAM,EAAE,CA8BV;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,GAAG,MAAM,CAE7E"}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { cleanMarkdownText, MarkdownConverter } from '@se-studio/markdown-renderer';
|
|
2
2
|
import { debugLog } from '../debug';
|
|
3
|
+
import { stripMarkdownToPlainText } from './readability';
|
|
3
4
|
/** Upstash Search hard limit for total content per document. */
|
|
4
5
|
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
5
6
|
/** Buffer for JSON key names and structural overhead in the content object. */
|
|
@@ -12,33 +13,6 @@ const CHUNK_OVERLAP = 300;
|
|
|
12
13
|
export function calculateMaxBodyLength(titleLength, descriptionLength) {
|
|
13
14
|
return Math.max(0, UPSTASH_CONTENT_LIMIT - titleLength - descriptionLength - JSON_OVERHEAD);
|
|
14
15
|
}
|
|
15
|
-
/**
|
|
16
|
-
* Strips markdown syntax to produce plain text suitable for search indexing.
|
|
17
|
-
* Removes headings markers, bold/italic markers, links, images, and frontmatter.
|
|
18
|
-
*/
|
|
19
|
-
function stripMarkdown(md) {
|
|
20
|
-
let text = md;
|
|
21
|
-
// Remove YAML frontmatter
|
|
22
|
-
text = text.replace(/^---[\s\S]*?---\n*/m, '');
|
|
23
|
-
// Remove images: 
|
|
24
|
-
text = text.replace(/!\[.*?\]\(.*?\)/g, '');
|
|
25
|
-
// Convert links to text: [text](url) → text
|
|
26
|
-
text = text.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
|
|
27
|
-
// Remove heading markers: ### Heading → Heading
|
|
28
|
-
text = text.replace(/^#{1,6}\s+/gm, '');
|
|
29
|
-
// Remove bold/italic markers
|
|
30
|
-
text = text.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
|
|
31
|
-
text = text.replace(/_{1,3}([^_]+)_{1,3}/g, '$1');
|
|
32
|
-
// Remove horizontal rules
|
|
33
|
-
text = text.replace(/^[-*_]{3,}\s*$/gm, '');
|
|
34
|
-
// Remove list markers
|
|
35
|
-
text = text.replace(/^[\s]*[-*+]\s+/gm, '');
|
|
36
|
-
text = text.replace(/^[\s]*\d+\.\s+/gm, '');
|
|
37
|
-
// Collapse whitespace
|
|
38
|
-
text = text.replace(/\n{3,}/g, '\n\n');
|
|
39
|
-
text = text.trim();
|
|
40
|
-
return text;
|
|
41
|
-
}
|
|
42
16
|
/**
|
|
43
17
|
* Converts CMS content to plain text, then splits it into chunks that each
|
|
44
18
|
* fit within `maxCharsPerChunk`, with `CHUNK_OVERLAP` characters of overlap
|
|
@@ -50,7 +24,7 @@ export function extractSearchableChunks(contentData, converterContext, maxCharsP
|
|
|
50
24
|
const converter = new MarkdownConverter();
|
|
51
25
|
const markdown = converter.convert(contentData, converterContext);
|
|
52
26
|
debugLog('extractor', `Markdown length: ${markdown.length} chars`);
|
|
53
|
-
const plainText =
|
|
27
|
+
const plainText = stripMarkdownToPlainText(cleanMarkdownText(markdown));
|
|
54
28
|
debugLog('extractor', `Plain text length: ${plainText.length} chars (chunk limit ${maxCharsPerChunk})`);
|
|
55
29
|
if (plainText.length <= maxCharsPerChunk) {
|
|
56
30
|
return [plainText];
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content-extractor.js","sourceRoot":"","sources":["../../src/indexing/content-extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,8BAA8B,CAAC;AACpF,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;
|
|
1
|
+
{"version":3,"file":"content-extractor.js","sourceRoot":"","sources":["../../src/indexing/content-extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,8BAA8B,CAAC;AACpF,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AACpC,OAAO,EAAE,wBAAwB,EAAE,MAAM,eAAe,CAAC;AAEzD,gEAAgE;AAChE,MAAM,qBAAqB,GAAG,IAAI,CAAC;AAEnC,+EAA+E;AAC/E,MAAM,aAAa,GAAG,GAAG,CAAC;AAE1B,MAAM,aAAa,GAAG,GAAG,CAAC;AAE1B;;;GAGG;AACH,MAAM,UAAU,sBAAsB,CAAC,WAAmB,EAAE,iBAAyB;IACnF,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,qBAAqB,GAAG,WAAW,GAAG,iBAAiB,GAAG,aAAa,CAAC,CAAC;AAC9F,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,uBAAuB,CACrC,WAAwB,EACxB,gBAA0C,EAC1C,gBAAwB;IAExB,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,WAAW,EAAE,gBAAgB,CAAC,CAAC;IAClE,QAAQ,CAAC,WAAW,EAAE,oBAAoB,QAAQ,CAAC,MAAM,QAAQ,CAAC,CAAC;IACnE,MAAM,SAAS,GAAG,wBAAwB,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC;IACxE,QAAQ,CACN,WAAW,EACX,sBAAsB,SAAS,CAAC,MAAM,uBAAuB,gBAAgB,GAAG,CACjF,CAAC;IAEF,IAAI,SAAS,CAAC,MAAM,IAAI,gBAAgB,EAAE,CAAC;QACzC,OAAO,CAAC,SAAS,CAAC,CAAC;IACrB,CAAC;IACD,IAAI,gBAAgB,IAAI,aAAa,IAAI,gBAAgB,IAAI,CAAC,EAAE,CAAC;QAC/D,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,gBAAgB,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7D,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,MAAM,IAAI,GAAG,gBAAgB,GAAG,aAAa,CAAC;IAE9C,OAAO,MAAM,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;QACjC,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,GAAG,gBAAgB,CAAC,CAAC;QACjE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnB,MAAM,IAAI,IAAI,CAAC;QACf,IAAI,MAAM,GAAG,aAAa,IAAI,SAAS,CAAC,MAAM;YAAE,MAAM;IACxD,CAAC;IAED,QAAQ,CAAC,WAAW,EAAE,cAAc,MAAM,CAAC,MAAM,oBAAoB,aAAa,GAAG,CAAC,CAAC;IACvF,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kBAAkB,CAAC,KAAa,EAAE,WAAmB;IACnE,OAAO,iBAAiB,CAAC,GAAG,KAAK,KAAK,WAAW,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;AAC9D,CAAC"}
|
package/dist/indexing/index.d.ts
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
|
+
export type { ArticleIndexableHooks, ArticleMarkdownConverterParams, IndexArticleLinksOptions, } from './article-indexable';
|
|
2
|
+
export { defaultShouldIndexArticleLink, indexArticleLinksToIndexableContent, } from './article-indexable';
|
|
1
3
|
export { calculateMaxBodyLength, extractSearchableChunks, extractShallowText, } from './content-extractor';
|
|
2
4
|
export { allChunkIds, baseEntryId, buildSearchDocuments, chunkDocumentId, shouldIndex, } from './document-builder';
|
|
5
|
+
export { calculateReadingTime, DEFAULT_WORDS_PER_MINUTE, stripMarkdownToPlainText, } from './readability';
|
|
3
6
|
export { rebuildSearchIndex } from './rebuild';
|
|
4
7
|
export { rebuildSearchIndexWordPress } from './rebuild-wordpress';
|
|
5
8
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/indexing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,sBAAsB,EACtB,uBAAuB,EACvB,kBAAkB,GACnB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,WAAW,EACX,WAAW,EACX,oBAAoB,EACpB,eAAe,EACf,WAAW,GACZ,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,EAAE,2BAA2B,EAAE,MAAM,qBAAqB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/indexing/index.ts"],"names":[],"mappings":"AAAA,YAAY,EACV,qBAAqB,EACrB,8BAA8B,EAC9B,wBAAwB,GACzB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,6BAA6B,EAC7B,mCAAmC,GACpC,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,sBAAsB,EACtB,uBAAuB,EACvB,kBAAkB,GACnB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,WAAW,EACX,WAAW,EACX,oBAAoB,EACpB,eAAe,EACf,WAAW,GACZ,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EACL,oBAAoB,EACpB,wBAAwB,EACxB,wBAAwB,GACzB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,EAAE,2BAA2B,EAAE,MAAM,qBAAqB,CAAC"}
|
package/dist/indexing/index.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
export { defaultShouldIndexArticleLink, indexArticleLinksToIndexableContent, } from './article-indexable';
|
|
1
2
|
export { calculateMaxBodyLength, extractSearchableChunks, extractShallowText, } from './content-extractor';
|
|
2
3
|
export { allChunkIds, baseEntryId, buildSearchDocuments, chunkDocumentId, shouldIndex, } from './document-builder';
|
|
4
|
+
export { calculateReadingTime, DEFAULT_WORDS_PER_MINUTE, stripMarkdownToPlainText, } from './readability';
|
|
3
5
|
export { rebuildSearchIndex } from './rebuild';
|
|
4
6
|
export { rebuildSearchIndexWordPress } from './rebuild-wordpress';
|
|
5
7
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/indexing/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/indexing/index.ts"],"names":[],"mappings":"AAKA,OAAO,EACL,6BAA6B,EAC7B,mCAAmC,GACpC,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,sBAAsB,EACtB,uBAAuB,EACvB,kBAAkB,GACnB,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,WAAW,EACX,WAAW,EACX,oBAAoB,EACpB,eAAe,EACf,WAAW,GACZ,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EACL,oBAAoB,EACpB,wBAAwB,EACxB,wBAAwB,GACzB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,EAAE,2BAA2B,EAAE,MAAM,qBAAqB,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/** Default reading speed for {@link calculateReadingTime}. */
|
|
2
|
+
export declare const DEFAULT_WORDS_PER_MINUTE = 200;
|
|
3
|
+
/**
|
|
4
|
+
* Estimates reading time in minutes from plain text (e.g. after markdown stripping).
|
|
5
|
+
* Empty or whitespace-only input returns `1` (same convention as marketing-site reading time).
|
|
6
|
+
*/
|
|
7
|
+
export declare function calculateReadingTime(text: string, wordsPerMinute?: number): number;
|
|
8
|
+
/**
|
|
9
|
+
* Strip markdown syntax leaving only readable words for search indexing or word counts.
|
|
10
|
+
* Removes frontmatter, heading markers, bold/italic, links (keeping text), images,
|
|
11
|
+
* fenced and inline code, blockquotes, list markers, and horizontal rules.
|
|
12
|
+
*/
|
|
13
|
+
export declare function stripMarkdownToPlainText(markdown: string): string;
|
|
14
|
+
//# sourceMappingURL=readability.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"readability.d.ts","sourceRoot":"","sources":["../../src/indexing/readability.ts"],"names":[],"mappings":"AAAA,8DAA8D;AAC9D,eAAO,MAAM,wBAAwB,MAAM,CAAC;AAE5C;;;GAGG;AACH,wBAAgB,oBAAoB,CAClC,IAAI,EAAE,MAAM,EACZ,cAAc,GAAE,MAAiC,GAChD,MAAM,CAGR;AAED;;;;GAIG;AACH,wBAAgB,wBAAwB,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CA8BjE"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/** Default reading speed for {@link calculateReadingTime}. */
|
|
2
|
+
export const DEFAULT_WORDS_PER_MINUTE = 200;
|
|
3
|
+
/**
|
|
4
|
+
* Estimates reading time in minutes from plain text (e.g. after markdown stripping).
|
|
5
|
+
* Empty or whitespace-only input returns `1` (same convention as marketing-site reading time).
|
|
6
|
+
*/
|
|
7
|
+
export function calculateReadingTime(text, wordsPerMinute = DEFAULT_WORDS_PER_MINUTE) {
|
|
8
|
+
const words = text.trim().split(/\s+/).filter(Boolean);
|
|
9
|
+
return Math.max(1, Math.round(words.length / wordsPerMinute));
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Strip markdown syntax leaving only readable words for search indexing or word counts.
|
|
13
|
+
* Removes frontmatter, heading markers, bold/italic, links (keeping text), images,
|
|
14
|
+
* fenced and inline code, blockquotes, list markers, and horizontal rules.
|
|
15
|
+
*/
|
|
16
|
+
export function stripMarkdownToPlainText(markdown) {
|
|
17
|
+
return (markdown
|
|
18
|
+
// Remove YAML frontmatter block
|
|
19
|
+
.replace(/^---[\s\S]*?---\n?/, '')
|
|
20
|
+
// Fenced blocks before inline backticks (avoids corrupting ``` fences)
|
|
21
|
+
.replace(/```[\s\S]*?```/g, '')
|
|
22
|
+
// Remove images before links (order matters)
|
|
23
|
+
.replace(/!\[[^\]]*\]\([^)]+\)/g, '')
|
|
24
|
+
// Remove links, keep display text
|
|
25
|
+
.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1')
|
|
26
|
+
// Remove heading markers
|
|
27
|
+
.replace(/^#{1,6}\s+/gm, '')
|
|
28
|
+
// Remove bold/italic (triple, double, single asterisk/underscore)
|
|
29
|
+
.replace(/\*{1,3}([^*\n]*)\*{1,3}/g, '$1')
|
|
30
|
+
.replace(/_{1,3}([^_\n]*)_{1,3}/g, '$1')
|
|
31
|
+
// Remove inline code backticks
|
|
32
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
33
|
+
// Remove blockquote markers
|
|
34
|
+
.replace(/^>\s*/gm, '')
|
|
35
|
+
// Remove unordered list markers
|
|
36
|
+
.replace(/^[-*+]\s+/gm, '')
|
|
37
|
+
// Remove ordered list markers
|
|
38
|
+
.replace(/^\d+\.\s+/gm, '')
|
|
39
|
+
// Remove horizontal rules
|
|
40
|
+
.replace(/^[-*_]{3,}\s*$/gm, '')
|
|
41
|
+
// Collapse excess whitespace
|
|
42
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
43
|
+
.trim());
|
|
44
|
+
}
|
|
45
|
+
//# sourceMappingURL=readability.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"readability.js","sourceRoot":"","sources":["../../src/indexing/readability.ts"],"names":[],"mappings":"AAAA,8DAA8D;AAC9D,MAAM,CAAC,MAAM,wBAAwB,GAAG,GAAG,CAAC;AAE5C;;;GAGG;AACH,MAAM,UAAU,oBAAoB,CAClC,IAAY,EACZ,iBAAyB,wBAAwB;IAEjD,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IACvD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,cAAc,CAAC,CAAC,CAAC;AAChE,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,wBAAwB,CAAC,QAAgB;IACvD,OAAO,CACL,QAAQ;QACN,gCAAgC;SAC/B,OAAO,CAAC,oBAAoB,EAAE,EAAE,CAAC;QAClC,uEAAuE;SACtE,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC;QAC/B,6CAA6C;SAC5C,OAAO,CAAC,uBAAuB,EAAE,EAAE,CAAC;QACrC,kCAAkC;SACjC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC;QACxC,yBAAyB;SACxB,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC;QAC5B,kEAAkE;SACjE,OAAO,CAAC,0BAA0B,EAAE,IAAI,CAAC;SACzC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC;QACxC,+BAA+B;SAC9B,OAAO,CAAC,YAAY,EAAE,IAAI,CAAC;QAC5B,4BAA4B;SAC3B,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;QACvB,gCAAgC;SAC/B,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;QAC3B,8BAA8B;SAC7B,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;QAC3B,0BAA0B;SACzB,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC;QAChC,6BAA6B;SAC5B,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CACV,CAAC;AACJ,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@se-studio/search",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.37",
|
|
4
4
|
"description": "AI-powered site search with Upstash Search for Next.js marketing sites",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -43,22 +43,22 @@
|
|
|
43
43
|
],
|
|
44
44
|
"dependencies": {
|
|
45
45
|
"@upstash/search": "^0.1.7",
|
|
46
|
-
"@se-studio/contentful-rest-api": "1.0.
|
|
47
|
-
"@se-studio/core-data-types": "1.0.
|
|
48
|
-
"@se-studio/markdown-renderer": "1.0.
|
|
49
|
-
"@se-studio/wordpress-rest-api": "1.0.
|
|
46
|
+
"@se-studio/contentful-rest-api": "1.0.131",
|
|
47
|
+
"@se-studio/core-data-types": "1.0.126",
|
|
48
|
+
"@se-studio/markdown-renderer": "1.0.90",
|
|
49
|
+
"@se-studio/wordpress-rest-api": "1.0.7"
|
|
50
50
|
},
|
|
51
51
|
"peerDependencies": {
|
|
52
52
|
"next": ">=15.5.0",
|
|
53
53
|
"react": "^19.0.0"
|
|
54
54
|
},
|
|
55
55
|
"devDependencies": {
|
|
56
|
-
"@biomejs/biome": "^2.4.
|
|
57
|
-
"@types/node": "^22.19.
|
|
56
|
+
"@biomejs/biome": "^2.4.12",
|
|
57
|
+
"@types/node": "^22.19.17",
|
|
58
58
|
"@types/react": "^19.2.14",
|
|
59
|
-
"next": "^15.5.
|
|
59
|
+
"next": "^15.5.15",
|
|
60
60
|
"typescript": "^6.0.2",
|
|
61
|
-
"vitest": "^4.1.
|
|
61
|
+
"vitest": "^4.1.4"
|
|
62
62
|
},
|
|
63
63
|
"scripts": {
|
|
64
64
|
"build": "tsc --project tsconfig.build.json",
|