mkdnsite 0.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,342 @@
1
+ import type { ContentPage, ContentSource } from '../content/types.ts'
2
+
3
+ export interface SearchResult {
4
+ slug: string
5
+ title: string
6
+ description?: string
7
+ excerpt: string
8
+ score: number
9
+ }
10
+
11
+ export interface SearchIndex {
12
+ /** Add or update a page in the index */
13
+ index: (page: ContentPage) => void
14
+ /** Remove a page from the index */
15
+ remove: (slug: string) => void
16
+ /** Search for pages matching a query */
17
+ search: (query: string, limit?: number) => SearchResult[]
18
+ /** Rebuild the entire index from a content source */
19
+ rebuild: (source: ContentSource) => Promise<void>
20
+ /** Serialize internal index state to a JSON string for storage */
21
+ serialize: () => string
22
+ /** Restore index state from a previously serialized JSON string */
23
+ deserialize: (data: string) => void
24
+ }
25
+
26
+ /** Serialized format stored in cache / on disk */
27
+ export interface SerializedSearchIndex {
28
+ /** Version tag for forward-compat */
29
+ v: number
30
+ /** Documents: slug → serialized entry */
31
+ docs: Record<string, SerializedDocEntry>
32
+ /** Inverted index: token → list of slugs */
33
+ posting: Record<string, string[]>
34
+ }
35
+
36
+ interface SerializedDocEntry {
37
+ slug: string
38
+ title: string
39
+ description?: string
40
+ tags: string[]
41
+ titleTokens: string[]
42
+ descTokens: string[]
43
+ tagTokens: string[]
44
+ bodyTokens: string[]
45
+ body: string
46
+ termFreqs: Record<string, number>
47
+ totalTokens: number
48
+ }
49
+
50
+ interface DocEntry {
51
+ slug: string
52
+ title: string
53
+ description?: string
54
+ tags: string[]
55
+ titleTokens: string[]
56
+ descTokens: string[]
57
+ tagTokens: string[]
58
+ bodyTokens: string[]
59
+ body: string
60
+ termFreqs: Map<string, number>
61
+ totalTokens: number
62
+ }
63
+
64
+ export function createSearchIndex (): SearchIndex {
65
+ // inverted index: token → set of slugs
66
+ const posting = new Map<string, Set<string>>()
67
+ const docs = new Map<string, DocEntry>()
68
+
69
+ function addToPosting (token: string, slug: string): void {
70
+ let set = posting.get(token)
71
+ if (set == null) {
72
+ set = new Set()
73
+ posting.set(token, set)
74
+ }
75
+ set.add(slug)
76
+ }
77
+
78
+ function removeFromPosting (slug: string): void {
79
+ for (const set of posting.values()) {
80
+ set.delete(slug)
81
+ }
82
+ }
83
+
84
+ function index (page: ContentPage): void {
85
+ const slug = page.slug
86
+
87
+ // Remove any existing entry first
88
+ if (docs.has(slug)) removeFromPosting(slug)
89
+
90
+ const title = String(page.meta.title ?? '')
91
+ const description = page.meta.description != null ? String(page.meta.description) : undefined
92
+ const tags: string[] = Array.isArray(page.meta.tags)
93
+ ? (page.meta.tags as unknown[]).map(t => String(t))
94
+ : []
95
+
96
+ const titleTokens = tokenize(title)
97
+ const descTokens = description != null ? tokenize(description) : []
98
+ const tagTokens = tags.flatMap(t => tokenize(t))
99
+ const bodyTokens = tokenize(stripMarkdown(page.body))
100
+
101
+ // Boost: title 3x, description 2x, tags 2x
102
+ const allTokens = [
103
+ ...titleTokens, ...titleTokens, ...titleTokens,
104
+ ...descTokens, ...descTokens,
105
+ ...tagTokens, ...tagTokens,
106
+ ...bodyTokens
107
+ ]
108
+
109
+ const termFreqs = new Map<string, number>()
110
+ for (const t of allTokens) {
111
+ termFreqs.set(t, (termFreqs.get(t) ?? 0) + 1)
112
+ }
113
+
114
+ const entry: DocEntry = {
115
+ slug,
116
+ title,
117
+ description,
118
+ tags,
119
+ titleTokens,
120
+ descTokens,
121
+ tagTokens,
122
+ bodyTokens,
123
+ body: page.body,
124
+ termFreqs,
125
+ totalTokens: allTokens.length
126
+ }
127
+ docs.set(slug, entry)
128
+
129
+ // Update posting list
130
+ for (const token of termFreqs.keys()) {
131
+ addToPosting(token, slug)
132
+ }
133
+ }
134
+
135
+ function remove (slug: string): void {
136
+ if (!docs.has(slug)) return
137
+ removeFromPosting(slug)
138
+ docs.delete(slug)
139
+ }
140
+
141
+ function search (query: string, limit = 10): SearchResult[] {
142
+ const cappedLimit = Math.min(limit, 50)
143
+ const trimmed = query.trim()
144
+ if (trimmed === '') return []
145
+
146
+ const queryTokens = tokenize(trimmed)
147
+ if (queryTokens.length === 0) return []
148
+
149
+ const totalDocs = docs.size
150
+ if (totalDocs === 0) return []
151
+
152
+ // Gather candidate slugs (any posting list hit)
153
+ const candidates = new Set<string>()
154
+ for (const token of queryTokens) {
155
+ const set = posting.get(token)
156
+ if (set != null) {
157
+ for (const slug of set) candidates.add(slug)
158
+ }
159
+ }
160
+
161
+ const results: SearchResult[] = []
162
+
163
+ for (const slug of candidates) {
164
+ const entry = docs.get(slug)
165
+ if (entry == null) continue
166
+
167
+ let score = 0
168
+ for (const token of queryTokens) {
169
+ const tf = (entry.termFreqs.get(token) ?? 0) / (entry.totalTokens === 0 ? 1 : entry.totalTokens)
170
+ const docsWithTerm = posting.get(token)?.size ?? 0
171
+ if (docsWithTerm === 0) continue
172
+ // Smoothed IDF: log((N+1) / df) — avoids zero when N == df
173
+ const idf = Math.log((totalDocs + 1) / docsWithTerm)
174
+ score += tf * idf
175
+ }
176
+
177
+ if (score <= 0) continue
178
+
179
+ results.push({
180
+ slug,
181
+ title: entry.title,
182
+ description: entry.description,
183
+ excerpt: buildExcerpt(entry.body, queryTokens),
184
+ score
185
+ })
186
+ }
187
+
188
+ return results
189
+ .sort((a, b) => b.score - a.score)
190
+ .slice(0, cappedLimit)
191
+ }
192
+
193
+ async function rebuild (source: ContentSource): Promise<void> {
194
+ const pages = await source.listPages()
195
+ docs.clear()
196
+ for (const set of posting.values()) set.clear()
197
+ posting.clear()
198
+ for (const page of pages) {
199
+ if (page.meta.draft !== true) {
200
+ index(page)
201
+ }
202
+ }
203
+ }
204
+
205
+ function serialize (): string {
206
+ const docsObj: Record<string, SerializedDocEntry> = {}
207
+ for (const [slug, entry] of docs) {
208
+ const termFreqsObj: Record<string, number> = {}
209
+ for (const [token, freq] of entry.termFreqs) {
210
+ termFreqsObj[token] = freq
211
+ }
212
+ docsObj[slug] = {
213
+ slug: entry.slug,
214
+ title: entry.title,
215
+ description: entry.description,
216
+ tags: entry.tags,
217
+ titleTokens: entry.titleTokens,
218
+ descTokens: entry.descTokens,
219
+ tagTokens: entry.tagTokens,
220
+ bodyTokens: entry.bodyTokens,
221
+ body: entry.body,
222
+ termFreqs: termFreqsObj,
223
+ totalTokens: entry.totalTokens
224
+ }
225
+ }
226
+
227
+ const postingObj: Record<string, string[]> = {}
228
+ for (const [token, set] of posting) {
229
+ postingObj[token] = Array.from(set)
230
+ }
231
+
232
+ const serialized: SerializedSearchIndex = { v: 1, docs: docsObj, posting: postingObj }
233
+ return JSON.stringify(serialized)
234
+ }
235
+
236
+ function deserialize (data: string): void {
237
+ const parsed = JSON.parse(data) as SerializedSearchIndex
238
+ if (parsed.v !== 1) {
239
+ throw new Error('SearchIndex: unsupported serialization version ' + String(parsed.v))
240
+ }
241
+
242
+ // Clear current state
243
+ docs.clear()
244
+ posting.clear()
245
+
246
+ // Restore docs
247
+ for (const [slug, entry] of Object.entries(parsed.docs)) {
248
+ const termFreqs = new Map<string, number>()
249
+ for (const [token, freq] of Object.entries(entry.termFreqs)) {
250
+ termFreqs.set(token, freq)
251
+ }
252
+ docs.set(slug, {
253
+ slug: entry.slug,
254
+ title: entry.title,
255
+ description: entry.description,
256
+ tags: entry.tags,
257
+ titleTokens: entry.titleTokens,
258
+ descTokens: entry.descTokens,
259
+ tagTokens: entry.tagTokens,
260
+ bodyTokens: entry.bodyTokens,
261
+ body: entry.body,
262
+ termFreqs,
263
+ totalTokens: entry.totalTokens
264
+ })
265
+ }
266
+
267
+ // Restore posting lists
268
+ for (const [token, slugs] of Object.entries(parsed.posting)) {
269
+ posting.set(token, new Set(slugs))
270
+ }
271
+ }
272
+
273
+ return { index, remove, search, rebuild, serialize, deserialize }
274
+ }
275
+
276
+ // ─── Helpers ─────────────────────────────────────────────────────────────────
277
+
278
+ function tokenize (text: string): string[] {
279
+ return text
280
+ .toLowerCase()
281
+ .replace(/[^a-z0-9\s'-]/g, ' ')
282
+ .split(/\s+/)
283
+ .map(t => t.replace(/^['-]+|['-]+$/g, ''))
284
+ .filter(t => t.length >= 2 && !STOP_WORDS.has(t))
285
+ }
286
+
287
+ function stripMarkdown (md: string): string {
288
+ return md
289
+ // Remove fenced code blocks
290
+ .replace(/```[\s\S]*?```/g, ' ')
291
+ // Remove inline code
292
+ .replace(/`[^`]+`/g, ' ')
293
+ // Remove headings (keep text)
294
+ .replace(/^#{1,6}\s+/gm, '')
295
+ // Remove images
296
+ .replace(/!\[[^\]]*\]\([^)]*\)/g, ' ')
297
+ // Remove links (keep text)
298
+ .replace(/\[([^\]]*)\]\([^)]*\)/g, '$1')
299
+ // Remove bold/italic markers
300
+ .replace(/[*_]{1,3}([^*_]+)[*_]{1,3}/g, '$1')
301
+ // Remove blockquotes
302
+ .replace(/^>\s*/gm, '')
303
+ // Remove horizontal rules
304
+ .replace(/^[-*_]{3,}\s*$/gm, '')
305
+ // Remove HTML tags
306
+ .replace(/<[^>]+>/g, ' ')
307
+ // Collapse whitespace
308
+ .replace(/\s+/g, ' ')
309
+ .trim()
310
+ }
311
+
312
+ function buildExcerpt (body: string, queryTokens: string[]): string {
313
+ const plain = stripMarkdown(body)
314
+ const lower = plain.toLowerCase()
315
+
316
+ let bestPos = -1
317
+ for (const token of queryTokens) {
318
+ const pos = lower.indexOf(token)
319
+ if (pos !== -1) {
320
+ bestPos = pos
321
+ break
322
+ }
323
+ }
324
+
325
+ if (bestPos === -1) {
326
+ return plain.slice(0, 150).trim() + (plain.length > 150 ? '…' : '')
327
+ }
328
+
329
+ const start = Math.max(0, bestPos - 50)
330
+ const end = Math.min(plain.length, start + 150)
331
+ const excerpt = plain.slice(start, end).trim()
332
+ return (start > 0 ? '…' : '') + excerpt + (end < plain.length ? '…' : '')
333
+ }
334
+
335
+ const STOP_WORDS = new Set([
336
+ 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
337
+ 'of', 'with', 'by', 'from', 'is', 'it', 'its', 'this', 'that', 'be',
338
+ 'as', 'was', 'are', 'were', 'been', 'has', 'have', 'had', 'do', 'does',
339
+ 'did', 'not', 'no', 'so', 'if', 'up', 'can', 'will', 'you', 'we', 'he',
340
+ 'she', 'they', 'their', 'all', 'any', 'also', 'more', 'into', 'than',
341
+ 'then', 'when', 'how', 'what', 'which', 'who', 'use', 'used', 'using'
342
+ ])
@@ -0,0 +1,92 @@
1
+ import type { MkdnSiteConfig, CspConfig } from '../config/schema.ts'
2
+
3
+ /**
4
+ * Sanitize a CSP source value to prevent directive injection.
5
+ * Strips semicolons which act as directive separators.
6
+ */
7
+ function sanitizeCspValue (val: string): string {
8
+ return val.replace(/;/g, '').trim()
9
+ }
10
+
11
+ /**
12
+ * Build a Content-Security-Policy header value string from the current config.
13
+ * Only includes external sources for features that are actually enabled.
14
+ */
15
+ export function buildCsp (config: MkdnSiteConfig): string {
16
+ const { client, analytics, csp, theme } = config
17
+ const gaEnabled = (analytics?.googleAnalytics?.measurementId ?? '') !== ''
18
+ const useCdn = client.mermaid || client.charts
19
+ const extra: CspConfig = csp ?? { enabled: true }
20
+
21
+ // script-src
22
+ const scriptSrc = ["'self'", "'unsafe-inline'"]
23
+ if (useCdn) scriptSrc.push('https://cdn.jsdelivr.net')
24
+ if (gaEnabled) {
25
+ scriptSrc.push('https://www.googletagmanager.com')
26
+ scriptSrc.push('https://www.google-analytics.com')
27
+ }
28
+ if (extra.extraScriptSrc != null) {
29
+ scriptSrc.push(...extra.extraScriptSrc.map(sanitizeCspValue))
30
+ }
31
+
32
+ // style-src
33
+ const styleSrc = ["'self'", "'unsafe-inline'"]
34
+ if (client.math) styleSrc.push('https://cdn.jsdelivr.net')
35
+ if (theme.customCssUrl != null) {
36
+ try {
37
+ const u = new URL(theme.customCssUrl)
38
+ if (u.protocol === 'https:' || u.protocol === 'http:') {
39
+ styleSrc.push(u.origin)
40
+ }
41
+ } catch {
42
+ // relative URL — 'self' covers it
43
+ }
44
+ }
45
+ if (extra.extraStyleSrc != null) {
46
+ styleSrc.push(...extra.extraStyleSrc.map(sanitizeCspValue))
47
+ }
48
+
49
+ // img-src
50
+ const imgSrc = ["'self'", 'data:', 'https:']
51
+ if (client.mermaid) imgSrc.push('blob:')
52
+ if (extra.extraImgSrc != null) {
53
+ imgSrc.push(...extra.extraImgSrc.map(sanitizeCspValue))
54
+ }
55
+
56
+ // font-src
57
+ const fontSrc = ["'self'", 'https://fonts.gstatic.com']
58
+ if (client.math) fontSrc.push('https://cdn.jsdelivr.net')
59
+ if (extra.extraFontSrc != null) {
60
+ fontSrc.push(...extra.extraFontSrc.map(sanitizeCspValue))
61
+ }
62
+
63
+ // connect-src
64
+ const connectSrc = ["'self'"]
65
+ if (gaEnabled) {
66
+ connectSrc.push('https://www.google-analytics.com')
67
+ connectSrc.push('https://analytics.google.com')
68
+ connectSrc.push('https://region1.google-analytics.com')
69
+ }
70
+ if (extra.extraConnectSrc != null) {
71
+ connectSrc.push(...extra.extraConnectSrc.map(sanitizeCspValue))
72
+ }
73
+
74
+ const directives: string[] = [
75
+ "default-src 'self'",
76
+ 'script-src ' + scriptSrc.join(' '),
77
+ 'style-src ' + styleSrc.join(' '),
78
+ 'img-src ' + imgSrc.join(' '),
79
+ 'font-src ' + fontSrc.join(' '),
80
+ 'connect-src ' + connectSrc.join(' '),
81
+ "frame-src 'none'",
82
+ "object-src 'none'",
83
+ "base-uri 'self'",
84
+ "form-action 'self'"
85
+ ]
86
+
87
+ if (extra.reportUri != null && extra.reportUri !== '') {
88
+ directives.push('report-uri ' + sanitizeCspValue(extra.reportUri))
89
+ }
90
+
91
+ return directives.join('; ')
92
+ }