@glw907/cairn-cms 0.58.0 → 0.60.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +84 -0
- package/dist/components/CairnAdmin.svelte +3 -0
- package/dist/components/CairnMediaLibrary.svelte +1101 -27
- package/dist/components/CairnMediaLibrary.svelte.d.ts +10 -2
- package/dist/components/CairnTidySettings.svelte +553 -0
- package/dist/components/CairnTidySettings.svelte.d.ts +32 -0
- package/dist/components/EditPage.svelte +371 -2
- package/dist/components/MarkdownEditor.svelte +168 -1
- package/dist/components/MarkdownEditor.svelte.d.ts +44 -0
- package/dist/components/TidyReview.svelte +463 -0
- package/dist/components/TidyReview.svelte.d.ts +47 -0
- package/dist/components/admin-icons.d.ts +1 -0
- package/dist/components/admin-icons.js +1 -0
- package/dist/components/cairn-admin.css +913 -2
- package/dist/components/editor-tidy.d.ts +31 -0
- package/dist/components/editor-tidy.js +199 -0
- package/dist/components/index.d.ts +1 -0
- package/dist/components/index.js +1 -0
- package/dist/components/markdown-directives.d.ts +16 -0
- package/dist/components/markdown-directives.js +34 -0
- package/dist/components/objective-errors.d.ts +30 -0
- package/dist/components/objective-errors.js +113 -0
- package/dist/components/spellcheck-assets/dictionary-en-us.txt +104743 -0
- package/dist/components/spellcheck-assets/spellchecker-wasm-LICENSE.txt +21 -0
- package/dist/components/spellcheck-assets/spellchecker-wasm.wasm +0 -0
- package/dist/components/spellcheck-worker.d.ts +80 -0
- package/dist/components/spellcheck-worker.js +161 -0
- package/dist/components/spellcheck.d.ts +146 -0
- package/dist/components/spellcheck.js +541 -0
- package/dist/components/tidy-categorize.d.ts +67 -0
- package/dist/components/tidy-categorize.js +392 -0
- package/dist/components/tidy-diff.d.ts +60 -0
- package/dist/components/tidy-diff.js +147 -0
- package/dist/components/tidy-validate.d.ts +37 -0
- package/dist/components/tidy-validate.js +174 -0
- package/dist/content/compose.d.ts +1 -1
- package/dist/content/compose.js +11 -0
- package/dist/content/site-dictionary.d.ts +31 -0
- package/dist/content/site-dictionary.js +82 -0
- package/dist/content/types.d.ts +25 -0
- package/dist/doctor/checks-local.d.ts +1 -0
- package/dist/doctor/checks-local.js +55 -6
- package/dist/doctor/index.js +2 -1
- package/dist/log/events.d.ts +1 -1
- package/dist/media/bulk-delete-plan.d.ts +24 -0
- package/dist/media/bulk-delete-plan.js +25 -0
- package/dist/media/orphan-scan.d.ts +37 -0
- package/dist/media/orphan-scan.js +42 -0
- package/dist/media/reconcile.d.ts +3 -0
- package/dist/media/reconcile.js +3 -2
- package/dist/nav/site-config.d.ts +98 -0
- package/dist/nav/site-config.js +132 -0
- package/dist/sveltekit/admin-dispatch.d.ts +2 -0
- package/dist/sveltekit/admin-dispatch.js +6 -2
- package/dist/sveltekit/cairn-admin.d.ts +16 -1
- package/dist/sveltekit/cairn-admin.js +28 -3
- package/dist/sveltekit/content-routes.d.ts +171 -4
- package/dist/sveltekit/content-routes.js +597 -3
- package/dist/sveltekit/index.d.ts +1 -1
- package/dist/sveltekit/tidy-prompt.d.ts +11 -0
- package/dist/sveltekit/tidy-prompt.js +118 -0
- package/package.json +10 -1
- package/src/lib/components/CairnAdmin.svelte +3 -0
- package/src/lib/components/CairnMediaLibrary.svelte +1101 -27
- package/src/lib/components/CairnTidySettings.svelte +553 -0
- package/src/lib/components/EditPage.svelte +371 -2
- package/src/lib/components/MarkdownEditor.svelte +168 -1
- package/src/lib/components/TidyReview.svelte +463 -0
- package/src/lib/components/admin-icons.ts +1 -0
- package/src/lib/components/cairn-admin.css +25 -0
- package/src/lib/components/editor-tidy.ts +241 -0
- package/src/lib/components/index.ts +1 -0
- package/src/lib/components/markdown-directives.ts +35 -0
- package/src/lib/components/objective-errors.ts +155 -0
- package/src/lib/components/spellcheck-assets/dictionary-en-us.txt +104743 -0
- package/src/lib/components/spellcheck-assets/spellchecker-wasm-LICENSE.txt +21 -0
- package/src/lib/components/spellcheck-assets/spellchecker-wasm.wasm +0 -0
- package/src/lib/components/spellcheck-worker.ts +279 -0
- package/src/lib/components/spellcheck.ts +679 -0
- package/src/lib/components/tidy-categorize.ts +460 -0
- package/src/lib/components/tidy-diff.ts +196 -0
- package/src/lib/components/tidy-validate.ts +202 -0
- package/src/lib/content/compose.ts +11 -1
- package/src/lib/content/site-dictionary.ts +84 -0
- package/src/lib/content/types.ts +25 -0
- package/src/lib/doctor/checks-local.ts +59 -5
- package/src/lib/doctor/index.ts +2 -0
- package/src/lib/log/events.ts +9 -1
- package/src/lib/media/bulk-delete-plan.ts +54 -0
- package/src/lib/media/orphan-scan.ts +74 -0
- package/src/lib/media/reconcile.ts +3 -2
- package/src/lib/nav/site-config.ts +197 -0
- package/src/lib/sveltekit/admin-dispatch.ts +7 -3
- package/src/lib/sveltekit/cairn-admin.ts +38 -4
- package/src/lib/sveltekit/content-routes.ts +795 -7
- package/src/lib/sveltekit/index.ts +1 -0
- package/src/lib/sveltekit/tidy-prompt.ts +153 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
// The tidy output validation: the safety backstop that proves a tidy result is a proofread and not
|
|
2
|
+
// a restructure (spec 2.6) or a successful prompt injection (spec 2.3.3). A pure module taking the
|
|
3
|
+
// captured original and the model's corrected string and returning either the validated change set
|
|
4
|
+
// (the Task 12 diff) or a typed rejection reason. A rejected result is discarded by the caller with
|
|
5
|
+
// an honest message and the document is left untouched; nothing here mutates the buffer.
|
|
6
|
+
//
|
|
7
|
+
// Four of the five checks are EXACT and are the real structural backstop: the directive structure,
|
|
8
|
+
// the heading count and levels, the fenced-code-block count, the byte-for-byte frontmatter, the
|
|
9
|
+
// media-hash multiset, and every code span and fenced block. The fifth, the divergence bound, is
|
|
10
|
+
// the only fuzzy one, and it is a rewrite/injection backstop only, never a voice safeguard. The
|
|
11
|
+
// config-driven prompt is what protects voice.
|
|
12
|
+
import { unified } from 'unified';
|
|
13
|
+
import remarkParse from 'remark-parse';
|
|
14
|
+
import remarkGfm from 'remark-gfm';
|
|
15
|
+
import { visit } from 'unist-util-visit';
|
|
16
|
+
import { fenceScan, frontmatterSpan } from './markdown-directives.js';
|
|
17
|
+
import { parseMediaToken } from '../media/reference.js';
|
|
18
|
+
import { diffTokens, diffChanges } from './tidy-diff.js';
|
|
19
|
+
/** The honest author-facing message a rejection maps to. The same message for every reason, by
|
|
20
|
+
* design: an author does not need the validator's internal taxonomy, only that the result was
|
|
21
|
+
* discarded and their text is safe. */
|
|
22
|
+
export const TIDY_REJECTION_MESSAGE = 'Tidy returned a result that changed more than the wording, so it was discarded. Your text is unchanged.';
|
|
23
|
+
// The divergence bound. The floor allows a fixed number of changed tokens regardless of fraction so
|
|
24
|
+
// a legitimate heavy proofread of a SHORT input is not penalized: a short paragraph with a typo in
|
|
25
|
+
// nearly every word is a real proofread, not a rewrite. The fraction catches a wholesale rewrite of
|
|
26
|
+
// a LONG input, where a large absolute count is past any honest copy-edit. A result is rejected only
|
|
27
|
+
// when it exceeds BOTH the floor and the fraction, so a short input rides the floor and a long input
|
|
28
|
+
// rides the fraction. The values are deliberate: 60 tokens of change covers a dense proofread of a
|
|
29
|
+
// few short paragraphs, and 0.5 of the total tokens marks the point where more than half the text
|
|
30
|
+
// changed, which no proofread does but a rewrite or a successful injection always does.
|
|
31
|
+
const DIVERGENCE_TOKEN_FLOOR = 60;
|
|
32
|
+
const DIVERGENCE_FRACTION = 0.5;
|
|
33
|
+
// Every `media:` token anywhere in the text, hash and slug forms alike. The validator scans the raw
|
|
34
|
+
// text rather than going through extractMediaRefs for two reasons. First, a true MULTISET is the
|
|
35
|
+
// invariant a backstop wants: extractMediaRefs dedups by hash, so a doubled token collapsing to one
|
|
36
|
+
// would read as equal, and the validator must catch a dropped duplicate. Second, the raw scan covers
|
|
37
|
+
// the whole text including frontmatter without threading the concept's FrontmatterField[] to the call
|
|
38
|
+
// site, which the validator otherwise has no reason to know. A token mangled inside a code fence is
|
|
39
|
+
// caught here too, redundantly with the code check, which is the right posture for a backstop.
|
|
40
|
+
const MEDIA_TOKEN = /media:[A-Za-z0-9.-]+/g;
|
|
41
|
+
/** The sorted multiset of valid media hashes in the text. Each `media:` occurrence is parsed; a
|
|
42
|
+
* malformed token (a broken hash, an illegal slug) parses to null and is dropped, so a tidy that
|
|
43
|
+
* CORRUPTED a hash drops it from the multiset and the comparison fails. Sorted so two multisets
|
|
44
|
+
* compare by value, order-independent. */
|
|
45
|
+
function mediaHashes(text) {
|
|
46
|
+
const hashes = [];
|
|
47
|
+
for (const m of text.matchAll(MEDIA_TOKEN)) {
|
|
48
|
+
const ref = parseMediaToken(m[0]);
|
|
49
|
+
if (ref)
|
|
50
|
+
hashes.push(ref.hash);
|
|
51
|
+
}
|
|
52
|
+
return hashes.sort();
|
|
53
|
+
}
|
|
54
|
+
/** The directive structure signature: each opener or closer in document order, paired with the depth
|
|
55
|
+
* the fence scan assigned it. Two texts share a directive structure when these signatures are equal,
|
|
56
|
+
* so an added, removed, or relevelled container fails the comparison. A fence-shaped line inside a
|
|
57
|
+
* code block is already disowned by the scan (its role is null), so a documented `:::` example does
|
|
58
|
+
* not enter the signature. */
|
|
59
|
+
function directiveSignature(text) {
|
|
60
|
+
const { depths, roles } = fenceScan(text.split('\n'));
|
|
61
|
+
const parts = [];
|
|
62
|
+
for (let i = 0; i < roles.length; i++) {
|
|
63
|
+
if (roles[i] !== null)
|
|
64
|
+
parts.push(`${roles[i]}@${depths[i]}`);
|
|
65
|
+
}
|
|
66
|
+
return parts.join(',');
|
|
67
|
+
}
|
|
68
|
+
/** The heading signature: every ATX heading's level in document order. Parsed as mdast so a `#`
|
|
69
|
+
* inside a code block or an escaped one is never counted, and the level is the parser's own depth.
|
|
70
|
+
* Two texts share a heading structure when these are equal, so an added, removed, or relevelled
|
|
71
|
+
* heading fails the comparison. */
|
|
72
|
+
function headingSignature(text) {
|
|
73
|
+
const tree = unified().use(remarkParse).use(remarkGfm).parse(text);
|
|
74
|
+
const levels = [];
|
|
75
|
+
visit(tree, 'heading', (node) => {
|
|
76
|
+
if (typeof node.depth === 'number')
|
|
77
|
+
levels.push(node.depth);
|
|
78
|
+
});
|
|
79
|
+
return levels.join(',');
|
|
80
|
+
}
|
|
81
|
+
/** Every code span and fenced or indented code block in the text, as a sorted multiset of values.
|
|
82
|
+
* Parsed as mdast so the comparison sees exactly what the parser treats as code, the same authority
|
|
83
|
+
* the media body scan uses. Sorted so the comparison is order-independent: the divergence and
|
|
84
|
+
* structure checks own ordering, this check owns the contents. A `code` node is a block, an
|
|
85
|
+
* `inlineCode` node is a span. */
|
|
86
|
+
function codeContents(text) {
|
|
87
|
+
const tree = unified().use(remarkParse).use(remarkGfm).parse(text);
|
|
88
|
+
const values = [];
|
|
89
|
+
visit(tree, (node) => {
|
|
90
|
+
if ((node.type === 'code' || node.type === 'inlineCode') && typeof node.value === 'string') {
|
|
91
|
+
values.push(`${node.type}:${node.value}`);
|
|
92
|
+
}
|
|
93
|
+
});
|
|
94
|
+
return values.sort();
|
|
95
|
+
}
|
|
96
|
+
/** True when two string multisets are equal: same length and same sorted contents. */
|
|
97
|
+
function multisetEqual(a, b) {
|
|
98
|
+
if (a.length !== b.length)
|
|
99
|
+
return false;
|
|
100
|
+
for (let i = 0; i < a.length; i++) {
|
|
101
|
+
if (a[i] !== b[i])
|
|
102
|
+
return false;
|
|
103
|
+
}
|
|
104
|
+
return true;
|
|
105
|
+
}
|
|
106
|
+
// The changed token amount: the count of tokens the diff marked inserted or deleted, against the
|
|
107
|
+
// total tokens in the original. An equal run contributes nothing; an inserted or deleted run counts
|
|
108
|
+
// its own tokens. This is the rewrite measure, deliberately coarse, since the structure/token/code
|
|
109
|
+
// checks are the exact backstop and this only catches a wholesale rewrite that slipped past them.
|
|
110
|
+
function divergence(original, corrected) {
|
|
111
|
+
const runs = diffTokens(original, corrected);
|
|
112
|
+
// Count tokens by splitting each run's text on the same word/non-word boundary the diff uses; a
|
|
113
|
+
// run's token count is its number of word-or-nonword matches. The original's total is the equal
|
|
114
|
+
// plus deleted token count.
|
|
115
|
+
const countTokens = (s) => (s.match(/[A-Za-z0-9_]+(?:['’][A-Za-z0-9_]+)*|[^A-Za-z0-9_]+/g) ?? []).length;
|
|
116
|
+
let changed = 0;
|
|
117
|
+
let total = 0;
|
|
118
|
+
for (const run of runs) {
|
|
119
|
+
const tokens = countTokens(run.text);
|
|
120
|
+
if (run.kind === 'inserted' || run.kind === 'deleted')
|
|
121
|
+
changed += tokens;
|
|
122
|
+
if (run.kind === 'equal' || run.kind === 'deleted')
|
|
123
|
+
total += tokens;
|
|
124
|
+
}
|
|
125
|
+
return { changed, total };
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Validate a tidy result against the captured original. Runs the exact structural checks first (a
|
|
129
|
+
* restructure or a token or code edit is a hard reject regardless of how little else changed), then
|
|
130
|
+
* the length-aware divergence bound. On success returns the Task 12 change set for the review
|
|
131
|
+
* surface; on failure returns the typed reason and the one honest message.
|
|
132
|
+
*
|
|
133
|
+
* The checks, in order: the directive opener/closer sequence and depths, the ATX heading count and
|
|
134
|
+
* levels, the fenced-code-block count (folded into the code-contents multiset), the byte-for-byte
|
|
135
|
+
* frontmatter via the shared frontmatterSpan helper, the media-hash multiset, the code-span and
|
|
136
|
+
* code-block contents, and finally the divergence bound. A pure function: it reads the two strings
|
|
137
|
+
* and nothing else, and it never mutates the buffer.
|
|
138
|
+
*/
|
|
139
|
+
export function validateTidy(original, corrected) {
|
|
140
|
+
// Directive structure: the opener/closer sequence and depths must match exactly.
|
|
141
|
+
if (directiveSignature(original) !== directiveSignature(corrected)) {
|
|
142
|
+
return { ok: false, reason: 'structure', message: TIDY_REJECTION_MESSAGE };
|
|
143
|
+
}
|
|
144
|
+
// Headings: the same ATX headings at the same levels, in order.
|
|
145
|
+
if (headingSignature(original) !== headingSignature(corrected)) {
|
|
146
|
+
return { ok: false, reason: 'structure', message: TIDY_REJECTION_MESSAGE };
|
|
147
|
+
}
|
|
148
|
+
// Frontmatter: byte-for-byte equal, via the same helper the spellcheck skip uses. A null span
|
|
149
|
+
// (no frontmatter) on both sides slices to the empty string on both, so a body-only document
|
|
150
|
+
// passes; a span on one side and not the other diverges.
|
|
151
|
+
const fmOriginal = frontmatterSpan(original);
|
|
152
|
+
const fmCorrected = frontmatterSpan(corrected);
|
|
153
|
+
const fmTextOriginal = fmOriginal ? original.slice(fmOriginal.from, fmOriginal.to) : '';
|
|
154
|
+
const fmTextCorrected = fmCorrected ? corrected.slice(fmCorrected.from, fmCorrected.to) : '';
|
|
155
|
+
if (fmTextOriginal !== fmTextCorrected) {
|
|
156
|
+
return { ok: false, reason: 'frontmatter', message: TIDY_REJECTION_MESSAGE };
|
|
157
|
+
}
|
|
158
|
+
// Media: the exact same multiset of hashes across the whole text.
|
|
159
|
+
if (!multisetEqual(mediaHashes(original), mediaHashes(corrected))) {
|
|
160
|
+
return { ok: false, reason: 'media', message: TIDY_REJECTION_MESSAGE };
|
|
161
|
+
}
|
|
162
|
+
// Code: every code span and fenced or indented block identical. The block count is folded in
|
|
163
|
+
// here: a multiset of block-and-span values that differs in count or contents fails.
|
|
164
|
+
if (!multisetEqual(codeContents(original), codeContents(corrected))) {
|
|
165
|
+
return { ok: false, reason: 'code', message: TIDY_REJECTION_MESSAGE };
|
|
166
|
+
}
|
|
167
|
+
// Divergence: rejected only when the changed amount exceeds BOTH the absolute floor and the
|
|
168
|
+
// fraction of the total. A short input rides the floor; a long input rides the fraction.
|
|
169
|
+
const { changed, total } = divergence(original, corrected);
|
|
170
|
+
if (changed > DIVERGENCE_TOKEN_FLOOR && changed > total * DIVERGENCE_FRACTION) {
|
|
171
|
+
return { ok: false, reason: 'divergence', message: TIDY_REJECTION_MESSAGE };
|
|
172
|
+
}
|
|
173
|
+
return { ok: true, changes: diffChanges(original, corrected) };
|
|
174
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { CairnAdapter, CairnExtension, CairnRuntime } from './types.js';
|
|
2
|
-
import type
|
|
2
|
+
import { type SiteConfig } from '../nav/site-config.js';
|
|
3
3
|
/** The input to {@link composeRuntime}. `siteConfig` is required so the per-concept URL policy is
|
|
4
4
|
* always derived from one source and can never be silently dropped. `extensions` fold in after the
|
|
5
5
|
* adapter's concepts. */
|
package/dist/content/compose.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { resolveConcepts } from './concepts.js';
|
|
2
2
|
import { normalizeAssets } from '../media/config.js';
|
|
3
|
+
import { dictionaryFileForDialect } from '../nav/site-config.js';
|
|
3
4
|
/**
|
|
4
5
|
* Fold an adapter and any extensions into the composed runtime (seam 2). The per-concept URL policy
|
|
5
6
|
* is derived from the site config, the same source the delivery path uses, so the runtime and
|
|
@@ -36,6 +37,16 @@ export function composeRuntime({ adapter, siteConfig, extensions = [] }) {
|
|
|
36
37
|
assets: adapter.assets,
|
|
37
38
|
resolvedAssets: normalizeAssets(adapter.assets),
|
|
38
39
|
mediaManifestPath: adapter.mediaManifestPath ?? 'src/content/.cairn/media.json',
|
|
40
|
+
// The personal dictionary sits beside the manifests under the same `.cairn/` content root, so the
|
|
41
|
+
// spec's `content/.cairn/dictionary.txt` resolves the same configurable way the manifest paths do.
|
|
42
|
+
dictionaryPath: adapter.dictionaryPath ?? 'src/content/.cairn/dictionary.txt',
|
|
43
|
+
// The spellcheck dictionary is resolved once here from the site config's dialect (default US),
|
|
44
|
+
// so the runtime and the editor never re-derive it. The site config is the one home for the
|
|
45
|
+
// dialect; the editor resolves this filename to a real asset URL on the main thread.
|
|
46
|
+
spellcheckDictionary: dictionaryFileForDialect(siteConfig.spellcheck?.dialect),
|
|
47
|
+
// The tidy block passes through from the site config; the tidy action reads enabled/model at call
|
|
48
|
+
// time and builds its prompt from conventions. Absent means tidy is off.
|
|
49
|
+
tidy: siteConfig.tidy,
|
|
39
50
|
adminPanels,
|
|
40
51
|
fieldTypes,
|
|
41
52
|
};
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/** True when a word is a single valid dictionary line (no whitespace, no control characters, non-empty
|
|
2
|
+
* and within the length bound). A leading "#" is rejected: parseDictionary re-reads such a line as a
|
|
3
|
+
* comment, so committing it would silently drop the word on the next read. The action uses this to
|
|
4
|
+
* reject untrusted input before the merge, so a newline or a control byte can never inject an extra
|
|
5
|
+
* line into the committed file. */
|
|
6
|
+
export declare function isValidDictionaryWord(word: string, maxLength?: number): boolean;
|
|
7
|
+
/**
|
|
8
|
+
* Parse the committed dictionary file text into its word list. Comment lines (a `#` after optional
|
|
9
|
+
* leading whitespace) and blank lines are dropped; every other line is trimmed and kept. A null or
|
|
10
|
+
* empty file yields an empty list. The result preserves the file's order and is not deduplicated or
|
|
11
|
+
* sorted here, so a caller can see exactly what the file held; `mergeDictionaryWords` is the path that
|
|
12
|
+
* normalizes to the sorted, deduplicated set.
|
|
13
|
+
*/
|
|
14
|
+
export declare function parseDictionary(text: string | null): string[];
|
|
15
|
+
/**
|
|
16
|
+
* Merge `additions` into the `existing` word list, returning the canonical sorted, deduplicated set.
|
|
17
|
+
* The merge is case-insensitive (a duplicate add of an existing word, in any case, collapses) and
|
|
18
|
+
* order-independent: the inputs are unioned by lowercased key and sorted, so re-merging the same
|
|
19
|
+
* additions at a moved head produces the same set. The first-seen casing of each word wins, so an
|
|
20
|
+
* existing "Cairn" is kept over a later "cairn". Invalid additions (whitespace, control characters,
|
|
21
|
+
* empty) are skipped here as a backstop; the action validates before this is reached.
|
|
22
|
+
*/
|
|
23
|
+
export declare function mergeDictionaryWords(existing: readonly string[], additions: readonly string[]): string[];
|
|
24
|
+
/**
|
|
25
|
+
* Serialize a word list to the canonical committed file text: the header comment, then one word per
|
|
26
|
+
* line sorted case-insensitively, with a trailing newline. The input is run through the same dedup
|
|
27
|
+
* and sort as the merge, so serializing an unsorted or duplicate-bearing list still yields the
|
|
28
|
+
* canonical form. An empty word list serializes to just the header (so the file stays a valid,
|
|
29
|
+
* recognizable dictionary rather than vanishing).
|
|
30
|
+
*/
|
|
31
|
+
export declare function serializeDictionary(words: readonly string[]): string;
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
// cairn-cms: the git-committed per-site personal dictionary (spec 1.6). One word per line,
|
|
2
|
+
// sorted, with comment lines (starting with #) and blank lines tolerated on read. This module is
|
|
3
|
+
// pure: it parses the committed file text, inserts words in sorted order, and serializes the
|
|
4
|
+
// canonical form. The insert is order-independent, so the action's commit-and-retry can re-merge
|
|
5
|
+
// the pending additions at a new head and reach the same sorted set regardless of insertion order.
|
|
6
|
+
//
|
|
7
|
+
// The canonical serialization keeps a single leading header comment and one sorted word per line.
|
|
8
|
+
// An inbound file's other comment lines are dropped on serialize (the header is regenerated), so the
|
|
9
|
+
// committed file stays a clean, diffable, sorted word list; a maintainer who wants a richer comment
|
|
10
|
+
// edits it in git, and the next add through here normalizes it back to the header.
|
|
11
|
+
/** The header comment the canonical serialization writes above the sorted words. */
|
|
12
|
+
const HEADER = '# cairn personal dictionary: one word per line, sorted, kept in git.';
|
|
13
|
+
// A dictionary word: a single line carrying no whitespace and no ASCII control characters, so it can
|
|
14
|
+
// never inject an extra line into the committed file. Hyphens and apostrophes are allowed, since real
|
|
15
|
+
// words carry them ("well-known", "O'Brien"); a non-ASCII surname or place name validates too, since
|
|
16
|
+
// the test is for whitespace and control bytes rather than an allow-list of letters. The action runs
|
|
17
|
+
// inbound words through this before a merge.
|
|
18
|
+
const WORD_RE = /^[^\s\p{Cc}]+$/u;
|
|
19
|
+
/** True when a word is a single valid dictionary line (no whitespace, no control characters, non-empty
|
|
20
|
+
* and within the length bound). A leading "#" is rejected: parseDictionary re-reads such a line as a
|
|
21
|
+
* comment, so committing it would silently drop the word on the next read. The action uses this to
|
|
22
|
+
* reject untrusted input before the merge, so a newline or a control byte can never inject an extra
|
|
23
|
+
* line into the committed file. */
|
|
24
|
+
export function isValidDictionaryWord(word, maxLength = 64) {
|
|
25
|
+
if (word.startsWith('#'))
|
|
26
|
+
return false;
|
|
27
|
+
return word.length > 0 && word.length <= maxLength && WORD_RE.test(word);
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Parse the committed dictionary file text into its word list. Comment lines (a `#` after optional
|
|
31
|
+
* leading whitespace) and blank lines are dropped; every other line is trimmed and kept. A null or
|
|
32
|
+
* empty file yields an empty list. The result preserves the file's order and is not deduplicated or
|
|
33
|
+
* sorted here, so a caller can see exactly what the file held; `mergeDictionaryWords` is the path that
|
|
34
|
+
* normalizes to the sorted, deduplicated set.
|
|
35
|
+
*/
|
|
36
|
+
export function parseDictionary(text) {
|
|
37
|
+
if (!text)
|
|
38
|
+
return [];
|
|
39
|
+
const words = [];
|
|
40
|
+
for (const line of text.split('\n')) {
|
|
41
|
+
const trimmed = line.trim();
|
|
42
|
+
if (trimmed === '' || trimmed.startsWith('#'))
|
|
43
|
+
continue;
|
|
44
|
+
words.push(trimmed);
|
|
45
|
+
}
|
|
46
|
+
return words;
|
|
47
|
+
}
|
|
48
|
+
/** Case-insensitive, locale-stable comparator for the canonical sort. Words are compared lowercased
|
|
49
|
+
* so "Cairn" and "cairn" collapse to one entry, the same case-folding the Worker's merged set uses. */
|
|
50
|
+
function byWord(a, b) {
|
|
51
|
+
return a.toLowerCase().localeCompare(b.toLowerCase());
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Merge `additions` into the `existing` word list, returning the canonical sorted, deduplicated set.
|
|
55
|
+
* The merge is case-insensitive (a duplicate add of an existing word, in any case, collapses) and
|
|
56
|
+
* order-independent: the inputs are unioned by lowercased key and sorted, so re-merging the same
|
|
57
|
+
* additions at a moved head produces the same set. The first-seen casing of each word wins, so an
|
|
58
|
+
* existing "Cairn" is kept over a later "cairn". Invalid additions (whitespace, control characters,
|
|
59
|
+
* empty) are skipped here as a backstop; the action validates before this is reached.
|
|
60
|
+
*/
|
|
61
|
+
export function mergeDictionaryWords(existing, additions) {
|
|
62
|
+
const byKey = new Map();
|
|
63
|
+
for (const word of [...existing, ...additions]) {
|
|
64
|
+
if (!isValidDictionaryWord(word))
|
|
65
|
+
continue;
|
|
66
|
+
const key = word.toLowerCase();
|
|
67
|
+
if (!byKey.has(key))
|
|
68
|
+
byKey.set(key, word);
|
|
69
|
+
}
|
|
70
|
+
return [...byKey.values()].sort(byWord);
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Serialize a word list to the canonical committed file text: the header comment, then one word per
|
|
74
|
+
* line sorted case-insensitively, with a trailing newline. The input is run through the same dedup
|
|
75
|
+
* and sort as the merge, so serializing an unsorted or duplicate-bearing list still yields the
|
|
76
|
+
* canonical form. An empty word list serializes to just the header (so the file stays a valid,
|
|
77
|
+
* recognizable dictionary rather than vanishing).
|
|
78
|
+
*/
|
|
79
|
+
export function serializeDictionary(words) {
|
|
80
|
+
const sorted = mergeDictionaryWords(words, []);
|
|
81
|
+
return [HEADER, ...sorted].join('\n') + '\n';
|
|
82
|
+
}
|
package/dist/content/types.d.ts
CHANGED
|
@@ -238,6 +238,11 @@ export interface CairnAdapter {
|
|
|
238
238
|
/** Repo-relative path to the committed media manifest. Defaults to src/content/.cairn/media.json,
|
|
239
239
|
* applied in composeRuntime. Sits outside any concept directory, like the content manifest. */
|
|
240
240
|
mediaManifestPath?: string;
|
|
241
|
+
/** Repo-relative path to the committed personal dictionary file. Defaults to
|
|
242
|
+
* src/content/.cairn/dictionary.txt, applied in composeRuntime: the same `.cairn/` content root the
|
|
243
|
+
* manifests use, so the spec's `content/.cairn/dictionary.txt` resolves the same configurable way the
|
|
244
|
+
* manifest paths do. One word per line, sorted, comment lines allowed (see site-dictionary.ts). */
|
|
245
|
+
dictionaryPath?: string;
|
|
241
246
|
/** Directive component registry; the renderer and the future palette derive from it (seam 3). */
|
|
242
247
|
registry?: ComponentRegistry;
|
|
243
248
|
/** The site's glyph name to SVG path-data map, for the admin icon picker and the renderer. */
|
|
@@ -345,6 +350,13 @@ export interface CairnRuntime {
|
|
|
345
350
|
manifestPath: string;
|
|
346
351
|
/** The repo-relative path to the committed media manifest, defaulted in composeRuntime. */
|
|
347
352
|
mediaManifestPath: string;
|
|
353
|
+
/** The repo-relative path to the committed personal dictionary file (one word per line, sorted),
|
|
354
|
+
* defaulted in composeRuntime to src/content/.cairn/dictionary.txt: the same `.cairn/` content root
|
|
355
|
+
* the manifests use. The edit load reads it and threads its words onto EditData; the
|
|
356
|
+
* addDictionaryWord action reads, merges, and commits it. Optional on the runtime so a hand-built
|
|
357
|
+
* runtime need not set it; composeRuntime always fills it, and the edit load and the action default
|
|
358
|
+
* a missing value to the same content-root path. */
|
|
359
|
+
dictionaryPath?: string;
|
|
348
360
|
/** The adapter's asset config resolved once at compose: `{ enabled: false }` for a no-media site,
|
|
349
361
|
* otherwise the filled config the upload, storage, delivery, and resolver paths read. */
|
|
350
362
|
resolvedAssets: import('../media/config.js').ResolvedAssetConfig;
|
|
@@ -355,6 +367,19 @@ export interface CairnRuntime {
|
|
|
355
367
|
/** The live site's content styling for the preview frame; passed through from the adapter. */
|
|
356
368
|
preview?: PreviewConfig;
|
|
357
369
|
assets?: AssetConfig;
|
|
370
|
+
/** The editor's spellcheck dictionary file, resolved once at compose from the site config's
|
|
371
|
+
* `spellcheck.dialect` (defaulting to US English). The edit load threads it onto EditData and the
|
|
372
|
+
* editor resolves it to a real asset URL on the main thread, so the Worker receives the URL and
|
|
373
|
+
* never reads config. Just the filename, e.g. "dictionary-en-us.txt". Optional on the runtime so a
|
|
374
|
+
* hand-built runtime need not set it; composeRuntime always fills it, and the edit load defaults a
|
|
375
|
+
* missing value to the US English dictionary. */
|
|
376
|
+
spellcheckDictionary?: string;
|
|
377
|
+
/** The editor tidy (LLM copy-edit) settings, passed through from the site config. Optional on the
|
|
378
|
+
* runtime so a hand-built runtime need not set it; composeRuntime threads it from
|
|
379
|
+
* `siteConfig.tidy`. The tidy action reads `enabled` and `model` at call time, and builds its prompt
|
|
380
|
+
* from `conventions`. Absent (or `enabled` false) means tidy is off, and the action refuses with a
|
|
381
|
+
* fail(503) before any model call. */
|
|
382
|
+
tidy?: import('../nav/site-config.js').TidyConfig;
|
|
358
383
|
/** Admin panels contributed by extensions (Mode 2). Empty until Plan 09 wires the dispatch route. */
|
|
359
384
|
adminPanels?: AdminPanel[];
|
|
360
385
|
/** Field types contributed by extensions (Mode 2). Empty until Plan 09 wires the form dispatch. */
|
|
@@ -129,17 +129,21 @@ export const configPublicOrigin = {
|
|
|
129
129
|
// evaluate, so the check probes the conventional spots instead (the repo root and the two
|
|
130
130
|
// src locations the production sites use).
|
|
131
131
|
const SITE_CONFIG_PATHS = ['site.config.yaml', 'src/lib/site.config.yaml', 'src/site.config.yaml'];
|
|
132
|
+
// Read the first site.config.yaml that exists in a conventional spot, or null when none does.
|
|
133
|
+
async function readSiteConfigText(ctx) {
|
|
134
|
+
for (const path of SITE_CONFIG_PATHS) {
|
|
135
|
+
const text = await ctx.readFile(path);
|
|
136
|
+
if (text !== null)
|
|
137
|
+
return text;
|
|
138
|
+
}
|
|
139
|
+
return null;
|
|
140
|
+
}
|
|
132
141
|
export const configSiteConfig = {
|
|
133
142
|
id: 'config.site-config',
|
|
134
143
|
conditionId: 'config.site-config-invalid',
|
|
135
144
|
title: 'Site config',
|
|
136
145
|
async run(ctx) {
|
|
137
|
-
|
|
138
|
-
for (const path of SITE_CONFIG_PATHS) {
|
|
139
|
-
text = await ctx.readFile(path);
|
|
140
|
-
if (text !== null)
|
|
141
|
-
break;
|
|
142
|
-
}
|
|
146
|
+
const text = await readSiteConfigText(ctx);
|
|
143
147
|
if (text === null)
|
|
144
148
|
return skip(`no site.config.yaml found (looked in ${SITE_CONFIG_PATHS.join(', ')})`);
|
|
145
149
|
try {
|
|
@@ -157,3 +161,48 @@ export const configSiteConfig = {
|
|
|
157
161
|
}
|
|
158
162
|
},
|
|
159
163
|
};
|
|
164
|
+
// A site enables tidy with `tidy.enabled: true` in the committed config; ignore a config the rest of
|
|
165
|
+
// the doctor reports through configSiteConfig, so a parse error here just skips rather than doubling
|
|
166
|
+
// the failure.
|
|
167
|
+
function tidyEnabled(text) {
|
|
168
|
+
let config;
|
|
169
|
+
try {
|
|
170
|
+
config = parseSiteConfig(text);
|
|
171
|
+
}
|
|
172
|
+
catch {
|
|
173
|
+
return false;
|
|
174
|
+
}
|
|
175
|
+
return config.tidy?.enabled === true;
|
|
176
|
+
}
|
|
177
|
+
// The Anthropic key is a Worker secret, so the doctor cannot prove it is unset (it is in neither the
|
|
178
|
+
// committed wrangler config nor anything readFile reaches). It CAN read the two spots a key would also
|
|
179
|
+
// appear if set as a plain var: the wrangler config text and .dev.vars. A bare presence-by-name read
|
|
180
|
+
// is enough for the heuristic; the runtime fail(503) and --probe are the real truth checks.
|
|
181
|
+
function keyAppearsIn(text) {
|
|
182
|
+
return text !== null && text.includes('ANTHROPIC_API_KEY');
|
|
183
|
+
}
|
|
184
|
+
// The tidy secret heuristic. It reuses the config.bindings-missing condition rather than registering a
|
|
185
|
+
// new one, so the readiness count holds (the same pattern configMediaBucket uses). A warn here is not a
|
|
186
|
+
// definitive unset claim: it asks the operator to verify the secret, since a wrangler secret is
|
|
187
|
+
// invisible to the CLI.
|
|
188
|
+
export const configTidyKey = {
|
|
189
|
+
id: 'config.tidy-key',
|
|
190
|
+
conditionId: 'config.bindings-missing',
|
|
191
|
+
title: 'Tidy API key',
|
|
192
|
+
async run(ctx) {
|
|
193
|
+
const text = await readSiteConfigText(ctx);
|
|
194
|
+
if (text === null)
|
|
195
|
+
return skip('no site.config.yaml found, so tidy enablement is unknown');
|
|
196
|
+
if (!tidyEnabled(text))
|
|
197
|
+
return skip('tidy is not enabled in the site config');
|
|
198
|
+
const wrangler = (await ctx.readFile('wrangler.jsonc')) ?? (await ctx.readFile('wrangler.toml'));
|
|
199
|
+
if (keyAppearsIn(wrangler)) {
|
|
200
|
+
return pass('ANTHROPIC_API_KEY appears in the wrangler vars (verify it is the real key, not a placeholder)');
|
|
201
|
+
}
|
|
202
|
+
const devVars = await ctx.readFile('.dev.vars');
|
|
203
|
+
if (keyAppearsIn(devVars)) {
|
|
204
|
+
return pass('ANTHROPIC_API_KEY appears in .dev.vars (the local override; verify the Worker secret is set for production)');
|
|
205
|
+
}
|
|
206
|
+
return fail('tidy is enabled but ANTHROPIC_API_KEY is in neither the wrangler vars nor .dev.vars; verify the secret is configured with wrangler secret put ANTHROPIC_API_KEY');
|
|
207
|
+
},
|
|
208
|
+
};
|
package/dist/doctor/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { configBindings, configMediaBucket, configObservability, configCsrfDisable, configSiteConfig, configPublicOrigin, } from './checks-local.js';
|
|
1
|
+
import { configBindings, configMediaBucket, configObservability, configCsrfDisable, configSiteConfig, configPublicOrigin, configTidyKey, } from './checks-local.js';
|
|
2
2
|
import { configDependencyFloors } from './check-floors.js';
|
|
3
3
|
import { emailSenderOnboarded, edgeHttpsForced, edgeHsts, authStore } from './checks-cloudflare.js';
|
|
4
4
|
import { githubApp } from './checks-github.js';
|
|
@@ -108,6 +108,7 @@ export function defaultChecks() {
|
|
|
108
108
|
configCsrfDisable,
|
|
109
109
|
configSiteConfig,
|
|
110
110
|
configPublicOrigin,
|
|
111
|
+
configTidyKey,
|
|
111
112
|
configDependencyFloors,
|
|
112
113
|
emailSenderOnboarded,
|
|
113
114
|
edgeHttpsForced,
|
package/dist/log/events.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export type CairnLogEvent = 'auth.link.requested' | 'auth.link.send_failed' | 'auth.token.minted' | 'auth.token.confirmed' | 'auth.session.created' | 'auth.session.destroyed' | 'commit.succeeded' | 'commit.failed' | 'config.invalid' | 'entry.published' | 'entry.discarded' | 'publish.failed' | 'github.unreachable' | 'guard.rejected' | 'media.uploaded' | 'media.upload_failed' | 'media.delivery_failed' | 'media.orphan_reconcile' | 'media.resolve_missing' | 'media.deleted' | 'media.delete_blocked' | 'media.replaced' | 'media.replace_blocked' | 'media.alt_propagated';
|
|
1
|
+
export type CairnLogEvent = 'auth.link.requested' | 'auth.link.send_failed' | 'auth.token.minted' | 'auth.token.confirmed' | 'auth.session.created' | 'auth.session.destroyed' | 'commit.succeeded' | 'commit.failed' | 'config.invalid' | 'entry.published' | 'entry.discarded' | 'publish.failed' | 'github.unreachable' | 'guard.rejected' | 'media.uploaded' | 'media.upload_failed' | 'media.delivery_failed' | 'media.orphan_reconcile' | 'media.resolve_missing' | 'media.deleted' | 'media.delete_blocked' | 'media.bulk_deleted' | 'media.orphans_purged' | 'media.replaced' | 'media.replace_blocked' | 'media.alt_propagated' | 'dictionary.added' | 'dictionary.add_conflict' | 'tidy.done' | 'tidy.error' | 'tidy.refused' | 'tidy.empty';
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { UsageEntry, UsageIndex } from './usage.js';
|
|
2
|
+
import type { MediaManifest } from './manifest.js';
|
|
3
|
+
/** One selected hash that is not deleted, with why and (for the where-used) its usage rows. The rows
|
|
4
|
+
* are present only for 'still-referenced'; an 'uncommitted' skip carries an empty list. */
|
|
5
|
+
export interface BulkDeleteSkip {
|
|
6
|
+
hash: string;
|
|
7
|
+
reason: 'still-referenced' | 'uncommitted';
|
|
8
|
+
usage: UsageEntry[];
|
|
9
|
+
}
|
|
10
|
+
/** The partitioned selection: the hashes safe to purge and the hashes held back. Both arrays keep the
|
|
11
|
+
* input order of `selected` so the screen reports them in the order the user picked. */
|
|
12
|
+
export interface BulkDeletePlan {
|
|
13
|
+
deletable: string[];
|
|
14
|
+
skipped: BulkDeleteSkip[];
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Partition `selected` against a strict usage index and the media manifest.
|
|
18
|
+
*
|
|
19
|
+
* A hash with one or more usage rows is skipped 'still-referenced', carrying those rows for the
|
|
20
|
+
* where-used. A hash with no usage row and no committed manifest row is skipped 'uncommitted', since
|
|
21
|
+
* there is nothing committed to delete. A hash with no usage row and a committed manifest row is
|
|
22
|
+
* deletable. The input order of `selected` is preserved in both output arrays.
|
|
23
|
+
*/
|
|
24
|
+
export declare function planBulkDelete(selected: string[], index: UsageIndex, manifest: MediaManifest): BulkDeletePlan;
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Partition `selected` against a strict usage index and the media manifest.
|
|
3
|
+
*
|
|
4
|
+
* A hash with one or more usage rows is skipped 'still-referenced', carrying those rows for the
|
|
5
|
+
* where-used. A hash with no usage row and no committed manifest row is skipped 'uncommitted', since
|
|
6
|
+
* there is nothing committed to delete. A hash with no usage row and a committed manifest row is
|
|
7
|
+
* deletable. The input order of `selected` is preserved in both output arrays.
|
|
8
|
+
*/
|
|
9
|
+
export function planBulkDelete(selected, index, manifest) {
|
|
10
|
+
const deletable = [];
|
|
11
|
+
const skipped = [];
|
|
12
|
+
for (const hash of selected) {
|
|
13
|
+
const usage = index.get(hash);
|
|
14
|
+
if (usage && usage.length > 0) {
|
|
15
|
+
skipped.push({ hash, reason: 'still-referenced', usage });
|
|
16
|
+
}
|
|
17
|
+
else if (manifest[hash]) {
|
|
18
|
+
deletable.push(hash);
|
|
19
|
+
}
|
|
20
|
+
else {
|
|
21
|
+
skipped.push({ hash, reason: 'uncommitted', usage: [] });
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
return { deletable, skipped };
|
|
25
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { type ReconcileResult } from './reconcile.js';
|
|
2
|
+
import type { MediaManifest } from './manifest.js';
|
|
3
|
+
import type { UsageEntry, UsageIndex } from './usage.js';
|
|
4
|
+
/** A purgeable orphan: a stored R2 key with no manifest row, plus the 16-hex hash parsed from it. */
|
|
5
|
+
export interface OrphanByteRow {
|
|
6
|
+
/** The full R2 object key, e.g. "media/ff/ffffffffffffffff.webp". */
|
|
7
|
+
key: string;
|
|
8
|
+
/** The 16-hex content hash parsed from the key. */
|
|
9
|
+
hash: string;
|
|
10
|
+
}
|
|
11
|
+
/** A broken reference: a manifest row whose bytes are gone. Read-only, since purging it would drop a
|
|
12
|
+
* still-referenced asset's record; the screen shows where it is used so an operator can re-ingest. */
|
|
13
|
+
export interface BrokenRefRow {
|
|
14
|
+
/** The 16-hex content hash of the manifest row whose bytes are missing. */
|
|
15
|
+
hash: string;
|
|
16
|
+
/** The manifest row's display slug, or '' when the row is somehow absent. */
|
|
17
|
+
slug: string;
|
|
18
|
+
/** Where the asset is referenced, from the usage index. Empty when no reference was found. */
|
|
19
|
+
usage: UsageEntry[];
|
|
20
|
+
}
|
|
21
|
+
/** The scan surface model: the two row sets the Library renders. */
|
|
22
|
+
export interface OrphanScan {
|
|
23
|
+
orphanedBytes: OrphanByteRow[];
|
|
24
|
+
brokenRefs: BrokenRefRow[];
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Project a reconcile read plus the usage index into the scan surface model.
|
|
28
|
+
*
|
|
29
|
+
* `orphanedBytes` come from `reconcile.orphanedObjects`: each key is parsed to its hash via the
|
|
30
|
+
* shared media-key grammar, and a key that does not match (so it is not a content-addressed media
|
|
31
|
+
* object) is skipped. A key whose hash the usage index references is also skipped: it is referenced
|
|
32
|
+
* on main or some open branch, so its bytes are in use, not orphaned. `brokenRefs` come from
|
|
33
|
+
* `reconcile.missingObjects`: each hash carries its
|
|
34
|
+
* manifest slug (falling back to '' when the row is absent) and its where-used rows from the index
|
|
35
|
+
* (an empty list when no reference was found). Both directions keep their input order.
|
|
36
|
+
*/
|
|
37
|
+
export declare function buildOrphanScan(reconcile: ReconcileResult, manifest: MediaManifest, index: UsageIndex): OrphanScan;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
// cairn-cms: the orphan-scan projection, the pure model behind the admin Media Library's scan
|
|
2
|
+
// surface. It folds reconcileMedia's two directions together with the usage index into the two rows
|
|
3
|
+
// the screen renders: the purgeable byte-rows and the read-only broken-reference rows (manifest rows
|
|
4
|
+
// whose bytes are gone). It only projects; no path here reads R2, the manifest, or git. The module
|
|
5
|
+
// is engine-internal and on no public subpath.
|
|
6
|
+
//
|
|
7
|
+
// An orphaned byte is a stored R2 object whose hash has NO manifest row AND appears in NO usage row,
|
|
8
|
+
// so it is referenced nowhere across main and every open branch. Reconcile only checks main's
|
|
9
|
+
// manifest, so a branch-only upload (bytes in R2, manifest row only on the open cairn/* branch) gets
|
|
10
|
+
// flagged as an orphaned object even though a colleague's in-progress draft references it. The byte
|
|
11
|
+
// purge is irreversible, so we intersect reconcile's verdict with the strict cross-branch usage
|
|
12
|
+
// index here: any hash the index references is in use and is dropped from orphanedBytes, which keeps
|
|
13
|
+
// a live draft's bytes from ever reaching the purge surface.
|
|
14
|
+
import { MEDIA_KEY_RE } from './reconcile.js';
|
|
15
|
+
/**
|
|
16
|
+
* Project a reconcile read plus the usage index into the scan surface model.
|
|
17
|
+
*
|
|
18
|
+
* `orphanedBytes` come from `reconcile.orphanedObjects`: each key is parsed to its hash via the
|
|
19
|
+
* shared media-key grammar, and a key that does not match (so it is not a content-addressed media
|
|
20
|
+
* object) is skipped. A key whose hash the usage index references is also skipped: it is referenced
|
|
21
|
+
* on main or some open branch, so its bytes are in use, not orphaned. `brokenRefs` come from
|
|
22
|
+
* `reconcile.missingObjects`: each hash carries its
|
|
23
|
+
* manifest slug (falling back to '' when the row is absent) and its where-used rows from the index
|
|
24
|
+
* (an empty list when no reference was found). Both directions keep their input order.
|
|
25
|
+
*/
|
|
26
|
+
export function buildOrphanScan(reconcile, manifest, index) {
|
|
27
|
+
const orphanedBytes = [];
|
|
28
|
+
for (const key of reconcile.orphanedObjects) {
|
|
29
|
+
const hash = MEDIA_KEY_RE.exec(key)?.[1];
|
|
30
|
+
if (hash === undefined)
|
|
31
|
+
continue;
|
|
32
|
+
if (index.has(hash))
|
|
33
|
+
continue;
|
|
34
|
+
orphanedBytes.push({ key, hash });
|
|
35
|
+
}
|
|
36
|
+
const brokenRefs = reconcile.missingObjects.map((hash) => ({
|
|
37
|
+
hash,
|
|
38
|
+
slug: manifest[hash]?.slug ?? '',
|
|
39
|
+
usage: index.get(hash) ?? [],
|
|
40
|
+
}));
|
|
41
|
+
return { orphanedBytes, brokenRefs };
|
|
42
|
+
}
|
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
import type { MediaManifest } from './manifest.js';
|
|
2
|
+
/** A stored media object key parses to its short hash via `media/<aa>/<shortHash>.<ext>`. Exported so
|
|
3
|
+
* the orphan-scan projection derives the same hash from an orphaned key without a second grammar. */
|
|
4
|
+
export declare const MEDIA_KEY_RE: RegExp;
|
|
2
5
|
/** What a reconcile read found in either direction. `orphanedObjects` are stored R2 keys whose hash
|
|
3
6
|
* has no manifest row; `missingObjects` are manifest hashes with no stored object. */
|
|
4
7
|
export interface ReconcileResult {
|
package/dist/media/reconcile.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { log } from '../log/index.js';
|
|
2
|
-
/** A stored media object key parses to its short hash via `media/<aa>/<shortHash>.<ext>`.
|
|
3
|
-
|
|
2
|
+
/** A stored media object key parses to its short hash via `media/<aa>/<shortHash>.<ext>`. Exported so
|
|
3
|
+
* the orphan-scan projection derives the same hash from an orphaned key without a second grammar. */
|
|
4
|
+
export const MEDIA_KEY_RE = /^media\/[0-9a-f]{2}\/([0-9a-f]{16})\.[a-z0-9]{1,5}$/;
|
|
4
5
|
/** The pure core: compare the stored R2 keys against the manifest's content-hash keys and report
|
|
5
6
|
* both orphan directions. A stored key that does not match the media-key grammar is ignored, since
|
|
6
7
|
* it is not a content-addressed media object this reconcile owns. */
|