biblicus 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +28 -0
- biblicus/__main__.py +8 -0
- biblicus/backends/__init__.py +44 -0
- biblicus/backends/base.py +65 -0
- biblicus/backends/scan.py +292 -0
- biblicus/backends/sqlite_full_text_search.py +427 -0
- biblicus/cli.py +468 -0
- biblicus/constants.py +10 -0
- biblicus/corpus.py +952 -0
- biblicus/evaluation.py +261 -0
- biblicus/frontmatter.py +92 -0
- biblicus/models.py +307 -0
- biblicus/retrieval.py +137 -0
- biblicus/sources.py +132 -0
- biblicus/time.py +18 -0
- biblicus/uris.py +64 -0
- biblicus-0.1.1.dist-info/METADATA +174 -0
- biblicus-0.1.1.dist-info/RECORD +22 -0
- biblicus-0.1.1.dist-info/WHEEL +5 -0
- biblicus-0.1.1.dist-info/entry_points.txt +2 -0
- biblicus-0.1.1.dist-info/licenses/LICENSE +21 -0
- biblicus-0.1.1.dist-info/top_level.txt +1 -0
biblicus/corpus.py
ADDED
|
@@ -0,0 +1,952 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Corpus storage and ingestion for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import mimetypes
|
|
10
|
+
import shutil
|
|
11
|
+
import uuid
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Dict, List, Optional, Sequence
|
|
14
|
+
|
|
15
|
+
import yaml
|
|
16
|
+
|
|
17
|
+
from .constants import CORPUS_DIR_NAME, DEFAULT_RAW_DIR, RUNS_DIR_NAME, SCHEMA_VERSION, SIDECAR_SUFFIX
|
|
18
|
+
from .frontmatter import parse_front_matter, render_front_matter
|
|
19
|
+
from .models import CatalogItem, CorpusCatalog, CorpusConfig, IngestResult, RetrievalRun
|
|
20
|
+
from .sources import load_source
|
|
21
|
+
from .time import utc_now_iso
|
|
22
|
+
from .uris import normalize_corpus_uri, corpus_ref_to_path
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _sha256_bytes(data: bytes) -> str:
|
|
26
|
+
"""
|
|
27
|
+
Compute a Secure Hash Algorithm 256 digest for byte content.
|
|
28
|
+
|
|
29
|
+
:param data: Input bytes.
|
|
30
|
+
:type data: bytes
|
|
31
|
+
:return: Secure Hash Algorithm 256 hex digest.
|
|
32
|
+
:rtype: str
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
return hashlib.sha256(data).hexdigest()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _sanitize_filename(name: str) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Sanitize a filename into a portable, filesystem-friendly form.
|
|
41
|
+
|
|
42
|
+
:param name: Raw filename.
|
|
43
|
+
:type name: str
|
|
44
|
+
:return: Sanitized filename.
|
|
45
|
+
:rtype: str
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._() ")
|
|
49
|
+
sanitized_name = "".join(
|
|
50
|
+
(character if character in allowed_characters else "_") for character in name
|
|
51
|
+
).strip()
|
|
52
|
+
return sanitized_name or "file"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _preferred_extension_for_media_type(media_type: str) -> Optional[str]:
|
|
56
|
+
"""
|
|
57
|
+
Return a preferred filename extension for a media type.
|
|
58
|
+
|
|
59
|
+
:param media_type: Internet Assigned Numbers Authority media type.
|
|
60
|
+
:type media_type: str
|
|
61
|
+
:return: Preferred extension or None.
|
|
62
|
+
:rtype: str or None
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
media_type_overrides = {
|
|
66
|
+
"image/jpeg": ".jpg",
|
|
67
|
+
}
|
|
68
|
+
if media_type in media_type_overrides:
|
|
69
|
+
return media_type_overrides[media_type]
|
|
70
|
+
return mimetypes.guess_extension(media_type)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _ensure_filename_extension(filename: str, *, media_type: str) -> str:
|
|
74
|
+
"""
|
|
75
|
+
Ensure a usable filename extension for a media type.
|
|
76
|
+
|
|
77
|
+
:param filename: Raw filename.
|
|
78
|
+
:type filename: str
|
|
79
|
+
:param media_type: Internet Assigned Numbers Authority media type.
|
|
80
|
+
:type media_type: str
|
|
81
|
+
:return: Filename with a compatible extension.
|
|
82
|
+
:rtype: str
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
raw_name = filename.strip()
|
|
86
|
+
|
|
87
|
+
if media_type == "text/markdown":
|
|
88
|
+
if raw_name.lower().endswith((".md", ".markdown")):
|
|
89
|
+
return raw_name
|
|
90
|
+
return raw_name + ".md"
|
|
91
|
+
|
|
92
|
+
ext = _preferred_extension_for_media_type(media_type)
|
|
93
|
+
if not ext:
|
|
94
|
+
return raw_name
|
|
95
|
+
if raw_name.lower().endswith(ext.lower()):
|
|
96
|
+
return raw_name
|
|
97
|
+
return raw_name + ext
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _merge_tags(explicit: Sequence[str], from_frontmatter: Any) -> List[str]:
|
|
101
|
+
"""
|
|
102
|
+
Merge tags from explicit input and front matter values.
|
|
103
|
+
|
|
104
|
+
:param explicit: Explicit tags provided by callers.
|
|
105
|
+
:type explicit: Sequence[str]
|
|
106
|
+
:param from_frontmatter: Tags from front matter.
|
|
107
|
+
:type from_frontmatter: Any
|
|
108
|
+
:return: Deduplicated tag list preserving order.
|
|
109
|
+
:rtype: list[str]
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
merged_tags: List[str] = []
|
|
113
|
+
|
|
114
|
+
for explicit_tag in explicit:
|
|
115
|
+
cleaned_tag = explicit_tag.strip()
|
|
116
|
+
if cleaned_tag:
|
|
117
|
+
merged_tags.append(cleaned_tag)
|
|
118
|
+
|
|
119
|
+
if isinstance(from_frontmatter, str):
|
|
120
|
+
merged_tags.append(from_frontmatter)
|
|
121
|
+
elif isinstance(from_frontmatter, list):
|
|
122
|
+
for item in from_frontmatter:
|
|
123
|
+
if isinstance(item, str) and item.strip():
|
|
124
|
+
merged_tags.append(item.strip())
|
|
125
|
+
|
|
126
|
+
seen_tags = set()
|
|
127
|
+
deduplicated_tags: List[str] = []
|
|
128
|
+
for tag_value in merged_tags:
|
|
129
|
+
if tag_value not in seen_tags:
|
|
130
|
+
seen_tags.add(tag_value)
|
|
131
|
+
deduplicated_tags.append(tag_value)
|
|
132
|
+
return deduplicated_tags
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _sidecar_path_for(content_path: Path) -> Path:
|
|
136
|
+
"""
|
|
137
|
+
Compute the sidecar metadata path for a content file.
|
|
138
|
+
|
|
139
|
+
:param content_path: Path to the content file.
|
|
140
|
+
:type content_path: Path
|
|
141
|
+
:return: Sidecar path.
|
|
142
|
+
:rtype: Path
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
return content_path.with_name(content_path.name + SIDECAR_SUFFIX)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _load_sidecar(content_path: Path) -> Dict[str, Any]:
|
|
149
|
+
"""
|
|
150
|
+
Load sidecar metadata for a content file.
|
|
151
|
+
|
|
152
|
+
:param content_path: Path to the content file.
|
|
153
|
+
:type content_path: Path
|
|
154
|
+
:return: Parsed sidecar metadata.
|
|
155
|
+
:rtype: dict[str, Any]
|
|
156
|
+
:raises ValueError: If the sidecar content is not a mapping.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
path = _sidecar_path_for(content_path)
|
|
160
|
+
if not path.is_file():
|
|
161
|
+
return {}
|
|
162
|
+
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
|
163
|
+
if not isinstance(data, dict):
|
|
164
|
+
raise ValueError(f"Sidecar metadata must be a mapping/object: {path}")
|
|
165
|
+
return dict(data)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _write_sidecar(content_path: Path, metadata: Dict[str, Any]) -> None:
|
|
169
|
+
"""
|
|
170
|
+
Write a sidecar metadata file.
|
|
171
|
+
|
|
172
|
+
:param content_path: Path to the content file.
|
|
173
|
+
:type content_path: Path
|
|
174
|
+
:param metadata: Metadata to serialize.
|
|
175
|
+
:type metadata: dict[str, Any]
|
|
176
|
+
:return: None.
|
|
177
|
+
:rtype: None
|
|
178
|
+
"""
|
|
179
|
+
path = _sidecar_path_for(content_path)
|
|
180
|
+
text = yaml.safe_dump(
|
|
181
|
+
metadata,
|
|
182
|
+
sort_keys=False,
|
|
183
|
+
allow_unicode=True,
|
|
184
|
+
default_flow_style=False,
|
|
185
|
+
).strip()
|
|
186
|
+
path.write_text(text + "\n", encoding="utf-8")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _ensure_biblicus_block(metadata: Dict[str, Any], *, item_id: str, source_uri: str) -> Dict[str, Any]:
|
|
190
|
+
"""
|
|
191
|
+
Ensure the biblicus metadata block exists and is populated.
|
|
192
|
+
|
|
193
|
+
:param metadata: Existing metadata.
|
|
194
|
+
:type metadata: dict[str, Any]
|
|
195
|
+
:param item_id: Item identifier to store.
|
|
196
|
+
:type item_id: str
|
|
197
|
+
:param source_uri: Source uniform resource identifier to store.
|
|
198
|
+
:type source_uri: str
|
|
199
|
+
:return: Updated metadata mapping.
|
|
200
|
+
:rtype: dict[str, Any]
|
|
201
|
+
"""
|
|
202
|
+
updated_metadata = dict(metadata)
|
|
203
|
+
existing_biblicus = updated_metadata.get("biblicus")
|
|
204
|
+
if not isinstance(existing_biblicus, dict):
|
|
205
|
+
existing_biblicus = {}
|
|
206
|
+
biblicus_block = dict(existing_biblicus)
|
|
207
|
+
biblicus_block["id"] = item_id
|
|
208
|
+
biblicus_block["source"] = source_uri
|
|
209
|
+
updated_metadata["biblicus"] = biblicus_block
|
|
210
|
+
return updated_metadata
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _parse_uuid_prefix(filename: str) -> Optional[str]:
|
|
214
|
+
"""
|
|
215
|
+
Extract a universally unique identifier prefix from a filename, if present.
|
|
216
|
+
|
|
217
|
+
:param filename: Filename to inspect.
|
|
218
|
+
:type filename: str
|
|
219
|
+
:return: Universally unique identifier string or None.
|
|
220
|
+
:rtype: str or None
|
|
221
|
+
"""
|
|
222
|
+
if len(filename) < 36:
|
|
223
|
+
return None
|
|
224
|
+
prefix = filename[:36]
|
|
225
|
+
try:
|
|
226
|
+
return str(uuid.UUID(prefix))
|
|
227
|
+
except ValueError:
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _merge_metadata(front: Dict[str, Any], side: Dict[str, Any]) -> Dict[str, Any]:
|
|
232
|
+
"""
|
|
233
|
+
Merge front matter and sidecar metadata.
|
|
234
|
+
|
|
235
|
+
:param front: Front matter metadata.
|
|
236
|
+
:type front: dict[str, Any]
|
|
237
|
+
:param side: Sidecar metadata.
|
|
238
|
+
:type side: dict[str, Any]
|
|
239
|
+
:return: Merged metadata.
|
|
240
|
+
:rtype: dict[str, Any]
|
|
241
|
+
"""
|
|
242
|
+
merged_metadata: Dict[str, Any] = dict(front)
|
|
243
|
+
|
|
244
|
+
front_biblicus = merged_metadata.get("biblicus")
|
|
245
|
+
sidecar_biblicus = side.get("biblicus")
|
|
246
|
+
if isinstance(front_biblicus, dict) or isinstance(sidecar_biblicus, dict):
|
|
247
|
+
merged_biblicus: Dict[str, Any] = {}
|
|
248
|
+
if isinstance(front_biblicus, dict):
|
|
249
|
+
merged_biblicus.update(front_biblicus)
|
|
250
|
+
if isinstance(sidecar_biblicus, dict):
|
|
251
|
+
merged_biblicus.update(sidecar_biblicus)
|
|
252
|
+
merged_metadata["biblicus"] = merged_biblicus
|
|
253
|
+
|
|
254
|
+
merged_tags = _merge_tags(_merge_tags([], front.get("tags")), side.get("tags"))
|
|
255
|
+
if merged_tags:
|
|
256
|
+
merged_metadata["tags"] = merged_tags
|
|
257
|
+
|
|
258
|
+
for metadata_key, metadata_value in side.items():
|
|
259
|
+
if metadata_key in {"biblicus", "tags"}:
|
|
260
|
+
continue
|
|
261
|
+
merged_metadata[metadata_key] = metadata_value
|
|
262
|
+
|
|
263
|
+
return merged_metadata
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class Corpus:
|
|
267
|
+
"""
|
|
268
|
+
Local corpus manager for Biblicus.
|
|
269
|
+
|
|
270
|
+
:ivar root: Corpus root directory.
|
|
271
|
+
:vartype root: Path
|
|
272
|
+
:ivar meta_dir: Metadata directory under the corpus root.
|
|
273
|
+
:vartype meta_dir: Path
|
|
274
|
+
:ivar raw_dir: Raw item directory under the corpus root.
|
|
275
|
+
:vartype raw_dir: Path
|
|
276
|
+
:ivar config: Parsed corpus config, if present.
|
|
277
|
+
:vartype config: CorpusConfig or None
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
def __init__(self, root: Path):
|
|
281
|
+
"""
|
|
282
|
+
Initialize a corpus wrapper around a filesystem path.
|
|
283
|
+
|
|
284
|
+
:param root: Corpus root directory.
|
|
285
|
+
:type root: Path
|
|
286
|
+
"""
|
|
287
|
+
|
|
288
|
+
self.root = root
|
|
289
|
+
self.meta_dir = self.root / CORPUS_DIR_NAME
|
|
290
|
+
self.raw_dir = self.root / DEFAULT_RAW_DIR
|
|
291
|
+
self.config = self._load_config()
|
|
292
|
+
|
|
293
|
+
@property
|
|
294
|
+
def uri(self) -> str:
|
|
295
|
+
"""
|
|
296
|
+
Return the canonical uniform resource identifier for the corpus root.
|
|
297
|
+
|
|
298
|
+
:return: Corpus uniform resource identifier.
|
|
299
|
+
:rtype: str
|
|
300
|
+
"""
|
|
301
|
+
|
|
302
|
+
return self.root.as_uri()
|
|
303
|
+
|
|
304
|
+
def _load_config(self) -> Optional[CorpusConfig]:
|
|
305
|
+
"""
|
|
306
|
+
Load the corpus config if it exists.
|
|
307
|
+
|
|
308
|
+
:return: Parsed corpus config or None.
|
|
309
|
+
:rtype: CorpusConfig or None
|
|
310
|
+
:raises ValueError: If the config schema is invalid.
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
path = self.meta_dir / "config.json"
|
|
314
|
+
if not path.is_file():
|
|
315
|
+
return None
|
|
316
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
317
|
+
return CorpusConfig.model_validate(data)
|
|
318
|
+
|
|
319
|
+
@classmethod
|
|
320
|
+
def find(cls, start: Path) -> "Corpus":
|
|
321
|
+
"""
|
|
322
|
+
Locate a corpus by searching upward from a path.
|
|
323
|
+
|
|
324
|
+
:param start: Starting path to search.
|
|
325
|
+
:type start: Path
|
|
326
|
+
:return: Located corpus instance.
|
|
327
|
+
:rtype: Corpus
|
|
328
|
+
:raises FileNotFoundError: If no corpus config is found.
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
start = start.resolve()
|
|
332
|
+
for candidate in [start, *start.parents]:
|
|
333
|
+
if (candidate / CORPUS_DIR_NAME / "config.json").is_file():
|
|
334
|
+
return cls(candidate)
|
|
335
|
+
raise FileNotFoundError(
|
|
336
|
+
f"Not a Biblicus corpus (no {CORPUS_DIR_NAME}/config.json found from {start})"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
@classmethod
|
|
340
|
+
def open(cls, ref: str | Path) -> "Corpus":
|
|
341
|
+
"""
|
|
342
|
+
Open a corpus from a path or uniform resource identifier reference.
|
|
343
|
+
|
|
344
|
+
:param ref: Filesystem path or file:// uniform resource identifier.
|
|
345
|
+
:type ref: str or Path
|
|
346
|
+
:return: Opened corpus instance.
|
|
347
|
+
:rtype: Corpus
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
return cls.find(corpus_ref_to_path(ref))
|
|
351
|
+
|
|
352
|
+
@classmethod
|
|
353
|
+
def init(cls, root: Path, *, force: bool = False) -> "Corpus":
|
|
354
|
+
"""
|
|
355
|
+
Initialize a new corpus on disk.
|
|
356
|
+
|
|
357
|
+
:param root: Corpus root directory.
|
|
358
|
+
:type root: Path
|
|
359
|
+
:param force: Whether to overwrite existing config.
|
|
360
|
+
:type force: bool
|
|
361
|
+
:return: Initialized corpus instance.
|
|
362
|
+
:rtype: Corpus
|
|
363
|
+
:raises FileExistsError: If the corpus already exists and force is False.
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
root = root.resolve()
|
|
367
|
+
corpus = cls(root)
|
|
368
|
+
|
|
369
|
+
corpus.meta_dir.mkdir(parents=True, exist_ok=True)
|
|
370
|
+
corpus.raw_dir.mkdir(parents=True, exist_ok=True)
|
|
371
|
+
|
|
372
|
+
config_path = corpus.meta_dir / "config.json"
|
|
373
|
+
if config_path.exists() and not force:
|
|
374
|
+
raise FileExistsError(f"Corpus already exists at {root}")
|
|
375
|
+
|
|
376
|
+
config = CorpusConfig(
|
|
377
|
+
schema_version=SCHEMA_VERSION,
|
|
378
|
+
created_at=utc_now_iso(),
|
|
379
|
+
corpus_uri=normalize_corpus_uri(root),
|
|
380
|
+
raw_dir=DEFAULT_RAW_DIR,
|
|
381
|
+
)
|
|
382
|
+
config_path.write_text(config.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
383
|
+
|
|
384
|
+
corpus._init_catalog()
|
|
385
|
+
return corpus
|
|
386
|
+
|
|
387
|
+
@property
|
|
388
|
+
def catalog_path(self) -> Path:
|
|
389
|
+
"""
|
|
390
|
+
Return the path to the corpus catalog file.
|
|
391
|
+
|
|
392
|
+
:return: Catalog file path.
|
|
393
|
+
:rtype: Path
|
|
394
|
+
"""
|
|
395
|
+
|
|
396
|
+
return self.meta_dir / "catalog.json"
|
|
397
|
+
|
|
398
|
+
def _init_catalog(self) -> None:
|
|
399
|
+
"""
|
|
400
|
+
Initialize the catalog if it does not already exist.
|
|
401
|
+
|
|
402
|
+
:return: None.
|
|
403
|
+
:rtype: None
|
|
404
|
+
"""
|
|
405
|
+
|
|
406
|
+
if self.catalog_path.exists():
|
|
407
|
+
return
|
|
408
|
+
catalog = CorpusCatalog(
|
|
409
|
+
schema_version=SCHEMA_VERSION,
|
|
410
|
+
generated_at=utc_now_iso(),
|
|
411
|
+
corpus_uri=normalize_corpus_uri(self.root),
|
|
412
|
+
raw_dir=DEFAULT_RAW_DIR,
|
|
413
|
+
latest_run_id=None,
|
|
414
|
+
items={},
|
|
415
|
+
order=[],
|
|
416
|
+
)
|
|
417
|
+
self._write_catalog(catalog)
|
|
418
|
+
|
|
419
|
+
def _load_catalog(self) -> CorpusCatalog:
|
|
420
|
+
"""
|
|
421
|
+
Read and validate the corpus catalog file.
|
|
422
|
+
|
|
423
|
+
:return: Parsed corpus catalog.
|
|
424
|
+
:rtype: CorpusCatalog
|
|
425
|
+
:raises FileNotFoundError: If the catalog file does not exist.
|
|
426
|
+
:raises ValueError: If the catalog schema is invalid.
|
|
427
|
+
"""
|
|
428
|
+
|
|
429
|
+
if not self.catalog_path.is_file():
|
|
430
|
+
raise FileNotFoundError(f"Missing corpus catalog: {self.catalog_path}")
|
|
431
|
+
catalog_data = json.loads(self.catalog_path.read_text(encoding="utf-8"))
|
|
432
|
+
return CorpusCatalog.model_validate(catalog_data)
|
|
433
|
+
|
|
434
|
+
def load_catalog(self) -> CorpusCatalog:
|
|
435
|
+
"""
|
|
436
|
+
Load the current corpus catalog.
|
|
437
|
+
|
|
438
|
+
:return: Parsed corpus catalog.
|
|
439
|
+
:rtype: CorpusCatalog
|
|
440
|
+
:raises FileNotFoundError: If the catalog file does not exist.
|
|
441
|
+
:raises ValueError: If the catalog schema is invalid.
|
|
442
|
+
"""
|
|
443
|
+
|
|
444
|
+
return self._load_catalog()
|
|
445
|
+
|
|
446
|
+
def _write_catalog(self, catalog: CorpusCatalog) -> None:
|
|
447
|
+
"""
|
|
448
|
+
Atomically write a corpus catalog to disk.
|
|
449
|
+
|
|
450
|
+
:param catalog: Catalog to persist.
|
|
451
|
+
:type catalog: CorpusCatalog
|
|
452
|
+
:return: None.
|
|
453
|
+
:rtype: None
|
|
454
|
+
"""
|
|
455
|
+
|
|
456
|
+
temp_path = self.catalog_path.with_suffix(".json.tmp")
|
|
457
|
+
temp_path.write_text(catalog.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
458
|
+
temp_path.replace(self.catalog_path)
|
|
459
|
+
|
|
460
|
+
@property
|
|
461
|
+
def runs_dir(self) -> Path:
|
|
462
|
+
"""
|
|
463
|
+
Location of retrieval run manifests.
|
|
464
|
+
|
|
465
|
+
:return: Path to the runs directory.
|
|
466
|
+
:rtype: Path
|
|
467
|
+
"""
|
|
468
|
+
|
|
469
|
+
return self.meta_dir / RUNS_DIR_NAME
|
|
470
|
+
|
|
471
|
+
def _ensure_runs_dir(self) -> None:
|
|
472
|
+
"""
|
|
473
|
+
Ensure the retrieval runs directory exists.
|
|
474
|
+
|
|
475
|
+
:return: None.
|
|
476
|
+
:rtype: None
|
|
477
|
+
"""
|
|
478
|
+
|
|
479
|
+
self.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
480
|
+
|
|
481
|
+
def write_run(self, run: RetrievalRun) -> None:
|
|
482
|
+
"""
|
|
483
|
+
Persist a retrieval run manifest and update the catalog pointer.
|
|
484
|
+
|
|
485
|
+
:param run: Run manifest to persist.
|
|
486
|
+
:type run: RetrievalRun
|
|
487
|
+
:return: None.
|
|
488
|
+
:rtype: None
|
|
489
|
+
"""
|
|
490
|
+
|
|
491
|
+
self._ensure_runs_dir()
|
|
492
|
+
path = self.runs_dir / f"{run.run_id}.json"
|
|
493
|
+
path.write_text(run.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
494
|
+
catalog = self._load_catalog()
|
|
495
|
+
catalog.latest_run_id = run.run_id
|
|
496
|
+
catalog.generated_at = utc_now_iso()
|
|
497
|
+
self._write_catalog(catalog)
|
|
498
|
+
|
|
499
|
+
def load_run(self, run_id: str) -> RetrievalRun:
|
|
500
|
+
"""
|
|
501
|
+
Load a retrieval run manifest by identifier.
|
|
502
|
+
|
|
503
|
+
:param run_id: Run identifier.
|
|
504
|
+
:type run_id: str
|
|
505
|
+
:return: Parsed run manifest.
|
|
506
|
+
:rtype: RetrievalRun
|
|
507
|
+
:raises FileNotFoundError: If the run manifest does not exist.
|
|
508
|
+
"""
|
|
509
|
+
|
|
510
|
+
path = self.runs_dir / f"{run_id}.json"
|
|
511
|
+
if not path.is_file():
|
|
512
|
+
raise FileNotFoundError(f"Missing run manifest: {path}")
|
|
513
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
514
|
+
return RetrievalRun.model_validate(data)
|
|
515
|
+
|
|
516
|
+
@property
|
|
517
|
+
def latest_run_id(self) -> Optional[str]:
|
|
518
|
+
"""
|
|
519
|
+
Latest retrieval run identifier recorded in the catalog.
|
|
520
|
+
|
|
521
|
+
:return: Latest run identifier or None.
|
|
522
|
+
:rtype: str or None
|
|
523
|
+
"""
|
|
524
|
+
|
|
525
|
+
return self._load_catalog().latest_run_id
|
|
526
|
+
|
|
527
|
+
def _upsert_catalog_item(self, item: CatalogItem) -> None:
|
|
528
|
+
"""
|
|
529
|
+
Upsert a catalog item and reset the latest run pointer.
|
|
530
|
+
|
|
531
|
+
:param item: Catalog item to insert or update.
|
|
532
|
+
:type item: CatalogItem
|
|
533
|
+
:return: None.
|
|
534
|
+
:rtype: None
|
|
535
|
+
"""
|
|
536
|
+
|
|
537
|
+
self._init_catalog()
|
|
538
|
+
catalog = self._load_catalog()
|
|
539
|
+
catalog.items[item.id] = item
|
|
540
|
+
|
|
541
|
+
ordered_ids = [item_id for item_id in catalog.order if item_id != item.id]
|
|
542
|
+
ordered_ids.insert(0, item.id)
|
|
543
|
+
catalog.order = ordered_ids
|
|
544
|
+
catalog.generated_at = utc_now_iso()
|
|
545
|
+
catalog.latest_run_id = None
|
|
546
|
+
|
|
547
|
+
self._write_catalog(catalog)
|
|
548
|
+
|
|
549
|
+
def ingest_item(
|
|
550
|
+
self,
|
|
551
|
+
data: bytes,
|
|
552
|
+
*,
|
|
553
|
+
filename: Optional[str] = None,
|
|
554
|
+
media_type: str = "application/octet-stream",
|
|
555
|
+
title: Optional[str] = None,
|
|
556
|
+
tags: Sequence[str] = (),
|
|
557
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
558
|
+
source_uri: str = "unknown",
|
|
559
|
+
) -> IngestResult:
|
|
560
|
+
"""
|
|
561
|
+
Ingest a single raw item into the corpus.
|
|
562
|
+
|
|
563
|
+
This is the modality-neutral primitive: callers provide bytes + a media type.
|
|
564
|
+
Higher-level conveniences (ingest_note, ingest_source, and related methods) build on top.
|
|
565
|
+
|
|
566
|
+
:param data: Raw item bytes.
|
|
567
|
+
:type data: bytes
|
|
568
|
+
:param filename: Optional filename for the stored item.
|
|
569
|
+
:type filename: str or None
|
|
570
|
+
:param media_type: Internet Assigned Numbers Authority media type for the item.
|
|
571
|
+
:type media_type: str
|
|
572
|
+
:param title: Optional title metadata.
|
|
573
|
+
:type title: str or None
|
|
574
|
+
:param tags: Tags to associate with the item.
|
|
575
|
+
:type tags: Sequence[str]
|
|
576
|
+
:param metadata: Optional metadata mapping.
|
|
577
|
+
:type metadata: dict[str, Any] or None
|
|
578
|
+
:param source_uri: Source uniform resource identifier for provenance.
|
|
579
|
+
:type source_uri: str
|
|
580
|
+
:return: Ingestion result summary.
|
|
581
|
+
:rtype: IngestResult
|
|
582
|
+
:raises ValueError: If markdown is not Unicode Transformation Format 8.
|
|
583
|
+
"""
|
|
584
|
+
|
|
585
|
+
item_id = str(uuid.uuid4())
|
|
586
|
+
safe_filename = _sanitize_filename(filename) if filename else ""
|
|
587
|
+
|
|
588
|
+
if safe_filename:
|
|
589
|
+
safe_filename = _ensure_filename_extension(safe_filename, media_type=media_type)
|
|
590
|
+
|
|
591
|
+
if media_type == "text/markdown":
|
|
592
|
+
output_name = f"{item_id}--{safe_filename}" if safe_filename else f"{item_id}.md"
|
|
593
|
+
else:
|
|
594
|
+
if safe_filename:
|
|
595
|
+
output_name = f"{item_id}--{safe_filename}"
|
|
596
|
+
else:
|
|
597
|
+
extension = _preferred_extension_for_media_type(media_type) or ""
|
|
598
|
+
output_name = f"{item_id}{extension}" if extension else f"{item_id}"
|
|
599
|
+
|
|
600
|
+
relpath = str(Path(DEFAULT_RAW_DIR) / output_name)
|
|
601
|
+
output_path = self.root / relpath
|
|
602
|
+
|
|
603
|
+
resolved_title = title.strip() if isinstance(title, str) and title.strip() else None
|
|
604
|
+
resolved_tags = list(tags)
|
|
605
|
+
metadata_input: Dict[str, Any] = dict(metadata or {})
|
|
606
|
+
if resolved_title and "title" not in metadata_input:
|
|
607
|
+
metadata_input["title"] = resolved_title
|
|
608
|
+
if resolved_tags and "tags" not in metadata_input:
|
|
609
|
+
metadata_input["tags"] = list(resolved_tags)
|
|
610
|
+
|
|
611
|
+
frontmatter: Dict[str, Any] = {}
|
|
612
|
+
|
|
613
|
+
if media_type == "text/markdown":
|
|
614
|
+
try:
|
|
615
|
+
markdown_text = data.decode("utf-8")
|
|
616
|
+
except UnicodeDecodeError as decode_error:
|
|
617
|
+
raise ValueError("Markdown must be Unicode Transformation Format 8") from decode_error
|
|
618
|
+
|
|
619
|
+
parsed_document = parse_front_matter(markdown_text)
|
|
620
|
+
frontmatter = dict(parsed_document.metadata)
|
|
621
|
+
|
|
622
|
+
merged_tags = _merge_tags(resolved_tags, frontmatter.get("tags"))
|
|
623
|
+
if merged_tags:
|
|
624
|
+
frontmatter["tags"] = merged_tags
|
|
625
|
+
resolved_tags = merged_tags
|
|
626
|
+
|
|
627
|
+
if resolved_title and not (
|
|
628
|
+
isinstance(frontmatter.get("title"), str) and frontmatter.get("title").strip()
|
|
629
|
+
):
|
|
630
|
+
frontmatter["title"] = resolved_title
|
|
631
|
+
|
|
632
|
+
title_value = frontmatter.get("title")
|
|
633
|
+
if isinstance(title_value, str) and title_value.strip():
|
|
634
|
+
resolved_title = title_value.strip()
|
|
635
|
+
|
|
636
|
+
frontmatter = _ensure_biblicus_block(frontmatter, item_id=item_id, source_uri=source_uri)
|
|
637
|
+
rendered_document = render_front_matter(frontmatter, parsed_document.body)
|
|
638
|
+
data_to_write = rendered_document.encode("utf-8")
|
|
639
|
+
else:
|
|
640
|
+
data_to_write = data
|
|
641
|
+
|
|
642
|
+
sha256_digest = _sha256_bytes(data_to_write)
|
|
643
|
+
output_path.write_bytes(data_to_write)
|
|
644
|
+
|
|
645
|
+
if media_type != "text/markdown":
|
|
646
|
+
sidecar: Dict[str, Any] = {}
|
|
647
|
+
sidecar["media_type"] = media_type
|
|
648
|
+
if resolved_tags:
|
|
649
|
+
sidecar["tags"] = resolved_tags
|
|
650
|
+
if metadata_input:
|
|
651
|
+
for metadata_key, metadata_value in metadata_input.items():
|
|
652
|
+
if metadata_key in {"tags", "biblicus"}:
|
|
653
|
+
continue
|
|
654
|
+
sidecar[metadata_key] = metadata_value
|
|
655
|
+
sidecar["biblicus"] = {"id": item_id, "source": source_uri}
|
|
656
|
+
_write_sidecar(output_path, sidecar)
|
|
657
|
+
frontmatter = sidecar
|
|
658
|
+
|
|
659
|
+
created_at = utc_now_iso()
|
|
660
|
+
item_record = CatalogItem(
|
|
661
|
+
id=item_id,
|
|
662
|
+
relpath=relpath,
|
|
663
|
+
sha256=sha256_digest,
|
|
664
|
+
bytes=len(data_to_write),
|
|
665
|
+
media_type=media_type,
|
|
666
|
+
title=resolved_title,
|
|
667
|
+
tags=list(resolved_tags),
|
|
668
|
+
metadata=dict(frontmatter or {}),
|
|
669
|
+
created_at=created_at,
|
|
670
|
+
source_uri=source_uri,
|
|
671
|
+
)
|
|
672
|
+
self._upsert_catalog_item(item_record)
|
|
673
|
+
|
|
674
|
+
return IngestResult(item_id=item_id, relpath=relpath, sha256=sha256_digest)
|
|
675
|
+
|
|
676
|
+
def ingest_note(
|
|
677
|
+
self,
|
|
678
|
+
text: str,
|
|
679
|
+
*,
|
|
680
|
+
title: Optional[str] = None,
|
|
681
|
+
tags: Sequence[str] = (),
|
|
682
|
+
source_uri: str = "text",
|
|
683
|
+
) -> IngestResult:
|
|
684
|
+
"""
|
|
685
|
+
Ingest a text note as Markdown.
|
|
686
|
+
|
|
687
|
+
:param text: Note content.
|
|
688
|
+
:type text: str
|
|
689
|
+
:param title: Optional title metadata.
|
|
690
|
+
:type title: str or None
|
|
691
|
+
:param tags: Tags to associate with the note.
|
|
692
|
+
:type tags: Sequence[str]
|
|
693
|
+
:param source_uri: Source uniform resource identifier for provenance.
|
|
694
|
+
:type source_uri: str
|
|
695
|
+
:return: Ingestion result summary.
|
|
696
|
+
:rtype: IngestResult
|
|
697
|
+
"""
|
|
698
|
+
|
|
699
|
+
data = text.encode("utf-8")
|
|
700
|
+
return self.ingest_item(
|
|
701
|
+
data,
|
|
702
|
+
filename=None,
|
|
703
|
+
media_type="text/markdown",
|
|
704
|
+
title=title,
|
|
705
|
+
tags=tags,
|
|
706
|
+
metadata=None,
|
|
707
|
+
source_uri=source_uri,
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
def ingest_source(
|
|
711
|
+
self,
|
|
712
|
+
source: str | Path,
|
|
713
|
+
*,
|
|
714
|
+
tags: Sequence[str] = (),
|
|
715
|
+
source_uri: Optional[str] = None,
|
|
716
|
+
) -> IngestResult:
|
|
717
|
+
"""
|
|
718
|
+
Ingest a file path or uniform resource locator source.
|
|
719
|
+
|
|
720
|
+
:param source: File path or uniform resource locator.
|
|
721
|
+
:type source: str or Path
|
|
722
|
+
:param tags: Tags to associate with the item.
|
|
723
|
+
:type tags: Sequence[str]
|
|
724
|
+
:param source_uri: Optional override for the source uniform resource identifier.
|
|
725
|
+
:type source_uri: str or None
|
|
726
|
+
:return: Ingestion result summary.
|
|
727
|
+
:rtype: IngestResult
|
|
728
|
+
"""
|
|
729
|
+
|
|
730
|
+
payload = load_source(source, source_uri=source_uri)
|
|
731
|
+
return self.ingest_item(
|
|
732
|
+
payload.data,
|
|
733
|
+
filename=payload.filename,
|
|
734
|
+
media_type=payload.media_type,
|
|
735
|
+
title=None,
|
|
736
|
+
tags=tags,
|
|
737
|
+
metadata=None,
|
|
738
|
+
source_uri=payload.source_uri,
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
def list_items(self, *, limit: int = 50) -> List[CatalogItem]:
|
|
742
|
+
"""
|
|
743
|
+
List items from the catalog.
|
|
744
|
+
|
|
745
|
+
:param limit: Maximum number of items to return.
|
|
746
|
+
:type limit: int
|
|
747
|
+
:return: Catalog items ordered by recency.
|
|
748
|
+
:rtype: list[CatalogItem]
|
|
749
|
+
"""
|
|
750
|
+
|
|
751
|
+
catalog = self._load_catalog()
|
|
752
|
+
ordered_ids = (
|
|
753
|
+
catalog.order[:limit] if catalog.order else list(catalog.items.keys())[:limit]
|
|
754
|
+
)
|
|
755
|
+
collected_items: List[CatalogItem] = []
|
|
756
|
+
for item_id in ordered_ids:
|
|
757
|
+
item = catalog.items.get(item_id)
|
|
758
|
+
if item is not None:
|
|
759
|
+
collected_items.append(item)
|
|
760
|
+
return collected_items
|
|
761
|
+
|
|
762
|
+
def get_item(self, item_id: str) -> CatalogItem:
|
|
763
|
+
"""
|
|
764
|
+
Fetch a catalog item by identifier.
|
|
765
|
+
|
|
766
|
+
:param item_id: Item identifier.
|
|
767
|
+
:type item_id: str
|
|
768
|
+
:return: Catalog item.
|
|
769
|
+
:rtype: CatalogItem
|
|
770
|
+
:raises KeyError: If the item identifier is unknown.
|
|
771
|
+
"""
|
|
772
|
+
|
|
773
|
+
catalog = self._load_catalog()
|
|
774
|
+
item = catalog.items.get(item_id)
|
|
775
|
+
if item is None:
|
|
776
|
+
raise KeyError(f"Unknown item identifier: {item_id}")
|
|
777
|
+
return item
|
|
778
|
+
|
|
779
|
+
def reindex(self) -> Dict[str, int]:
|
|
780
|
+
"""
|
|
781
|
+
Rebuild/refresh the corpus catalog from the current on-disk corpus contents.
|
|
782
|
+
|
|
783
|
+
This is the core "mutable corpus with re-indexing" loop: edit raw files or sidecars,
|
|
784
|
+
then reindex to refresh the derived catalog.
|
|
785
|
+
|
|
786
|
+
:return: Reindex statistics.
|
|
787
|
+
:rtype: dict[str, int]
|
|
788
|
+
:raises ValueError: If a markdown file cannot be decoded as Unicode Transformation Format 8.
|
|
789
|
+
"""
|
|
790
|
+
|
|
791
|
+
self._init_catalog()
|
|
792
|
+
existing_catalog = self._load_catalog()
|
|
793
|
+
stats = {"scanned": 0, "skipped": 0, "inserted": 0, "updated": 0}
|
|
794
|
+
|
|
795
|
+
content_files = [
|
|
796
|
+
content_path
|
|
797
|
+
for content_path in self.raw_dir.rglob("*")
|
|
798
|
+
if content_path.is_file() and not content_path.name.endswith(SIDECAR_SUFFIX)
|
|
799
|
+
]
|
|
800
|
+
|
|
801
|
+
new_items: Dict[str, CatalogItem] = {}
|
|
802
|
+
|
|
803
|
+
for content_path in content_files:
|
|
804
|
+
stats["scanned"] += 1
|
|
805
|
+
relpath = str(content_path.relative_to(self.root))
|
|
806
|
+
data = content_path.read_bytes()
|
|
807
|
+
sha256 = _sha256_bytes(data)
|
|
808
|
+
|
|
809
|
+
media_type, _ = mimetypes.guess_type(content_path.name)
|
|
810
|
+
media_type = media_type or "application/octet-stream"
|
|
811
|
+
|
|
812
|
+
sidecar = _load_sidecar(content_path)
|
|
813
|
+
|
|
814
|
+
frontmatter: Dict[str, Any] = {}
|
|
815
|
+
if content_path.suffix.lower() in {".md", ".markdown"}:
|
|
816
|
+
try:
|
|
817
|
+
text = data.decode("utf-8")
|
|
818
|
+
except UnicodeDecodeError as decode_error:
|
|
819
|
+
raise ValueError(
|
|
820
|
+
f"Markdown file must be Unicode Transformation Format 8: {relpath}"
|
|
821
|
+
) from decode_error
|
|
822
|
+
parsed_document = parse_front_matter(text)
|
|
823
|
+
frontmatter = parsed_document.metadata
|
|
824
|
+
media_type = "text/markdown"
|
|
825
|
+
|
|
826
|
+
merged_metadata = _merge_metadata(frontmatter, sidecar)
|
|
827
|
+
|
|
828
|
+
if media_type != "text/markdown":
|
|
829
|
+
media_type_override = merged_metadata.get("media_type")
|
|
830
|
+
if isinstance(media_type_override, str) and media_type_override.strip():
|
|
831
|
+
media_type = media_type_override.strip()
|
|
832
|
+
|
|
833
|
+
item_id: Optional[str] = None
|
|
834
|
+
biblicus_block = merged_metadata.get("biblicus")
|
|
835
|
+
if isinstance(biblicus_block, dict):
|
|
836
|
+
biblicus_id = biblicus_block.get("id")
|
|
837
|
+
if isinstance(biblicus_id, str):
|
|
838
|
+
try:
|
|
839
|
+
item_id = str(uuid.UUID(biblicus_id))
|
|
840
|
+
except ValueError:
|
|
841
|
+
item_id = None
|
|
842
|
+
|
|
843
|
+
if item_id is None:
|
|
844
|
+
item_id = _parse_uuid_prefix(content_path.name)
|
|
845
|
+
|
|
846
|
+
if item_id is None:
|
|
847
|
+
stats["skipped"] += 1
|
|
848
|
+
continue
|
|
849
|
+
|
|
850
|
+
title: Optional[str] = None
|
|
851
|
+
title_value = merged_metadata.get("title")
|
|
852
|
+
if isinstance(title_value, str) and title_value.strip():
|
|
853
|
+
title = title_value.strip()
|
|
854
|
+
|
|
855
|
+
resolved_tags = _merge_tags([], merged_metadata.get("tags"))
|
|
856
|
+
|
|
857
|
+
source_uri: Optional[str] = None
|
|
858
|
+
if isinstance(biblicus_block, dict):
|
|
859
|
+
source_value = biblicus_block.get("source")
|
|
860
|
+
if isinstance(source_value, str) and source_value.strip():
|
|
861
|
+
source_uri = source_value.strip()
|
|
862
|
+
|
|
863
|
+
previous_item = existing_catalog.items.get(item_id)
|
|
864
|
+
created_at = previous_item.created_at if previous_item is not None else utc_now_iso()
|
|
865
|
+
source_uri = source_uri or (previous_item.source_uri if previous_item is not None else None)
|
|
866
|
+
|
|
867
|
+
if previous_item is None:
|
|
868
|
+
stats["inserted"] += 1
|
|
869
|
+
else:
|
|
870
|
+
stats["updated"] += 1
|
|
871
|
+
|
|
872
|
+
new_items[item_id] = CatalogItem(
|
|
873
|
+
id=item_id,
|
|
874
|
+
relpath=relpath,
|
|
875
|
+
sha256=sha256,
|
|
876
|
+
bytes=len(data),
|
|
877
|
+
media_type=media_type,
|
|
878
|
+
title=title,
|
|
879
|
+
tags=list(resolved_tags),
|
|
880
|
+
metadata=dict(merged_metadata or {}),
|
|
881
|
+
created_at=created_at,
|
|
882
|
+
source_uri=source_uri,
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
order = sorted(
|
|
886
|
+
new_items.keys(),
|
|
887
|
+
key=lambda item_id: (new_items[item_id].created_at, item_id),
|
|
888
|
+
reverse=True,
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
catalog = CorpusCatalog(
|
|
892
|
+
schema_version=SCHEMA_VERSION,
|
|
893
|
+
generated_at=utc_now_iso(),
|
|
894
|
+
corpus_uri=normalize_corpus_uri(self.root),
|
|
895
|
+
raw_dir=DEFAULT_RAW_DIR,
|
|
896
|
+
latest_run_id=None,
|
|
897
|
+
items=new_items,
|
|
898
|
+
order=order,
|
|
899
|
+
)
|
|
900
|
+
self._write_catalog(catalog)
|
|
901
|
+
|
|
902
|
+
return stats
|
|
903
|
+
|
|
904
|
+
@property
|
|
905
|
+
def name(self) -> str:
|
|
906
|
+
"""
|
|
907
|
+
Return the corpus name (directory basename).
|
|
908
|
+
|
|
909
|
+
:return: Corpus name.
|
|
910
|
+
:rtype: str
|
|
911
|
+
"""
|
|
912
|
+
|
|
913
|
+
return self.root.name
|
|
914
|
+
|
|
915
|
+
def purge(self, *, confirm: str) -> None:
|
|
916
|
+
"""
|
|
917
|
+
Delete all ingested items and derived files, preserving corpus identity/config.
|
|
918
|
+
|
|
919
|
+
:param confirm: Confirmation string matching the corpus name.
|
|
920
|
+
:type confirm: str
|
|
921
|
+
:return: None.
|
|
922
|
+
:rtype: None
|
|
923
|
+
:raises ValueError: If the confirmation does not match.
|
|
924
|
+
"""
|
|
925
|
+
|
|
926
|
+
expected = self.name
|
|
927
|
+
if confirm != expected:
|
|
928
|
+
raise ValueError(f"Confirmation mismatch: pass --confirm {expected!r} to purge this corpus")
|
|
929
|
+
|
|
930
|
+
if self.raw_dir.exists():
|
|
931
|
+
shutil.rmtree(self.raw_dir)
|
|
932
|
+
self.raw_dir.mkdir(parents=True, exist_ok=True)
|
|
933
|
+
|
|
934
|
+
for path in self.meta_dir.iterdir():
|
|
935
|
+
if path.name == "config.json":
|
|
936
|
+
continue
|
|
937
|
+
if path.is_dir():
|
|
938
|
+
shutil.rmtree(path)
|
|
939
|
+
else:
|
|
940
|
+
path.unlink()
|
|
941
|
+
self._init_catalog()
|
|
942
|
+
self._write_catalog(
|
|
943
|
+
CorpusCatalog(
|
|
944
|
+
schema_version=SCHEMA_VERSION,
|
|
945
|
+
generated_at=utc_now_iso(),
|
|
946
|
+
corpus_uri=normalize_corpus_uri(self.root),
|
|
947
|
+
raw_dir=DEFAULT_RAW_DIR,
|
|
948
|
+
latest_run_id=None,
|
|
949
|
+
items={},
|
|
950
|
+
order=[],
|
|
951
|
+
)
|
|
952
|
+
)
|