biblicus 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. biblicus/__init__.py +30 -0
  2. biblicus/__main__.py +8 -0
  3. biblicus/_vendor/dotyaml/__init__.py +14 -0
  4. biblicus/_vendor/dotyaml/interpolation.py +63 -0
  5. biblicus/_vendor/dotyaml/loader.py +181 -0
  6. biblicus/_vendor/dotyaml/transformer.py +135 -0
  7. biblicus/backends/__init__.py +42 -0
  8. biblicus/backends/base.py +65 -0
  9. biblicus/backends/scan.py +375 -0
  10. biblicus/backends/sqlite_full_text_search.py +487 -0
  11. biblicus/cli.py +804 -0
  12. biblicus/constants.py +12 -0
  13. biblicus/context.py +183 -0
  14. biblicus/corpus.py +1531 -0
  15. biblicus/crawl.py +186 -0
  16. biblicus/errors.py +15 -0
  17. biblicus/evaluation.py +257 -0
  18. biblicus/evidence_processing.py +201 -0
  19. biblicus/extraction.py +531 -0
  20. biblicus/extractors/__init__.py +44 -0
  21. biblicus/extractors/base.py +68 -0
  22. biblicus/extractors/metadata_text.py +106 -0
  23. biblicus/extractors/openai_stt.py +180 -0
  24. biblicus/extractors/pass_through_text.py +84 -0
  25. biblicus/extractors/pdf_text.py +100 -0
  26. biblicus/extractors/pipeline.py +105 -0
  27. biblicus/extractors/rapidocr_text.py +129 -0
  28. biblicus/extractors/select_longest_text.py +105 -0
  29. biblicus/extractors/select_text.py +100 -0
  30. biblicus/extractors/unstructured_text.py +100 -0
  31. biblicus/frontmatter.py +89 -0
  32. biblicus/hook_logging.py +180 -0
  33. biblicus/hook_manager.py +203 -0
  34. biblicus/hooks.py +261 -0
  35. biblicus/ignore.py +64 -0
  36. biblicus/knowledge_base.py +191 -0
  37. biblicus/models.py +445 -0
  38. biblicus/retrieval.py +133 -0
  39. biblicus/sources.py +212 -0
  40. biblicus/time.py +17 -0
  41. biblicus/uris.py +63 -0
  42. biblicus/user_config.py +138 -0
  43. biblicus-0.6.0.dist-info/METADATA +533 -0
  44. biblicus-0.6.0.dist-info/RECORD +48 -0
  45. biblicus-0.6.0.dist-info/WHEEL +5 -0
  46. biblicus-0.6.0.dist-info/entry_points.txt +2 -0
  47. biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
  48. biblicus-0.6.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,129 @@
1
+ """
2
+ RapidOCR-backed optical character recognition extractor plugin.
3
+
4
+ This extractor is an optional dependency. It exists as a practical default for extracting text
5
+ from image items without requiring a separate daemon.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from ..corpus import Corpus
15
+ from ..errors import ExtractionRunFatalError
16
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
17
+ from .base import TextExtractor
18
+
19
+
20
+ class RapidOcrExtractorConfig(BaseModel):
21
+ """
22
+ Configuration for the RapidOCR extractor.
23
+
24
+ :ivar min_confidence: Minimum per-line confidence to include in output.
25
+ :vartype min_confidence: float
26
+ :ivar joiner: Joiner used to combine recognized lines.
27
+ :vartype joiner: str
28
+ """
29
+
30
+ model_config = ConfigDict(extra="forbid")
31
+
32
+ min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
33
+ joiner: str = Field(default="\n")
34
+
35
+
36
+ class RapidOcrExtractor(TextExtractor):
37
+ """
38
+ Extractor plugin that performs optical character recognition on image items using RapidOCR.
39
+
40
+ This extractor handles common image media types such as Portable Network Graphics and Joint Photographic Experts Group.
41
+ It returns an empty extracted text artifact when the image is handled but no text is recognized.
42
+
43
+ :ivar extractor_id: Extractor identifier.
44
+ :vartype extractor_id: str
45
+ """
46
+
47
+ extractor_id = "ocr-rapidocr"
48
+
49
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
50
+ """
51
+ Validate extractor configuration and ensure prerequisites are available.
52
+
53
+ :param config: Configuration mapping.
54
+ :type config: dict[str, Any]
55
+ :return: Parsed configuration model.
56
+ :rtype: RapidOcrExtractorConfig
57
+ :raises ExtractionRunFatalError: If the optional dependency is missing.
58
+ """
59
+ try:
60
+ from rapidocr_onnxruntime import RapidOCR # noqa: F401
61
+ except ImportError as import_error:
62
+ raise ExtractionRunFatalError(
63
+ "RapidOCR extractor requires an optional dependency. "
64
+ 'Install it with pip install "biblicus[ocr]".'
65
+ ) from import_error
66
+
67
+ return RapidOcrExtractorConfig.model_validate(config)
68
+
69
+ def extract_text(
70
+ self,
71
+ *,
72
+ corpus: Corpus,
73
+ item: CatalogItem,
74
+ config: BaseModel,
75
+ previous_extractions: List[ExtractionStepOutput],
76
+ ) -> Optional[ExtractedText]:
77
+ """
78
+ Extract text from an image item using optical character recognition.
79
+
80
+ :param corpus: Corpus containing the item bytes.
81
+ :type corpus: Corpus
82
+ :param item: Catalog item being processed.
83
+ :type item: CatalogItem
84
+ :param config: Parsed configuration model.
85
+ :type config: RapidOcrExtractorConfig
86
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
87
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
88
+ :return: Extracted text payload, or None when the item is not an image.
89
+ :rtype: ExtractedText or None
90
+ """
91
+ _ = previous_extractions
92
+ media_type = item.media_type
93
+ if not media_type.startswith("image/"):
94
+ return None
95
+
96
+ parsed_config = (
97
+ config
98
+ if isinstance(config, RapidOcrExtractorConfig)
99
+ else RapidOcrExtractorConfig.model_validate(config)
100
+ )
101
+
102
+ from rapidocr_onnxruntime import RapidOCR
103
+
104
+ source_path = corpus.root / item.relpath
105
+ ocr = RapidOCR()
106
+ result, _elapsed = ocr(str(source_path))
107
+
108
+ if result is None:
109
+ return ExtractedText(text="", producer_extractor_id=self.extractor_id)
110
+
111
+ lines: list[str] = []
112
+ for entry in result:
113
+ if not isinstance(entry, list) or len(entry) < 3:
114
+ continue
115
+ text_value = entry[1]
116
+ confidence_value = entry[2]
117
+ if not isinstance(text_value, str):
118
+ continue
119
+ if not isinstance(confidence_value, (int, float)):
120
+ continue
121
+ confidence = float(confidence_value)
122
+ if confidence < parsed_config.min_confidence:
123
+ continue
124
+ cleaned = text_value.strip()
125
+ if cleaned:
126
+ lines.append(cleaned)
127
+
128
+ text = parsed_config.joiner.join(lines).strip()
129
+ return ExtractedText(text=text, producer_extractor_id=self.extractor_id)
@@ -0,0 +1,105 @@
1
+ """
2
+ Selection extractor that chooses the longest available text from previous pipeline outputs.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict
10
+
11
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
12
+ from .base import TextExtractor
13
+
14
+
15
+ class SelectLongestTextExtractorConfig(BaseModel):
16
+ """
17
+ Configuration for the longest text selection extractor.
18
+
19
+ Version zero does not expose configuration for this extractor.
20
+ """
21
+
22
+ model_config = ConfigDict(extra="forbid")
23
+
24
+
25
+ class SelectLongestTextExtractor(TextExtractor):
26
+ """
27
+ Extractor plugin that selects the longest text from previous pipeline outputs.
28
+
29
+ This extractor does not attempt to score semantic quality. It is a deterministic
30
+ selection policy for cases where multiple steps can produce usable text for the
31
+ same item.
32
+
33
+ The selection rules are:
34
+
35
+ - If any prior extracted texts are non-empty after stripping whitespace, choose the one
36
+ with the greatest stripped character count.
37
+ - Ties are broken by earliest pipeline step index.
38
+ - If no prior extracted texts are usable but prior extracted texts exist, select the
39
+ earliest extracted text even if it is empty.
40
+
41
+ :ivar extractor_id: Extractor identifier.
42
+ :vartype extractor_id: str
43
+ """
44
+
45
+ extractor_id = "select-longest-text"
46
+
47
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
48
+ """
49
+ Validate selection extractor configuration.
50
+
51
+ :param config: Configuration mapping.
52
+ :type config: dict[str, Any]
53
+ :return: Parsed configuration.
54
+ :rtype: SelectLongestTextExtractorConfig
55
+ """
56
+ return SelectLongestTextExtractorConfig.model_validate(config)
57
+
58
+ def extract_text(
59
+ self,
60
+ *,
61
+ corpus,
62
+ item: CatalogItem,
63
+ config: BaseModel,
64
+ previous_extractions: List[ExtractionStepOutput],
65
+ ) -> Optional[ExtractedText]:
66
+ """
67
+ Select the longest extracted text from previous pipeline outputs.
68
+
69
+ :param corpus: Corpus containing the item bytes.
70
+ :type corpus: Corpus
71
+ :param item: Catalog item being processed.
72
+ :type item: CatalogItem
73
+ :param config: Parsed configuration model.
74
+ :type config: SelectLongestTextExtractorConfig
75
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
76
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
77
+ :return: Selected extracted text payload or None when no prior outputs exist.
78
+ :rtype: ExtractedText or None
79
+ """
80
+ _ = corpus
81
+ _ = item
82
+ _ = config
83
+
84
+ extracted_candidates = [entry for entry in previous_extractions if entry.text is not None]
85
+ if not extracted_candidates:
86
+ return None
87
+
88
+ usable_candidates = [entry for entry in extracted_candidates if entry.text.strip()]
89
+ if usable_candidates:
90
+ candidate = max(usable_candidates, key=lambda entry: len(entry.text.strip()))
91
+ ties = [
92
+ entry
93
+ for entry in usable_candidates
94
+ if len(entry.text.strip()) == len(candidate.text.strip())
95
+ ]
96
+ candidate = min(ties, key=lambda entry: int(entry.step_index))
97
+ else:
98
+ candidate = min(extracted_candidates, key=lambda entry: int(entry.step_index))
99
+
100
+ producer = candidate.producer_extractor_id or candidate.extractor_id
101
+ return ExtractedText(
102
+ text=candidate.text or "",
103
+ producer_extractor_id=producer,
104
+ source_step_index=candidate.step_index,
105
+ )
@@ -0,0 +1,100 @@
1
+ """
2
+ Selection extractor that chooses text from previous pipeline outputs.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict
10
+
11
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
12
+ from .base import TextExtractor
13
+
14
+
15
+ class SelectTextExtractorConfig(BaseModel):
16
+ """
17
+ Configuration for the selection extractor.
18
+
19
+ The selection extractor is intentionally minimal and requires no configuration.
20
+ """
21
+
22
+ model_config = ConfigDict(extra="forbid")
23
+
24
+
25
+ class SelectTextExtractor(TextExtractor):
26
+ """
27
+ Extractor plugin that selects from previous pipeline outputs.
28
+
29
+ This extractor is used as a final step when you want to make an explicit choice among
30
+ multiple extraction outputs in the same pipeline.
31
+
32
+ It selects the first usable extracted text in pipeline order. Usable means the text is
33
+ non-empty after stripping whitespace. If no usable text exists but prior extracted text
34
+ exists, it selects the first extracted text even if it is empty.
35
+
36
+ :ivar extractor_id: Extractor identifier.
37
+ :vartype extractor_id: str
38
+ """
39
+
40
+ extractor_id = "select-text"
41
+
42
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
43
+ """
44
+ Validate selection extractor configuration.
45
+
46
+ :param config: Configuration mapping.
47
+ :type config: dict[str, Any]
48
+ :return: Parsed configuration.
49
+ :rtype: SelectTextExtractorConfig
50
+ """
51
+ return SelectTextExtractorConfig.model_validate(config)
52
+
53
+ def extract_text(
54
+ self,
55
+ *,
56
+ corpus,
57
+ item: CatalogItem,
58
+ config: BaseModel,
59
+ previous_extractions: List[ExtractionStepOutput],
60
+ ) -> Optional[ExtractedText]:
61
+ """
62
+ Select extracted text from previous pipeline outputs.
63
+
64
+ :param corpus: Corpus containing the item bytes.
65
+ :type corpus: Corpus
66
+ :param item: Catalog item being processed.
67
+ :type item: CatalogItem
68
+ :param config: Parsed configuration model.
69
+ :type config: SelectTextExtractorConfig
70
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
71
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
72
+ :return: Selected extracted text payload or None when no prior outputs exist.
73
+ :rtype: ExtractedText or None
74
+ """
75
+ _ = corpus
76
+ _ = item
77
+ _ = config
78
+
79
+ extracted_candidates = [entry for entry in previous_extractions if entry.text is not None]
80
+ usable_candidates = [entry for entry in extracted_candidates if entry.text.strip()]
81
+
82
+ if usable_candidates:
83
+ candidate = usable_candidates[0]
84
+ producer = candidate.producer_extractor_id or candidate.extractor_id
85
+ return ExtractedText(
86
+ text=candidate.text or "",
87
+ producer_extractor_id=producer,
88
+ source_step_index=candidate.step_index,
89
+ )
90
+
91
+ if extracted_candidates:
92
+ candidate = extracted_candidates[0]
93
+ producer = candidate.producer_extractor_id or candidate.extractor_id
94
+ return ExtractedText(
95
+ text=candidate.text or "",
96
+ producer_extractor_id=producer,
97
+ source_step_index=candidate.step_index,
98
+ )
99
+
100
+ return None
@@ -0,0 +1,100 @@
1
+ """
2
+ Unstructured-based text extraction plugin.
3
+
4
+ This extractor is implemented as an optional dependency so the core installation stays small.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from pydantic import BaseModel, ConfigDict
12
+
13
+ from ..corpus import Corpus
14
+ from ..errors import ExtractionRunFatalError
15
+ from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
16
+ from .base import TextExtractor
17
+
18
+
19
+ class UnstructuredExtractorConfig(BaseModel):
20
+ """
21
+ Configuration for the Unstructured extractor.
22
+
23
+ Version zero does not expose any configuration for this extractor.
24
+ """
25
+
26
+ model_config = ConfigDict(extra="forbid")
27
+
28
+
29
+ class UnstructuredExtractor(TextExtractor):
30
+ """
31
+ Extractor plugin backed by the `unstructured` library.
32
+
33
+ The intent is broad format coverage as a last-resort extractor. This extractor skips items
34
+ that are already text so the pass-through extractor remains the canonical choice for text
35
+ items and Markdown front matter handling.
36
+
37
+ :ivar extractor_id: Extractor identifier.
38
+ :vartype extractor_id: str
39
+ """
40
+
41
+ extractor_id = "unstructured"
42
+
43
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
44
+ """
45
+ Validate extractor configuration and ensure the dependency is installed.
46
+
47
+ :param config: Configuration mapping.
48
+ :type config: dict[str, Any]
49
+ :return: Parsed config.
50
+ :rtype: UnstructuredExtractorConfig
51
+ :raises ExtractionRunFatalError: If the optional dependency is not installed.
52
+ """
53
+ try:
54
+ from unstructured.partition.auto import partition # noqa: F401
55
+ except ImportError as import_error:
56
+ raise ExtractionRunFatalError(
57
+ "Unstructured extractor requires an optional dependency. "
58
+ 'Install it with pip install "biblicus[unstructured]".'
59
+ ) from import_error
60
+ return UnstructuredExtractorConfig.model_validate(config)
61
+
62
+ def extract_text(
63
+ self,
64
+ *,
65
+ corpus: Corpus,
66
+ item: CatalogItem,
67
+ config: BaseModel,
68
+ previous_extractions: List[ExtractionStepOutput],
69
+ ) -> Optional[ExtractedText]:
70
+ """
71
+ Extract text for a non-text item using Unstructured.
72
+
73
+ :param corpus: Corpus containing the item bytes.
74
+ :type corpus: Corpus
75
+ :param item: Catalog item being processed.
76
+ :type item: CatalogItem
77
+ :param config: Parsed configuration model.
78
+ :type config: UnstructuredExtractorConfig
79
+ :param previous_extractions: Prior step outputs for this item within the pipeline.
80
+ :type previous_extractions: list[biblicus.models.ExtractionStepOutput]
81
+ :return: Extracted text payload, or None when the item is already text.
82
+ :rtype: ExtractedText or None
83
+ """
84
+ _ = config
85
+ _ = previous_extractions
86
+ media_type = item.media_type
87
+ if media_type == "text/markdown" or media_type.startswith("text/"):
88
+ return None
89
+
90
+ from unstructured.partition.auto import partition
91
+
92
+ source_path = corpus.root / item.relpath
93
+ elements = partition(filename=str(source_path))
94
+ lines: list[str] = []
95
+ for element in elements or []:
96
+ text = getattr(element, "text", None)
97
+ if isinstance(text, str) and text.strip():
98
+ lines.append(text.strip())
99
+ combined_text = "\n".join(lines).strip()
100
+ return ExtractedText(text=combined_text, producer_extractor_id=self.extractor_id)
@@ -0,0 +1,89 @@
1
+ """
2
+ Markdown front matter helpers.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Any, Dict, Tuple
9
+
10
+ import yaml
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class FrontMatterDocument:
15
+ """
16
+ Parsed front matter and markdown body.
17
+
18
+ :ivar metadata: Front matter metadata mapping.
19
+ :vartype metadata: dict[str, Any]
20
+ :ivar body: Markdown body text.
21
+ :vartype body: str
22
+ """
23
+
24
+ metadata: Dict[str, Any]
25
+ body: str
26
+
27
+
28
+ def parse_front_matter(text: str) -> FrontMatterDocument:
29
+ """
30
+ Parse Yet Another Markup Language front matter from a Markdown document.
31
+
32
+ :param text: Markdown content with optional front matter.
33
+ :type text: str
34
+ :return: Parsed front matter and body.
35
+ :rtype: FrontMatterDocument
36
+ :raises ValueError: If front matter is present but not a mapping.
37
+ """
38
+ if not text.startswith("---\n"):
39
+ return FrontMatterDocument(metadata={}, body=text)
40
+
41
+ front_matter_end = text.find("\n---\n", 4)
42
+ if front_matter_end == -1:
43
+ return FrontMatterDocument(metadata={}, body=text)
44
+
45
+ raw_yaml = text[4:front_matter_end]
46
+ body = text[front_matter_end + len("\n---\n") :]
47
+
48
+ metadata = yaml.safe_load(raw_yaml) or {}
49
+ if not isinstance(metadata, dict):
50
+ raise ValueError("Yet Another Markup Language front matter must be a mapping object")
51
+
52
+ return FrontMatterDocument(metadata=dict(metadata), body=body)
53
+
54
+
55
+ def render_front_matter(metadata: Dict[str, Any], body: str) -> str:
56
+ """
57
+ Render Yet Another Markup Language front matter with a Markdown body.
58
+
59
+ :param metadata: Front matter metadata mapping.
60
+ :type metadata: dict[str, Any]
61
+ :param body: Markdown body text.
62
+ :type body: str
63
+ :return: Markdown with Yet Another Markup Language front matter.
64
+ :rtype: str
65
+ """
66
+ if not metadata:
67
+ return body
68
+
69
+ yaml_text = yaml.safe_dump(
70
+ metadata,
71
+ sort_keys=False,
72
+ allow_unicode=True,
73
+ default_flow_style=False,
74
+ ).strip()
75
+
76
+ return f"---\n{yaml_text}\n---\n{body}"
77
+
78
+
79
+ def split_markdown_front_matter(path_text: str) -> Tuple[Dict[str, Any], str]:
80
+ """
81
+ Split Markdown into front matter metadata and body.
82
+
83
+ :param path_text: Markdown content.
84
+ :type path_text: str
85
+ :return: Metadata mapping and body text.
86
+ :rtype: tuple[dict[str, Any], str]
87
+ """
88
+ parsed_document = parse_front_matter(path_text)
89
+ return parsed_document.metadata, parsed_document.body
@@ -0,0 +1,180 @@
1
+ """
2
+ Structured hook execution logging.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import uuid
9
+ from pathlib import Path
10
+ from typing import Any, Dict, Optional
11
+ from urllib.parse import urlparse, urlunparse
12
+
13
+ from pydantic import BaseModel, ConfigDict, Field
14
+
15
+ from .hooks import HookPoint
16
+ from .time import utc_now_iso
17
+
18
+
19
+ def new_operation_id() -> str:
20
+ """
21
+ Create a new operation identifier for hook log grouping.
22
+
23
+ :return: Operation identifier.
24
+ :rtype: str
25
+ """
26
+ return str(uuid.uuid4())
27
+
28
+
29
+ def redact_source_uri(source_uri: str) -> str:
30
+ """
31
+ Redact sensitive components from a source uniform resource identifier.
32
+
33
+ :param source_uri: Source uniform resource identifier.
34
+ :type source_uri: str
35
+ :return: Redacted source uniform resource identifier.
36
+ :rtype: str
37
+ """
38
+ parsed = urlparse(source_uri)
39
+
40
+ if not parsed.scheme:
41
+ return source_uri
42
+
43
+ netloc = parsed.netloc
44
+ if "@" in netloc:
45
+ netloc = netloc.split("@", 1)[-1]
46
+
47
+ return urlunparse(
48
+ (
49
+ parsed.scheme,
50
+ netloc,
51
+ parsed.path,
52
+ parsed.params,
53
+ parsed.query,
54
+ parsed.fragment,
55
+ )
56
+ )
57
+
58
+
59
+ class HookLogEntry(BaseModel):
60
+ """
61
+ Single structured log record for hook execution.
62
+
63
+ :ivar operation_id: Identifier for the enclosing command or call.
64
+ :vartype operation_id: str
65
+ :ivar hook_point: Hook point that executed.
66
+ :vartype hook_point: HookPoint
67
+ :ivar hook_id: Hook implementation identifier.
68
+ :vartype hook_id: str
69
+ :ivar recorded_at: International Organization for Standardization 8601 timestamp for log record creation.
70
+ :vartype recorded_at: str
71
+ :ivar status: Execution status string.
72
+ :vartype status: str
73
+ :ivar message: Optional message describing execution results.
74
+ :vartype message: str or None
75
+ :ivar item_id: Optional item identifier.
76
+ :vartype item_id: str or None
77
+ :ivar relpath: Optional relative path associated with an item.
78
+ :vartype relpath: str or None
79
+ :ivar source_uri: Optional redacted source uniform resource identifier.
80
+ :vartype source_uri: str or None
81
+ :ivar details: Optional structured details about changes.
82
+ :vartype details: dict[str, Any]
83
+ """
84
+
85
+ model_config = ConfigDict(extra="forbid")
86
+
87
+ operation_id: str
88
+ hook_point: HookPoint
89
+ hook_id: str
90
+ recorded_at: str
91
+ status: str = Field(min_length=1)
92
+ message: Optional[str] = None
93
+ item_id: Optional[str] = None
94
+ relpath: Optional[str] = None
95
+ source_uri: Optional[str] = None
96
+ details: Dict[str, Any] = Field(default_factory=dict)
97
+
98
+
99
+ class HookLogger:
100
+ """
101
+ Hook logger that writes JSON lines records to a corpus log directory.
102
+
103
+ :ivar log_dir: Directory where log files are written.
104
+ :vartype log_dir: Path
105
+ :ivar operation_id: Operation identifier for grouping records.
106
+ :vartype operation_id: str
107
+ """
108
+
109
+ def __init__(self, *, log_dir: Path, operation_id: str):
110
+ """
111
+ Initialize a hook logger.
112
+
113
+ :param log_dir: Log directory to write into.
114
+ :type log_dir: Path
115
+ :param operation_id: Operation identifier for grouping records.
116
+ :type operation_id: str
117
+ """
118
+ self.log_dir = log_dir
119
+ self.operation_id = operation_id
120
+
121
+ @property
122
+ def path(self) -> Path:
123
+ """
124
+ Return the log file path for this operation.
125
+
126
+ :return: Log file path.
127
+ :rtype: Path
128
+ """
129
+ return self.log_dir / f"{self.operation_id}.jsonl"
130
+
131
+ def record(
132
+ self,
133
+ *,
134
+ hook_point: HookPoint,
135
+ hook_id: str,
136
+ status: str,
137
+ message: Optional[str] = None,
138
+ item_id: Optional[str] = None,
139
+ relpath: Optional[str] = None,
140
+ source_uri: Optional[str] = None,
141
+ details: Optional[Dict[str, Any]] = None,
142
+ ) -> None:
143
+ """
144
+ Append a structured hook log record.
145
+
146
+ :param hook_point: Hook point that executed.
147
+ :type hook_point: HookPoint
148
+ :param hook_id: Hook identifier.
149
+ :type hook_id: str
150
+ :param status: Status string such as ok, denied, or error.
151
+ :type status: str
152
+ :param message: Optional message describing results.
153
+ :type message: str or None
154
+ :param item_id: Optional item identifier.
155
+ :type item_id: str or None
156
+ :param relpath: Optional relative path for the item.
157
+ :type relpath: str or None
158
+ :param source_uri: Optional source uniform resource identifier.
159
+ :type source_uri: str or None
160
+ :param details: Optional structured details.
161
+ :type details: dict[str, Any] or None
162
+ :return: None.
163
+ :rtype: None
164
+ """
165
+ self.log_dir.mkdir(parents=True, exist_ok=True)
166
+ entry = HookLogEntry(
167
+ operation_id=self.operation_id,
168
+ hook_point=hook_point,
169
+ hook_id=hook_id,
170
+ recorded_at=utc_now_iso(),
171
+ status=status,
172
+ message=message,
173
+ item_id=item_id,
174
+ relpath=relpath,
175
+ source_uri=redact_source_uri(source_uri) if source_uri else None,
176
+ details=dict(details or {}),
177
+ )
178
+ line = json.dumps(entry.model_dump(), sort_keys=False)
179
+ with self.path.open("a", encoding="utf-8") as handle:
180
+ handle.write(line + "\n")