biblicus 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +30 -0
- biblicus/__main__.py +8 -0
- biblicus/_vendor/dotyaml/__init__.py +14 -0
- biblicus/_vendor/dotyaml/interpolation.py +63 -0
- biblicus/_vendor/dotyaml/loader.py +181 -0
- biblicus/_vendor/dotyaml/transformer.py +135 -0
- biblicus/backends/__init__.py +42 -0
- biblicus/backends/base.py +65 -0
- biblicus/backends/scan.py +375 -0
- biblicus/backends/sqlite_full_text_search.py +487 -0
- biblicus/cli.py +804 -0
- biblicus/constants.py +12 -0
- biblicus/context.py +183 -0
- biblicus/corpus.py +1531 -0
- biblicus/crawl.py +186 -0
- biblicus/errors.py +15 -0
- biblicus/evaluation.py +257 -0
- biblicus/evidence_processing.py +201 -0
- biblicus/extraction.py +531 -0
- biblicus/extractors/__init__.py +44 -0
- biblicus/extractors/base.py +68 -0
- biblicus/extractors/metadata_text.py +106 -0
- biblicus/extractors/openai_stt.py +180 -0
- biblicus/extractors/pass_through_text.py +84 -0
- biblicus/extractors/pdf_text.py +100 -0
- biblicus/extractors/pipeline.py +105 -0
- biblicus/extractors/rapidocr_text.py +129 -0
- biblicus/extractors/select_longest_text.py +105 -0
- biblicus/extractors/select_text.py +100 -0
- biblicus/extractors/unstructured_text.py +100 -0
- biblicus/frontmatter.py +89 -0
- biblicus/hook_logging.py +180 -0
- biblicus/hook_manager.py +203 -0
- biblicus/hooks.py +261 -0
- biblicus/ignore.py +64 -0
- biblicus/knowledge_base.py +191 -0
- biblicus/models.py +445 -0
- biblicus/retrieval.py +133 -0
- biblicus/sources.py +212 -0
- biblicus/time.py +17 -0
- biblicus/uris.py +63 -0
- biblicus/user_config.py +138 -0
- biblicus-0.6.0.dist-info/METADATA +533 -0
- biblicus-0.6.0.dist-info/RECORD +48 -0
- biblicus-0.6.0.dist-info/WHEEL +5 -0
- biblicus-0.6.0.dist-info/entry_points.txt +2 -0
- biblicus-0.6.0.dist-info/licenses/LICENSE +21 -0
- biblicus-0.6.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RapidOCR-backed optical character recognition extractor plugin.
|
|
3
|
+
|
|
4
|
+
This extractor is an optional dependency. It exists as a practical default for extracting text
|
|
5
|
+
from image items without requiring a separate daemon.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..errors import ExtractionRunFatalError
|
|
16
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
17
|
+
from .base import TextExtractor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RapidOcrExtractorConfig(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Configuration for the RapidOCR extractor.
|
|
23
|
+
|
|
24
|
+
:ivar min_confidence: Minimum per-line confidence to include in output.
|
|
25
|
+
:vartype min_confidence: float
|
|
26
|
+
:ivar joiner: Joiner used to combine recognized lines.
|
|
27
|
+
:vartype joiner: str
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
model_config = ConfigDict(extra="forbid")
|
|
31
|
+
|
|
32
|
+
min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
|
33
|
+
joiner: str = Field(default="\n")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class RapidOcrExtractor(TextExtractor):
|
|
37
|
+
"""
|
|
38
|
+
Extractor plugin that performs optical character recognition on image items using RapidOCR.
|
|
39
|
+
|
|
40
|
+
This extractor handles common image media types such as Portable Network Graphics and Joint Photographic Experts Group.
|
|
41
|
+
It returns an empty extracted text artifact when the image is handled but no text is recognized.
|
|
42
|
+
|
|
43
|
+
:ivar extractor_id: Extractor identifier.
|
|
44
|
+
:vartype extractor_id: str
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
extractor_id = "ocr-rapidocr"
|
|
48
|
+
|
|
49
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
50
|
+
"""
|
|
51
|
+
Validate extractor configuration and ensure prerequisites are available.
|
|
52
|
+
|
|
53
|
+
:param config: Configuration mapping.
|
|
54
|
+
:type config: dict[str, Any]
|
|
55
|
+
:return: Parsed configuration model.
|
|
56
|
+
:rtype: RapidOcrExtractorConfig
|
|
57
|
+
:raises ExtractionRunFatalError: If the optional dependency is missing.
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
from rapidocr_onnxruntime import RapidOCR # noqa: F401
|
|
61
|
+
except ImportError as import_error:
|
|
62
|
+
raise ExtractionRunFatalError(
|
|
63
|
+
"RapidOCR extractor requires an optional dependency. "
|
|
64
|
+
'Install it with pip install "biblicus[ocr]".'
|
|
65
|
+
) from import_error
|
|
66
|
+
|
|
67
|
+
return RapidOcrExtractorConfig.model_validate(config)
|
|
68
|
+
|
|
69
|
+
def extract_text(
|
|
70
|
+
self,
|
|
71
|
+
*,
|
|
72
|
+
corpus: Corpus,
|
|
73
|
+
item: CatalogItem,
|
|
74
|
+
config: BaseModel,
|
|
75
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
76
|
+
) -> Optional[ExtractedText]:
|
|
77
|
+
"""
|
|
78
|
+
Extract text from an image item using optical character recognition.
|
|
79
|
+
|
|
80
|
+
:param corpus: Corpus containing the item bytes.
|
|
81
|
+
:type corpus: Corpus
|
|
82
|
+
:param item: Catalog item being processed.
|
|
83
|
+
:type item: CatalogItem
|
|
84
|
+
:param config: Parsed configuration model.
|
|
85
|
+
:type config: RapidOcrExtractorConfig
|
|
86
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
87
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
88
|
+
:return: Extracted text payload, or None when the item is not an image.
|
|
89
|
+
:rtype: ExtractedText or None
|
|
90
|
+
"""
|
|
91
|
+
_ = previous_extractions
|
|
92
|
+
media_type = item.media_type
|
|
93
|
+
if not media_type.startswith("image/"):
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
parsed_config = (
|
|
97
|
+
config
|
|
98
|
+
if isinstance(config, RapidOcrExtractorConfig)
|
|
99
|
+
else RapidOcrExtractorConfig.model_validate(config)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
from rapidocr_onnxruntime import RapidOCR
|
|
103
|
+
|
|
104
|
+
source_path = corpus.root / item.relpath
|
|
105
|
+
ocr = RapidOCR()
|
|
106
|
+
result, _elapsed = ocr(str(source_path))
|
|
107
|
+
|
|
108
|
+
if result is None:
|
|
109
|
+
return ExtractedText(text="", producer_extractor_id=self.extractor_id)
|
|
110
|
+
|
|
111
|
+
lines: list[str] = []
|
|
112
|
+
for entry in result:
|
|
113
|
+
if not isinstance(entry, list) or len(entry) < 3:
|
|
114
|
+
continue
|
|
115
|
+
text_value = entry[1]
|
|
116
|
+
confidence_value = entry[2]
|
|
117
|
+
if not isinstance(text_value, str):
|
|
118
|
+
continue
|
|
119
|
+
if not isinstance(confidence_value, (int, float)):
|
|
120
|
+
continue
|
|
121
|
+
confidence = float(confidence_value)
|
|
122
|
+
if confidence < parsed_config.min_confidence:
|
|
123
|
+
continue
|
|
124
|
+
cleaned = text_value.strip()
|
|
125
|
+
if cleaned:
|
|
126
|
+
lines.append(cleaned)
|
|
127
|
+
|
|
128
|
+
text = parsed_config.joiner.join(lines).strip()
|
|
129
|
+
return ExtractedText(text=text, producer_extractor_id=self.extractor_id)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Selection extractor that chooses the longest available text from previous pipeline outputs.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
|
|
11
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
12
|
+
from .base import TextExtractor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SelectLongestTextExtractorConfig(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for the longest text selection extractor.
|
|
18
|
+
|
|
19
|
+
Version zero does not expose configuration for this extractor.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
model_config = ConfigDict(extra="forbid")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SelectLongestTextExtractor(TextExtractor):
|
|
26
|
+
"""
|
|
27
|
+
Extractor plugin that selects the longest text from previous pipeline outputs.
|
|
28
|
+
|
|
29
|
+
This extractor does not attempt to score semantic quality. It is a deterministic
|
|
30
|
+
selection policy for cases where multiple steps can produce usable text for the
|
|
31
|
+
same item.
|
|
32
|
+
|
|
33
|
+
The selection rules are:
|
|
34
|
+
|
|
35
|
+
- If any prior extracted texts are non-empty after stripping whitespace, choose the one
|
|
36
|
+
with the greatest stripped character count.
|
|
37
|
+
- Ties are broken by earliest pipeline step index.
|
|
38
|
+
- If no prior extracted texts are usable but prior extracted texts exist, select the
|
|
39
|
+
earliest extracted text even if it is empty.
|
|
40
|
+
|
|
41
|
+
:ivar extractor_id: Extractor identifier.
|
|
42
|
+
:vartype extractor_id: str
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
extractor_id = "select-longest-text"
|
|
46
|
+
|
|
47
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
48
|
+
"""
|
|
49
|
+
Validate selection extractor configuration.
|
|
50
|
+
|
|
51
|
+
:param config: Configuration mapping.
|
|
52
|
+
:type config: dict[str, Any]
|
|
53
|
+
:return: Parsed configuration.
|
|
54
|
+
:rtype: SelectLongestTextExtractorConfig
|
|
55
|
+
"""
|
|
56
|
+
return SelectLongestTextExtractorConfig.model_validate(config)
|
|
57
|
+
|
|
58
|
+
def extract_text(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
corpus,
|
|
62
|
+
item: CatalogItem,
|
|
63
|
+
config: BaseModel,
|
|
64
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
65
|
+
) -> Optional[ExtractedText]:
|
|
66
|
+
"""
|
|
67
|
+
Select the longest extracted text from previous pipeline outputs.
|
|
68
|
+
|
|
69
|
+
:param corpus: Corpus containing the item bytes.
|
|
70
|
+
:type corpus: Corpus
|
|
71
|
+
:param item: Catalog item being processed.
|
|
72
|
+
:type item: CatalogItem
|
|
73
|
+
:param config: Parsed configuration model.
|
|
74
|
+
:type config: SelectLongestTextExtractorConfig
|
|
75
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
76
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
77
|
+
:return: Selected extracted text payload or None when no prior outputs exist.
|
|
78
|
+
:rtype: ExtractedText or None
|
|
79
|
+
"""
|
|
80
|
+
_ = corpus
|
|
81
|
+
_ = item
|
|
82
|
+
_ = config
|
|
83
|
+
|
|
84
|
+
extracted_candidates = [entry for entry in previous_extractions if entry.text is not None]
|
|
85
|
+
if not extracted_candidates:
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
usable_candidates = [entry for entry in extracted_candidates if entry.text.strip()]
|
|
89
|
+
if usable_candidates:
|
|
90
|
+
candidate = max(usable_candidates, key=lambda entry: len(entry.text.strip()))
|
|
91
|
+
ties = [
|
|
92
|
+
entry
|
|
93
|
+
for entry in usable_candidates
|
|
94
|
+
if len(entry.text.strip()) == len(candidate.text.strip())
|
|
95
|
+
]
|
|
96
|
+
candidate = min(ties, key=lambda entry: int(entry.step_index))
|
|
97
|
+
else:
|
|
98
|
+
candidate = min(extracted_candidates, key=lambda entry: int(entry.step_index))
|
|
99
|
+
|
|
100
|
+
producer = candidate.producer_extractor_id or candidate.extractor_id
|
|
101
|
+
return ExtractedText(
|
|
102
|
+
text=candidate.text or "",
|
|
103
|
+
producer_extractor_id=producer,
|
|
104
|
+
source_step_index=candidate.step_index,
|
|
105
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Selection extractor that chooses text from previous pipeline outputs.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
|
|
11
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
12
|
+
from .base import TextExtractor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SelectTextExtractorConfig(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for the selection extractor.
|
|
18
|
+
|
|
19
|
+
The selection extractor is intentionally minimal and requires no configuration.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
model_config = ConfigDict(extra="forbid")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SelectTextExtractor(TextExtractor):
|
|
26
|
+
"""
|
|
27
|
+
Extractor plugin that selects from previous pipeline outputs.
|
|
28
|
+
|
|
29
|
+
This extractor is used as a final step when you want to make an explicit choice among
|
|
30
|
+
multiple extraction outputs in the same pipeline.
|
|
31
|
+
|
|
32
|
+
It selects the first usable extracted text in pipeline order. Usable means the text is
|
|
33
|
+
non-empty after stripping whitespace. If no usable text exists but prior extracted text
|
|
34
|
+
exists, it selects the first extracted text even if it is empty.
|
|
35
|
+
|
|
36
|
+
:ivar extractor_id: Extractor identifier.
|
|
37
|
+
:vartype extractor_id: str
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
extractor_id = "select-text"
|
|
41
|
+
|
|
42
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
43
|
+
"""
|
|
44
|
+
Validate selection extractor configuration.
|
|
45
|
+
|
|
46
|
+
:param config: Configuration mapping.
|
|
47
|
+
:type config: dict[str, Any]
|
|
48
|
+
:return: Parsed configuration.
|
|
49
|
+
:rtype: SelectTextExtractorConfig
|
|
50
|
+
"""
|
|
51
|
+
return SelectTextExtractorConfig.model_validate(config)
|
|
52
|
+
|
|
53
|
+
def extract_text(
|
|
54
|
+
self,
|
|
55
|
+
*,
|
|
56
|
+
corpus,
|
|
57
|
+
item: CatalogItem,
|
|
58
|
+
config: BaseModel,
|
|
59
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
60
|
+
) -> Optional[ExtractedText]:
|
|
61
|
+
"""
|
|
62
|
+
Select extracted text from previous pipeline outputs.
|
|
63
|
+
|
|
64
|
+
:param corpus: Corpus containing the item bytes.
|
|
65
|
+
:type corpus: Corpus
|
|
66
|
+
:param item: Catalog item being processed.
|
|
67
|
+
:type item: CatalogItem
|
|
68
|
+
:param config: Parsed configuration model.
|
|
69
|
+
:type config: SelectTextExtractorConfig
|
|
70
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
71
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
72
|
+
:return: Selected extracted text payload or None when no prior outputs exist.
|
|
73
|
+
:rtype: ExtractedText or None
|
|
74
|
+
"""
|
|
75
|
+
_ = corpus
|
|
76
|
+
_ = item
|
|
77
|
+
_ = config
|
|
78
|
+
|
|
79
|
+
extracted_candidates = [entry for entry in previous_extractions if entry.text is not None]
|
|
80
|
+
usable_candidates = [entry for entry in extracted_candidates if entry.text.strip()]
|
|
81
|
+
|
|
82
|
+
if usable_candidates:
|
|
83
|
+
candidate = usable_candidates[0]
|
|
84
|
+
producer = candidate.producer_extractor_id or candidate.extractor_id
|
|
85
|
+
return ExtractedText(
|
|
86
|
+
text=candidate.text or "",
|
|
87
|
+
producer_extractor_id=producer,
|
|
88
|
+
source_step_index=candidate.step_index,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if extracted_candidates:
|
|
92
|
+
candidate = extracted_candidates[0]
|
|
93
|
+
producer = candidate.producer_extractor_id or candidate.extractor_id
|
|
94
|
+
return ExtractedText(
|
|
95
|
+
text=candidate.text or "",
|
|
96
|
+
producer_extractor_id=producer,
|
|
97
|
+
source_step_index=candidate.step_index,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return None
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unstructured-based text extraction plugin.
|
|
3
|
+
|
|
4
|
+
This extractor is implemented as an optional dependency so the core installation stays small.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict
|
|
12
|
+
|
|
13
|
+
from ..corpus import Corpus
|
|
14
|
+
from ..errors import ExtractionRunFatalError
|
|
15
|
+
from ..models import CatalogItem, ExtractedText, ExtractionStepOutput
|
|
16
|
+
from .base import TextExtractor
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class UnstructuredExtractorConfig(BaseModel):
|
|
20
|
+
"""
|
|
21
|
+
Configuration for the Unstructured extractor.
|
|
22
|
+
|
|
23
|
+
Version zero does not expose any configuration for this extractor.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class UnstructuredExtractor(TextExtractor):
|
|
30
|
+
"""
|
|
31
|
+
Extractor plugin backed by the `unstructured` library.
|
|
32
|
+
|
|
33
|
+
The intent is broad format coverage as a last-resort extractor. This extractor skips items
|
|
34
|
+
that are already text so the pass-through extractor remains the canonical choice for text
|
|
35
|
+
items and Markdown front matter handling.
|
|
36
|
+
|
|
37
|
+
:ivar extractor_id: Extractor identifier.
|
|
38
|
+
:vartype extractor_id: str
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
extractor_id = "unstructured"
|
|
42
|
+
|
|
43
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
44
|
+
"""
|
|
45
|
+
Validate extractor configuration and ensure the dependency is installed.
|
|
46
|
+
|
|
47
|
+
:param config: Configuration mapping.
|
|
48
|
+
:type config: dict[str, Any]
|
|
49
|
+
:return: Parsed config.
|
|
50
|
+
:rtype: UnstructuredExtractorConfig
|
|
51
|
+
:raises ExtractionRunFatalError: If the optional dependency is not installed.
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
from unstructured.partition.auto import partition # noqa: F401
|
|
55
|
+
except ImportError as import_error:
|
|
56
|
+
raise ExtractionRunFatalError(
|
|
57
|
+
"Unstructured extractor requires an optional dependency. "
|
|
58
|
+
'Install it with pip install "biblicus[unstructured]".'
|
|
59
|
+
) from import_error
|
|
60
|
+
return UnstructuredExtractorConfig.model_validate(config)
|
|
61
|
+
|
|
62
|
+
def extract_text(
|
|
63
|
+
self,
|
|
64
|
+
*,
|
|
65
|
+
corpus: Corpus,
|
|
66
|
+
item: CatalogItem,
|
|
67
|
+
config: BaseModel,
|
|
68
|
+
previous_extractions: List[ExtractionStepOutput],
|
|
69
|
+
) -> Optional[ExtractedText]:
|
|
70
|
+
"""
|
|
71
|
+
Extract text for a non-text item using Unstructured.
|
|
72
|
+
|
|
73
|
+
:param corpus: Corpus containing the item bytes.
|
|
74
|
+
:type corpus: Corpus
|
|
75
|
+
:param item: Catalog item being processed.
|
|
76
|
+
:type item: CatalogItem
|
|
77
|
+
:param config: Parsed configuration model.
|
|
78
|
+
:type config: UnstructuredExtractorConfig
|
|
79
|
+
:param previous_extractions: Prior step outputs for this item within the pipeline.
|
|
80
|
+
:type previous_extractions: list[biblicus.models.ExtractionStepOutput]
|
|
81
|
+
:return: Extracted text payload, or None when the item is already text.
|
|
82
|
+
:rtype: ExtractedText or None
|
|
83
|
+
"""
|
|
84
|
+
_ = config
|
|
85
|
+
_ = previous_extractions
|
|
86
|
+
media_type = item.media_type
|
|
87
|
+
if media_type == "text/markdown" or media_type.startswith("text/"):
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
from unstructured.partition.auto import partition
|
|
91
|
+
|
|
92
|
+
source_path = corpus.root / item.relpath
|
|
93
|
+
elements = partition(filename=str(source_path))
|
|
94
|
+
lines: list[str] = []
|
|
95
|
+
for element in elements or []:
|
|
96
|
+
text = getattr(element, "text", None)
|
|
97
|
+
if isinstance(text, str) and text.strip():
|
|
98
|
+
lines.append(text.strip())
|
|
99
|
+
combined_text = "\n".join(lines).strip()
|
|
100
|
+
return ExtractedText(text=combined_text, producer_extractor_id=self.extractor_id)
|
biblicus/frontmatter.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Markdown front matter helpers.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any, Dict, Tuple
|
|
9
|
+
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class FrontMatterDocument:
|
|
15
|
+
"""
|
|
16
|
+
Parsed front matter and markdown body.
|
|
17
|
+
|
|
18
|
+
:ivar metadata: Front matter metadata mapping.
|
|
19
|
+
:vartype metadata: dict[str, Any]
|
|
20
|
+
:ivar body: Markdown body text.
|
|
21
|
+
:vartype body: str
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
metadata: Dict[str, Any]
|
|
25
|
+
body: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def parse_front_matter(text: str) -> FrontMatterDocument:
|
|
29
|
+
"""
|
|
30
|
+
Parse Yet Another Markup Language front matter from a Markdown document.
|
|
31
|
+
|
|
32
|
+
:param text: Markdown content with optional front matter.
|
|
33
|
+
:type text: str
|
|
34
|
+
:return: Parsed front matter and body.
|
|
35
|
+
:rtype: FrontMatterDocument
|
|
36
|
+
:raises ValueError: If front matter is present but not a mapping.
|
|
37
|
+
"""
|
|
38
|
+
if not text.startswith("---\n"):
|
|
39
|
+
return FrontMatterDocument(metadata={}, body=text)
|
|
40
|
+
|
|
41
|
+
front_matter_end = text.find("\n---\n", 4)
|
|
42
|
+
if front_matter_end == -1:
|
|
43
|
+
return FrontMatterDocument(metadata={}, body=text)
|
|
44
|
+
|
|
45
|
+
raw_yaml = text[4:front_matter_end]
|
|
46
|
+
body = text[front_matter_end + len("\n---\n") :]
|
|
47
|
+
|
|
48
|
+
metadata = yaml.safe_load(raw_yaml) or {}
|
|
49
|
+
if not isinstance(metadata, dict):
|
|
50
|
+
raise ValueError("Yet Another Markup Language front matter must be a mapping object")
|
|
51
|
+
|
|
52
|
+
return FrontMatterDocument(metadata=dict(metadata), body=body)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def render_front_matter(metadata: Dict[str, Any], body: str) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Render Yet Another Markup Language front matter with a Markdown body.
|
|
58
|
+
|
|
59
|
+
:param metadata: Front matter metadata mapping.
|
|
60
|
+
:type metadata: dict[str, Any]
|
|
61
|
+
:param body: Markdown body text.
|
|
62
|
+
:type body: str
|
|
63
|
+
:return: Markdown with Yet Another Markup Language front matter.
|
|
64
|
+
:rtype: str
|
|
65
|
+
"""
|
|
66
|
+
if not metadata:
|
|
67
|
+
return body
|
|
68
|
+
|
|
69
|
+
yaml_text = yaml.safe_dump(
|
|
70
|
+
metadata,
|
|
71
|
+
sort_keys=False,
|
|
72
|
+
allow_unicode=True,
|
|
73
|
+
default_flow_style=False,
|
|
74
|
+
).strip()
|
|
75
|
+
|
|
76
|
+
return f"---\n{yaml_text}\n---\n{body}"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def split_markdown_front_matter(path_text: str) -> Tuple[Dict[str, Any], str]:
|
|
80
|
+
"""
|
|
81
|
+
Split Markdown into front matter metadata and body.
|
|
82
|
+
|
|
83
|
+
:param path_text: Markdown content.
|
|
84
|
+
:type path_text: str
|
|
85
|
+
:return: Metadata mapping and body text.
|
|
86
|
+
:rtype: tuple[dict[str, Any], str]
|
|
87
|
+
"""
|
|
88
|
+
parsed_document = parse_front_matter(path_text)
|
|
89
|
+
return parsed_document.metadata, parsed_document.body
|
biblicus/hook_logging.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Structured hook execution logging.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import uuid
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, Optional
|
|
11
|
+
from urllib.parse import urlparse, urlunparse
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
|
+
|
|
15
|
+
from .hooks import HookPoint
|
|
16
|
+
from .time import utc_now_iso
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def new_operation_id() -> str:
|
|
20
|
+
"""
|
|
21
|
+
Create a new operation identifier for hook log grouping.
|
|
22
|
+
|
|
23
|
+
:return: Operation identifier.
|
|
24
|
+
:rtype: str
|
|
25
|
+
"""
|
|
26
|
+
return str(uuid.uuid4())
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def redact_source_uri(source_uri: str) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Redact sensitive components from a source uniform resource identifier.
|
|
32
|
+
|
|
33
|
+
:param source_uri: Source uniform resource identifier.
|
|
34
|
+
:type source_uri: str
|
|
35
|
+
:return: Redacted source uniform resource identifier.
|
|
36
|
+
:rtype: str
|
|
37
|
+
"""
|
|
38
|
+
parsed = urlparse(source_uri)
|
|
39
|
+
|
|
40
|
+
if not parsed.scheme:
|
|
41
|
+
return source_uri
|
|
42
|
+
|
|
43
|
+
netloc = parsed.netloc
|
|
44
|
+
if "@" in netloc:
|
|
45
|
+
netloc = netloc.split("@", 1)[-1]
|
|
46
|
+
|
|
47
|
+
return urlunparse(
|
|
48
|
+
(
|
|
49
|
+
parsed.scheme,
|
|
50
|
+
netloc,
|
|
51
|
+
parsed.path,
|
|
52
|
+
parsed.params,
|
|
53
|
+
parsed.query,
|
|
54
|
+
parsed.fragment,
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class HookLogEntry(BaseModel):
|
|
60
|
+
"""
|
|
61
|
+
Single structured log record for hook execution.
|
|
62
|
+
|
|
63
|
+
:ivar operation_id: Identifier for the enclosing command or call.
|
|
64
|
+
:vartype operation_id: str
|
|
65
|
+
:ivar hook_point: Hook point that executed.
|
|
66
|
+
:vartype hook_point: HookPoint
|
|
67
|
+
:ivar hook_id: Hook implementation identifier.
|
|
68
|
+
:vartype hook_id: str
|
|
69
|
+
:ivar recorded_at: International Organization for Standardization 8601 timestamp for log record creation.
|
|
70
|
+
:vartype recorded_at: str
|
|
71
|
+
:ivar status: Execution status string.
|
|
72
|
+
:vartype status: str
|
|
73
|
+
:ivar message: Optional message describing execution results.
|
|
74
|
+
:vartype message: str or None
|
|
75
|
+
:ivar item_id: Optional item identifier.
|
|
76
|
+
:vartype item_id: str or None
|
|
77
|
+
:ivar relpath: Optional relative path associated with an item.
|
|
78
|
+
:vartype relpath: str or None
|
|
79
|
+
:ivar source_uri: Optional redacted source uniform resource identifier.
|
|
80
|
+
:vartype source_uri: str or None
|
|
81
|
+
:ivar details: Optional structured details about changes.
|
|
82
|
+
:vartype details: dict[str, Any]
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
model_config = ConfigDict(extra="forbid")
|
|
86
|
+
|
|
87
|
+
operation_id: str
|
|
88
|
+
hook_point: HookPoint
|
|
89
|
+
hook_id: str
|
|
90
|
+
recorded_at: str
|
|
91
|
+
status: str = Field(min_length=1)
|
|
92
|
+
message: Optional[str] = None
|
|
93
|
+
item_id: Optional[str] = None
|
|
94
|
+
relpath: Optional[str] = None
|
|
95
|
+
source_uri: Optional[str] = None
|
|
96
|
+
details: Dict[str, Any] = Field(default_factory=dict)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class HookLogger:
|
|
100
|
+
"""
|
|
101
|
+
Hook logger that writes JSON lines records to a corpus log directory.
|
|
102
|
+
|
|
103
|
+
:ivar log_dir: Directory where log files are written.
|
|
104
|
+
:vartype log_dir: Path
|
|
105
|
+
:ivar operation_id: Operation identifier for grouping records.
|
|
106
|
+
:vartype operation_id: str
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __init__(self, *, log_dir: Path, operation_id: str):
|
|
110
|
+
"""
|
|
111
|
+
Initialize a hook logger.
|
|
112
|
+
|
|
113
|
+
:param log_dir: Log directory to write into.
|
|
114
|
+
:type log_dir: Path
|
|
115
|
+
:param operation_id: Operation identifier for grouping records.
|
|
116
|
+
:type operation_id: str
|
|
117
|
+
"""
|
|
118
|
+
self.log_dir = log_dir
|
|
119
|
+
self.operation_id = operation_id
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def path(self) -> Path:
|
|
123
|
+
"""
|
|
124
|
+
Return the log file path for this operation.
|
|
125
|
+
|
|
126
|
+
:return: Log file path.
|
|
127
|
+
:rtype: Path
|
|
128
|
+
"""
|
|
129
|
+
return self.log_dir / f"{self.operation_id}.jsonl"
|
|
130
|
+
|
|
131
|
+
def record(
|
|
132
|
+
self,
|
|
133
|
+
*,
|
|
134
|
+
hook_point: HookPoint,
|
|
135
|
+
hook_id: str,
|
|
136
|
+
status: str,
|
|
137
|
+
message: Optional[str] = None,
|
|
138
|
+
item_id: Optional[str] = None,
|
|
139
|
+
relpath: Optional[str] = None,
|
|
140
|
+
source_uri: Optional[str] = None,
|
|
141
|
+
details: Optional[Dict[str, Any]] = None,
|
|
142
|
+
) -> None:
|
|
143
|
+
"""
|
|
144
|
+
Append a structured hook log record.
|
|
145
|
+
|
|
146
|
+
:param hook_point: Hook point that executed.
|
|
147
|
+
:type hook_point: HookPoint
|
|
148
|
+
:param hook_id: Hook identifier.
|
|
149
|
+
:type hook_id: str
|
|
150
|
+
:param status: Status string such as ok, denied, or error.
|
|
151
|
+
:type status: str
|
|
152
|
+
:param message: Optional message describing results.
|
|
153
|
+
:type message: str or None
|
|
154
|
+
:param item_id: Optional item identifier.
|
|
155
|
+
:type item_id: str or None
|
|
156
|
+
:param relpath: Optional relative path for the item.
|
|
157
|
+
:type relpath: str or None
|
|
158
|
+
:param source_uri: Optional source uniform resource identifier.
|
|
159
|
+
:type source_uri: str or None
|
|
160
|
+
:param details: Optional structured details.
|
|
161
|
+
:type details: dict[str, Any] or None
|
|
162
|
+
:return: None.
|
|
163
|
+
:rtype: None
|
|
164
|
+
"""
|
|
165
|
+
self.log_dir.mkdir(parents=True, exist_ok=True)
|
|
166
|
+
entry = HookLogEntry(
|
|
167
|
+
operation_id=self.operation_id,
|
|
168
|
+
hook_point=hook_point,
|
|
169
|
+
hook_id=hook_id,
|
|
170
|
+
recorded_at=utc_now_iso(),
|
|
171
|
+
status=status,
|
|
172
|
+
message=message,
|
|
173
|
+
item_id=item_id,
|
|
174
|
+
relpath=relpath,
|
|
175
|
+
source_uri=redact_source_uri(source_uri) if source_uri else None,
|
|
176
|
+
details=dict(details or {}),
|
|
177
|
+
)
|
|
178
|
+
line = json.dumps(entry.model_dump(), sort_keys=False)
|
|
179
|
+
with self.path.open("a", encoding="utf-8") as handle:
|
|
180
|
+
handle.write(line + "\n")
|