biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/extraction.py ADDED
@@ -0,0 +1,330 @@
1
+ """
2
+ Text extraction runs for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional
10
+ from uuid import uuid4
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ from .corpus import Corpus
15
+ from .extractors import get_extractor
16
+ from .models import CatalogItem
17
+ from .retrieval import hash_text
18
+ from .time import utc_now_iso
19
+
20
+
21
+ class ExtractionRunReference(BaseModel):
22
+ """
23
+ Reference to an extraction run.
24
+
25
+ :ivar extractor_id: Extractor plugin identifier.
26
+ :vartype extractor_id: str
27
+ :ivar run_id: Extraction run identifier.
28
+ :vartype run_id: str
29
+ """
30
+
31
+ model_config = ConfigDict(extra="forbid")
32
+
33
+ extractor_id: str = Field(min_length=1)
34
+ run_id: str = Field(min_length=1)
35
+
36
+ def as_string(self) -> str:
37
+ """
38
+ Serialize the reference as a single string.
39
+
40
+ :return: Reference in the form extractor_id:run_id.
41
+ :rtype: str
42
+ """
43
+
44
+ return f"{self.extractor_id}:{self.run_id}"
45
+
46
+
47
+ def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
48
+ """
49
+ Parse an extraction run reference in the form extractor_id:run_id.
50
+
51
+ :param value: Raw reference string.
52
+ :type value: str
53
+ :return: Parsed extraction run reference.
54
+ :rtype: ExtractionRunReference
55
+ :raises ValueError: If the reference is not well formed.
56
+ """
57
+
58
+ if ":" not in value:
59
+ raise ValueError("Extraction run reference must be extractor_id:run_id")
60
+ extractor_id, run_id = value.split(":", 1)
61
+ extractor_id = extractor_id.strip()
62
+ run_id = run_id.strip()
63
+ if not extractor_id or not run_id:
64
+ raise ValueError("Extraction run reference must be extractor_id:run_id with non-empty parts")
65
+ return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
66
+
67
+
68
+ class ExtractionRecipeManifest(BaseModel):
69
+ """
70
+ Reproducible configuration for an extraction plugin run.
71
+
72
+ :ivar recipe_id: Deterministic recipe identifier.
73
+ :vartype recipe_id: str
74
+ :ivar extractor_id: Extractor plugin identifier.
75
+ :vartype extractor_id: str
76
+ :ivar name: Human-readable recipe name.
77
+ :vartype name: str
78
+ :ivar created_at: International Organization for Standardization 8601 timestamp.
79
+ :vartype created_at: str
80
+ :ivar config: Extractor-specific configuration values.
81
+ :vartype config: dict[str, Any]
82
+ """
83
+
84
+ model_config = ConfigDict(extra="forbid")
85
+
86
+ recipe_id: str
87
+ extractor_id: str
88
+ name: str
89
+ created_at: str
90
+ config: Dict[str, Any] = Field(default_factory=dict)
91
+
92
+
93
+ class ExtractionItemResult(BaseModel):
94
+ """
95
+ Per-item result record for an extraction run.
96
+
97
+ :ivar item_id: Item identifier.
98
+ :vartype item_id: str
99
+ :ivar status: Result status, extracted or skipped.
100
+ :vartype status: str
101
+ :ivar text_relpath: Relative path to the extracted text artifact, when extracted.
102
+ :vartype text_relpath: str or None
103
+ :ivar producer_extractor_id: Extractor identifier that produced the extracted text.
104
+ :vartype producer_extractor_id: str or None
105
+ """
106
+
107
+ model_config = ConfigDict(extra="forbid")
108
+
109
+ item_id: str
110
+ status: str
111
+ text_relpath: Optional[str] = None
112
+ producer_extractor_id: Optional[str] = None
113
+
114
+
115
+ class ExtractionRunManifest(BaseModel):
116
+ """
117
+ Immutable record describing an extraction run.
118
+
119
+ :ivar run_id: Unique run identifier.
120
+ :vartype run_id: str
121
+ :ivar recipe: Recipe manifest for this run.
122
+ :vartype recipe: ExtractionRecipeManifest
123
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
124
+ :vartype corpus_uri: str
125
+ :ivar catalog_generated_at: Catalog timestamp used for the run.
126
+ :vartype catalog_generated_at: str
127
+ :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
128
+ :vartype created_at: str
129
+ :ivar items: Per-item results.
130
+ :vartype items: list[ExtractionItemResult]
131
+ :ivar stats: Run statistics.
132
+ :vartype stats: dict[str, Any]
133
+ """
134
+
135
+ model_config = ConfigDict(extra="forbid")
136
+
137
+ run_id: str
138
+ recipe: ExtractionRecipeManifest
139
+ corpus_uri: str
140
+ catalog_generated_at: str
141
+ created_at: str
142
+ items: List[ExtractionItemResult] = Field(default_factory=list)
143
+ stats: Dict[str, Any] = Field(default_factory=dict)
144
+
145
+
146
+ def create_extraction_recipe_manifest(*, extractor_id: str, name: str, config: Dict[str, Any]) -> ExtractionRecipeManifest:
147
+ """
148
+ Create a deterministic extraction recipe manifest.
149
+
150
+ :param extractor_id: Extractor plugin identifier.
151
+ :type extractor_id: str
152
+ :param name: Human recipe name.
153
+ :type name: str
154
+ :param config: Extractor configuration.
155
+ :type config: dict[str, Any]
156
+ :return: Recipe manifest.
157
+ :rtype: ExtractionRecipeManifest
158
+ """
159
+
160
+ recipe_payload = json.dumps({"extractor_id": extractor_id, "name": name, "config": config}, sort_keys=True)
161
+ recipe_id = hash_text(recipe_payload)
162
+ return ExtractionRecipeManifest(
163
+ recipe_id=recipe_id,
164
+ extractor_id=extractor_id,
165
+ name=name,
166
+ created_at=utc_now_iso(),
167
+ config=config,
168
+ )
169
+
170
+
171
+ def create_extraction_run_manifest(corpus: Corpus, *, recipe: ExtractionRecipeManifest) -> ExtractionRunManifest:
172
+ """
173
+ Create a new extraction run manifest for a corpus.
174
+
175
+ :param corpus: Corpus associated with the run.
176
+ :type corpus: Corpus
177
+ :param recipe: Recipe manifest.
178
+ :type recipe: ExtractionRecipeManifest
179
+ :return: Run manifest.
180
+ :rtype: ExtractionRunManifest
181
+ """
182
+
183
+ catalog = corpus.load_catalog()
184
+ return ExtractionRunManifest(
185
+ run_id=str(uuid4()),
186
+ recipe=recipe,
187
+ corpus_uri=corpus.uri,
188
+ catalog_generated_at=catalog.generated_at,
189
+ created_at=utc_now_iso(),
190
+ items=[],
191
+ stats={},
192
+ )
193
+
194
+
195
+ def write_extraction_run_manifest(*, run_dir: Path, manifest: ExtractionRunManifest) -> None:
196
+ """
197
+ Persist an extraction run manifest to a run directory.
198
+
199
+ :param run_dir: Extraction run directory.
200
+ :type run_dir: Path
201
+ :param manifest: Run manifest to write.
202
+ :type manifest: ExtractionRunManifest
203
+ :return: None.
204
+ :rtype: None
205
+ """
206
+
207
+ manifest_path = run_dir / "manifest.json"
208
+ manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
209
+
210
+
211
+ def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str) -> str:
212
+ """
213
+ Write an extracted text artifact for an item into the run directory.
214
+
215
+ :param run_dir: Extraction run directory.
216
+ :type run_dir: Path
217
+ :param item: Catalog item being extracted.
218
+ :type item: CatalogItem
219
+ :param text: Extracted text.
220
+ :type text: str
221
+ :return: Relative path to the stored text artifact.
222
+ :rtype: str
223
+ """
224
+
225
+ text_dir = run_dir / "text"
226
+ text_dir.mkdir(parents=True, exist_ok=True)
227
+ relpath = str(Path("text") / f"{item.id}.txt")
228
+ path = run_dir / relpath
229
+ path.write_text(text, encoding="utf-8")
230
+ return relpath
231
+
232
+
233
+ def build_extraction_run(
234
+ corpus: Corpus,
235
+ *,
236
+ extractor_id: str,
237
+ recipe_name: str,
238
+ config: Dict[str, Any],
239
+ ) -> ExtractionRunManifest:
240
+ """
241
+ Build an extraction run for a corpus using a named extractor plugin.
242
+
243
+ :param corpus: Corpus to extract from.
244
+ :type corpus: Corpus
245
+ :param extractor_id: Extractor plugin identifier.
246
+ :type extractor_id: str
247
+ :param recipe_name: Human-readable recipe name.
248
+ :type recipe_name: str
249
+ :param config: Extractor configuration mapping.
250
+ :type config: dict[str, Any]
251
+ :return: Extraction run manifest describing the build.
252
+ :rtype: ExtractionRunManifest
253
+ :raises KeyError: If the extractor identifier is unknown.
254
+ :raises ValueError: If the extractor configuration is invalid.
255
+ :raises OSError: If the run directory or artifacts cannot be written.
256
+ """
257
+
258
+ extractor = get_extractor(extractor_id)
259
+ parsed_config = extractor.validate_config(config)
260
+ recipe = create_extraction_recipe_manifest(
261
+ extractor_id=extractor_id,
262
+ name=recipe_name,
263
+ config=parsed_config.model_dump(),
264
+ )
265
+ manifest = create_extraction_run_manifest(corpus, recipe=recipe)
266
+ run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
267
+ run_dir.mkdir(parents=True, exist_ok=False)
268
+
269
+ catalog = corpus.load_catalog()
270
+ extracted_items: List[ExtractionItemResult] = []
271
+ extracted_count = 0
272
+ skipped_count = 0
273
+ extracted_nonempty_count = 0
274
+ extracted_empty_count = 0
275
+ already_text_item_count = 0
276
+ needs_extraction_item_count = 0
277
+ converted_item_count = 0
278
+ for item in catalog.items.values():
279
+ media_type = item.media_type
280
+ item_is_text = media_type == "text/markdown" or media_type.startswith("text/")
281
+ if item_is_text:
282
+ already_text_item_count += 1
283
+ else:
284
+ needs_extraction_item_count += 1
285
+
286
+ extracted_text = extractor.extract_text(corpus=corpus, item=item, config=parsed_config)
287
+ if extracted_text is None:
288
+ skipped_count += 1
289
+ extracted_items.append(
290
+ ExtractionItemResult(
291
+ item_id=item.id,
292
+ status="skipped",
293
+ text_relpath=None,
294
+ producer_extractor_id=None,
295
+ )
296
+ )
297
+ continue
298
+
299
+ extracted_count += 1
300
+ stripped_text = extracted_text.text.strip()
301
+ if stripped_text:
302
+ extracted_nonempty_count += 1
303
+ if not item_is_text:
304
+ converted_item_count += 1
305
+ else:
306
+ extracted_empty_count += 1
307
+
308
+ relpath = write_extracted_text_artifact(run_dir=run_dir, item=item, text=extracted_text.text)
309
+ extracted_items.append(
310
+ ExtractionItemResult(
311
+ item_id=item.id,
312
+ status="extracted",
313
+ text_relpath=relpath,
314
+ producer_extractor_id=extracted_text.producer_extractor_id,
315
+ )
316
+ )
317
+
318
+ stats = {
319
+ "total_items": len(catalog.items),
320
+ "already_text_items": already_text_item_count,
321
+ "needs_extraction_items": needs_extraction_item_count,
322
+ "extracted_items": extracted_count,
323
+ "extracted_nonempty_items": extracted_nonempty_count,
324
+ "extracted_empty_items": extracted_empty_count,
325
+ "skipped_items": skipped_count,
326
+ "converted_items": converted_item_count,
327
+ }
328
+ manifest = manifest.model_copy(update={"items": extracted_items, "stats": stats})
329
+ write_extraction_run_manifest(run_dir=run_dir, manifest=manifest)
330
+ return manifest
@@ -0,0 +1,33 @@
1
+ """
2
+ Text extraction plugins for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict
8
+
9
+ from .base import TextExtractor
10
+ from .cascade import CascadeExtractor
11
+ from .metadata_text import MetadataTextExtractor
12
+ from .pass_through_text import PassThroughTextExtractor
13
+
14
+
15
+ def get_extractor(extractor_id: str) -> TextExtractor:
16
+ """
17
+ Resolve a built-in text extractor by identifier.
18
+
19
+ :param extractor_id: Extractor identifier.
20
+ :type extractor_id: str
21
+ :return: Extractor plugin instance.
22
+ :rtype: TextExtractor
23
+ :raises KeyError: If the extractor identifier is not known.
24
+ """
25
+
26
+ extractors: Dict[str, TextExtractor] = {
27
+ CascadeExtractor.extractor_id: CascadeExtractor(),
28
+ MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
29
+ PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
30
+ }
31
+ if extractor_id not in extractors:
32
+ raise KeyError(f"Unknown extractor: {extractor_id!r}")
33
+ return extractors[extractor_id]
@@ -0,0 +1,61 @@
1
+ """
2
+ Base interfaces for text extraction plugins.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any, Dict, Optional
9
+
10
+ from pydantic import BaseModel
11
+
12
+ from ..corpus import Corpus
13
+ from ..models import CatalogItem, ExtractedText
14
+
15
+
16
+ class TextExtractor(ABC):
17
+ """
18
+ Abstract interface for plugins that derive text artifacts from corpus items.
19
+
20
+ A text extractor is intentionally independent from retrieval backends. It can be swapped
21
+ independently so that different extraction approaches can be evaluated against the same corpus
22
+ and the same retrieval backend.
23
+
24
+ :ivar extractor_id: Identifier string for the extractor plugin.
25
+ :vartype extractor_id: str
26
+ """
27
+
28
+ extractor_id: str
29
+
30
+ @abstractmethod
31
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
32
+ """
33
+ Validate extractor configuration and return a parsed model.
34
+
35
+ :param config: Extractor configuration mapping.
36
+ :type config: dict[str, Any]
37
+ :return: Parsed configuration model.
38
+ :rtype: pydantic.BaseModel
39
+ :raises ValueError: If the configuration is invalid.
40
+ """
41
+
42
+ raise NotImplementedError
43
+
44
+ @abstractmethod
45
+ def extract_text(self, *, corpus: Corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
46
+ """
47
+ Derive text for a catalog item.
48
+
49
+ Returning None indicates that the item was intentionally skipped.
50
+
51
+ :param corpus: Corpus containing the item bytes.
52
+ :type corpus: Corpus
53
+ :param item: Catalog item to process.
54
+ :type item: CatalogItem
55
+ :param config: Parsed extractor configuration.
56
+ :type config: pydantic.BaseModel
57
+ :return: Extracted text payload or None when skipped.
58
+ :rtype: ExtractedText or None
59
+ """
60
+
61
+ raise NotImplementedError
@@ -0,0 +1,101 @@
1
+ """
2
+ Cascade extractor plugin that composes multiple extractors.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
10
+
11
+ from ..corpus import Corpus
12
+ from ..models import CatalogItem, ExtractedText
13
+ from .base import TextExtractor
14
+
15
+
16
+ class CascadeStepSpec(BaseModel):
17
+ """
18
+ Single extractor step within a cascade pipeline.
19
+
20
+ :ivar extractor_id: Extractor plugin identifier.
21
+ :vartype extractor_id: str
22
+ :ivar config: Extractor configuration mapping.
23
+ :vartype config: dict[str, Any]
24
+ """
25
+
26
+ model_config = ConfigDict(extra="forbid")
27
+
28
+ extractor_id: str = Field(min_length=1)
29
+ config: Dict[str, Any] = Field(default_factory=dict)
30
+
31
+
32
+ class CascadeExtractorConfig(BaseModel):
33
+ """
34
+ Configuration for the cascade extractor.
35
+
36
+ :ivar steps: Ordered list of extractor steps to try.
37
+ :vartype steps: list[CascadeStepSpec]
38
+ """
39
+
40
+ model_config = ConfigDict(extra="forbid")
41
+
42
+ steps: List[CascadeStepSpec] = Field(min_length=1)
43
+
44
+ @model_validator(mode="after")
45
+ def _forbid_self_reference(self) -> "CascadeExtractorConfig":
46
+ if any(step.extractor_id == "cascade" for step in self.steps):
47
+ raise ValueError("Cascade extractor cannot include itself as a step")
48
+ return self
49
+
50
+
51
+ class CascadeExtractor(TextExtractor):
52
+ """
53
+ Extractor that tries a sequence of extractors and uses the first usable text result.
54
+
55
+ A result is considered usable when its text is non-empty after stripping whitespace.
56
+
57
+ :ivar extractor_id: Extractor identifier.
58
+ :vartype extractor_id: str
59
+ """
60
+
61
+ extractor_id = "cascade"
62
+
63
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
64
+ """
65
+ Validate cascade extractor configuration.
66
+
67
+ :param config: Configuration mapping.
68
+ :type config: dict[str, Any]
69
+ :return: Parsed config.
70
+ :rtype: CascadeExtractorConfig
71
+ """
72
+
73
+ return CascadeExtractorConfig.model_validate(config)
74
+
75
+ def extract_text(self, *, corpus: Corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
76
+ """
77
+ Run each configured extractor step until usable text is produced.
78
+
79
+ :param corpus: Corpus containing the item bytes.
80
+ :type corpus: Corpus
81
+ :param item: Catalog item being processed.
82
+ :type item: CatalogItem
83
+ :param config: Parsed configuration model.
84
+ :type config: CascadeExtractorConfig
85
+ :return: Extracted text payload or None.
86
+ :rtype: ExtractedText or None
87
+ """
88
+
89
+ cascade_config = config if isinstance(config, CascadeExtractorConfig) else CascadeExtractorConfig.model_validate(config)
90
+ for step in cascade_config.steps:
91
+ from . import get_extractor
92
+
93
+ extractor = get_extractor(step.extractor_id)
94
+ parsed_step_config = extractor.validate_config(step.config)
95
+ result = extractor.extract_text(corpus=corpus, item=item, config=parsed_step_config)
96
+ if result is None:
97
+ continue
98
+ if not result.text.strip():
99
+ continue
100
+ return result
101
+ return None
@@ -0,0 +1,98 @@
1
+ """
2
+ Metadata-based text extractor plugin.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+ from ..models import CatalogItem, ExtractedText
12
+ from .base import TextExtractor
13
+
14
+
15
+ class MetadataTextExtractorConfig(BaseModel):
16
+ """
17
+ Configuration for the metadata text extractor.
18
+
19
+ The metadata text extractor is intentionally minimal and deterministic.
20
+ It emits a plain text representation derived only from an item's catalog metadata.
21
+
22
+ :ivar include_title: Whether to include the item title as the first line, if present.
23
+ :vartype include_title: bool
24
+ :ivar include_tags: Whether to include a ``tags: ...`` line, if tags are present.
25
+ :vartype include_tags: bool
26
+ """
27
+
28
+ model_config = ConfigDict(extra="forbid")
29
+
30
+ include_title: bool = Field(default=True)
31
+ include_tags: bool = Field(default=True)
32
+
33
+
34
+ class MetadataTextExtractor(TextExtractor):
35
+ """
36
+ Extractor plugin that emits a small, searchable text representation of item metadata.
37
+
38
+ The output is intended to be stable and human-readable:
39
+
40
+ - If a title exists, the first line is the title.
41
+ - If tags exist, the next line is ``tags: <comma separated tags>``.
42
+
43
+ This extractor is useful for:
44
+
45
+ - Retrieval over non-text items that carry meaningful metadata.
46
+ - Comparing downstream retrieval backends while holding extraction stable.
47
+
48
+ :ivar extractor_id: Extractor identifier.
49
+ :vartype extractor_id: str
50
+ """
51
+
52
+ extractor_id = "metadata-text"
53
+
54
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
55
+ """
56
+ Validate extractor configuration.
57
+
58
+ :param config: Configuration mapping.
59
+ :type config: dict[str, Any]
60
+ :return: Parsed config.
61
+ :rtype: MetadataTextExtractorConfig
62
+ """
63
+
64
+ return MetadataTextExtractorConfig.model_validate(config)
65
+
66
+ def extract_text(self, *, corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
67
+ """
68
+ Extract a metadata-based text payload for the item.
69
+
70
+ :param corpus: Corpus containing the item bytes.
71
+ :type corpus: Corpus
72
+ :param item: Catalog item being processed.
73
+ :type item: CatalogItem
74
+ :param config: Parsed configuration model.
75
+ :type config: MetadataTextExtractorConfig
76
+ :return: Extracted text payload, or ``None`` if no metadata is available.
77
+ :rtype: ExtractedText or None
78
+ """
79
+
80
+ parsed_config = (
81
+ config
82
+ if isinstance(config, MetadataTextExtractorConfig)
83
+ else MetadataTextExtractorConfig.model_validate(config)
84
+ )
85
+ _ = corpus
86
+ lines: list[str] = []
87
+
88
+ if parsed_config.include_title and isinstance(item.title, str) and item.title.strip():
89
+ lines.append(item.title.strip())
90
+
91
+ tags = [tag.strip() for tag in item.tags if isinstance(tag, str) and tag.strip()]
92
+ if parsed_config.include_tags and tags:
93
+ lines.append(f"tags: {', '.join(tags)}")
94
+
95
+ if not lines:
96
+ return None
97
+
98
+ return ExtractedText(text="\n".join(lines), producer_extractor_id=self.extractor_id)
@@ -0,0 +1,74 @@
1
+ """
2
+ Pass-through extractor for text items.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, Optional
8
+
9
+ from pydantic import BaseModel, ConfigDict
10
+
11
+ from ..corpus import Corpus
12
+ from ..frontmatter import parse_front_matter
13
+ from ..models import CatalogItem, ExtractedText
14
+ from .base import TextExtractor
15
+
16
+
17
+ class PassThroughTextExtractorConfig(BaseModel):
18
+ """
19
+ Configuration for the pass-through text extractor.
20
+
21
+ This extractor is intentionally minimal and requires no configuration.
22
+ """
23
+
24
+ model_config = ConfigDict(extra="forbid")
25
+
26
+
27
+ class PassThroughTextExtractor(TextExtractor):
28
+ """
29
+ Extractor plugin that reads text items from the corpus and returns their text content.
30
+
31
+ Non-text items are skipped.
32
+
33
+ :ivar extractor_id: Extractor identifier.
34
+ :vartype extractor_id: str
35
+ """
36
+
37
+ extractor_id = "pass-through-text"
38
+
39
+ def validate_config(self, config: Dict[str, Any]) -> BaseModel:
40
+ """
41
+ Validate extractor configuration.
42
+
43
+ :param config: Configuration mapping.
44
+ :type config: dict[str, Any]
45
+ :return: Parsed config.
46
+ :rtype: PassThroughTextExtractorConfig
47
+ """
48
+
49
+ return PassThroughTextExtractorConfig.model_validate(config)
50
+
51
+ def extract_text(self, *, corpus: Corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
52
+ """
53
+ Extract text by reading the raw item content from the corpus.
54
+
55
+ :param corpus: Corpus containing the item bytes.
56
+ :type corpus: Corpus
57
+ :param item: Catalog item being processed.
58
+ :type item: CatalogItem
59
+ :param config: Parsed configuration model.
60
+ :type config: PassThroughTextExtractorConfig
61
+ :return: Extracted text payload, or None if the item is not text.
62
+ :rtype: ExtractedText or None
63
+ """
64
+
65
+ _ = config
66
+ media_type = item.media_type
67
+ if media_type != "text/markdown" and not media_type.startswith("text/"):
68
+ return None
69
+ raw_bytes = (corpus.root / item.relpath).read_bytes()
70
+ if media_type == "text/markdown":
71
+ markdown_text = raw_bytes.decode("utf-8")
72
+ parsed_document = parse_front_matter(markdown_text)
73
+ return ExtractedText(text=parsed_document.body, producer_extractor_id=self.extractor_id)
74
+ return ExtractedText(text=raw_bytes.decode("utf-8"), producer_extractor_id=self.extractor_id)