biblicus 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/backends/scan.py +81 -4
- biblicus/backends/sqlite_full_text_search.py +63 -2
- biblicus/cli.py +123 -0
- biblicus/constants.py +2 -0
- biblicus/corpus.py +431 -2
- biblicus/extraction.py +330 -0
- biblicus/extractors/__init__.py +33 -0
- biblicus/extractors/base.py +61 -0
- biblicus/extractors/cascade.py +101 -0
- biblicus/extractors/metadata_text.py +98 -0
- biblicus/extractors/pass_through_text.py +74 -0
- biblicus/hook_logging.py +185 -0
- biblicus/hook_manager.py +205 -0
- biblicus/hooks.py +265 -0
- biblicus/ignore.py +67 -0
- biblicus/models.py +20 -0
- biblicus/sources.py +45 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/METADATA +101 -1
- biblicus-0.2.0.dist-info/RECORD +32 -0
- biblicus-0.1.1.dist-info/RECORD +0 -22
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/WHEEL +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.1.1.dist-info → biblicus-0.2.0.dist-info}/top_level.txt +0 -0
biblicus/extraction.py
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text extraction runs for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
from uuid import uuid4
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
from .corpus import Corpus
|
|
15
|
+
from .extractors import get_extractor
|
|
16
|
+
from .models import CatalogItem
|
|
17
|
+
from .retrieval import hash_text
|
|
18
|
+
from .time import utc_now_iso
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ExtractionRunReference(BaseModel):
|
|
22
|
+
"""
|
|
23
|
+
Reference to an extraction run.
|
|
24
|
+
|
|
25
|
+
:ivar extractor_id: Extractor plugin identifier.
|
|
26
|
+
:vartype extractor_id: str
|
|
27
|
+
:ivar run_id: Extraction run identifier.
|
|
28
|
+
:vartype run_id: str
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
model_config = ConfigDict(extra="forbid")
|
|
32
|
+
|
|
33
|
+
extractor_id: str = Field(min_length=1)
|
|
34
|
+
run_id: str = Field(min_length=1)
|
|
35
|
+
|
|
36
|
+
def as_string(self) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Serialize the reference as a single string.
|
|
39
|
+
|
|
40
|
+
:return: Reference in the form extractor_id:run_id.
|
|
41
|
+
:rtype: str
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
return f"{self.extractor_id}:{self.run_id}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def parse_extraction_run_reference(value: str) -> ExtractionRunReference:
|
|
48
|
+
"""
|
|
49
|
+
Parse an extraction run reference in the form extractor_id:run_id.
|
|
50
|
+
|
|
51
|
+
:param value: Raw reference string.
|
|
52
|
+
:type value: str
|
|
53
|
+
:return: Parsed extraction run reference.
|
|
54
|
+
:rtype: ExtractionRunReference
|
|
55
|
+
:raises ValueError: If the reference is not well formed.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
if ":" not in value:
|
|
59
|
+
raise ValueError("Extraction run reference must be extractor_id:run_id")
|
|
60
|
+
extractor_id, run_id = value.split(":", 1)
|
|
61
|
+
extractor_id = extractor_id.strip()
|
|
62
|
+
run_id = run_id.strip()
|
|
63
|
+
if not extractor_id or not run_id:
|
|
64
|
+
raise ValueError("Extraction run reference must be extractor_id:run_id with non-empty parts")
|
|
65
|
+
return ExtractionRunReference(extractor_id=extractor_id, run_id=run_id)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ExtractionRecipeManifest(BaseModel):
|
|
69
|
+
"""
|
|
70
|
+
Reproducible configuration for an extraction plugin run.
|
|
71
|
+
|
|
72
|
+
:ivar recipe_id: Deterministic recipe identifier.
|
|
73
|
+
:vartype recipe_id: str
|
|
74
|
+
:ivar extractor_id: Extractor plugin identifier.
|
|
75
|
+
:vartype extractor_id: str
|
|
76
|
+
:ivar name: Human-readable recipe name.
|
|
77
|
+
:vartype name: str
|
|
78
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp.
|
|
79
|
+
:vartype created_at: str
|
|
80
|
+
:ivar config: Extractor-specific configuration values.
|
|
81
|
+
:vartype config: dict[str, Any]
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
model_config = ConfigDict(extra="forbid")
|
|
85
|
+
|
|
86
|
+
recipe_id: str
|
|
87
|
+
extractor_id: str
|
|
88
|
+
name: str
|
|
89
|
+
created_at: str
|
|
90
|
+
config: Dict[str, Any] = Field(default_factory=dict)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ExtractionItemResult(BaseModel):
|
|
94
|
+
"""
|
|
95
|
+
Per-item result record for an extraction run.
|
|
96
|
+
|
|
97
|
+
:ivar item_id: Item identifier.
|
|
98
|
+
:vartype item_id: str
|
|
99
|
+
:ivar status: Result status, extracted or skipped.
|
|
100
|
+
:vartype status: str
|
|
101
|
+
:ivar text_relpath: Relative path to the extracted text artifact, when extracted.
|
|
102
|
+
:vartype text_relpath: str or None
|
|
103
|
+
:ivar producer_extractor_id: Extractor identifier that produced the extracted text.
|
|
104
|
+
:vartype producer_extractor_id: str or None
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
model_config = ConfigDict(extra="forbid")
|
|
108
|
+
|
|
109
|
+
item_id: str
|
|
110
|
+
status: str
|
|
111
|
+
text_relpath: Optional[str] = None
|
|
112
|
+
producer_extractor_id: Optional[str] = None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class ExtractionRunManifest(BaseModel):
|
|
116
|
+
"""
|
|
117
|
+
Immutable record describing an extraction run.
|
|
118
|
+
|
|
119
|
+
:ivar run_id: Unique run identifier.
|
|
120
|
+
:vartype run_id: str
|
|
121
|
+
:ivar recipe: Recipe manifest for this run.
|
|
122
|
+
:vartype recipe: ExtractionRecipeManifest
|
|
123
|
+
:ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
|
|
124
|
+
:vartype corpus_uri: str
|
|
125
|
+
:ivar catalog_generated_at: Catalog timestamp used for the run.
|
|
126
|
+
:vartype catalog_generated_at: str
|
|
127
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
|
|
128
|
+
:vartype created_at: str
|
|
129
|
+
:ivar items: Per-item results.
|
|
130
|
+
:vartype items: list[ExtractionItemResult]
|
|
131
|
+
:ivar stats: Run statistics.
|
|
132
|
+
:vartype stats: dict[str, Any]
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
model_config = ConfigDict(extra="forbid")
|
|
136
|
+
|
|
137
|
+
run_id: str
|
|
138
|
+
recipe: ExtractionRecipeManifest
|
|
139
|
+
corpus_uri: str
|
|
140
|
+
catalog_generated_at: str
|
|
141
|
+
created_at: str
|
|
142
|
+
items: List[ExtractionItemResult] = Field(default_factory=list)
|
|
143
|
+
stats: Dict[str, Any] = Field(default_factory=dict)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def create_extraction_recipe_manifest(*, extractor_id: str, name: str, config: Dict[str, Any]) -> ExtractionRecipeManifest:
|
|
147
|
+
"""
|
|
148
|
+
Create a deterministic extraction recipe manifest.
|
|
149
|
+
|
|
150
|
+
:param extractor_id: Extractor plugin identifier.
|
|
151
|
+
:type extractor_id: str
|
|
152
|
+
:param name: Human recipe name.
|
|
153
|
+
:type name: str
|
|
154
|
+
:param config: Extractor configuration.
|
|
155
|
+
:type config: dict[str, Any]
|
|
156
|
+
:return: Recipe manifest.
|
|
157
|
+
:rtype: ExtractionRecipeManifest
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
recipe_payload = json.dumps({"extractor_id": extractor_id, "name": name, "config": config}, sort_keys=True)
|
|
161
|
+
recipe_id = hash_text(recipe_payload)
|
|
162
|
+
return ExtractionRecipeManifest(
|
|
163
|
+
recipe_id=recipe_id,
|
|
164
|
+
extractor_id=extractor_id,
|
|
165
|
+
name=name,
|
|
166
|
+
created_at=utc_now_iso(),
|
|
167
|
+
config=config,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def create_extraction_run_manifest(corpus: Corpus, *, recipe: ExtractionRecipeManifest) -> ExtractionRunManifest:
|
|
172
|
+
"""
|
|
173
|
+
Create a new extraction run manifest for a corpus.
|
|
174
|
+
|
|
175
|
+
:param corpus: Corpus associated with the run.
|
|
176
|
+
:type corpus: Corpus
|
|
177
|
+
:param recipe: Recipe manifest.
|
|
178
|
+
:type recipe: ExtractionRecipeManifest
|
|
179
|
+
:return: Run manifest.
|
|
180
|
+
:rtype: ExtractionRunManifest
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
catalog = corpus.load_catalog()
|
|
184
|
+
return ExtractionRunManifest(
|
|
185
|
+
run_id=str(uuid4()),
|
|
186
|
+
recipe=recipe,
|
|
187
|
+
corpus_uri=corpus.uri,
|
|
188
|
+
catalog_generated_at=catalog.generated_at,
|
|
189
|
+
created_at=utc_now_iso(),
|
|
190
|
+
items=[],
|
|
191
|
+
stats={},
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def write_extraction_run_manifest(*, run_dir: Path, manifest: ExtractionRunManifest) -> None:
|
|
196
|
+
"""
|
|
197
|
+
Persist an extraction run manifest to a run directory.
|
|
198
|
+
|
|
199
|
+
:param run_dir: Extraction run directory.
|
|
200
|
+
:type run_dir: Path
|
|
201
|
+
:param manifest: Run manifest to write.
|
|
202
|
+
:type manifest: ExtractionRunManifest
|
|
203
|
+
:return: None.
|
|
204
|
+
:rtype: None
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
manifest_path = run_dir / "manifest.json"
|
|
208
|
+
manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def write_extracted_text_artifact(*, run_dir: Path, item: CatalogItem, text: str) -> str:
|
|
212
|
+
"""
|
|
213
|
+
Write an extracted text artifact for an item into the run directory.
|
|
214
|
+
|
|
215
|
+
:param run_dir: Extraction run directory.
|
|
216
|
+
:type run_dir: Path
|
|
217
|
+
:param item: Catalog item being extracted.
|
|
218
|
+
:type item: CatalogItem
|
|
219
|
+
:param text: Extracted text.
|
|
220
|
+
:type text: str
|
|
221
|
+
:return: Relative path to the stored text artifact.
|
|
222
|
+
:rtype: str
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
text_dir = run_dir / "text"
|
|
226
|
+
text_dir.mkdir(parents=True, exist_ok=True)
|
|
227
|
+
relpath = str(Path("text") / f"{item.id}.txt")
|
|
228
|
+
path = run_dir / relpath
|
|
229
|
+
path.write_text(text, encoding="utf-8")
|
|
230
|
+
return relpath
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def build_extraction_run(
|
|
234
|
+
corpus: Corpus,
|
|
235
|
+
*,
|
|
236
|
+
extractor_id: str,
|
|
237
|
+
recipe_name: str,
|
|
238
|
+
config: Dict[str, Any],
|
|
239
|
+
) -> ExtractionRunManifest:
|
|
240
|
+
"""
|
|
241
|
+
Build an extraction run for a corpus using a named extractor plugin.
|
|
242
|
+
|
|
243
|
+
:param corpus: Corpus to extract from.
|
|
244
|
+
:type corpus: Corpus
|
|
245
|
+
:param extractor_id: Extractor plugin identifier.
|
|
246
|
+
:type extractor_id: str
|
|
247
|
+
:param recipe_name: Human-readable recipe name.
|
|
248
|
+
:type recipe_name: str
|
|
249
|
+
:param config: Extractor configuration mapping.
|
|
250
|
+
:type config: dict[str, Any]
|
|
251
|
+
:return: Extraction run manifest describing the build.
|
|
252
|
+
:rtype: ExtractionRunManifest
|
|
253
|
+
:raises KeyError: If the extractor identifier is unknown.
|
|
254
|
+
:raises ValueError: If the extractor configuration is invalid.
|
|
255
|
+
:raises OSError: If the run directory or artifacts cannot be written.
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
extractor = get_extractor(extractor_id)
|
|
259
|
+
parsed_config = extractor.validate_config(config)
|
|
260
|
+
recipe = create_extraction_recipe_manifest(
|
|
261
|
+
extractor_id=extractor_id,
|
|
262
|
+
name=recipe_name,
|
|
263
|
+
config=parsed_config.model_dump(),
|
|
264
|
+
)
|
|
265
|
+
manifest = create_extraction_run_manifest(corpus, recipe=recipe)
|
|
266
|
+
run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
|
|
267
|
+
run_dir.mkdir(parents=True, exist_ok=False)
|
|
268
|
+
|
|
269
|
+
catalog = corpus.load_catalog()
|
|
270
|
+
extracted_items: List[ExtractionItemResult] = []
|
|
271
|
+
extracted_count = 0
|
|
272
|
+
skipped_count = 0
|
|
273
|
+
extracted_nonempty_count = 0
|
|
274
|
+
extracted_empty_count = 0
|
|
275
|
+
already_text_item_count = 0
|
|
276
|
+
needs_extraction_item_count = 0
|
|
277
|
+
converted_item_count = 0
|
|
278
|
+
for item in catalog.items.values():
|
|
279
|
+
media_type = item.media_type
|
|
280
|
+
item_is_text = media_type == "text/markdown" or media_type.startswith("text/")
|
|
281
|
+
if item_is_text:
|
|
282
|
+
already_text_item_count += 1
|
|
283
|
+
else:
|
|
284
|
+
needs_extraction_item_count += 1
|
|
285
|
+
|
|
286
|
+
extracted_text = extractor.extract_text(corpus=corpus, item=item, config=parsed_config)
|
|
287
|
+
if extracted_text is None:
|
|
288
|
+
skipped_count += 1
|
|
289
|
+
extracted_items.append(
|
|
290
|
+
ExtractionItemResult(
|
|
291
|
+
item_id=item.id,
|
|
292
|
+
status="skipped",
|
|
293
|
+
text_relpath=None,
|
|
294
|
+
producer_extractor_id=None,
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
extracted_count += 1
|
|
300
|
+
stripped_text = extracted_text.text.strip()
|
|
301
|
+
if stripped_text:
|
|
302
|
+
extracted_nonempty_count += 1
|
|
303
|
+
if not item_is_text:
|
|
304
|
+
converted_item_count += 1
|
|
305
|
+
else:
|
|
306
|
+
extracted_empty_count += 1
|
|
307
|
+
|
|
308
|
+
relpath = write_extracted_text_artifact(run_dir=run_dir, item=item, text=extracted_text.text)
|
|
309
|
+
extracted_items.append(
|
|
310
|
+
ExtractionItemResult(
|
|
311
|
+
item_id=item.id,
|
|
312
|
+
status="extracted",
|
|
313
|
+
text_relpath=relpath,
|
|
314
|
+
producer_extractor_id=extracted_text.producer_extractor_id,
|
|
315
|
+
)
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
stats = {
|
|
319
|
+
"total_items": len(catalog.items),
|
|
320
|
+
"already_text_items": already_text_item_count,
|
|
321
|
+
"needs_extraction_items": needs_extraction_item_count,
|
|
322
|
+
"extracted_items": extracted_count,
|
|
323
|
+
"extracted_nonempty_items": extracted_nonempty_count,
|
|
324
|
+
"extracted_empty_items": extracted_empty_count,
|
|
325
|
+
"skipped_items": skipped_count,
|
|
326
|
+
"converted_items": converted_item_count,
|
|
327
|
+
}
|
|
328
|
+
manifest = manifest.model_copy(update={"items": extracted_items, "stats": stats})
|
|
329
|
+
write_extraction_run_manifest(run_dir=run_dir, manifest=manifest)
|
|
330
|
+
return manifest
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text extraction plugins for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict
|
|
8
|
+
|
|
9
|
+
from .base import TextExtractor
|
|
10
|
+
from .cascade import CascadeExtractor
|
|
11
|
+
from .metadata_text import MetadataTextExtractor
|
|
12
|
+
from .pass_through_text import PassThroughTextExtractor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_extractor(extractor_id: str) -> TextExtractor:
|
|
16
|
+
"""
|
|
17
|
+
Resolve a built-in text extractor by identifier.
|
|
18
|
+
|
|
19
|
+
:param extractor_id: Extractor identifier.
|
|
20
|
+
:type extractor_id: str
|
|
21
|
+
:return: Extractor plugin instance.
|
|
22
|
+
:rtype: TextExtractor
|
|
23
|
+
:raises KeyError: If the extractor identifier is not known.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
extractors: Dict[str, TextExtractor] = {
|
|
27
|
+
CascadeExtractor.extractor_id: CascadeExtractor(),
|
|
28
|
+
MetadataTextExtractor.extractor_id: MetadataTextExtractor(),
|
|
29
|
+
PassThroughTextExtractor.extractor_id: PassThroughTextExtractor(),
|
|
30
|
+
}
|
|
31
|
+
if extractor_id not in extractors:
|
|
32
|
+
raise KeyError(f"Unknown extractor: {extractor_id!r}")
|
|
33
|
+
return extractors[extractor_id]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base interfaces for text extraction plugins.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any, Dict, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from ..corpus import Corpus
|
|
13
|
+
from ..models import CatalogItem, ExtractedText
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TextExtractor(ABC):
|
|
17
|
+
"""
|
|
18
|
+
Abstract interface for plugins that derive text artifacts from corpus items.
|
|
19
|
+
|
|
20
|
+
A text extractor is intentionally independent from retrieval backends. It can be swapped
|
|
21
|
+
independently so that different extraction approaches can be evaluated against the same corpus
|
|
22
|
+
and the same retrieval backend.
|
|
23
|
+
|
|
24
|
+
:ivar extractor_id: Identifier string for the extractor plugin.
|
|
25
|
+
:vartype extractor_id: str
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
extractor_id: str
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
32
|
+
"""
|
|
33
|
+
Validate extractor configuration and return a parsed model.
|
|
34
|
+
|
|
35
|
+
:param config: Extractor configuration mapping.
|
|
36
|
+
:type config: dict[str, Any]
|
|
37
|
+
:return: Parsed configuration model.
|
|
38
|
+
:rtype: pydantic.BaseModel
|
|
39
|
+
:raises ValueError: If the configuration is invalid.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
raise NotImplementedError
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def extract_text(self, *, corpus: Corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
|
|
46
|
+
"""
|
|
47
|
+
Derive text for a catalog item.
|
|
48
|
+
|
|
49
|
+
Returning None indicates that the item was intentionally skipped.
|
|
50
|
+
|
|
51
|
+
:param corpus: Corpus containing the item bytes.
|
|
52
|
+
:type corpus: Corpus
|
|
53
|
+
:param item: Catalog item to process.
|
|
54
|
+
:type item: CatalogItem
|
|
55
|
+
:param config: Parsed extractor configuration.
|
|
56
|
+
:type config: pydantic.BaseModel
|
|
57
|
+
:return: Extracted text payload or None when skipped.
|
|
58
|
+
:rtype: ExtractedText or None
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cascade extractor plugin that composes multiple extractors.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus
|
|
12
|
+
from ..models import CatalogItem, ExtractedText
|
|
13
|
+
from .base import TextExtractor
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CascadeStepSpec(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
Single extractor step within a cascade pipeline.
|
|
19
|
+
|
|
20
|
+
:ivar extractor_id: Extractor plugin identifier.
|
|
21
|
+
:vartype extractor_id: str
|
|
22
|
+
:ivar config: Extractor configuration mapping.
|
|
23
|
+
:vartype config: dict[str, Any]
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
extractor_id: str = Field(min_length=1)
|
|
29
|
+
config: Dict[str, Any] = Field(default_factory=dict)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CascadeExtractorConfig(BaseModel):
|
|
33
|
+
"""
|
|
34
|
+
Configuration for the cascade extractor.
|
|
35
|
+
|
|
36
|
+
:ivar steps: Ordered list of extractor steps to try.
|
|
37
|
+
:vartype steps: list[CascadeStepSpec]
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
model_config = ConfigDict(extra="forbid")
|
|
41
|
+
|
|
42
|
+
steps: List[CascadeStepSpec] = Field(min_length=1)
|
|
43
|
+
|
|
44
|
+
@model_validator(mode="after")
|
|
45
|
+
def _forbid_self_reference(self) -> "CascadeExtractorConfig":
|
|
46
|
+
if any(step.extractor_id == "cascade" for step in self.steps):
|
|
47
|
+
raise ValueError("Cascade extractor cannot include itself as a step")
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class CascadeExtractor(TextExtractor):
|
|
52
|
+
"""
|
|
53
|
+
Extractor that tries a sequence of extractors and uses the first usable text result.
|
|
54
|
+
|
|
55
|
+
A result is considered usable when its text is non-empty after stripping whitespace.
|
|
56
|
+
|
|
57
|
+
:ivar extractor_id: Extractor identifier.
|
|
58
|
+
:vartype extractor_id: str
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
extractor_id = "cascade"
|
|
62
|
+
|
|
63
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
64
|
+
"""
|
|
65
|
+
Validate cascade extractor configuration.
|
|
66
|
+
|
|
67
|
+
:param config: Configuration mapping.
|
|
68
|
+
:type config: dict[str, Any]
|
|
69
|
+
:return: Parsed config.
|
|
70
|
+
:rtype: CascadeExtractorConfig
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
return CascadeExtractorConfig.model_validate(config)
|
|
74
|
+
|
|
75
|
+
def extract_text(self, *, corpus: Corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
|
|
76
|
+
"""
|
|
77
|
+
Run each configured extractor step until usable text is produced.
|
|
78
|
+
|
|
79
|
+
:param corpus: Corpus containing the item bytes.
|
|
80
|
+
:type corpus: Corpus
|
|
81
|
+
:param item: Catalog item being processed.
|
|
82
|
+
:type item: CatalogItem
|
|
83
|
+
:param config: Parsed configuration model.
|
|
84
|
+
:type config: CascadeExtractorConfig
|
|
85
|
+
:return: Extracted text payload or None.
|
|
86
|
+
:rtype: ExtractedText or None
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
cascade_config = config if isinstance(config, CascadeExtractorConfig) else CascadeExtractorConfig.model_validate(config)
|
|
90
|
+
for step in cascade_config.steps:
|
|
91
|
+
from . import get_extractor
|
|
92
|
+
|
|
93
|
+
extractor = get_extractor(step.extractor_id)
|
|
94
|
+
parsed_step_config = extractor.validate_config(step.config)
|
|
95
|
+
result = extractor.extract_text(corpus=corpus, item=item, config=parsed_step_config)
|
|
96
|
+
if result is None:
|
|
97
|
+
continue
|
|
98
|
+
if not result.text.strip():
|
|
99
|
+
continue
|
|
100
|
+
return result
|
|
101
|
+
return None
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Metadata-based text extractor plugin.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
|
|
11
|
+
from ..models import CatalogItem, ExtractedText
|
|
12
|
+
from .base import TextExtractor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MetadataTextExtractorConfig(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for the metadata text extractor.
|
|
18
|
+
|
|
19
|
+
The metadata text extractor is intentionally minimal and deterministic.
|
|
20
|
+
It emits a plain text representation derived only from an item's catalog metadata.
|
|
21
|
+
|
|
22
|
+
:ivar include_title: Whether to include the item title as the first line, if present.
|
|
23
|
+
:vartype include_title: bool
|
|
24
|
+
:ivar include_tags: Whether to include a ``tags: ...`` line, if tags are present.
|
|
25
|
+
:vartype include_tags: bool
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
29
|
+
|
|
30
|
+
include_title: bool = Field(default=True)
|
|
31
|
+
include_tags: bool = Field(default=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MetadataTextExtractor(TextExtractor):
|
|
35
|
+
"""
|
|
36
|
+
Extractor plugin that emits a small, searchable text representation of item metadata.
|
|
37
|
+
|
|
38
|
+
The output is intended to be stable and human-readable:
|
|
39
|
+
|
|
40
|
+
- If a title exists, the first line is the title.
|
|
41
|
+
- If tags exist, the next line is ``tags: <comma separated tags>``.
|
|
42
|
+
|
|
43
|
+
This extractor is useful for:
|
|
44
|
+
|
|
45
|
+
- Retrieval over non-text items that carry meaningful metadata.
|
|
46
|
+
- Comparing downstream retrieval backends while holding extraction stable.
|
|
47
|
+
|
|
48
|
+
:ivar extractor_id: Extractor identifier.
|
|
49
|
+
:vartype extractor_id: str
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
extractor_id = "metadata-text"
|
|
53
|
+
|
|
54
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
55
|
+
"""
|
|
56
|
+
Validate extractor configuration.
|
|
57
|
+
|
|
58
|
+
:param config: Configuration mapping.
|
|
59
|
+
:type config: dict[str, Any]
|
|
60
|
+
:return: Parsed config.
|
|
61
|
+
:rtype: MetadataTextExtractorConfig
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
return MetadataTextExtractorConfig.model_validate(config)
|
|
65
|
+
|
|
66
|
+
def extract_text(self, *, corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
|
|
67
|
+
"""
|
|
68
|
+
Extract a metadata-based text payload for the item.
|
|
69
|
+
|
|
70
|
+
:param corpus: Corpus containing the item bytes.
|
|
71
|
+
:type corpus: Corpus
|
|
72
|
+
:param item: Catalog item being processed.
|
|
73
|
+
:type item: CatalogItem
|
|
74
|
+
:param config: Parsed configuration model.
|
|
75
|
+
:type config: MetadataTextExtractorConfig
|
|
76
|
+
:return: Extracted text payload, or ``None`` if no metadata is available.
|
|
77
|
+
:rtype: ExtractedText or None
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
parsed_config = (
|
|
81
|
+
config
|
|
82
|
+
if isinstance(config, MetadataTextExtractorConfig)
|
|
83
|
+
else MetadataTextExtractorConfig.model_validate(config)
|
|
84
|
+
)
|
|
85
|
+
_ = corpus
|
|
86
|
+
lines: list[str] = []
|
|
87
|
+
|
|
88
|
+
if parsed_config.include_title and isinstance(item.title, str) and item.title.strip():
|
|
89
|
+
lines.append(item.title.strip())
|
|
90
|
+
|
|
91
|
+
tags = [tag.strip() for tag in item.tags if isinstance(tag, str) and tag.strip()]
|
|
92
|
+
if parsed_config.include_tags and tags:
|
|
93
|
+
lines.append(f"tags: {', '.join(tags)}")
|
|
94
|
+
|
|
95
|
+
if not lines:
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
return ExtractedText(text="\n".join(lines), producer_extractor_id=self.extractor_id)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pass-through extractor for text items.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus
|
|
12
|
+
from ..frontmatter import parse_front_matter
|
|
13
|
+
from ..models import CatalogItem, ExtractedText
|
|
14
|
+
from .base import TextExtractor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PassThroughTextExtractorConfig(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Configuration for the pass-through text extractor.
|
|
20
|
+
|
|
21
|
+
This extractor is intentionally minimal and requires no configuration.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
model_config = ConfigDict(extra="forbid")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class PassThroughTextExtractor(TextExtractor):
|
|
28
|
+
"""
|
|
29
|
+
Extractor plugin that reads text items from the corpus and returns their text content.
|
|
30
|
+
|
|
31
|
+
Non-text items are skipped.
|
|
32
|
+
|
|
33
|
+
:ivar extractor_id: Extractor identifier.
|
|
34
|
+
:vartype extractor_id: str
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
extractor_id = "pass-through-text"
|
|
38
|
+
|
|
39
|
+
def validate_config(self, config: Dict[str, Any]) -> BaseModel:
|
|
40
|
+
"""
|
|
41
|
+
Validate extractor configuration.
|
|
42
|
+
|
|
43
|
+
:param config: Configuration mapping.
|
|
44
|
+
:type config: dict[str, Any]
|
|
45
|
+
:return: Parsed config.
|
|
46
|
+
:rtype: PassThroughTextExtractorConfig
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
return PassThroughTextExtractorConfig.model_validate(config)
|
|
50
|
+
|
|
51
|
+
def extract_text(self, *, corpus: Corpus, item: CatalogItem, config: BaseModel) -> Optional[ExtractedText]:
|
|
52
|
+
"""
|
|
53
|
+
Extract text by reading the raw item content from the corpus.
|
|
54
|
+
|
|
55
|
+
:param corpus: Corpus containing the item bytes.
|
|
56
|
+
:type corpus: Corpus
|
|
57
|
+
:param item: Catalog item being processed.
|
|
58
|
+
:type item: CatalogItem
|
|
59
|
+
:param config: Parsed configuration model.
|
|
60
|
+
:type config: PassThroughTextExtractorConfig
|
|
61
|
+
:return: Extracted text payload, or None if the item is not text.
|
|
62
|
+
:rtype: ExtractedText or None
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
_ = config
|
|
66
|
+
media_type = item.media_type
|
|
67
|
+
if media_type != "text/markdown" and not media_type.startswith("text/"):
|
|
68
|
+
return None
|
|
69
|
+
raw_bytes = (corpus.root / item.relpath).read_bytes()
|
|
70
|
+
if media_type == "text/markdown":
|
|
71
|
+
markdown_text = raw_bytes.decode("utf-8")
|
|
72
|
+
parsed_document = parse_front_matter(markdown_text)
|
|
73
|
+
return ExtractedText(text=parsed_document.body, producer_extractor_id=self.extractor_id)
|
|
74
|
+
return ExtractedText(text=raw_bytes.decode("utf-8"), producer_extractor_id=self.extractor_id)
|