biblicus 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +2 -2
- biblicus/_vendor/dotyaml/loader.py +40 -1
- biblicus/ai/__init__.py +39 -0
- biblicus/ai/embeddings.py +114 -0
- biblicus/ai/llm.py +138 -0
- biblicus/ai/models.py +226 -0
- biblicus/analysis/__init__.py +5 -2
- biblicus/analysis/markov.py +1624 -0
- biblicus/analysis/models.py +754 -1
- biblicus/analysis/topic_modeling.py +98 -19
- biblicus/backends/sqlite_full_text_search.py +4 -2
- biblicus/cli.py +118 -23
- biblicus/recipes.py +136 -0
- biblicus/text/__init__.py +43 -0
- biblicus/text/annotate.py +222 -0
- biblicus/text/extract.py +210 -0
- biblicus/text/link.py +519 -0
- biblicus/text/markup.py +200 -0
- biblicus/text/models.py +319 -0
- biblicus/text/prompts.py +113 -0
- biblicus/text/redact.py +229 -0
- biblicus/text/slice.py +155 -0
- biblicus/text/tool_loop.py +334 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/METADATA +88 -25
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/RECORD +30 -15
- biblicus/analysis/llm.py +0 -106
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/WHEEL +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -13,12 +13,12 @@ from typing import Any, Dict, List, Tuple
|
|
|
13
13
|
|
|
14
14
|
from pydantic import BaseModel
|
|
15
15
|
|
|
16
|
+
from ..ai.llm import generate_completion
|
|
16
17
|
from ..corpus import Corpus
|
|
17
18
|
from ..models import ExtractionRunReference
|
|
18
19
|
from ..retrieval import hash_text
|
|
19
20
|
from ..time import utc_now_iso
|
|
20
21
|
from .base import CorpusAnalysisBackend
|
|
21
|
-
from .llm import generate_completion
|
|
22
22
|
from .models import (
|
|
23
23
|
AnalysisRecipeManifest,
|
|
24
24
|
AnalysisRunInput,
|
|
@@ -45,7 +45,18 @@ from .models import (
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
@dataclass
|
|
48
|
-
class
|
|
48
|
+
class TopicModelingDocument:
|
|
49
|
+
"""
|
|
50
|
+
Text document input for topic modeling.
|
|
51
|
+
|
|
52
|
+
:ivar document_id: Stable identifier for this document in the topic modeling stage.
|
|
53
|
+
:vartype document_id: str
|
|
54
|
+
:ivar source_item_id: Corpus item identifier the text was derived from.
|
|
55
|
+
:vartype source_item_id: str
|
|
56
|
+
:ivar text: Document text content.
|
|
57
|
+
:vartype text: str
|
|
58
|
+
"""
|
|
59
|
+
|
|
49
60
|
document_id: str
|
|
50
61
|
source_item_id: str
|
|
51
62
|
text: str
|
|
@@ -190,6 +201,74 @@ def _run_topic_modeling(
|
|
|
190
201
|
return output
|
|
191
202
|
|
|
192
203
|
|
|
204
|
+
def run_topic_modeling_for_documents(
|
|
205
|
+
*,
|
|
206
|
+
documents: List[TopicModelingDocument],
|
|
207
|
+
config: TopicModelingRecipeConfig,
|
|
208
|
+
) -> TopicModelingReport:
|
|
209
|
+
"""
|
|
210
|
+
Run topic modeling using caller-provided documents.
|
|
211
|
+
|
|
212
|
+
:param documents: Pre-collected documents to model.
|
|
213
|
+
:type documents: list[TopicModelingDocument]
|
|
214
|
+
:param config: Topic modeling recipe configuration.
|
|
215
|
+
:type config: TopicModelingRecipeConfig
|
|
216
|
+
:return: Topic modeling report with topic assignments.
|
|
217
|
+
:rtype: TopicModelingReport
|
|
218
|
+
"""
|
|
219
|
+
text_report = TopicModelingTextCollectionReport(
|
|
220
|
+
status=TopicModelingStageStatus.COMPLETE,
|
|
221
|
+
source_items=len({doc.source_item_id for doc in documents}),
|
|
222
|
+
documents=len(documents),
|
|
223
|
+
sample_size=config.text_source.sample_size,
|
|
224
|
+
min_text_characters=config.text_source.min_text_characters,
|
|
225
|
+
empty_texts=len([doc for doc in documents if not doc.text.strip()]),
|
|
226
|
+
skipped_items=0,
|
|
227
|
+
warnings=[],
|
|
228
|
+
errors=[],
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
llm_extraction_report, extracted_documents = _apply_llm_extraction(
|
|
232
|
+
documents=documents,
|
|
233
|
+
config=config.llm_extraction,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
lexical_report, lexical_documents = _apply_lexical_processing(
|
|
237
|
+
documents=extracted_documents,
|
|
238
|
+
config=config.lexical_processing,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
bertopic_report, topics = _run_bertopic(
|
|
242
|
+
documents=lexical_documents,
|
|
243
|
+
config=config.bertopic_analysis,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
fine_tuning_report, labeled_topics = _apply_llm_fine_tuning(
|
|
247
|
+
topics=topics,
|
|
248
|
+
documents=lexical_documents,
|
|
249
|
+
config=config.llm_fine_tuning,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
return TopicModelingReport(
|
|
253
|
+
text_collection=text_report,
|
|
254
|
+
llm_extraction=llm_extraction_report,
|
|
255
|
+
lexical_processing=lexical_report,
|
|
256
|
+
bertopic_analysis=bertopic_report,
|
|
257
|
+
llm_fine_tuning=fine_tuning_report,
|
|
258
|
+
topics=labeled_topics,
|
|
259
|
+
warnings=(
|
|
260
|
+
text_report.warnings
|
|
261
|
+
+ llm_extraction_report.warnings
|
|
262
|
+
+ bertopic_report.warnings
|
|
263
|
+
+ fine_tuning_report.warnings
|
|
264
|
+
),
|
|
265
|
+
errors=text_report.errors
|
|
266
|
+
+ llm_extraction_report.errors
|
|
267
|
+
+ bertopic_report.errors
|
|
268
|
+
+ fine_tuning_report.errors,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
|
|
193
272
|
def _create_recipe_manifest(
|
|
194
273
|
*, name: str, config: TopicModelingRecipeConfig
|
|
195
274
|
) -> AnalysisRecipeManifest:
|
|
@@ -226,14 +305,14 @@ def _collect_documents(
|
|
|
226
305
|
corpus: Corpus,
|
|
227
306
|
extraction_run: ExtractionRunReference,
|
|
228
307
|
config: TopicModelingTextSourceConfig,
|
|
229
|
-
) -> Tuple[List[
|
|
308
|
+
) -> Tuple[List[TopicModelingDocument], TopicModelingTextCollectionReport]:
|
|
230
309
|
manifest = corpus.load_extraction_run_manifest(
|
|
231
310
|
extractor_id=extraction_run.extractor_id,
|
|
232
311
|
run_id=extraction_run.run_id,
|
|
233
312
|
)
|
|
234
313
|
warnings: List[str] = []
|
|
235
314
|
errors: List[str] = []
|
|
236
|
-
documents: List[
|
|
315
|
+
documents: List[TopicModelingDocument] = []
|
|
237
316
|
skipped_items = 0
|
|
238
317
|
empty_texts = 0
|
|
239
318
|
|
|
@@ -256,7 +335,7 @@ def _collect_documents(
|
|
|
256
335
|
skipped_items += 1
|
|
257
336
|
continue
|
|
258
337
|
documents.append(
|
|
259
|
-
|
|
338
|
+
TopicModelingDocument(
|
|
260
339
|
document_id=item_result.item_id,
|
|
261
340
|
source_item_id=item_result.item_id,
|
|
262
341
|
text=text_value,
|
|
@@ -286,9 +365,9 @@ def _collect_documents(
|
|
|
286
365
|
|
|
287
366
|
def _apply_llm_extraction(
|
|
288
367
|
*,
|
|
289
|
-
documents: List[
|
|
368
|
+
documents: List[TopicModelingDocument],
|
|
290
369
|
config: TopicModelingLlmExtractionConfig,
|
|
291
|
-
) -> Tuple[TopicModelingLlmExtractionReport, List[
|
|
370
|
+
) -> Tuple[TopicModelingLlmExtractionReport, List[TopicModelingDocument]]:
|
|
292
371
|
if not config.enabled:
|
|
293
372
|
report = TopicModelingLlmExtractionReport(
|
|
294
373
|
status=TopicModelingStageStatus.SKIPPED,
|
|
@@ -300,7 +379,7 @@ def _apply_llm_extraction(
|
|
|
300
379
|
)
|
|
301
380
|
return report, list(documents)
|
|
302
381
|
|
|
303
|
-
extracted_documents: List[
|
|
382
|
+
extracted_documents: List[TopicModelingDocument] = []
|
|
304
383
|
errors: List[str] = []
|
|
305
384
|
|
|
306
385
|
for document in documents:
|
|
@@ -315,7 +394,7 @@ def _apply_llm_extraction(
|
|
|
315
394
|
errors.append(f"LLM extraction returned empty output for {document.document_id}")
|
|
316
395
|
continue
|
|
317
396
|
extracted_documents.append(
|
|
318
|
-
|
|
397
|
+
TopicModelingDocument(
|
|
319
398
|
document_id=document.document_id,
|
|
320
399
|
source_item_id=document.source_item_id,
|
|
321
400
|
text=response_text,
|
|
@@ -328,7 +407,7 @@ def _apply_llm_extraction(
|
|
|
328
407
|
continue
|
|
329
408
|
for index, item_text in enumerate(items, start=1):
|
|
330
409
|
extracted_documents.append(
|
|
331
|
-
|
|
410
|
+
TopicModelingDocument(
|
|
332
411
|
document_id=f"{document.document_id}:{index}",
|
|
333
412
|
source_item_id=document.source_item_id,
|
|
334
413
|
text=item_text,
|
|
@@ -381,9 +460,9 @@ def _parse_itemized_response(response_text: str) -> List[str]:
|
|
|
381
460
|
|
|
382
461
|
def _apply_lexical_processing(
|
|
383
462
|
*,
|
|
384
|
-
documents: List[
|
|
463
|
+
documents: List[TopicModelingDocument],
|
|
385
464
|
config: TopicModelingLexicalProcessingConfig,
|
|
386
|
-
) -> Tuple[TopicModelingLexicalProcessingReport, List[
|
|
465
|
+
) -> Tuple[TopicModelingLexicalProcessingReport, List[TopicModelingDocument]]:
|
|
387
466
|
if not config.enabled:
|
|
388
467
|
report = TopicModelingLexicalProcessingReport(
|
|
389
468
|
status=TopicModelingStageStatus.SKIPPED,
|
|
@@ -395,7 +474,7 @@ def _apply_lexical_processing(
|
|
|
395
474
|
)
|
|
396
475
|
return report, list(documents)
|
|
397
476
|
|
|
398
|
-
processed: List[
|
|
477
|
+
processed: List[TopicModelingDocument] = []
|
|
399
478
|
for document in documents:
|
|
400
479
|
text_value = document.text
|
|
401
480
|
if config.lowercase:
|
|
@@ -405,7 +484,7 @@ def _apply_lexical_processing(
|
|
|
405
484
|
if config.collapse_whitespace:
|
|
406
485
|
text_value = re.sub(r"\s+", " ", text_value).strip()
|
|
407
486
|
processed.append(
|
|
408
|
-
|
|
487
|
+
TopicModelingDocument(
|
|
409
488
|
document_id=document.document_id,
|
|
410
489
|
source_item_id=document.source_item_id,
|
|
411
490
|
text=text_value,
|
|
@@ -425,7 +504,7 @@ def _apply_lexical_processing(
|
|
|
425
504
|
|
|
426
505
|
def _run_bertopic(
|
|
427
506
|
*,
|
|
428
|
-
documents: List[
|
|
507
|
+
documents: List[TopicModelingDocument],
|
|
429
508
|
config: TopicModelingBerTopicConfig,
|
|
430
509
|
) -> Tuple[TopicModelingBerTopicReport, List[TopicModelingTopic]]:
|
|
431
510
|
try:
|
|
@@ -496,9 +575,9 @@ def _run_bertopic(
|
|
|
496
575
|
|
|
497
576
|
|
|
498
577
|
def _group_documents_by_topic(
|
|
499
|
-
documents: List[
|
|
500
|
-
) -> Dict[int, List[
|
|
501
|
-
grouped: Dict[int, List[
|
|
578
|
+
documents: List[TopicModelingDocument], assignments: List[int]
|
|
579
|
+
) -> Dict[int, List[TopicModelingDocument]]:
|
|
580
|
+
grouped: Dict[int, List[TopicModelingDocument]] = {}
|
|
502
581
|
for index, topic_id in enumerate(assignments):
|
|
503
582
|
grouped.setdefault(int(topic_id), []).append(documents[index])
|
|
504
583
|
return grouped
|
|
@@ -514,7 +593,7 @@ def _resolve_topic_keywords(*, topic_model: Any, topic_id: int) -> List[TopicMod
|
|
|
514
593
|
def _apply_llm_fine_tuning(
|
|
515
594
|
*,
|
|
516
595
|
topics: List[TopicModelingTopic],
|
|
517
|
-
documents: List[
|
|
596
|
+
documents: List[TopicModelingDocument],
|
|
518
597
|
config: TopicModelingLlmFineTuningConfig,
|
|
519
598
|
) -> Tuple[TopicModelingLlmFineTuningReport, List[TopicModelingTopic]]:
|
|
520
599
|
if not config.enabled:
|
|
@@ -459,7 +459,8 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
|
|
|
459
459
|
:return: None.
|
|
460
460
|
:rtype: None
|
|
461
461
|
"""
|
|
462
|
-
conn.execute(
|
|
462
|
+
conn.execute(
|
|
463
|
+
"""
|
|
463
464
|
CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
|
|
464
465
|
content,
|
|
465
466
|
item_id UNINDEXED,
|
|
@@ -470,7 +471,8 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
|
|
|
470
471
|
start_offset UNINDEXED,
|
|
471
472
|
end_offset UNINDEXED
|
|
472
473
|
)
|
|
473
|
-
"""
|
|
474
|
+
"""
|
|
475
|
+
)
|
|
474
476
|
|
|
475
477
|
|
|
476
478
|
def _build_full_text_search_index(
|
biblicus/cli.py
CHANGED
|
@@ -394,7 +394,7 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
|
394
394
|
:return: Exit code.
|
|
395
395
|
:rtype: int
|
|
396
396
|
"""
|
|
397
|
-
import
|
|
397
|
+
from .recipes import load_recipe_view
|
|
398
398
|
|
|
399
399
|
corpus = (
|
|
400
400
|
Corpus.open(arguments.corpus)
|
|
@@ -404,11 +404,11 @@ def cmd_extract_build(arguments: argparse.Namespace) -> int:
|
|
|
404
404
|
|
|
405
405
|
# Load recipe from file if --recipe is provided
|
|
406
406
|
if getattr(arguments, "recipe", None):
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
407
|
+
recipe_data = load_recipe_view(
|
|
408
|
+
arguments.recipe,
|
|
409
|
+
recipe_label="Recipe file",
|
|
410
|
+
mapping_error_message="Extraction recipe must be a mapping/object",
|
|
411
|
+
)
|
|
412
412
|
loaded_extractor_id = recipe_data.get("extractor_id", "pipeline")
|
|
413
413
|
loaded_config = recipe_data.get("config", {})
|
|
414
414
|
|
|
@@ -713,19 +713,20 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
|
|
|
713
713
|
:return: Exit code.
|
|
714
714
|
:rtype: int
|
|
715
715
|
"""
|
|
716
|
-
import
|
|
716
|
+
from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
|
|
717
717
|
|
|
718
718
|
corpus = (
|
|
719
719
|
Corpus.open(arguments.corpus)
|
|
720
720
|
if getattr(arguments, "corpus", None)
|
|
721
721
|
else Corpus.find(Path.cwd())
|
|
722
722
|
)
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
723
|
+
recipe_data = load_recipe_view(
|
|
724
|
+
arguments.recipe,
|
|
725
|
+
recipe_label="Recipe file",
|
|
726
|
+
mapping_error_message="Topic modeling recipe must be a mapping/object",
|
|
727
|
+
)
|
|
728
|
+
overrides = parse_dotted_overrides(arguments.config)
|
|
729
|
+
recipe_data = apply_dotted_overrides(recipe_data, overrides)
|
|
729
730
|
|
|
730
731
|
if arguments.extraction_run:
|
|
731
732
|
extraction_run = parse_extraction_run_reference(arguments.extraction_run)
|
|
@@ -761,7 +762,7 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
|
761
762
|
:return: Exit code.
|
|
762
763
|
:rtype: int
|
|
763
764
|
"""
|
|
764
|
-
import
|
|
765
|
+
from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
|
|
765
766
|
|
|
766
767
|
corpus = (
|
|
767
768
|
Corpus.open(arguments.corpus)
|
|
@@ -771,13 +772,17 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
|
771
772
|
|
|
772
773
|
recipe_data: dict[str, object] = {}
|
|
773
774
|
if arguments.recipe is not None:
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
recipe_data =
|
|
775
|
+
recipe_data = load_recipe_view(
|
|
776
|
+
arguments.recipe,
|
|
777
|
+
recipe_label="Recipe file",
|
|
778
|
+
mapping_error_message="Profiling recipe must be a mapping/object",
|
|
779
|
+
)
|
|
780
|
+
overrides = parse_dotted_overrides(arguments.config)
|
|
781
|
+
recipe_data = apply_dotted_overrides(recipe_data, overrides)
|
|
782
|
+
else:
|
|
783
|
+
overrides = parse_dotted_overrides(arguments.config)
|
|
784
|
+
if overrides:
|
|
785
|
+
recipe_data = apply_dotted_overrides(recipe_data, overrides)
|
|
781
786
|
|
|
782
787
|
if arguments.extraction_run:
|
|
783
788
|
extraction_run = parse_extraction_run_reference(arguments.extraction_run)
|
|
@@ -804,6 +809,55 @@ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
|
804
809
|
return 0
|
|
805
810
|
|
|
806
811
|
|
|
812
|
+
def cmd_analyze_markov(arguments: argparse.Namespace) -> int:
|
|
813
|
+
"""
|
|
814
|
+
Run Markov analysis for a corpus.
|
|
815
|
+
|
|
816
|
+
:param arguments: Parsed command-line interface arguments.
|
|
817
|
+
:type arguments: argparse.Namespace
|
|
818
|
+
:return: Exit code.
|
|
819
|
+
:rtype: int
|
|
820
|
+
"""
|
|
821
|
+
from .recipes import apply_dotted_overrides, load_recipe_view, parse_dotted_overrides
|
|
822
|
+
|
|
823
|
+
corpus = (
|
|
824
|
+
Corpus.open(arguments.corpus)
|
|
825
|
+
if getattr(arguments, "corpus", None)
|
|
826
|
+
else Corpus.find(Path.cwd())
|
|
827
|
+
)
|
|
828
|
+
recipe_data = load_recipe_view(
|
|
829
|
+
arguments.recipe,
|
|
830
|
+
recipe_label="Recipe file",
|
|
831
|
+
mapping_error_message="Markov analysis recipe must be a mapping/object",
|
|
832
|
+
)
|
|
833
|
+
overrides = parse_dotted_overrides(arguments.config)
|
|
834
|
+
recipe_data = apply_dotted_overrides(recipe_data, overrides)
|
|
835
|
+
|
|
836
|
+
if arguments.extraction_run:
|
|
837
|
+
extraction_run = parse_extraction_run_reference(arguments.extraction_run)
|
|
838
|
+
else:
|
|
839
|
+
extraction_run = corpus.latest_extraction_run_reference()
|
|
840
|
+
if extraction_run is None:
|
|
841
|
+
raise ValueError("Markov analysis requires an extraction run to supply text inputs")
|
|
842
|
+
print(
|
|
843
|
+
"Warning: using latest extraction run; pass --extraction-run for reproducibility.",
|
|
844
|
+
file=sys.stderr,
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
backend = get_analysis_backend("markov")
|
|
848
|
+
try:
|
|
849
|
+
output = backend.run_analysis(
|
|
850
|
+
corpus,
|
|
851
|
+
recipe_name=arguments.recipe_name,
|
|
852
|
+
config=recipe_data,
|
|
853
|
+
extraction_run=extraction_run,
|
|
854
|
+
)
|
|
855
|
+
except ValidationError as exc:
|
|
856
|
+
raise ValueError(f"Invalid Markov analysis recipe: {exc}") from exc
|
|
857
|
+
print(output.model_dump_json(indent=2))
|
|
858
|
+
return 0
|
|
859
|
+
|
|
860
|
+
|
|
807
861
|
def build_parser() -> argparse.ArgumentParser:
|
|
808
862
|
"""
|
|
809
863
|
Build the command-line interface argument parser.
|
|
@@ -912,6 +966,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
912
966
|
p_extract_build.add_argument(
|
|
913
967
|
"--recipe",
|
|
914
968
|
default=None,
|
|
969
|
+
action="append",
|
|
915
970
|
help="Path to YAML recipe file. If provided, --step arguments are ignored.",
|
|
916
971
|
)
|
|
917
972
|
p_extract_build.add_argument(
|
|
@@ -1067,7 +1122,14 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1067
1122
|
p_analyze_topics.add_argument(
|
|
1068
1123
|
"--recipe",
|
|
1069
1124
|
required=True,
|
|
1070
|
-
|
|
1125
|
+
action="append",
|
|
1126
|
+
help="Path to topic modeling recipe YAML. Repeatable; later recipes override earlier recipes.",
|
|
1127
|
+
)
|
|
1128
|
+
p_analyze_topics.add_argument(
|
|
1129
|
+
"--config",
|
|
1130
|
+
action="append",
|
|
1131
|
+
default=[],
|
|
1132
|
+
help="Override key=value pairs applied after composing recipes (supports dotted keys).",
|
|
1071
1133
|
)
|
|
1072
1134
|
p_analyze_topics.add_argument(
|
|
1073
1135
|
"--recipe-name",
|
|
@@ -1086,7 +1148,14 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1086
1148
|
p_analyze_profile.add_argument(
|
|
1087
1149
|
"--recipe",
|
|
1088
1150
|
default=None,
|
|
1089
|
-
|
|
1151
|
+
action="append",
|
|
1152
|
+
help="Optional profiling recipe YAML file. Repeatable; later recipes override earlier recipes.",
|
|
1153
|
+
)
|
|
1154
|
+
p_analyze_profile.add_argument(
|
|
1155
|
+
"--config",
|
|
1156
|
+
action="append",
|
|
1157
|
+
default=[],
|
|
1158
|
+
help="Override key=value pairs applied after composing recipes (supports dotted keys).",
|
|
1090
1159
|
)
|
|
1091
1160
|
p_analyze_profile.add_argument(
|
|
1092
1161
|
"--recipe-name",
|
|
@@ -1100,6 +1169,32 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1100
1169
|
)
|
|
1101
1170
|
p_analyze_profile.set_defaults(func=cmd_analyze_profile)
|
|
1102
1171
|
|
|
1172
|
+
p_analyze_markov = analyze_sub.add_parser("markov", help="Run Markov analysis.")
|
|
1173
|
+
_add_common_corpus_arg(p_analyze_markov)
|
|
1174
|
+
p_analyze_markov.add_argument(
|
|
1175
|
+
"--recipe",
|
|
1176
|
+
required=True,
|
|
1177
|
+
action="append",
|
|
1178
|
+
help="Path to Markov analysis recipe YAML. Repeatable; later recipes override earlier recipes.",
|
|
1179
|
+
)
|
|
1180
|
+
p_analyze_markov.add_argument(
|
|
1181
|
+
"--config",
|
|
1182
|
+
action="append",
|
|
1183
|
+
default=[],
|
|
1184
|
+
help="Override key=value pairs applied after composing recipes (supports dotted keys).",
|
|
1185
|
+
)
|
|
1186
|
+
p_analyze_markov.add_argument(
|
|
1187
|
+
"--recipe-name",
|
|
1188
|
+
default="default",
|
|
1189
|
+
help="Human-readable recipe name.",
|
|
1190
|
+
)
|
|
1191
|
+
p_analyze_markov.add_argument(
|
|
1192
|
+
"--extraction-run",
|
|
1193
|
+
default=None,
|
|
1194
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
1195
|
+
)
|
|
1196
|
+
p_analyze_markov.set_defaults(func=cmd_analyze_markov)
|
|
1197
|
+
|
|
1103
1198
|
return parser
|
|
1104
1199
|
|
|
1105
1200
|
|
biblicus/recipes.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Recipe loading utilities for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Iterable, List, Mapping, MutableMapping, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _parse_scalar(value: str) -> object:
|
|
13
|
+
lowered = value.lower()
|
|
14
|
+
if lowered == "true":
|
|
15
|
+
return True
|
|
16
|
+
if lowered == "false":
|
|
17
|
+
return False
|
|
18
|
+
if lowered in {"null", "none"}:
|
|
19
|
+
return None
|
|
20
|
+
if value.isdigit():
|
|
21
|
+
return int(value)
|
|
22
|
+
try:
|
|
23
|
+
return float(value)
|
|
24
|
+
except ValueError:
|
|
25
|
+
return value
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def parse_override_value(raw: str) -> object:
|
|
29
|
+
"""
|
|
30
|
+
Parse a command-line override string into a Python value.
|
|
31
|
+
|
|
32
|
+
:param raw: Raw override string.
|
|
33
|
+
:type raw: str
|
|
34
|
+
:return: Parsed value.
|
|
35
|
+
:rtype: object
|
|
36
|
+
"""
|
|
37
|
+
raw = str(raw)
|
|
38
|
+
stripped = raw.strip()
|
|
39
|
+
if not stripped:
|
|
40
|
+
return ""
|
|
41
|
+
if stripped[0] in {"{", "["}:
|
|
42
|
+
try:
|
|
43
|
+
return json.loads(stripped)
|
|
44
|
+
except json.JSONDecodeError:
|
|
45
|
+
return raw
|
|
46
|
+
return _parse_scalar(stripped)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def parse_dotted_overrides(pairs: Optional[List[str]]) -> Dict[str, object]:
|
|
50
|
+
"""
|
|
51
|
+
Parse repeated key=value pairs into a dotted override mapping.
|
|
52
|
+
|
|
53
|
+
:param pairs: Repeated command-line pairs.
|
|
54
|
+
:type pairs: list[str] or None
|
|
55
|
+
:return: Override mapping.
|
|
56
|
+
:rtype: dict[str, object]
|
|
57
|
+
:raises ValueError: If a pair is not key=value.
|
|
58
|
+
"""
|
|
59
|
+
overrides: Dict[str, object] = {}
|
|
60
|
+
for item in pairs or []:
|
|
61
|
+
if "=" not in item:
|
|
62
|
+
raise ValueError(f"Config values must be key=value (got {item!r})")
|
|
63
|
+
key, raw = item.split("=", 1)
|
|
64
|
+
key = key.strip()
|
|
65
|
+
if not key:
|
|
66
|
+
raise ValueError("Config keys must be non-empty")
|
|
67
|
+
overrides[key] = parse_override_value(raw)
|
|
68
|
+
return overrides
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _set_dotted_key(target: MutableMapping[str, object], dotted_key: str, value: object) -> None:
|
|
72
|
+
parts = [part.strip() for part in dotted_key.split(".") if part.strip()]
|
|
73
|
+
if not parts:
|
|
74
|
+
raise ValueError("Override keys must be non-empty")
|
|
75
|
+
current: MutableMapping[str, object] = target
|
|
76
|
+
for part in parts[:-1]:
|
|
77
|
+
existing = current.get(part)
|
|
78
|
+
if not isinstance(existing, dict):
|
|
79
|
+
nested: Dict[str, object] = {}
|
|
80
|
+
current[part] = nested
|
|
81
|
+
current = nested
|
|
82
|
+
else:
|
|
83
|
+
current = existing
|
|
84
|
+
current[parts[-1]] = value
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def apply_dotted_overrides(
|
|
88
|
+
config: Dict[str, object], overrides: Mapping[str, object]
|
|
89
|
+
) -> Dict[str, object]:
|
|
90
|
+
"""
|
|
91
|
+
Apply dotted key overrides to a nested configuration mapping.
|
|
92
|
+
|
|
93
|
+
:param config: Base configuration mapping.
|
|
94
|
+
:type config: dict[str, object]
|
|
95
|
+
:param overrides: Dotted key override mapping.
|
|
96
|
+
:type overrides: Mapping[str, object]
|
|
97
|
+
:return: New configuration mapping with overrides applied.
|
|
98
|
+
:rtype: dict[str, object]
|
|
99
|
+
"""
|
|
100
|
+
updated: Dict[str, object] = json.loads(json.dumps(config))
|
|
101
|
+
for key, value in overrides.items():
|
|
102
|
+
_set_dotted_key(updated, key, value)
|
|
103
|
+
return updated
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def load_recipe_view(
|
|
107
|
+
recipe_paths: Iterable[str],
|
|
108
|
+
*,
|
|
109
|
+
recipe_label: str = "Recipe",
|
|
110
|
+
mapping_error_message: Optional[str] = None,
|
|
111
|
+
) -> Dict[str, object]:
|
|
112
|
+
"""
|
|
113
|
+
Load a composed recipe view from one or more YAML files.
|
|
114
|
+
|
|
115
|
+
:param recipe_paths: Iterable of recipe file paths in precedence order.
|
|
116
|
+
:type recipe_paths: Iterable[str]
|
|
117
|
+
:param recipe_label: Label used in error messages (for example: "Recipe file").
|
|
118
|
+
:type recipe_label: str
|
|
119
|
+
:return: Composed configuration view.
|
|
120
|
+
:rtype: dict[str, object]
|
|
121
|
+
:raises FileNotFoundError: If any recipe file is missing.
|
|
122
|
+
:raises ValueError: If any recipe file is not a mapping/object.
|
|
123
|
+
"""
|
|
124
|
+
from biblicus._vendor.dotyaml import load_yaml_view
|
|
125
|
+
|
|
126
|
+
paths: List[str] = [str(path) for path in recipe_paths]
|
|
127
|
+
for raw in paths:
|
|
128
|
+
candidate = Path(raw)
|
|
129
|
+
if not candidate.is_file():
|
|
130
|
+
raise FileNotFoundError(f"{recipe_label} not found: {candidate}")
|
|
131
|
+
try:
|
|
132
|
+
view = load_yaml_view(paths)
|
|
133
|
+
except ValueError as exc:
|
|
134
|
+
message = mapping_error_message or f"{recipe_label} must be a mapping/object"
|
|
135
|
+
raise ValueError(message) from exc
|
|
136
|
+
return view
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agentic text utilities.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .annotate import apply_text_annotate
|
|
6
|
+
from .extract import apply_text_extract
|
|
7
|
+
from .link import apply_text_link
|
|
8
|
+
from .models import (
|
|
9
|
+
TextAnnotateRequest,
|
|
10
|
+
TextAnnotateResult,
|
|
11
|
+
TextExtractRequest,
|
|
12
|
+
TextExtractResult,
|
|
13
|
+
TextExtractSpan,
|
|
14
|
+
TextLinkRequest,
|
|
15
|
+
TextLinkResult,
|
|
16
|
+
TextRedactRequest,
|
|
17
|
+
TextRedactResult,
|
|
18
|
+
TextSliceRequest,
|
|
19
|
+
TextSliceResult,
|
|
20
|
+
TextSliceSegment,
|
|
21
|
+
)
|
|
22
|
+
from .redact import apply_text_redact
|
|
23
|
+
from .slice import apply_text_slice
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"TextAnnotateRequest",
|
|
27
|
+
"TextAnnotateResult",
|
|
28
|
+
"TextExtractRequest",
|
|
29
|
+
"TextExtractResult",
|
|
30
|
+
"TextExtractSpan",
|
|
31
|
+
"TextLinkRequest",
|
|
32
|
+
"TextLinkResult",
|
|
33
|
+
"TextRedactRequest",
|
|
34
|
+
"TextRedactResult",
|
|
35
|
+
"TextSliceRequest",
|
|
36
|
+
"TextSliceResult",
|
|
37
|
+
"TextSliceSegment",
|
|
38
|
+
"apply_text_annotate",
|
|
39
|
+
"apply_text_extract",
|
|
40
|
+
"apply_text_link",
|
|
41
|
+
"apply_text_redact",
|
|
42
|
+
"apply_text_slice",
|
|
43
|
+
]
|