biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
  43. biblicus-1.1.1.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
@@ -15,16 +15,17 @@ from pydantic import BaseModel
15
15
 
16
16
  from ..ai.llm import generate_completion
17
17
  from ..corpus import Corpus
18
- from ..models import ExtractionRunReference
18
+ from ..models import ExtractionSnapshotReference
19
19
  from ..retrieval import hash_text
20
20
  from ..time import utc_now_iso
21
21
  from .base import CorpusAnalysisBackend
22
22
  from .models import (
23
- AnalysisRecipeManifest,
23
+ AnalysisConfigurationManifest,
24
24
  AnalysisRunInput,
25
25
  AnalysisRunManifest,
26
26
  TopicModelingBerTopicConfig,
27
27
  TopicModelingBerTopicReport,
28
+ TopicModelingConfiguration,
28
29
  TopicModelingKeyword,
29
30
  TopicModelingLabelSource,
30
31
  TopicModelingLexicalProcessingConfig,
@@ -35,7 +36,6 @@ from .models import (
35
36
  TopicModelingLlmFineTuningConfig,
36
37
  TopicModelingLlmFineTuningReport,
37
38
  TopicModelingOutput,
38
- TopicModelingRecipeConfig,
39
39
  TopicModelingReport,
40
40
  TopicModelingStageStatus,
41
41
  TopicModelingTextCollectionReport,
@@ -76,69 +76,74 @@ class TopicModelingBackend(CorpusAnalysisBackend):
76
76
  self,
77
77
  corpus: Corpus,
78
78
  *,
79
- recipe_name: str,
80
- config: Dict[str, object],
81
- extraction_run: ExtractionRunReference,
79
+ configuration_name: str,
80
+ configuration: Dict[str, object],
81
+ extraction_snapshot: ExtractionSnapshotReference,
82
82
  ) -> BaseModel:
83
83
  """
84
84
  Run the topic modeling analysis pipeline.
85
85
 
86
86
  :param corpus: Corpus to analyze.
87
87
  :type corpus: Corpus
88
- :param recipe_name: Human-readable recipe name.
89
- :type recipe_name: str
90
- :param config: Analysis configuration values.
91
- :type config: dict[str, object]
92
- :param extraction_run: Extraction run reference for text inputs.
93
- :type extraction_run: biblicus.models.ExtractionRunReference
88
+ :param configuration_name: Human-readable configuration name.
89
+ :type configuration_name: str
90
+ :param configuration: Analysis configuration values.
91
+ :type configuration: dict[str, object]
92
+ :param extraction_snapshot: Extraction snapshot reference for text inputs.
93
+ :type extraction_snapshot: biblicus.models.ExtractionSnapshotReference
94
94
  :return: Topic modeling output model.
95
95
  :rtype: pydantic.BaseModel
96
96
  """
97
97
  parsed_config = (
98
- config
99
- if isinstance(config, TopicModelingRecipeConfig)
100
- else TopicModelingRecipeConfig.model_validate(config)
98
+ configuration
99
+ if isinstance(configuration, TopicModelingConfiguration)
100
+ else TopicModelingConfiguration.model_validate(configuration)
101
101
  )
102
102
  return _run_topic_modeling(
103
103
  corpus=corpus,
104
- recipe_name=recipe_name,
104
+ configuration_name=configuration_name,
105
105
  config=parsed_config,
106
- extraction_run=extraction_run,
106
+ extraction_snapshot=extraction_snapshot,
107
107
  )
108
108
 
109
109
 
110
110
  def _run_topic_modeling(
111
111
  *,
112
112
  corpus: Corpus,
113
- recipe_name: str,
114
- config: TopicModelingRecipeConfig,
115
- extraction_run: ExtractionRunReference,
113
+ configuration_name: str,
114
+ config: TopicModelingConfiguration,
115
+ extraction_snapshot: ExtractionSnapshotReference,
116
116
  ) -> TopicModelingOutput:
117
- recipe = _create_recipe_manifest(name=recipe_name, config=config)
117
+ configuration_manifest = _create_configuration_manifest(
118
+ name=configuration_name,
119
+ config=config,
120
+ )
118
121
  catalog = corpus.load_catalog()
119
- run_id = _analysis_run_id(
120
- recipe_id=recipe.recipe_id,
121
- extraction_run=extraction_run,
122
+ snapshot_id = _analysis_snapshot_id(
123
+ configuration_id=configuration_manifest.configuration_id,
124
+ extraction_snapshot=extraction_snapshot,
122
125
  catalog_generated_at=catalog.generated_at,
123
126
  )
124
127
  run_manifest = AnalysisRunManifest(
125
- run_id=run_id,
126
- recipe=recipe,
128
+ snapshot_id=snapshot_id,
129
+ configuration=configuration_manifest,
127
130
  corpus_uri=catalog.corpus_uri,
128
131
  catalog_generated_at=catalog.generated_at,
129
132
  created_at=utc_now_iso(),
130
- input=AnalysisRunInput(extraction_run=extraction_run),
133
+ input=AnalysisRunInput(extraction_snapshot=extraction_snapshot),
131
134
  artifact_paths=[],
132
135
  stats={},
133
136
  )
134
- run_dir = corpus.analysis_run_dir(analysis_id=TopicModelingBackend.analysis_id, run_id=run_id)
137
+ run_dir = corpus.analysis_run_dir(
138
+ analysis_id=TopicModelingBackend.analysis_id, snapshot_id=snapshot_id
139
+ )
135
140
  output_path = run_dir / "output.json"
136
141
 
137
142
  run_dir.mkdir(parents=True, exist_ok=True)
138
143
 
139
144
  documents, text_report = _collect_documents(
140
145
  corpus=corpus,
141
- extraction_run=extraction_run,
146
+ extraction_snapshot=extraction_snapshot,
142
147
  config=config.text_source,
143
148
  )
144
149
 
@@ -194,7 +199,7 @@ def _run_topic_modeling(
194
199
  output = TopicModelingOutput(
195
200
  analysis_id=TopicModelingBackend.analysis_id,
196
201
  generated_at=utc_now_iso(),
197
- run=run_manifest,
202
+ snapshot=run_manifest,
198
203
  report=report,
199
204
  )
200
205
  _write_topic_modeling_output(path=output_path, output=output)
@@ -204,15 +209,15 @@ def _run_topic_modeling(
204
209
  def run_topic_modeling_for_documents(
205
210
  *,
206
211
  documents: List[TopicModelingDocument],
207
- config: TopicModelingRecipeConfig,
212
+ config: TopicModelingConfiguration,
208
213
  ) -> TopicModelingReport:
209
214
  """
210
215
  Run topic modeling using caller-provided documents.
211
216
 
212
217
  :param documents: Pre-collected documents to model.
213
218
  :type documents: list[TopicModelingDocument]
214
- :param config: Topic modeling recipe configuration.
215
- :type config: TopicModelingRecipeConfig
219
+ :param config: Topic modeling configuration.
220
+ :type config: TopicModelingConfiguration
216
221
  :return: Topic modeling report with topic assignments.
217
222
  :rtype: TopicModelingReport
218
223
  """
@@ -269,10 +274,10 @@ def run_topic_modeling_for_documents(
269
274
  )
270
275
 
271
276
 
272
- def _create_recipe_manifest(
273
- *, name: str, config: TopicModelingRecipeConfig
274
- ) -> AnalysisRecipeManifest:
275
- recipe_payload = json.dumps(
277
+ def _create_configuration_manifest(
278
+ *, name: str, config: TopicModelingConfiguration
279
+ ) -> AnalysisConfigurationManifest:
280
+ configuration_payload = json.dumps(
276
281
  {
277
282
  "analysis_id": TopicModelingBackend.analysis_id,
278
283
  "name": name,
@@ -280,9 +285,9 @@ def _create_recipe_manifest(
280
285
  },
281
286
  sort_keys=True,
282
287
  )
283
- recipe_id = hash_text(recipe_payload)
284
- return AnalysisRecipeManifest(
285
- recipe_id=recipe_id,
288
+ configuration_id = hash_text(configuration_payload)
289
+ return AnalysisConfigurationManifest(
290
+ configuration_id=configuration_id,
286
291
  analysis_id=TopicModelingBackend.analysis_id,
287
292
  name=name,
288
293
  created_at=utc_now_iso(),
@@ -290,25 +295,25 @@ def _create_recipe_manifest(
290
295
  )
291
296
 
292
297
 
293
- def _analysis_run_id(
298
+ def _analysis_snapshot_id(
294
299
  *,
295
- recipe_id: str,
296
- extraction_run: ExtractionRunReference,
300
+ configuration_id: str,
301
+ extraction_snapshot: ExtractionSnapshotReference,
297
302
  catalog_generated_at: str,
298
303
  ) -> str:
299
- run_seed = f"{recipe_id}:{extraction_run.as_string()}:{catalog_generated_at}"
304
+ run_seed = f"{configuration_id}:{extraction_snapshot.as_string()}:{catalog_generated_at}"
300
305
  return hash_text(run_seed)
301
306
 
302
307
 
303
308
  def _collect_documents(
304
309
  *,
305
310
  corpus: Corpus,
306
- extraction_run: ExtractionRunReference,
311
+ extraction_snapshot: ExtractionSnapshotReference,
307
312
  config: TopicModelingTextSourceConfig,
308
313
  ) -> Tuple[List[TopicModelingDocument], TopicModelingTextCollectionReport]:
309
- manifest = corpus.load_extraction_run_manifest(
310
- extractor_id=extraction_run.extractor_id,
311
- run_id=extraction_run.run_id,
314
+ manifest = corpus.load_extraction_snapshot_manifest(
315
+ extractor_id=extraction_snapshot.extractor_id,
316
+ snapshot_id=extraction_snapshot.snapshot_id,
312
317
  )
313
318
  warnings: List[str] = []
314
319
  errors: List[str] = []
@@ -321,9 +326,9 @@ def _collect_documents(
321
326
  skipped_items += 1
322
327
  continue
323
328
  text_path = (
324
- corpus.extraction_run_dir(
325
- extractor_id=extraction_run.extractor_id,
326
- run_id=extraction_run.run_id,
329
+ corpus.extraction_snapshot_dir(
330
+ extractor_id=extraction_snapshot.extractor_id,
331
+ snapshot_id=extraction_snapshot.snapshot_id,
327
332
  )
328
333
  / item_result.final_text_relpath
329
334
  )