biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
- biblicus-1.1.1.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -15,16 +15,17 @@ from pydantic import BaseModel
|
|
|
15
15
|
|
|
16
16
|
from ..ai.llm import generate_completion
|
|
17
17
|
from ..corpus import Corpus
|
|
18
|
-
from ..models import
|
|
18
|
+
from ..models import ExtractionSnapshotReference
|
|
19
19
|
from ..retrieval import hash_text
|
|
20
20
|
from ..time import utc_now_iso
|
|
21
21
|
from .base import CorpusAnalysisBackend
|
|
22
22
|
from .models import (
|
|
23
|
-
|
|
23
|
+
AnalysisConfigurationManifest,
|
|
24
24
|
AnalysisRunInput,
|
|
25
25
|
AnalysisRunManifest,
|
|
26
26
|
TopicModelingBerTopicConfig,
|
|
27
27
|
TopicModelingBerTopicReport,
|
|
28
|
+
TopicModelingConfiguration,
|
|
28
29
|
TopicModelingKeyword,
|
|
29
30
|
TopicModelingLabelSource,
|
|
30
31
|
TopicModelingLexicalProcessingConfig,
|
|
@@ -35,7 +36,6 @@ from .models import (
|
|
|
35
36
|
TopicModelingLlmFineTuningConfig,
|
|
36
37
|
TopicModelingLlmFineTuningReport,
|
|
37
38
|
TopicModelingOutput,
|
|
38
|
-
TopicModelingRecipeConfig,
|
|
39
39
|
TopicModelingReport,
|
|
40
40
|
TopicModelingStageStatus,
|
|
41
41
|
TopicModelingTextCollectionReport,
|
|
@@ -76,69 +76,74 @@ class TopicModelingBackend(CorpusAnalysisBackend):
|
|
|
76
76
|
self,
|
|
77
77
|
corpus: Corpus,
|
|
78
78
|
*,
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
configuration_name: str,
|
|
80
|
+
configuration: Dict[str, object],
|
|
81
|
+
extraction_snapshot: ExtractionSnapshotReference,
|
|
82
82
|
) -> BaseModel:
|
|
83
83
|
"""
|
|
84
84
|
Run the topic modeling analysis pipeline.
|
|
85
85
|
|
|
86
86
|
:param corpus: Corpus to analyze.
|
|
87
87
|
:type corpus: Corpus
|
|
88
|
-
:param
|
|
89
|
-
:type
|
|
90
|
-
:param
|
|
91
|
-
:type
|
|
92
|
-
:param
|
|
93
|
-
:type
|
|
88
|
+
:param configuration_name: Human-readable configuration name.
|
|
89
|
+
:type configuration_name: str
|
|
90
|
+
:param configuration: Analysis configuration values.
|
|
91
|
+
:type configuration: dict[str, object]
|
|
92
|
+
:param extraction_snapshot: Extraction snapshot reference for text inputs.
|
|
93
|
+
:type extraction_snapshot: biblicus.models.ExtractionSnapshotReference
|
|
94
94
|
:return: Topic modeling output model.
|
|
95
95
|
:rtype: pydantic.BaseModel
|
|
96
96
|
"""
|
|
97
97
|
parsed_config = (
|
|
98
|
-
|
|
99
|
-
if isinstance(
|
|
100
|
-
else
|
|
98
|
+
configuration
|
|
99
|
+
if isinstance(configuration, TopicModelingConfiguration)
|
|
100
|
+
else TopicModelingConfiguration.model_validate(configuration)
|
|
101
101
|
)
|
|
102
102
|
return _run_topic_modeling(
|
|
103
103
|
corpus=corpus,
|
|
104
|
-
|
|
104
|
+
configuration_name=configuration_name,
|
|
105
105
|
config=parsed_config,
|
|
106
|
-
|
|
106
|
+
extraction_snapshot=extraction_snapshot,
|
|
107
107
|
)
|
|
108
108
|
|
|
109
109
|
|
|
110
110
|
def _run_topic_modeling(
|
|
111
111
|
*,
|
|
112
112
|
corpus: Corpus,
|
|
113
|
-
|
|
114
|
-
config:
|
|
115
|
-
|
|
113
|
+
configuration_name: str,
|
|
114
|
+
config: TopicModelingConfiguration,
|
|
115
|
+
extraction_snapshot: ExtractionSnapshotReference,
|
|
116
116
|
) -> TopicModelingOutput:
|
|
117
|
-
|
|
117
|
+
configuration_manifest = _create_configuration_manifest(
|
|
118
|
+
name=configuration_name,
|
|
119
|
+
config=config,
|
|
120
|
+
)
|
|
118
121
|
catalog = corpus.load_catalog()
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
+
snapshot_id = _analysis_snapshot_id(
|
|
123
|
+
configuration_id=configuration_manifest.configuration_id,
|
|
124
|
+
extraction_snapshot=extraction_snapshot,
|
|
122
125
|
catalog_generated_at=catalog.generated_at,
|
|
123
126
|
)
|
|
124
127
|
run_manifest = AnalysisRunManifest(
|
|
125
|
-
|
|
126
|
-
|
|
128
|
+
snapshot_id=snapshot_id,
|
|
129
|
+
configuration=configuration_manifest,
|
|
127
130
|
corpus_uri=catalog.corpus_uri,
|
|
128
131
|
catalog_generated_at=catalog.generated_at,
|
|
129
132
|
created_at=utc_now_iso(),
|
|
130
|
-
input=AnalysisRunInput(
|
|
133
|
+
input=AnalysisRunInput(extraction_snapshot=extraction_snapshot),
|
|
131
134
|
artifact_paths=[],
|
|
132
135
|
stats={},
|
|
133
136
|
)
|
|
134
|
-
run_dir = corpus.analysis_run_dir(
|
|
137
|
+
run_dir = corpus.analysis_run_dir(
|
|
138
|
+
analysis_id=TopicModelingBackend.analysis_id, snapshot_id=snapshot_id
|
|
139
|
+
)
|
|
135
140
|
output_path = run_dir / "output.json"
|
|
136
141
|
|
|
137
142
|
run_dir.mkdir(parents=True, exist_ok=True)
|
|
138
143
|
|
|
139
144
|
documents, text_report = _collect_documents(
|
|
140
145
|
corpus=corpus,
|
|
141
|
-
|
|
146
|
+
extraction_snapshot=extraction_snapshot,
|
|
142
147
|
config=config.text_source,
|
|
143
148
|
)
|
|
144
149
|
|
|
@@ -194,7 +199,7 @@ def _run_topic_modeling(
|
|
|
194
199
|
output = TopicModelingOutput(
|
|
195
200
|
analysis_id=TopicModelingBackend.analysis_id,
|
|
196
201
|
generated_at=utc_now_iso(),
|
|
197
|
-
|
|
202
|
+
snapshot=run_manifest,
|
|
198
203
|
report=report,
|
|
199
204
|
)
|
|
200
205
|
_write_topic_modeling_output(path=output_path, output=output)
|
|
@@ -204,15 +209,15 @@ def _run_topic_modeling(
|
|
|
204
209
|
def run_topic_modeling_for_documents(
|
|
205
210
|
*,
|
|
206
211
|
documents: List[TopicModelingDocument],
|
|
207
|
-
config:
|
|
212
|
+
config: TopicModelingConfiguration,
|
|
208
213
|
) -> TopicModelingReport:
|
|
209
214
|
"""
|
|
210
215
|
Run topic modeling using caller-provided documents.
|
|
211
216
|
|
|
212
217
|
:param documents: Pre-collected documents to model.
|
|
213
218
|
:type documents: list[TopicModelingDocument]
|
|
214
|
-
:param config: Topic modeling
|
|
215
|
-
:type config:
|
|
219
|
+
:param config: Topic modeling configuration.
|
|
220
|
+
:type config: TopicModelingConfiguration
|
|
216
221
|
:return: Topic modeling report with topic assignments.
|
|
217
222
|
:rtype: TopicModelingReport
|
|
218
223
|
"""
|
|
@@ -269,10 +274,10 @@ def run_topic_modeling_for_documents(
|
|
|
269
274
|
)
|
|
270
275
|
|
|
271
276
|
|
|
272
|
-
def
|
|
273
|
-
*, name: str, config:
|
|
274
|
-
) ->
|
|
275
|
-
|
|
277
|
+
def _create_configuration_manifest(
|
|
278
|
+
*, name: str, config: TopicModelingConfiguration
|
|
279
|
+
) -> AnalysisConfigurationManifest:
|
|
280
|
+
configuration_payload = json.dumps(
|
|
276
281
|
{
|
|
277
282
|
"analysis_id": TopicModelingBackend.analysis_id,
|
|
278
283
|
"name": name,
|
|
@@ -280,9 +285,9 @@ def _create_recipe_manifest(
|
|
|
280
285
|
},
|
|
281
286
|
sort_keys=True,
|
|
282
287
|
)
|
|
283
|
-
|
|
284
|
-
return
|
|
285
|
-
|
|
288
|
+
configuration_id = hash_text(configuration_payload)
|
|
289
|
+
return AnalysisConfigurationManifest(
|
|
290
|
+
configuration_id=configuration_id,
|
|
286
291
|
analysis_id=TopicModelingBackend.analysis_id,
|
|
287
292
|
name=name,
|
|
288
293
|
created_at=utc_now_iso(),
|
|
@@ -290,25 +295,25 @@ def _create_recipe_manifest(
|
|
|
290
295
|
)
|
|
291
296
|
|
|
292
297
|
|
|
293
|
-
def
|
|
298
|
+
def _analysis_snapshot_id(
|
|
294
299
|
*,
|
|
295
|
-
|
|
296
|
-
|
|
300
|
+
configuration_id: str,
|
|
301
|
+
extraction_snapshot: ExtractionSnapshotReference,
|
|
297
302
|
catalog_generated_at: str,
|
|
298
303
|
) -> str:
|
|
299
|
-
run_seed = f"{
|
|
304
|
+
run_seed = f"{configuration_id}:{extraction_snapshot.as_string()}:{catalog_generated_at}"
|
|
300
305
|
return hash_text(run_seed)
|
|
301
306
|
|
|
302
307
|
|
|
303
308
|
def _collect_documents(
|
|
304
309
|
*,
|
|
305
310
|
corpus: Corpus,
|
|
306
|
-
|
|
311
|
+
extraction_snapshot: ExtractionSnapshotReference,
|
|
307
312
|
config: TopicModelingTextSourceConfig,
|
|
308
313
|
) -> Tuple[List[TopicModelingDocument], TopicModelingTextCollectionReport]:
|
|
309
|
-
manifest = corpus.
|
|
310
|
-
extractor_id=
|
|
311
|
-
|
|
314
|
+
manifest = corpus.load_extraction_snapshot_manifest(
|
|
315
|
+
extractor_id=extraction_snapshot.extractor_id,
|
|
316
|
+
snapshot_id=extraction_snapshot.snapshot_id,
|
|
312
317
|
)
|
|
313
318
|
warnings: List[str] = []
|
|
314
319
|
errors: List[str] = []
|
|
@@ -321,9 +326,9 @@ def _collect_documents(
|
|
|
321
326
|
skipped_items += 1
|
|
322
327
|
continue
|
|
323
328
|
text_path = (
|
|
324
|
-
corpus.
|
|
325
|
-
extractor_id=
|
|
326
|
-
|
|
329
|
+
corpus.extraction_snapshot_dir(
|
|
330
|
+
extractor_id=extraction_snapshot.extractor_id,
|
|
331
|
+
snapshot_id=extraction_snapshot.snapshot_id,
|
|
327
332
|
)
|
|
328
333
|
/ item_result.final_text_relpath
|
|
329
334
|
)
|