biblicus 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +0 -1
- biblicus/_vendor/dotyaml/interpolation.py +0 -1
- biblicus/_vendor/dotyaml/loader.py +0 -1
- biblicus/_vendor/dotyaml/transformer.py +0 -1
- biblicus/analysis/__init__.py +2 -0
- biblicus/analysis/models.py +228 -5
- biblicus/analysis/profiling.py +337 -0
- biblicus/analysis/topic_modeling.py +3 -6
- biblicus/backends/sqlite_full_text_search.py +2 -4
- biblicus/cli.py +83 -4
- biblicus/corpus.py +9 -3
- biblicus/evidence_processing.py +4 -2
- biblicus/extraction.py +3 -1
- biblicus/extractors/markitdown_text.py +1 -0
- biblicus/extractors/paddleocr_vl_text.py +1 -3
- biblicus/user_config.py +2 -6
- {biblicus-0.9.0.dist-info → biblicus-0.10.0.dist-info}/METADATA +7 -6
- {biblicus-0.9.0.dist-info → biblicus-0.10.0.dist-info}/RECORD +23 -22
- {biblicus-0.9.0.dist-info → biblicus-0.10.0.dist-info}/WHEEL +0 -0
- {biblicus-0.9.0.dist-info → biblicus-0.10.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.9.0.dist-info → biblicus-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.9.0.dist-info → biblicus-0.10.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/analysis/__init__.py
CHANGED
|
@@ -7,6 +7,7 @@ from __future__ import annotations
|
|
|
7
7
|
from typing import Dict, Type
|
|
8
8
|
|
|
9
9
|
from .base import CorpusAnalysisBackend
|
|
10
|
+
from .profiling import ProfilingBackend
|
|
10
11
|
from .topic_modeling import TopicModelingBackend
|
|
11
12
|
|
|
12
13
|
|
|
@@ -18,6 +19,7 @@ def available_analysis_backends() -> Dict[str, Type[CorpusAnalysisBackend]]:
|
|
|
18
19
|
:rtype: dict[str, Type[CorpusAnalysisBackend]]
|
|
19
20
|
"""
|
|
20
21
|
return {
|
|
22
|
+
ProfilingBackend.analysis_id: ProfilingBackend,
|
|
21
23
|
TopicModelingBackend.analysis_id: TopicModelingBackend,
|
|
22
24
|
}
|
|
23
25
|
|
biblicus/analysis/models.py
CHANGED
|
@@ -84,6 +84,221 @@ class AnalysisRunManifest(AnalysisSchemaModel):
|
|
|
84
84
|
stats: Dict[str, Any] = Field(default_factory=dict)
|
|
85
85
|
|
|
86
86
|
|
|
87
|
+
class ProfilingRecipeConfig(AnalysisSchemaModel):
|
|
88
|
+
"""
|
|
89
|
+
Recipe configuration for profiling analysis.
|
|
90
|
+
|
|
91
|
+
:ivar schema_version: Analysis schema version.
|
|
92
|
+
:vartype schema_version: int
|
|
93
|
+
:ivar sample_size: Optional sample size for distribution metrics.
|
|
94
|
+
:vartype sample_size: int or None
|
|
95
|
+
:ivar min_text_characters: Optional minimum character count for extracted text inclusion.
|
|
96
|
+
:vartype min_text_characters: int or None
|
|
97
|
+
:ivar percentiles: Percentiles to compute for distributions.
|
|
98
|
+
:vartype percentiles: list[int]
|
|
99
|
+
:ivar top_tag_count: Maximum number of tags to include in top tag output.
|
|
100
|
+
:vartype top_tag_count: int
|
|
101
|
+
:ivar tag_filters: Optional tag filters to limit tag coverage metrics.
|
|
102
|
+
:vartype tag_filters: list[str] or None
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
|
|
106
|
+
sample_size: Optional[int] = Field(default=None, ge=1)
|
|
107
|
+
min_text_characters: Optional[int] = Field(default=None, ge=1)
|
|
108
|
+
percentiles: List[int] = Field(default_factory=lambda: [50, 90, 99])
|
|
109
|
+
top_tag_count: int = Field(default=10, ge=1)
|
|
110
|
+
tag_filters: Optional[List[str]] = None
|
|
111
|
+
|
|
112
|
+
@model_validator(mode="after")
|
|
113
|
+
def _validate_schema_version(self) -> "ProfilingRecipeConfig":
|
|
114
|
+
if self.schema_version != ANALYSIS_SCHEMA_VERSION:
|
|
115
|
+
raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
|
|
116
|
+
return self
|
|
117
|
+
|
|
118
|
+
@field_validator("percentiles", mode="after")
|
|
119
|
+
@classmethod
|
|
120
|
+
def _validate_percentiles(cls, value: List[int]) -> List[int]:
|
|
121
|
+
if not value:
|
|
122
|
+
raise ValueError("percentiles must include at least one value")
|
|
123
|
+
if any(percentile < 1 or percentile > 100 for percentile in value):
|
|
124
|
+
raise ValueError("percentiles must be between 1 and 100")
|
|
125
|
+
if value != sorted(value):
|
|
126
|
+
raise ValueError("percentiles must be sorted in ascending order")
|
|
127
|
+
return value
|
|
128
|
+
|
|
129
|
+
@field_validator("tag_filters", mode="before")
|
|
130
|
+
@classmethod
|
|
131
|
+
def _validate_tag_filters(cls, value: object) -> object:
|
|
132
|
+
if value is None:
|
|
133
|
+
return None
|
|
134
|
+
if not isinstance(value, list):
|
|
135
|
+
raise ValueError("tag_filters must be a list of strings")
|
|
136
|
+
cleaned = [str(tag).strip() for tag in value]
|
|
137
|
+
if not cleaned or any(not tag for tag in cleaned):
|
|
138
|
+
raise ValueError("tag_filters must be a list of non-empty strings")
|
|
139
|
+
return cleaned
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class ProfilingPercentileValue(AnalysisSchemaModel):
|
|
143
|
+
"""
|
|
144
|
+
Percentile entry for a distribution.
|
|
145
|
+
|
|
146
|
+
:ivar percentile: Percentile value between 1 and 100.
|
|
147
|
+
:vartype percentile: int
|
|
148
|
+
:ivar value: Percentile value.
|
|
149
|
+
:vartype value: float
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
percentile: int = Field(ge=1, le=100)
|
|
153
|
+
value: float
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class ProfilingDistributionReport(AnalysisSchemaModel):
|
|
157
|
+
"""
|
|
158
|
+
Distribution summary for numeric values.
|
|
159
|
+
|
|
160
|
+
:ivar count: Count of values included.
|
|
161
|
+
:vartype count: int
|
|
162
|
+
:ivar min_value: Minimum value observed.
|
|
163
|
+
:vartype min_value: float
|
|
164
|
+
:ivar max_value: Maximum value observed.
|
|
165
|
+
:vartype max_value: float
|
|
166
|
+
:ivar mean_value: Mean value observed.
|
|
167
|
+
:vartype mean_value: float
|
|
168
|
+
:ivar percentiles: Percentile values.
|
|
169
|
+
:vartype percentiles: list[ProfilingPercentileValue]
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
count: int = Field(ge=0)
|
|
173
|
+
min_value: float
|
|
174
|
+
max_value: float
|
|
175
|
+
mean_value: float
|
|
176
|
+
percentiles: List[ProfilingPercentileValue] = Field(default_factory=list)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class ProfilingTagCount(AnalysisSchemaModel):
|
|
180
|
+
"""
|
|
181
|
+
Tag count entry for profiling output.
|
|
182
|
+
|
|
183
|
+
:ivar tag: Tag name.
|
|
184
|
+
:vartype tag: str
|
|
185
|
+
:ivar count: Number of items with this tag.
|
|
186
|
+
:vartype count: int
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
tag: str
|
|
190
|
+
count: int = Field(ge=0)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class ProfilingTagReport(AnalysisSchemaModel):
|
|
194
|
+
"""
|
|
195
|
+
Tag coverage summary for raw items.
|
|
196
|
+
|
|
197
|
+
:ivar tagged_items: Count of items with tags.
|
|
198
|
+
:vartype tagged_items: int
|
|
199
|
+
:ivar untagged_items: Count of items without tags.
|
|
200
|
+
:vartype untagged_items: int
|
|
201
|
+
:ivar total_unique_tags: Count of unique tags.
|
|
202
|
+
:vartype total_unique_tags: int
|
|
203
|
+
:ivar top_tags: Most frequent tags.
|
|
204
|
+
:vartype top_tags: list[ProfilingTagCount]
|
|
205
|
+
:ivar tag_filters: Optional tag filters applied.
|
|
206
|
+
:vartype tag_filters: list[str] or None
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
tagged_items: int = Field(ge=0)
|
|
210
|
+
untagged_items: int = Field(ge=0)
|
|
211
|
+
total_unique_tags: int = Field(ge=0)
|
|
212
|
+
top_tags: List[ProfilingTagCount] = Field(default_factory=list)
|
|
213
|
+
tag_filters: Optional[List[str]] = None
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class ProfilingRawItemsReport(AnalysisSchemaModel):
|
|
217
|
+
"""
|
|
218
|
+
Summary of raw corpus items.
|
|
219
|
+
|
|
220
|
+
:ivar total_items: Total number of catalog items.
|
|
221
|
+
:vartype total_items: int
|
|
222
|
+
:ivar media_type_counts: Count of items per media type.
|
|
223
|
+
:vartype media_type_counts: dict[str, int]
|
|
224
|
+
:ivar bytes_distribution: Distribution of raw item sizes in bytes.
|
|
225
|
+
:vartype bytes_distribution: ProfilingDistributionReport
|
|
226
|
+
:ivar tags: Tag coverage summary.
|
|
227
|
+
:vartype tags: ProfilingTagReport
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
total_items: int = Field(ge=0)
|
|
231
|
+
media_type_counts: Dict[str, int] = Field(default_factory=dict)
|
|
232
|
+
bytes_distribution: ProfilingDistributionReport
|
|
233
|
+
tags: ProfilingTagReport
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class ProfilingExtractedTextReport(AnalysisSchemaModel):
|
|
237
|
+
"""
|
|
238
|
+
Summary of extracted text coverage.
|
|
239
|
+
|
|
240
|
+
:ivar source_items: Count of source items in the extraction run.
|
|
241
|
+
:vartype source_items: int
|
|
242
|
+
:ivar extracted_nonempty_items: Count of extracted items with non-empty text.
|
|
243
|
+
:vartype extracted_nonempty_items: int
|
|
244
|
+
:ivar extracted_empty_items: Count of extracted items with empty text.
|
|
245
|
+
:vartype extracted_empty_items: int
|
|
246
|
+
:ivar extracted_missing_items: Count of items with no extracted text artifact.
|
|
247
|
+
:vartype extracted_missing_items: int
|
|
248
|
+
:ivar characters_distribution: Distribution of extracted text lengths.
|
|
249
|
+
:vartype characters_distribution: ProfilingDistributionReport
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
source_items: int = Field(ge=0)
|
|
253
|
+
extracted_nonempty_items: int = Field(ge=0)
|
|
254
|
+
extracted_empty_items: int = Field(ge=0)
|
|
255
|
+
extracted_missing_items: int = Field(ge=0)
|
|
256
|
+
characters_distribution: ProfilingDistributionReport
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class ProfilingReport(AnalysisSchemaModel):
|
|
260
|
+
"""
|
|
261
|
+
Report for profiling analysis.
|
|
262
|
+
|
|
263
|
+
:ivar raw_items: Raw corpus item summary.
|
|
264
|
+
:vartype raw_items: ProfilingRawItemsReport
|
|
265
|
+
:ivar extracted_text: Extracted text coverage summary.
|
|
266
|
+
:vartype extracted_text: ProfilingExtractedTextReport
|
|
267
|
+
:ivar warnings: Warning messages.
|
|
268
|
+
:vartype warnings: list[str]
|
|
269
|
+
:ivar errors: Error messages.
|
|
270
|
+
:vartype errors: list[str]
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
raw_items: ProfilingRawItemsReport
|
|
274
|
+
extracted_text: ProfilingExtractedTextReport
|
|
275
|
+
warnings: List[str] = Field(default_factory=list)
|
|
276
|
+
errors: List[str] = Field(default_factory=list)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class ProfilingOutput(AnalysisSchemaModel):
|
|
280
|
+
"""
|
|
281
|
+
Output bundle for profiling analysis.
|
|
282
|
+
|
|
283
|
+
:ivar schema_version: Analysis schema version.
|
|
284
|
+
:vartype schema_version: int
|
|
285
|
+
:ivar analysis_id: Analysis backend identifier.
|
|
286
|
+
:vartype analysis_id: str
|
|
287
|
+
:ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
|
|
288
|
+
:vartype generated_at: str
|
|
289
|
+
:ivar run: Analysis run manifest.
|
|
290
|
+
:vartype run: AnalysisRunManifest
|
|
291
|
+
:ivar report: Profiling report data.
|
|
292
|
+
:vartype report: ProfilingReport
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
|
|
296
|
+
analysis_id: str
|
|
297
|
+
generated_at: str
|
|
298
|
+
run: AnalysisRunManifest
|
|
299
|
+
report: ProfilingReport
|
|
300
|
+
|
|
301
|
+
|
|
87
302
|
class TopicModelingTextSourceConfig(AnalysisSchemaModel):
|
|
88
303
|
"""
|
|
89
304
|
Configuration for text collection within topic modeling.
|
|
@@ -124,7 +339,9 @@ class TopicModelingLlmExtractionConfig(AnalysisSchemaModel):
|
|
|
124
339
|
"""
|
|
125
340
|
|
|
126
341
|
enabled: bool = Field(default=False)
|
|
127
|
-
method: TopicModelingLlmExtractionMethod = Field(
|
|
342
|
+
method: TopicModelingLlmExtractionMethod = Field(
|
|
343
|
+
default=TopicModelingLlmExtractionMethod.SINGLE
|
|
344
|
+
)
|
|
128
345
|
client: Optional[LlmClientConfig] = None
|
|
129
346
|
prompt_template: Optional[str] = None
|
|
130
347
|
system_prompt: Optional[str] = None
|
|
@@ -136,7 +353,9 @@ class TopicModelingLlmExtractionConfig(AnalysisSchemaModel):
|
|
|
136
353
|
return value
|
|
137
354
|
if isinstance(value, str):
|
|
138
355
|
return TopicModelingLlmExtractionMethod(value)
|
|
139
|
-
raise ValueError(
|
|
356
|
+
raise ValueError(
|
|
357
|
+
"llm_extraction.method must be a string or TopicModelingLlmExtractionMethod"
|
|
358
|
+
)
|
|
140
359
|
|
|
141
360
|
@model_validator(mode="after")
|
|
142
361
|
def _validate_requirements(self) -> "TopicModelingLlmExtractionConfig":
|
|
@@ -188,7 +407,9 @@ class TopicModelingVectorizerConfig(AnalysisSchemaModel):
|
|
|
188
407
|
def _validate_ngram_range(self) -> "TopicModelingVectorizerConfig":
|
|
189
408
|
start, end = self.ngram_range
|
|
190
409
|
if start < 1 or end < start:
|
|
191
|
-
raise ValueError(
|
|
410
|
+
raise ValueError(
|
|
411
|
+
"vectorizer.ngram_range must include two integers with start >= 1 and end >= start"
|
|
412
|
+
)
|
|
192
413
|
return self
|
|
193
414
|
|
|
194
415
|
@field_validator("stop_words", mode="before")
|
|
@@ -201,7 +422,7 @@ class TopicModelingVectorizerConfig(AnalysisSchemaModel):
|
|
|
201
422
|
raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
|
|
202
423
|
return value
|
|
203
424
|
if isinstance(value, list):
|
|
204
|
-
if not all(isinstance(entry, str) and entry for entry in value):
|
|
425
|
+
if not value or not all(isinstance(entry, str) and entry for entry in value):
|
|
205
426
|
raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
|
|
206
427
|
return value
|
|
207
428
|
raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
|
|
@@ -280,7 +501,9 @@ class TopicModelingRecipeConfig(AnalysisSchemaModel):
|
|
|
280
501
|
"""
|
|
281
502
|
|
|
282
503
|
schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
|
|
283
|
-
text_source: TopicModelingTextSourceConfig = Field(
|
|
504
|
+
text_source: TopicModelingTextSourceConfig = Field(
|
|
505
|
+
default_factory=TopicModelingTextSourceConfig
|
|
506
|
+
)
|
|
284
507
|
llm_extraction: TopicModelingLlmExtractionConfig = Field(
|
|
285
508
|
default_factory=TopicModelingLlmExtractionConfig
|
|
286
509
|
)
|
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Profiling analysis backend for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import math
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, Iterable, List, Sequence
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from ..corpus import Corpus
|
|
15
|
+
from ..models import CatalogItem, ExtractionRunReference
|
|
16
|
+
from ..retrieval import hash_text
|
|
17
|
+
from ..time import utc_now_iso
|
|
18
|
+
from .base import CorpusAnalysisBackend
|
|
19
|
+
from .models import (
|
|
20
|
+
AnalysisRecipeManifest,
|
|
21
|
+
AnalysisRunInput,
|
|
22
|
+
AnalysisRunManifest,
|
|
23
|
+
ProfilingDistributionReport,
|
|
24
|
+
ProfilingExtractedTextReport,
|
|
25
|
+
ProfilingOutput,
|
|
26
|
+
ProfilingPercentileValue,
|
|
27
|
+
ProfilingRawItemsReport,
|
|
28
|
+
ProfilingRecipeConfig,
|
|
29
|
+
ProfilingReport,
|
|
30
|
+
ProfilingTagCount,
|
|
31
|
+
ProfilingTagReport,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ProfilingBackend(CorpusAnalysisBackend):
|
|
36
|
+
"""
|
|
37
|
+
Profiling analysis backend for corpus composition and coverage.
|
|
38
|
+
|
|
39
|
+
:ivar analysis_id: Backend identifier.
|
|
40
|
+
:vartype analysis_id: str
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
analysis_id = "profiling"
|
|
44
|
+
|
|
45
|
+
def run_analysis(
|
|
46
|
+
self,
|
|
47
|
+
corpus: Corpus,
|
|
48
|
+
*,
|
|
49
|
+
recipe_name: str,
|
|
50
|
+
config: Dict[str, object],
|
|
51
|
+
extraction_run: ExtractionRunReference,
|
|
52
|
+
) -> BaseModel:
|
|
53
|
+
"""
|
|
54
|
+
Run the profiling analysis pipeline.
|
|
55
|
+
|
|
56
|
+
:param corpus: Corpus to analyze.
|
|
57
|
+
:type corpus: Corpus
|
|
58
|
+
:param recipe_name: Human-readable recipe name.
|
|
59
|
+
:type recipe_name: str
|
|
60
|
+
:param config: Analysis configuration values.
|
|
61
|
+
:type config: dict[str, object]
|
|
62
|
+
:param extraction_run: Extraction run reference for text inputs.
|
|
63
|
+
:type extraction_run: biblicus.models.ExtractionRunReference
|
|
64
|
+
:return: Profiling output model.
|
|
65
|
+
:rtype: pydantic.BaseModel
|
|
66
|
+
"""
|
|
67
|
+
parsed_config = (
|
|
68
|
+
config
|
|
69
|
+
if isinstance(config, ProfilingRecipeConfig)
|
|
70
|
+
else ProfilingRecipeConfig.model_validate(config)
|
|
71
|
+
)
|
|
72
|
+
return _run_profiling(
|
|
73
|
+
corpus=corpus,
|
|
74
|
+
recipe_name=recipe_name,
|
|
75
|
+
config=parsed_config,
|
|
76
|
+
extraction_run=extraction_run,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _run_profiling(
|
|
81
|
+
*,
|
|
82
|
+
corpus: Corpus,
|
|
83
|
+
recipe_name: str,
|
|
84
|
+
config: ProfilingRecipeConfig,
|
|
85
|
+
extraction_run: ExtractionRunReference,
|
|
86
|
+
) -> ProfilingOutput:
|
|
87
|
+
recipe = _create_recipe_manifest(name=recipe_name, config=config)
|
|
88
|
+
catalog = corpus.load_catalog()
|
|
89
|
+
run_id = _analysis_run_id(
|
|
90
|
+
recipe_id=recipe.recipe_id,
|
|
91
|
+
extraction_run=extraction_run,
|
|
92
|
+
catalog_generated_at=catalog.generated_at,
|
|
93
|
+
)
|
|
94
|
+
run_manifest = AnalysisRunManifest(
|
|
95
|
+
run_id=run_id,
|
|
96
|
+
recipe=recipe,
|
|
97
|
+
corpus_uri=catalog.corpus_uri,
|
|
98
|
+
catalog_generated_at=catalog.generated_at,
|
|
99
|
+
created_at=utc_now_iso(),
|
|
100
|
+
input=AnalysisRunInput(extraction_run=extraction_run),
|
|
101
|
+
artifact_paths=[],
|
|
102
|
+
stats={},
|
|
103
|
+
)
|
|
104
|
+
run_dir = corpus.analysis_run_dir(analysis_id=ProfilingBackend.analysis_id, run_id=run_id)
|
|
105
|
+
output_path = run_dir / "output.json"
|
|
106
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
|
|
108
|
+
ordered_items = _ordered_catalog_items(catalog.items, catalog.order)
|
|
109
|
+
raw_report = _build_raw_items_report(items=ordered_items, config=config)
|
|
110
|
+
extracted_report = _build_extracted_text_report(
|
|
111
|
+
corpus=corpus,
|
|
112
|
+
extraction_run=extraction_run,
|
|
113
|
+
config=config,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
report = ProfilingReport(
|
|
117
|
+
raw_items=raw_report,
|
|
118
|
+
extracted_text=extracted_report,
|
|
119
|
+
warnings=[],
|
|
120
|
+
errors=[],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
run_stats = {
|
|
124
|
+
"raw_items": raw_report.total_items,
|
|
125
|
+
"extracted_nonempty_items": extracted_report.extracted_nonempty_items,
|
|
126
|
+
"extracted_missing_items": extracted_report.extracted_missing_items,
|
|
127
|
+
}
|
|
128
|
+
run_manifest = run_manifest.model_copy(
|
|
129
|
+
update={"artifact_paths": ["output.json"], "stats": run_stats}
|
|
130
|
+
)
|
|
131
|
+
_write_analysis_run_manifest(run_dir=run_dir, manifest=run_manifest)
|
|
132
|
+
|
|
133
|
+
output = ProfilingOutput(
|
|
134
|
+
analysis_id=ProfilingBackend.analysis_id,
|
|
135
|
+
generated_at=utc_now_iso(),
|
|
136
|
+
run=run_manifest,
|
|
137
|
+
report=report,
|
|
138
|
+
)
|
|
139
|
+
_write_profiling_output(path=output_path, output=output)
|
|
140
|
+
return output
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _create_recipe_manifest(*, name: str, config: ProfilingRecipeConfig) -> AnalysisRecipeManifest:
|
|
144
|
+
recipe_payload = json.dumps(
|
|
145
|
+
{
|
|
146
|
+
"analysis_id": ProfilingBackend.analysis_id,
|
|
147
|
+
"name": name,
|
|
148
|
+
"config": config.model_dump(),
|
|
149
|
+
},
|
|
150
|
+
sort_keys=True,
|
|
151
|
+
)
|
|
152
|
+
recipe_id = hash_text(recipe_payload)
|
|
153
|
+
return AnalysisRecipeManifest(
|
|
154
|
+
recipe_id=recipe_id,
|
|
155
|
+
analysis_id=ProfilingBackend.analysis_id,
|
|
156
|
+
name=name,
|
|
157
|
+
created_at=utc_now_iso(),
|
|
158
|
+
config=config.model_dump(),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _analysis_run_id(
|
|
163
|
+
*, recipe_id: str, extraction_run: ExtractionRunReference, catalog_generated_at: str
|
|
164
|
+
) -> str:
|
|
165
|
+
run_seed = f"{recipe_id}:{extraction_run.as_string()}:{catalog_generated_at}"
|
|
166
|
+
return hash_text(run_seed)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _ordered_catalog_items(
|
|
170
|
+
items: Dict[str, CatalogItem],
|
|
171
|
+
order: Sequence[str],
|
|
172
|
+
) -> List[CatalogItem]:
|
|
173
|
+
ordered: List[CatalogItem] = []
|
|
174
|
+
seen = set()
|
|
175
|
+
for item_id in order:
|
|
176
|
+
item = items.get(item_id)
|
|
177
|
+
if item is None:
|
|
178
|
+
continue
|
|
179
|
+
ordered.append(item)
|
|
180
|
+
seen.add(item_id)
|
|
181
|
+
for item_id in sorted(items):
|
|
182
|
+
if item_id in seen:
|
|
183
|
+
continue
|
|
184
|
+
ordered.append(items[item_id])
|
|
185
|
+
return ordered
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _build_raw_items_report(
|
|
189
|
+
*, items: Sequence[CatalogItem], config: ProfilingRecipeConfig
|
|
190
|
+
) -> ProfilingRawItemsReport:
|
|
191
|
+
media_type_counts: Dict[str, int] = {}
|
|
192
|
+
for item in items:
|
|
193
|
+
media_type_counts[item.media_type] = media_type_counts.get(item.media_type, 0) + 1
|
|
194
|
+
|
|
195
|
+
bytes_values = [item.bytes for item in _apply_sample(items, config.sample_size)]
|
|
196
|
+
bytes_distribution = _build_distribution(bytes_values, config.percentiles)
|
|
197
|
+
tag_report = _build_tag_report(items=items, config=config)
|
|
198
|
+
|
|
199
|
+
return ProfilingRawItemsReport(
|
|
200
|
+
total_items=len(items),
|
|
201
|
+
media_type_counts=media_type_counts,
|
|
202
|
+
bytes_distribution=bytes_distribution,
|
|
203
|
+
tags=tag_report,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _build_tag_report(
|
|
208
|
+
*, items: Sequence[CatalogItem], config: ProfilingRecipeConfig
|
|
209
|
+
) -> ProfilingTagReport:
|
|
210
|
+
tag_filters = config.tag_filters
|
|
211
|
+
tag_filter_set = set(tag_filters or [])
|
|
212
|
+
tag_counts: Dict[str, int] = {}
|
|
213
|
+
tagged_items = 0
|
|
214
|
+
|
|
215
|
+
for item in items:
|
|
216
|
+
tags = list(item.tags)
|
|
217
|
+
if tag_filters is not None:
|
|
218
|
+
tags = [tag for tag in tags if tag in tag_filter_set]
|
|
219
|
+
if tags:
|
|
220
|
+
tagged_items += 1
|
|
221
|
+
for tag in tags:
|
|
222
|
+
tag_counts[tag] = tag_counts.get(tag, 0) + 1
|
|
223
|
+
|
|
224
|
+
untagged_items = len(items) - tagged_items
|
|
225
|
+
top_tags = sorted(tag_counts.items(), key=lambda entry: (-entry[1], entry[0]))
|
|
226
|
+
top_tags = top_tags[: config.top_tag_count]
|
|
227
|
+
return ProfilingTagReport(
|
|
228
|
+
tagged_items=tagged_items,
|
|
229
|
+
untagged_items=untagged_items,
|
|
230
|
+
total_unique_tags=len(tag_counts),
|
|
231
|
+
top_tags=[ProfilingTagCount(tag=tag, count=count) for tag, count in top_tags],
|
|
232
|
+
tag_filters=tag_filters,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _build_extracted_text_report(
|
|
237
|
+
*,
|
|
238
|
+
corpus: Corpus,
|
|
239
|
+
extraction_run: ExtractionRunReference,
|
|
240
|
+
config: ProfilingRecipeConfig,
|
|
241
|
+
) -> ProfilingExtractedTextReport:
|
|
242
|
+
manifest = corpus.load_extraction_run_manifest(
|
|
243
|
+
extractor_id=extraction_run.extractor_id,
|
|
244
|
+
run_id=extraction_run.run_id,
|
|
245
|
+
)
|
|
246
|
+
nonempty_items = 0
|
|
247
|
+
empty_items = 0
|
|
248
|
+
missing_items = 0
|
|
249
|
+
text_lengths: List[int] = []
|
|
250
|
+
text_dir = corpus.extraction_run_dir(
|
|
251
|
+
extractor_id=extraction_run.extractor_id,
|
|
252
|
+
run_id=extraction_run.run_id,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
for item_result in manifest.items:
|
|
256
|
+
if item_result.status != "extracted" or item_result.final_text_relpath is None:
|
|
257
|
+
missing_items += 1
|
|
258
|
+
continue
|
|
259
|
+
text_path = text_dir / item_result.final_text_relpath
|
|
260
|
+
text_value = text_path.read_text(encoding="utf-8")
|
|
261
|
+
stripped = text_value.strip()
|
|
262
|
+
if not stripped:
|
|
263
|
+
empty_items += 1
|
|
264
|
+
continue
|
|
265
|
+
if config.min_text_characters is not None and len(stripped) < config.min_text_characters:
|
|
266
|
+
empty_items += 1
|
|
267
|
+
continue
|
|
268
|
+
nonempty_items += 1
|
|
269
|
+
text_lengths.append(len(text_value))
|
|
270
|
+
|
|
271
|
+
sampled_lengths = _apply_sample(text_lengths, config.sample_size)
|
|
272
|
+
characters_distribution = _build_distribution(sampled_lengths, config.percentiles)
|
|
273
|
+
return ProfilingExtractedTextReport(
|
|
274
|
+
source_items=len(manifest.items),
|
|
275
|
+
extracted_nonempty_items=nonempty_items,
|
|
276
|
+
extracted_empty_items=empty_items,
|
|
277
|
+
extracted_missing_items=missing_items,
|
|
278
|
+
characters_distribution=characters_distribution,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _apply_sample(values: Sequence, sample_size: int | None) -> List:
|
|
283
|
+
if sample_size is None:
|
|
284
|
+
return list(values)
|
|
285
|
+
return list(values[:sample_size])
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _build_distribution(
|
|
289
|
+
values: Sequence[int], percentiles: Iterable[int]
|
|
290
|
+
) -> ProfilingDistributionReport:
|
|
291
|
+
if not values:
|
|
292
|
+
percentile_values = [
|
|
293
|
+
ProfilingPercentileValue(percentile=percentile, value=0.0) for percentile in percentiles
|
|
294
|
+
]
|
|
295
|
+
return ProfilingDistributionReport(
|
|
296
|
+
count=0,
|
|
297
|
+
min_value=0.0,
|
|
298
|
+
max_value=0.0,
|
|
299
|
+
mean_value=0.0,
|
|
300
|
+
percentiles=percentile_values,
|
|
301
|
+
)
|
|
302
|
+
sorted_values = sorted(values)
|
|
303
|
+
count = len(sorted_values)
|
|
304
|
+
min_value = float(sorted_values[0])
|
|
305
|
+
max_value = float(sorted_values[-1])
|
|
306
|
+
mean_value = float(sum(sorted_values)) / count
|
|
307
|
+
percentile_values = [
|
|
308
|
+
ProfilingPercentileValue(
|
|
309
|
+
percentile=percentile,
|
|
310
|
+
value=float(_percentile_value(sorted_values, percentile)),
|
|
311
|
+
)
|
|
312
|
+
for percentile in percentiles
|
|
313
|
+
]
|
|
314
|
+
return ProfilingDistributionReport(
|
|
315
|
+
count=count,
|
|
316
|
+
min_value=min_value,
|
|
317
|
+
max_value=max_value,
|
|
318
|
+
mean_value=mean_value,
|
|
319
|
+
percentiles=percentile_values,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _percentile_value(sorted_values: Sequence[int], percentile: int) -> int:
|
|
324
|
+
if not sorted_values:
|
|
325
|
+
return 0
|
|
326
|
+
index = max(0, math.ceil((percentile / 100) * len(sorted_values)) - 1)
|
|
327
|
+
index = min(index, len(sorted_values) - 1)
|
|
328
|
+
return int(sorted_values[index])
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _write_analysis_run_manifest(*, run_dir: Path, manifest: AnalysisRunManifest) -> None:
|
|
332
|
+
manifest_path = run_dir / "manifest.json"
|
|
333
|
+
manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def _write_profiling_output(*, path: Path, output: ProfilingOutput) -> None:
|
|
337
|
+
path.write_text(output.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
@@ -452,7 +452,7 @@ def _run_bertopic(
|
|
|
452
452
|
except ImportError as import_error:
|
|
453
453
|
raise ValueError(
|
|
454
454
|
"Vectorizer configuration requires scikit-learn. "
|
|
455
|
-
|
|
455
|
+
'Install with pip install "biblicus[topic-modeling]".'
|
|
456
456
|
) from import_error
|
|
457
457
|
bertopic_kwargs["vectorizer_model"] = CountVectorizer(
|
|
458
458
|
ngram_range=tuple(config.vectorizer.ngram_range),
|
|
@@ -504,13 +504,10 @@ def _group_documents_by_topic(
|
|
|
504
504
|
return grouped
|
|
505
505
|
|
|
506
506
|
|
|
507
|
-
def _resolve_topic_keywords(
|
|
508
|
-
*, topic_model: Any, topic_id: int
|
|
509
|
-
) -> List[TopicModelingKeyword]:
|
|
507
|
+
def _resolve_topic_keywords(*, topic_model: Any, topic_id: int) -> List[TopicModelingKeyword]:
|
|
510
508
|
raw_keywords = topic_model.get_topic(topic_id) or []
|
|
511
509
|
return [
|
|
512
|
-
TopicModelingKeyword(keyword=str(entry[0]), score=float(entry[1]))
|
|
513
|
-
for entry in raw_keywords
|
|
510
|
+
TopicModelingKeyword(keyword=str(entry[0]), score=float(entry[1])) for entry in raw_keywords
|
|
514
511
|
]
|
|
515
512
|
|
|
516
513
|
|
|
@@ -213,8 +213,7 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
|
|
|
213
213
|
:return: None.
|
|
214
214
|
:rtype: None
|
|
215
215
|
"""
|
|
216
|
-
conn.execute(
|
|
217
|
-
"""
|
|
216
|
+
conn.execute("""
|
|
218
217
|
CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
|
|
219
218
|
content,
|
|
220
219
|
item_id UNINDEXED,
|
|
@@ -225,8 +224,7 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
|
|
|
225
224
|
start_offset UNINDEXED,
|
|
226
225
|
end_offset UNINDEXED
|
|
227
226
|
)
|
|
228
|
-
"""
|
|
229
|
-
)
|
|
227
|
+
""")
|
|
230
228
|
|
|
231
229
|
|
|
232
230
|
def _build_full_text_search_index(
|
biblicus/cli.py
CHANGED
|
@@ -563,7 +563,9 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
|
|
|
563
563
|
"""
|
|
564
564
|
input_text = sys.stdin.read()
|
|
565
565
|
if not input_text.strip():
|
|
566
|
-
raise ValueError(
|
|
566
|
+
raise ValueError(
|
|
567
|
+
"Context pack build requires a retrieval result JavaScript Object Notation on standard input"
|
|
568
|
+
)
|
|
567
569
|
retrieval_result = RetrievalResult.model_validate_json(input_text)
|
|
568
570
|
join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
|
|
569
571
|
policy = ContextPackPolicy(join_with=join_with)
|
|
@@ -685,6 +687,58 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
|
|
|
685
687
|
return 0
|
|
686
688
|
|
|
687
689
|
|
|
690
|
+
def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
|
|
691
|
+
"""
|
|
692
|
+
Run profiling analysis for a corpus.
|
|
693
|
+
|
|
694
|
+
:param arguments: Parsed command-line interface arguments.
|
|
695
|
+
:type arguments: argparse.Namespace
|
|
696
|
+
:return: Exit code.
|
|
697
|
+
:rtype: int
|
|
698
|
+
"""
|
|
699
|
+
import yaml
|
|
700
|
+
|
|
701
|
+
corpus = (
|
|
702
|
+
Corpus.open(arguments.corpus)
|
|
703
|
+
if getattr(arguments, "corpus", None)
|
|
704
|
+
else Corpus.find(Path.cwd())
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
recipe_data: dict[str, object] = {}
|
|
708
|
+
if arguments.recipe is not None:
|
|
709
|
+
recipe_path = Path(arguments.recipe)
|
|
710
|
+
if not recipe_path.is_file():
|
|
711
|
+
raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
|
|
712
|
+
recipe_raw = yaml.safe_load(recipe_path.read_text(encoding="utf-8")) or {}
|
|
713
|
+
if not isinstance(recipe_raw, dict):
|
|
714
|
+
raise ValueError("Profiling recipe must be a mapping/object")
|
|
715
|
+
recipe_data = recipe_raw
|
|
716
|
+
|
|
717
|
+
if arguments.extraction_run:
|
|
718
|
+
extraction_run = parse_extraction_run_reference(arguments.extraction_run)
|
|
719
|
+
else:
|
|
720
|
+
extraction_run = corpus.latest_extraction_run_reference()
|
|
721
|
+
if extraction_run is None:
|
|
722
|
+
raise ValueError("Profiling analysis requires an extraction run to supply text inputs")
|
|
723
|
+
print(
|
|
724
|
+
"Warning: using latest extraction run; pass --extraction-run for reproducibility.",
|
|
725
|
+
file=sys.stderr,
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
backend = get_analysis_backend("profiling")
|
|
729
|
+
try:
|
|
730
|
+
output = backend.run_analysis(
|
|
731
|
+
corpus,
|
|
732
|
+
recipe_name=arguments.recipe_name,
|
|
733
|
+
config=recipe_data,
|
|
734
|
+
extraction_run=extraction_run,
|
|
735
|
+
)
|
|
736
|
+
except ValidationError as exc:
|
|
737
|
+
raise ValueError(f"Invalid profiling recipe: {exc}") from exc
|
|
738
|
+
print(output.model_dump_json(indent=2))
|
|
739
|
+
return 0
|
|
740
|
+
|
|
741
|
+
|
|
688
742
|
def build_parser() -> argparse.ArgumentParser:
|
|
689
743
|
"""
|
|
690
744
|
Build the command-line interface argument parser.
|
|
@@ -890,14 +944,20 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
890
944
|
|
|
891
945
|
p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
|
|
892
946
|
_add_common_corpus_arg(p_crawl)
|
|
893
|
-
p_crawl.add_argument(
|
|
947
|
+
p_crawl.add_argument(
|
|
948
|
+
"--root-url", required=True, help="Root uniform resource locator to fetch."
|
|
949
|
+
)
|
|
894
950
|
p_crawl.add_argument(
|
|
895
951
|
"--allowed-prefix",
|
|
896
952
|
required=True,
|
|
897
953
|
help="Uniform resource locator prefix that limits which links are eligible for crawl.",
|
|
898
954
|
)
|
|
899
|
-
p_crawl.add_argument(
|
|
900
|
-
|
|
955
|
+
p_crawl.add_argument(
|
|
956
|
+
"--max-items", type=int, default=50, help="Maximum number of items to store."
|
|
957
|
+
)
|
|
958
|
+
p_crawl.add_argument(
|
|
959
|
+
"--tags", default=None, help="Comma-separated tags to apply to stored items."
|
|
960
|
+
)
|
|
901
961
|
p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
|
|
902
962
|
p_crawl.set_defaults(func=cmd_crawl)
|
|
903
963
|
|
|
@@ -923,6 +983,25 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
923
983
|
)
|
|
924
984
|
p_analyze_topics.set_defaults(func=cmd_analyze_topics)
|
|
925
985
|
|
|
986
|
+
p_analyze_profile = analyze_sub.add_parser("profile", help="Run profiling analysis.")
|
|
987
|
+
_add_common_corpus_arg(p_analyze_profile)
|
|
988
|
+
p_analyze_profile.add_argument(
|
|
989
|
+
"--recipe",
|
|
990
|
+
default=None,
|
|
991
|
+
help="Optional profiling recipe YAML file.",
|
|
992
|
+
)
|
|
993
|
+
p_analyze_profile.add_argument(
|
|
994
|
+
"--recipe-name",
|
|
995
|
+
default="default",
|
|
996
|
+
help="Human-readable recipe name.",
|
|
997
|
+
)
|
|
998
|
+
p_analyze_profile.add_argument(
|
|
999
|
+
"--extraction-run",
|
|
1000
|
+
default=None,
|
|
1001
|
+
help="Extraction run reference in the form extractor_id:run_id.",
|
|
1002
|
+
)
|
|
1003
|
+
p_analyze_profile.set_defaults(func=cmd_analyze_profile)
|
|
1004
|
+
|
|
926
1005
|
return parser
|
|
927
1006
|
|
|
928
1007
|
|
biblicus/corpus.py
CHANGED
|
@@ -622,7 +622,9 @@ class Corpus:
|
|
|
622
622
|
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
623
623
|
return ExtractionRunManifest.model_validate(data)
|
|
624
624
|
|
|
625
|
-
def list_extraction_runs(
|
|
625
|
+
def list_extraction_runs(
|
|
626
|
+
self, *, extractor_id: Optional[str] = None
|
|
627
|
+
) -> List[ExtractionRunListEntry]:
|
|
626
628
|
"""
|
|
627
629
|
List extraction runs stored under the corpus.
|
|
628
630
|
|
|
@@ -669,7 +671,9 @@ class Corpus:
|
|
|
669
671
|
)
|
|
670
672
|
)
|
|
671
673
|
|
|
672
|
-
entries.sort(
|
|
674
|
+
entries.sort(
|
|
675
|
+
key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True
|
|
676
|
+
)
|
|
673
677
|
return entries
|
|
674
678
|
|
|
675
679
|
def latest_extraction_run_reference(
|
|
@@ -1366,7 +1370,9 @@ class Corpus:
|
|
|
1366
1370
|
"""
|
|
1367
1371
|
_ = filename
|
|
1368
1372
|
item_id = str(uuid.uuid4())
|
|
1369
|
-
destination_relpath = str(
|
|
1373
|
+
destination_relpath = str(
|
|
1374
|
+
Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path
|
|
1375
|
+
)
|
|
1370
1376
|
destination_path = (self.root / destination_relpath).resolve()
|
|
1371
1377
|
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1372
1378
|
destination_path.write_bytes(data)
|
biblicus/evidence_processing.py
CHANGED
|
@@ -99,7 +99,10 @@ class EvidenceRerankLongestText(EvidenceReranker):
|
|
|
99
99
|
"""
|
|
100
100
|
return sorted(
|
|
101
101
|
evidence,
|
|
102
|
-
key=lambda evidence_item: (
|
|
102
|
+
key=lambda evidence_item: (
|
|
103
|
+
-len((evidence_item.text or "").strip()),
|
|
104
|
+
evidence_item.item_id,
|
|
105
|
+
),
|
|
103
106
|
)
|
|
104
107
|
|
|
105
108
|
|
|
@@ -198,4 +201,3 @@ def apply_evidence_filter(
|
|
|
198
201
|
"""
|
|
199
202
|
evidence_filter = _EVIDENCE_FILTERS[filter_id]
|
|
200
203
|
return evidence_filter.filter(query_text=query_text, evidence=evidence, config=config)
|
|
201
|
-
|
biblicus/extraction.py
CHANGED
|
@@ -345,7 +345,9 @@ def build_extraction_run(
|
|
|
345
345
|
manifest = create_extraction_run_manifest(corpus, recipe=recipe)
|
|
346
346
|
run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
|
|
347
347
|
if run_dir.exists():
|
|
348
|
-
return corpus.load_extraction_run_manifest(
|
|
348
|
+
return corpus.load_extraction_run_manifest(
|
|
349
|
+
extractor_id=extractor_id, run_id=manifest.run_id
|
|
350
|
+
)
|
|
349
351
|
run_dir.mkdir(parents=True, exist_ok=False)
|
|
350
352
|
|
|
351
353
|
catalog = corpus.load_catalog()
|
|
@@ -152,9 +152,7 @@ class PaddleOcrVlExtractor(TextExtractor):
|
|
|
152
152
|
parsed_config.backend.api_provider,
|
|
153
153
|
config_override=parsed_config.backend.api_key,
|
|
154
154
|
)
|
|
155
|
-
text, confidence = self._extract_via_api(
|
|
156
|
-
source_path, parsed_config, api_key
|
|
157
|
-
)
|
|
155
|
+
text, confidence = self._extract_via_api(source_path, parsed_config, api_key)
|
|
158
156
|
|
|
159
157
|
return ExtractedText(
|
|
160
158
|
text=text,
|
biblicus/user_config.py
CHANGED
|
@@ -170,9 +170,7 @@ def resolve_openai_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Op
|
|
|
170
170
|
return loaded.openai.api_key
|
|
171
171
|
|
|
172
172
|
|
|
173
|
-
def resolve_huggingface_api_key(
|
|
174
|
-
*, config: Optional[BiblicusUserConfig] = None
|
|
175
|
-
) -> Optional[str]:
|
|
173
|
+
def resolve_huggingface_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Optional[str]:
|
|
176
174
|
"""
|
|
177
175
|
Resolve a HuggingFace API key from environment or user configuration.
|
|
178
176
|
|
|
@@ -192,9 +190,7 @@ def resolve_huggingface_api_key(
|
|
|
192
190
|
return loaded.huggingface.api_key
|
|
193
191
|
|
|
194
192
|
|
|
195
|
-
def resolve_deepgram_api_key(
|
|
196
|
-
*, config: Optional[BiblicusUserConfig] = None
|
|
197
|
-
) -> Optional[str]:
|
|
193
|
+
def resolve_deepgram_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Optional[str]:
|
|
198
194
|
"""
|
|
199
195
|
Resolve a Deepgram API key from environment or user configuration.
|
|
200
196
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -531,12 +531,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
|
|
|
531
531
|
|
|
532
532
|
## Topic modeling analysis
|
|
533
533
|
|
|
534
|
-
Biblicus can run analysis pipelines on extracted text without changing the raw corpus.
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
JavaScript Object Notation.
|
|
534
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
535
|
+
are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
|
|
536
|
+
an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
|
|
537
|
+
optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
|
|
538
538
|
|
|
539
|
-
See `docs/ANALYSIS.md` for the analysis pipeline overview
|
|
539
|
+
See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
|
|
540
|
+
`docs/TOPIC_MODELING.md` for topic modeling details.
|
|
540
541
|
|
|
541
542
|
Run a topic analysis using a recipe file:
|
|
542
543
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=BejOPHIlCnT74pu9fNuLm14HsmWjGqCIwpfD9hDOqSo,496
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
|
-
biblicus/cli.py,sha256=
|
|
3
|
+
biblicus/cli.py,sha256=aH3plnednnYgcPnSoYQf200nboKc6N-tuc3FuLPQEcU,35132
|
|
4
4
|
biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
|
|
5
5
|
biblicus/context.py,sha256=qnT9CH7_ldoPcg-rxnUOtRhheOmpDAbF8uqhf8OdjC4,5832
|
|
6
|
-
biblicus/corpus.py,sha256=
|
|
6
|
+
biblicus/corpus.py,sha256=qSDnYJXhWlF2p_BbFLl6xtI53lIIPxwyKLLGLC432Sg,55612
|
|
7
7
|
biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
|
|
8
8
|
biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
|
|
9
9
|
biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
|
|
10
|
-
biblicus/evidence_processing.py,sha256=
|
|
11
|
-
biblicus/extraction.py,sha256=
|
|
10
|
+
biblicus/evidence_processing.py,sha256=sJe6T1nLxvU0xs9yMH8JZZS19zHXMR-Fpr5lWi5ndUM,6120
|
|
11
|
+
biblicus/extraction.py,sha256=qvrsq6zSz2Kg-cap-18HPHC9pQlqEGo7pyID2uKCyBo,19760
|
|
12
12
|
biblicus/frontmatter.py,sha256=JOGjIDzbbOkebQw2RzA-3WDVMAMtJta2INjS4e7-LMg,2463
|
|
13
13
|
biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
|
|
14
14
|
biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
|
|
@@ -21,30 +21,31 @@ biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
|
|
|
21
21
|
biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
|
|
22
22
|
biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
|
|
23
23
|
biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
|
|
24
|
-
biblicus/user_config.py,sha256=
|
|
25
|
-
biblicus/_vendor/dotyaml/__init__.py,sha256=
|
|
26
|
-
biblicus/_vendor/dotyaml/interpolation.py,sha256=
|
|
27
|
-
biblicus/_vendor/dotyaml/loader.py,sha256=
|
|
28
|
-
biblicus/_vendor/dotyaml/transformer.py,sha256=
|
|
29
|
-
biblicus/analysis/__init__.py,sha256=
|
|
24
|
+
biblicus/user_config.py,sha256=UXUYBNUN4FR37ggZGJG1wv3K8XzsMR8pXW1T18lrivw,6495
|
|
25
|
+
biblicus/_vendor/dotyaml/__init__.py,sha256=WAWdbFNFqO5cJPthxA8Kx-L76Bh07sKMosUxC_3o9qA,375
|
|
26
|
+
biblicus/_vendor/dotyaml/interpolation.py,sha256=FVUkdQr_KbXjoFPvGTv6I5v0X5iZkJe5yhZtYKRbYzI,1991
|
|
27
|
+
biblicus/_vendor/dotyaml/loader.py,sha256=zy_zinR5fiatmRyZSiELHv1vVz1Y2eRSboSf_x3kfi4,5623
|
|
28
|
+
biblicus/_vendor/dotyaml/transformer.py,sha256=RWNrm_KAsanG409HEIWquTH9i_jz-ZFK9fM86emXeF4,3724
|
|
29
|
+
biblicus/analysis/__init__.py,sha256=Z4Wb4d-EoUuGHkcfRm9ILuZ8vr9FBqRxC0u1i6Fp_0w,1288
|
|
30
30
|
biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
|
|
31
31
|
biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
|
|
32
|
-
biblicus/analysis/models.py,sha256=
|
|
32
|
+
biblicus/analysis/models.py,sha256=LuR52w27JRzV-Mr-WAOduZrBOCTrp5uYkMc46QHTRrI,27300
|
|
33
|
+
biblicus/analysis/profiling.py,sha256=z4w14LVJrTEXcQ3PBNwwb_61KuuwQgXw4-EiAaxOQ4Y,10672
|
|
33
34
|
biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
|
|
34
|
-
biblicus/analysis/topic_modeling.py,sha256=
|
|
35
|
+
biblicus/analysis/topic_modeling.py,sha256=ZGXvm2MyU6plxz2FE1RQU-3bra6QZ-t8EJj8kG1TW0M,19438
|
|
35
36
|
biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
|
|
36
37
|
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
37
38
|
biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
|
|
38
|
-
biblicus/backends/sqlite_full_text_search.py,sha256=
|
|
39
|
+
biblicus/backends/sqlite_full_text_search.py,sha256=XFuIbEHYWMD9JkjgRZcgYH3kP3b4hRnJ3PwP8rSFjUU,16502
|
|
39
40
|
biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
|
|
40
41
|
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
41
42
|
biblicus/extractors/deepgram_stt.py,sha256=VI71i4lbE-EFHcvpNcCPRpT8z7A5IuaSrT1UaPyZ8UY,6323
|
|
42
43
|
biblicus/extractors/docling_granite_text.py,sha256=aFNx-HubvaMmVJHbNqk3CR_ilSwN96-phkaENT6E2B0,6879
|
|
43
44
|
biblicus/extractors/docling_smol_text.py,sha256=cSbQcT4O47MMcM6_pmQCvqgC5ferLvaxJnm3v9EQd0A,6811
|
|
44
|
-
biblicus/extractors/markitdown_text.py,sha256
|
|
45
|
+
biblicus/extractors/markitdown_text.py,sha256=ZvN2TFh65icTTdzCe7L-ZB8zTPP2mxQ4MhOOqSc81Z0,4547
|
|
45
46
|
biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
|
|
46
47
|
biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
|
|
47
|
-
biblicus/extractors/paddleocr_vl_text.py,sha256=
|
|
48
|
+
biblicus/extractors/paddleocr_vl_text.py,sha256=59csxihkqK0lELpAtK2YLcfbSUvNGiuOw7CwPa_0l_c,11692
|
|
48
49
|
biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
|
|
49
50
|
biblicus/extractors/pdf_text.py,sha256=YtUphgLVxyWJXew6ZsJ8wBRh67Y5ri4ZTRlMmq3g1Bk,3255
|
|
50
51
|
biblicus/extractors/pipeline.py,sha256=LY6eM3ypw50MDB2cPEQqZrjxkhVvIc6sv4UEhHdNDrE,3208
|
|
@@ -54,9 +55,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
|
|
|
54
55
|
biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
|
|
55
56
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
56
57
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
57
|
-
biblicus-0.
|
|
58
|
-
biblicus-0.
|
|
59
|
-
biblicus-0.
|
|
60
|
-
biblicus-0.
|
|
61
|
-
biblicus-0.
|
|
62
|
-
biblicus-0.
|
|
58
|
+
biblicus-0.10.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
59
|
+
biblicus-0.10.0.dist-info/METADATA,sha256=xZ7scJLdlKHRtm0EU5Ravq5ih2mS2KNfMbbLXNqZ8Ek,27455
|
|
60
|
+
biblicus-0.10.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
61
|
+
biblicus-0.10.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
62
|
+
biblicus-0.10.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
63
|
+
biblicus-0.10.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|