biblicus 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -27,4 +27,4 @@ __all__ = [
27
27
  "RetrievalRun",
28
28
  ]
29
29
 
30
- __version__ = "0.9.0"
30
+ __version__ = "0.10.0"
@@ -11,4 +11,3 @@ from .interpolation import interpolate_env_vars
11
11
  from .loader import ConfigLoader, load_config
12
12
 
13
13
  __all__ = ["ConfigLoader", "interpolate_env_vars", "load_config"]
14
-
@@ -60,4 +60,3 @@ def _interpolate_string(text: str) -> str:
60
60
  raise ValueError(f"Required environment variable '{env_var}' not found")
61
61
 
62
62
  return re.sub(pattern, replace_match, text)
63
-
@@ -178,4 +178,3 @@ class ConfigLoader:
178
178
  for key, value in flat_config.items():
179
179
  if override or key not in os.environ:
180
180
  os.environ[key] = value
181
-
@@ -132,4 +132,3 @@ def convert_string_to_value(value: str) -> Any:
132
132
  return json.loads(value)
133
133
  except (json.JSONDecodeError, ValueError):
134
134
  return value
135
-
@@ -7,6 +7,7 @@ from __future__ import annotations
7
7
  from typing import Dict, Type
8
8
 
9
9
  from .base import CorpusAnalysisBackend
10
+ from .profiling import ProfilingBackend
10
11
  from .topic_modeling import TopicModelingBackend
11
12
 
12
13
 
@@ -18,6 +19,7 @@ def available_analysis_backends() -> Dict[str, Type[CorpusAnalysisBackend]]:
18
19
  :rtype: dict[str, Type[CorpusAnalysisBackend]]
19
20
  """
20
21
  return {
22
+ ProfilingBackend.analysis_id: ProfilingBackend,
21
23
  TopicModelingBackend.analysis_id: TopicModelingBackend,
22
24
  }
23
25
 
@@ -84,6 +84,221 @@ class AnalysisRunManifest(AnalysisSchemaModel):
84
84
  stats: Dict[str, Any] = Field(default_factory=dict)
85
85
 
86
86
 
87
+ class ProfilingRecipeConfig(AnalysisSchemaModel):
88
+ """
89
+ Recipe configuration for profiling analysis.
90
+
91
+ :ivar schema_version: Analysis schema version.
92
+ :vartype schema_version: int
93
+ :ivar sample_size: Optional sample size for distribution metrics.
94
+ :vartype sample_size: int or None
95
+ :ivar min_text_characters: Optional minimum character count for extracted text inclusion.
96
+ :vartype min_text_characters: int or None
97
+ :ivar percentiles: Percentiles to compute for distributions.
98
+ :vartype percentiles: list[int]
99
+ :ivar top_tag_count: Maximum number of tags to include in top tag output.
100
+ :vartype top_tag_count: int
101
+ :ivar tag_filters: Optional tag filters to limit tag coverage metrics.
102
+ :vartype tag_filters: list[str] or None
103
+ """
104
+
105
+ schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
106
+ sample_size: Optional[int] = Field(default=None, ge=1)
107
+ min_text_characters: Optional[int] = Field(default=None, ge=1)
108
+ percentiles: List[int] = Field(default_factory=lambda: [50, 90, 99])
109
+ top_tag_count: int = Field(default=10, ge=1)
110
+ tag_filters: Optional[List[str]] = None
111
+
112
+ @model_validator(mode="after")
113
+ def _validate_schema_version(self) -> "ProfilingRecipeConfig":
114
+ if self.schema_version != ANALYSIS_SCHEMA_VERSION:
115
+ raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
116
+ return self
117
+
118
+ @field_validator("percentiles", mode="after")
119
+ @classmethod
120
+ def _validate_percentiles(cls, value: List[int]) -> List[int]:
121
+ if not value:
122
+ raise ValueError("percentiles must include at least one value")
123
+ if any(percentile < 1 or percentile > 100 for percentile in value):
124
+ raise ValueError("percentiles must be between 1 and 100")
125
+ if value != sorted(value):
126
+ raise ValueError("percentiles must be sorted in ascending order")
127
+ return value
128
+
129
+ @field_validator("tag_filters", mode="before")
130
+ @classmethod
131
+ def _validate_tag_filters(cls, value: object) -> object:
132
+ if value is None:
133
+ return None
134
+ if not isinstance(value, list):
135
+ raise ValueError("tag_filters must be a list of strings")
136
+ cleaned = [str(tag).strip() for tag in value]
137
+ if not cleaned or any(not tag for tag in cleaned):
138
+ raise ValueError("tag_filters must be a list of non-empty strings")
139
+ return cleaned
140
+
141
+
142
+ class ProfilingPercentileValue(AnalysisSchemaModel):
143
+ """
144
+ Percentile entry for a distribution.
145
+
146
+ :ivar percentile: Percentile value between 1 and 100.
147
+ :vartype percentile: int
148
+ :ivar value: Percentile value.
149
+ :vartype value: float
150
+ """
151
+
152
+ percentile: int = Field(ge=1, le=100)
153
+ value: float
154
+
155
+
156
+ class ProfilingDistributionReport(AnalysisSchemaModel):
157
+ """
158
+ Distribution summary for numeric values.
159
+
160
+ :ivar count: Count of values included.
161
+ :vartype count: int
162
+ :ivar min_value: Minimum value observed.
163
+ :vartype min_value: float
164
+ :ivar max_value: Maximum value observed.
165
+ :vartype max_value: float
166
+ :ivar mean_value: Mean value observed.
167
+ :vartype mean_value: float
168
+ :ivar percentiles: Percentile values.
169
+ :vartype percentiles: list[ProfilingPercentileValue]
170
+ """
171
+
172
+ count: int = Field(ge=0)
173
+ min_value: float
174
+ max_value: float
175
+ mean_value: float
176
+ percentiles: List[ProfilingPercentileValue] = Field(default_factory=list)
177
+
178
+
179
+ class ProfilingTagCount(AnalysisSchemaModel):
180
+ """
181
+ Tag count entry for profiling output.
182
+
183
+ :ivar tag: Tag name.
184
+ :vartype tag: str
185
+ :ivar count: Number of items with this tag.
186
+ :vartype count: int
187
+ """
188
+
189
+ tag: str
190
+ count: int = Field(ge=0)
191
+
192
+
193
+ class ProfilingTagReport(AnalysisSchemaModel):
194
+ """
195
+ Tag coverage summary for raw items.
196
+
197
+ :ivar tagged_items: Count of items with tags.
198
+ :vartype tagged_items: int
199
+ :ivar untagged_items: Count of items without tags.
200
+ :vartype untagged_items: int
201
+ :ivar total_unique_tags: Count of unique tags.
202
+ :vartype total_unique_tags: int
203
+ :ivar top_tags: Most frequent tags.
204
+ :vartype top_tags: list[ProfilingTagCount]
205
+ :ivar tag_filters: Optional tag filters applied.
206
+ :vartype tag_filters: list[str] or None
207
+ """
208
+
209
+ tagged_items: int = Field(ge=0)
210
+ untagged_items: int = Field(ge=0)
211
+ total_unique_tags: int = Field(ge=0)
212
+ top_tags: List[ProfilingTagCount] = Field(default_factory=list)
213
+ tag_filters: Optional[List[str]] = None
214
+
215
+
216
+ class ProfilingRawItemsReport(AnalysisSchemaModel):
217
+ """
218
+ Summary of raw corpus items.
219
+
220
+ :ivar total_items: Total number of catalog items.
221
+ :vartype total_items: int
222
+ :ivar media_type_counts: Count of items per media type.
223
+ :vartype media_type_counts: dict[str, int]
224
+ :ivar bytes_distribution: Distribution of raw item sizes in bytes.
225
+ :vartype bytes_distribution: ProfilingDistributionReport
226
+ :ivar tags: Tag coverage summary.
227
+ :vartype tags: ProfilingTagReport
228
+ """
229
+
230
+ total_items: int = Field(ge=0)
231
+ media_type_counts: Dict[str, int] = Field(default_factory=dict)
232
+ bytes_distribution: ProfilingDistributionReport
233
+ tags: ProfilingTagReport
234
+
235
+
236
+ class ProfilingExtractedTextReport(AnalysisSchemaModel):
237
+ """
238
+ Summary of extracted text coverage.
239
+
240
+ :ivar source_items: Count of source items in the extraction run.
241
+ :vartype source_items: int
242
+ :ivar extracted_nonempty_items: Count of extracted items with non-empty text.
243
+ :vartype extracted_nonempty_items: int
244
+ :ivar extracted_empty_items: Count of extracted items with empty text.
245
+ :vartype extracted_empty_items: int
246
+ :ivar extracted_missing_items: Count of items with no extracted text artifact.
247
+ :vartype extracted_missing_items: int
248
+ :ivar characters_distribution: Distribution of extracted text lengths.
249
+ :vartype characters_distribution: ProfilingDistributionReport
250
+ """
251
+
252
+ source_items: int = Field(ge=0)
253
+ extracted_nonempty_items: int = Field(ge=0)
254
+ extracted_empty_items: int = Field(ge=0)
255
+ extracted_missing_items: int = Field(ge=0)
256
+ characters_distribution: ProfilingDistributionReport
257
+
258
+
259
+ class ProfilingReport(AnalysisSchemaModel):
260
+ """
261
+ Report for profiling analysis.
262
+
263
+ :ivar raw_items: Raw corpus item summary.
264
+ :vartype raw_items: ProfilingRawItemsReport
265
+ :ivar extracted_text: Extracted text coverage summary.
266
+ :vartype extracted_text: ProfilingExtractedTextReport
267
+ :ivar warnings: Warning messages.
268
+ :vartype warnings: list[str]
269
+ :ivar errors: Error messages.
270
+ :vartype errors: list[str]
271
+ """
272
+
273
+ raw_items: ProfilingRawItemsReport
274
+ extracted_text: ProfilingExtractedTextReport
275
+ warnings: List[str] = Field(default_factory=list)
276
+ errors: List[str] = Field(default_factory=list)
277
+
278
+
279
+ class ProfilingOutput(AnalysisSchemaModel):
280
+ """
281
+ Output bundle for profiling analysis.
282
+
283
+ :ivar schema_version: Analysis schema version.
284
+ :vartype schema_version: int
285
+ :ivar analysis_id: Analysis backend identifier.
286
+ :vartype analysis_id: str
287
+ :ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
288
+ :vartype generated_at: str
289
+ :ivar run: Analysis run manifest.
290
+ :vartype run: AnalysisRunManifest
291
+ :ivar report: Profiling report data.
292
+ :vartype report: ProfilingReport
293
+ """
294
+
295
+ schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
296
+ analysis_id: str
297
+ generated_at: str
298
+ run: AnalysisRunManifest
299
+ report: ProfilingReport
300
+
301
+
87
302
  class TopicModelingTextSourceConfig(AnalysisSchemaModel):
88
303
  """
89
304
  Configuration for text collection within topic modeling.
@@ -124,7 +339,9 @@ class TopicModelingLlmExtractionConfig(AnalysisSchemaModel):
124
339
  """
125
340
 
126
341
  enabled: bool = Field(default=False)
127
- method: TopicModelingLlmExtractionMethod = Field(default=TopicModelingLlmExtractionMethod.SINGLE)
342
+ method: TopicModelingLlmExtractionMethod = Field(
343
+ default=TopicModelingLlmExtractionMethod.SINGLE
344
+ )
128
345
  client: Optional[LlmClientConfig] = None
129
346
  prompt_template: Optional[str] = None
130
347
  system_prompt: Optional[str] = None
@@ -136,7 +353,9 @@ class TopicModelingLlmExtractionConfig(AnalysisSchemaModel):
136
353
  return value
137
354
  if isinstance(value, str):
138
355
  return TopicModelingLlmExtractionMethod(value)
139
- raise ValueError("llm_extraction.method must be a string or TopicModelingLlmExtractionMethod")
356
+ raise ValueError(
357
+ "llm_extraction.method must be a string or TopicModelingLlmExtractionMethod"
358
+ )
140
359
 
141
360
  @model_validator(mode="after")
142
361
  def _validate_requirements(self) -> "TopicModelingLlmExtractionConfig":
@@ -188,7 +407,9 @@ class TopicModelingVectorizerConfig(AnalysisSchemaModel):
188
407
  def _validate_ngram_range(self) -> "TopicModelingVectorizerConfig":
189
408
  start, end = self.ngram_range
190
409
  if start < 1 or end < start:
191
- raise ValueError("vectorizer.ngram_range must include two integers with start >= 1 and end >= start")
410
+ raise ValueError(
411
+ "vectorizer.ngram_range must include two integers with start >= 1 and end >= start"
412
+ )
192
413
  return self
193
414
 
194
415
  @field_validator("stop_words", mode="before")
@@ -201,7 +422,7 @@ class TopicModelingVectorizerConfig(AnalysisSchemaModel):
201
422
  raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
202
423
  return value
203
424
  if isinstance(value, list):
204
- if not all(isinstance(entry, str) and entry for entry in value):
425
+ if not value or not all(isinstance(entry, str) and entry for entry in value):
205
426
  raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
206
427
  return value
207
428
  raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
@@ -280,7 +501,9 @@ class TopicModelingRecipeConfig(AnalysisSchemaModel):
280
501
  """
281
502
 
282
503
  schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
283
- text_source: TopicModelingTextSourceConfig = Field(default_factory=TopicModelingTextSourceConfig)
504
+ text_source: TopicModelingTextSourceConfig = Field(
505
+ default_factory=TopicModelingTextSourceConfig
506
+ )
284
507
  llm_extraction: TopicModelingLlmExtractionConfig = Field(
285
508
  default_factory=TopicModelingLlmExtractionConfig
286
509
  )
@@ -0,0 +1,337 @@
1
+ """
2
+ Profiling analysis backend for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import math
9
+ from pathlib import Path
10
+ from typing import Dict, Iterable, List, Sequence
11
+
12
+ from pydantic import BaseModel
13
+
14
+ from ..corpus import Corpus
15
+ from ..models import CatalogItem, ExtractionRunReference
16
+ from ..retrieval import hash_text
17
+ from ..time import utc_now_iso
18
+ from .base import CorpusAnalysisBackend
19
+ from .models import (
20
+ AnalysisRecipeManifest,
21
+ AnalysisRunInput,
22
+ AnalysisRunManifest,
23
+ ProfilingDistributionReport,
24
+ ProfilingExtractedTextReport,
25
+ ProfilingOutput,
26
+ ProfilingPercentileValue,
27
+ ProfilingRawItemsReport,
28
+ ProfilingRecipeConfig,
29
+ ProfilingReport,
30
+ ProfilingTagCount,
31
+ ProfilingTagReport,
32
+ )
33
+
34
+
35
+ class ProfilingBackend(CorpusAnalysisBackend):
36
+ """
37
+ Profiling analysis backend for corpus composition and coverage.
38
+
39
+ :ivar analysis_id: Backend identifier.
40
+ :vartype analysis_id: str
41
+ """
42
+
43
+ analysis_id = "profiling"
44
+
45
+ def run_analysis(
46
+ self,
47
+ corpus: Corpus,
48
+ *,
49
+ recipe_name: str,
50
+ config: Dict[str, object],
51
+ extraction_run: ExtractionRunReference,
52
+ ) -> BaseModel:
53
+ """
54
+ Run the profiling analysis pipeline.
55
+
56
+ :param corpus: Corpus to analyze.
57
+ :type corpus: Corpus
58
+ :param recipe_name: Human-readable recipe name.
59
+ :type recipe_name: str
60
+ :param config: Analysis configuration values.
61
+ :type config: dict[str, object]
62
+ :param extraction_run: Extraction run reference for text inputs.
63
+ :type extraction_run: biblicus.models.ExtractionRunReference
64
+ :return: Profiling output model.
65
+ :rtype: pydantic.BaseModel
66
+ """
67
+ parsed_config = (
68
+ config
69
+ if isinstance(config, ProfilingRecipeConfig)
70
+ else ProfilingRecipeConfig.model_validate(config)
71
+ )
72
+ return _run_profiling(
73
+ corpus=corpus,
74
+ recipe_name=recipe_name,
75
+ config=parsed_config,
76
+ extraction_run=extraction_run,
77
+ )
78
+
79
+
80
+ def _run_profiling(
81
+ *,
82
+ corpus: Corpus,
83
+ recipe_name: str,
84
+ config: ProfilingRecipeConfig,
85
+ extraction_run: ExtractionRunReference,
86
+ ) -> ProfilingOutput:
87
+ recipe = _create_recipe_manifest(name=recipe_name, config=config)
88
+ catalog = corpus.load_catalog()
89
+ run_id = _analysis_run_id(
90
+ recipe_id=recipe.recipe_id,
91
+ extraction_run=extraction_run,
92
+ catalog_generated_at=catalog.generated_at,
93
+ )
94
+ run_manifest = AnalysisRunManifest(
95
+ run_id=run_id,
96
+ recipe=recipe,
97
+ corpus_uri=catalog.corpus_uri,
98
+ catalog_generated_at=catalog.generated_at,
99
+ created_at=utc_now_iso(),
100
+ input=AnalysisRunInput(extraction_run=extraction_run),
101
+ artifact_paths=[],
102
+ stats={},
103
+ )
104
+ run_dir = corpus.analysis_run_dir(analysis_id=ProfilingBackend.analysis_id, run_id=run_id)
105
+ output_path = run_dir / "output.json"
106
+ run_dir.mkdir(parents=True, exist_ok=True)
107
+
108
+ ordered_items = _ordered_catalog_items(catalog.items, catalog.order)
109
+ raw_report = _build_raw_items_report(items=ordered_items, config=config)
110
+ extracted_report = _build_extracted_text_report(
111
+ corpus=corpus,
112
+ extraction_run=extraction_run,
113
+ config=config,
114
+ )
115
+
116
+ report = ProfilingReport(
117
+ raw_items=raw_report,
118
+ extracted_text=extracted_report,
119
+ warnings=[],
120
+ errors=[],
121
+ )
122
+
123
+ run_stats = {
124
+ "raw_items": raw_report.total_items,
125
+ "extracted_nonempty_items": extracted_report.extracted_nonempty_items,
126
+ "extracted_missing_items": extracted_report.extracted_missing_items,
127
+ }
128
+ run_manifest = run_manifest.model_copy(
129
+ update={"artifact_paths": ["output.json"], "stats": run_stats}
130
+ )
131
+ _write_analysis_run_manifest(run_dir=run_dir, manifest=run_manifest)
132
+
133
+ output = ProfilingOutput(
134
+ analysis_id=ProfilingBackend.analysis_id,
135
+ generated_at=utc_now_iso(),
136
+ run=run_manifest,
137
+ report=report,
138
+ )
139
+ _write_profiling_output(path=output_path, output=output)
140
+ return output
141
+
142
+
143
+ def _create_recipe_manifest(*, name: str, config: ProfilingRecipeConfig) -> AnalysisRecipeManifest:
144
+ recipe_payload = json.dumps(
145
+ {
146
+ "analysis_id": ProfilingBackend.analysis_id,
147
+ "name": name,
148
+ "config": config.model_dump(),
149
+ },
150
+ sort_keys=True,
151
+ )
152
+ recipe_id = hash_text(recipe_payload)
153
+ return AnalysisRecipeManifest(
154
+ recipe_id=recipe_id,
155
+ analysis_id=ProfilingBackend.analysis_id,
156
+ name=name,
157
+ created_at=utc_now_iso(),
158
+ config=config.model_dump(),
159
+ )
160
+
161
+
162
+ def _analysis_run_id(
163
+ *, recipe_id: str, extraction_run: ExtractionRunReference, catalog_generated_at: str
164
+ ) -> str:
165
+ run_seed = f"{recipe_id}:{extraction_run.as_string()}:{catalog_generated_at}"
166
+ return hash_text(run_seed)
167
+
168
+
169
+ def _ordered_catalog_items(
170
+ items: Dict[str, CatalogItem],
171
+ order: Sequence[str],
172
+ ) -> List[CatalogItem]:
173
+ ordered: List[CatalogItem] = []
174
+ seen = set()
175
+ for item_id in order:
176
+ item = items.get(item_id)
177
+ if item is None:
178
+ continue
179
+ ordered.append(item)
180
+ seen.add(item_id)
181
+ for item_id in sorted(items):
182
+ if item_id in seen:
183
+ continue
184
+ ordered.append(items[item_id])
185
+ return ordered
186
+
187
+
188
+ def _build_raw_items_report(
189
+ *, items: Sequence[CatalogItem], config: ProfilingRecipeConfig
190
+ ) -> ProfilingRawItemsReport:
191
+ media_type_counts: Dict[str, int] = {}
192
+ for item in items:
193
+ media_type_counts[item.media_type] = media_type_counts.get(item.media_type, 0) + 1
194
+
195
+ bytes_values = [item.bytes for item in _apply_sample(items, config.sample_size)]
196
+ bytes_distribution = _build_distribution(bytes_values, config.percentiles)
197
+ tag_report = _build_tag_report(items=items, config=config)
198
+
199
+ return ProfilingRawItemsReport(
200
+ total_items=len(items),
201
+ media_type_counts=media_type_counts,
202
+ bytes_distribution=bytes_distribution,
203
+ tags=tag_report,
204
+ )
205
+
206
+
207
+ def _build_tag_report(
208
+ *, items: Sequence[CatalogItem], config: ProfilingRecipeConfig
209
+ ) -> ProfilingTagReport:
210
+ tag_filters = config.tag_filters
211
+ tag_filter_set = set(tag_filters or [])
212
+ tag_counts: Dict[str, int] = {}
213
+ tagged_items = 0
214
+
215
+ for item in items:
216
+ tags = list(item.tags)
217
+ if tag_filters is not None:
218
+ tags = [tag for tag in tags if tag in tag_filter_set]
219
+ if tags:
220
+ tagged_items += 1
221
+ for tag in tags:
222
+ tag_counts[tag] = tag_counts.get(tag, 0) + 1
223
+
224
+ untagged_items = len(items) - tagged_items
225
+ top_tags = sorted(tag_counts.items(), key=lambda entry: (-entry[1], entry[0]))
226
+ top_tags = top_tags[: config.top_tag_count]
227
+ return ProfilingTagReport(
228
+ tagged_items=tagged_items,
229
+ untagged_items=untagged_items,
230
+ total_unique_tags=len(tag_counts),
231
+ top_tags=[ProfilingTagCount(tag=tag, count=count) for tag, count in top_tags],
232
+ tag_filters=tag_filters,
233
+ )
234
+
235
+
236
+ def _build_extracted_text_report(
237
+ *,
238
+ corpus: Corpus,
239
+ extraction_run: ExtractionRunReference,
240
+ config: ProfilingRecipeConfig,
241
+ ) -> ProfilingExtractedTextReport:
242
+ manifest = corpus.load_extraction_run_manifest(
243
+ extractor_id=extraction_run.extractor_id,
244
+ run_id=extraction_run.run_id,
245
+ )
246
+ nonempty_items = 0
247
+ empty_items = 0
248
+ missing_items = 0
249
+ text_lengths: List[int] = []
250
+ text_dir = corpus.extraction_run_dir(
251
+ extractor_id=extraction_run.extractor_id,
252
+ run_id=extraction_run.run_id,
253
+ )
254
+
255
+ for item_result in manifest.items:
256
+ if item_result.status != "extracted" or item_result.final_text_relpath is None:
257
+ missing_items += 1
258
+ continue
259
+ text_path = text_dir / item_result.final_text_relpath
260
+ text_value = text_path.read_text(encoding="utf-8")
261
+ stripped = text_value.strip()
262
+ if not stripped:
263
+ empty_items += 1
264
+ continue
265
+ if config.min_text_characters is not None and len(stripped) < config.min_text_characters:
266
+ empty_items += 1
267
+ continue
268
+ nonempty_items += 1
269
+ text_lengths.append(len(text_value))
270
+
271
+ sampled_lengths = _apply_sample(text_lengths, config.sample_size)
272
+ characters_distribution = _build_distribution(sampled_lengths, config.percentiles)
273
+ return ProfilingExtractedTextReport(
274
+ source_items=len(manifest.items),
275
+ extracted_nonempty_items=nonempty_items,
276
+ extracted_empty_items=empty_items,
277
+ extracted_missing_items=missing_items,
278
+ characters_distribution=characters_distribution,
279
+ )
280
+
281
+
282
+ def _apply_sample(values: Sequence, sample_size: int | None) -> List:
283
+ if sample_size is None:
284
+ return list(values)
285
+ return list(values[:sample_size])
286
+
287
+
288
+ def _build_distribution(
289
+ values: Sequence[int], percentiles: Iterable[int]
290
+ ) -> ProfilingDistributionReport:
291
+ if not values:
292
+ percentile_values = [
293
+ ProfilingPercentileValue(percentile=percentile, value=0.0) for percentile in percentiles
294
+ ]
295
+ return ProfilingDistributionReport(
296
+ count=0,
297
+ min_value=0.0,
298
+ max_value=0.0,
299
+ mean_value=0.0,
300
+ percentiles=percentile_values,
301
+ )
302
+ sorted_values = sorted(values)
303
+ count = len(sorted_values)
304
+ min_value = float(sorted_values[0])
305
+ max_value = float(sorted_values[-1])
306
+ mean_value = float(sum(sorted_values)) / count
307
+ percentile_values = [
308
+ ProfilingPercentileValue(
309
+ percentile=percentile,
310
+ value=float(_percentile_value(sorted_values, percentile)),
311
+ )
312
+ for percentile in percentiles
313
+ ]
314
+ return ProfilingDistributionReport(
315
+ count=count,
316
+ min_value=min_value,
317
+ max_value=max_value,
318
+ mean_value=mean_value,
319
+ percentiles=percentile_values,
320
+ )
321
+
322
+
323
+ def _percentile_value(sorted_values: Sequence[int], percentile: int) -> int:
324
+ if not sorted_values:
325
+ return 0
326
+ index = max(0, math.ceil((percentile / 100) * len(sorted_values)) - 1)
327
+ index = min(index, len(sorted_values) - 1)
328
+ return int(sorted_values[index])
329
+
330
+
331
+ def _write_analysis_run_manifest(*, run_dir: Path, manifest: AnalysisRunManifest) -> None:
332
+ manifest_path = run_dir / "manifest.json"
333
+ manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
334
+
335
+
336
+ def _write_profiling_output(*, path: Path, output: ProfilingOutput) -> None:
337
+ path.write_text(output.model_dump_json(indent=2) + "\n", encoding="utf-8")
@@ -452,7 +452,7 @@ def _run_bertopic(
452
452
  except ImportError as import_error:
453
453
  raise ValueError(
454
454
  "Vectorizer configuration requires scikit-learn. "
455
- "Install with pip install \"biblicus[topic-modeling]\"."
455
+ 'Install with pip install "biblicus[topic-modeling]".'
456
456
  ) from import_error
457
457
  bertopic_kwargs["vectorizer_model"] = CountVectorizer(
458
458
  ngram_range=tuple(config.vectorizer.ngram_range),
@@ -504,13 +504,10 @@ def _group_documents_by_topic(
504
504
  return grouped
505
505
 
506
506
 
507
- def _resolve_topic_keywords(
508
- *, topic_model: Any, topic_id: int
509
- ) -> List[TopicModelingKeyword]:
507
+ def _resolve_topic_keywords(*, topic_model: Any, topic_id: int) -> List[TopicModelingKeyword]:
510
508
  raw_keywords = topic_model.get_topic(topic_id) or []
511
509
  return [
512
- TopicModelingKeyword(keyword=str(entry[0]), score=float(entry[1]))
513
- for entry in raw_keywords
510
+ TopicModelingKeyword(keyword=str(entry[0]), score=float(entry[1])) for entry in raw_keywords
514
511
  ]
515
512
 
516
513
 
@@ -213,8 +213,7 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
213
213
  :return: None.
214
214
  :rtype: None
215
215
  """
216
- conn.execute(
217
- """
216
+ conn.execute("""
218
217
  CREATE VIRTUAL TABLE chunks_full_text_search USING fts5(
219
218
  content,
220
219
  item_id UNINDEXED,
@@ -225,8 +224,7 @@ def _create_full_text_search_schema(conn: sqlite3.Connection) -> None:
225
224
  start_offset UNINDEXED,
226
225
  end_offset UNINDEXED
227
226
  )
228
- """
229
- )
227
+ """)
230
228
 
231
229
 
232
230
  def _build_full_text_search_index(
biblicus/cli.py CHANGED
@@ -563,7 +563,9 @@ def cmd_context_pack_build(arguments: argparse.Namespace) -> int:
563
563
  """
564
564
  input_text = sys.stdin.read()
565
565
  if not input_text.strip():
566
- raise ValueError("Context pack build requires a retrieval result JavaScript Object Notation on standard input")
566
+ raise ValueError(
567
+ "Context pack build requires a retrieval result JavaScript Object Notation on standard input"
568
+ )
567
569
  retrieval_result = RetrievalResult.model_validate_json(input_text)
568
570
  join_with = bytes(arguments.join_with, "utf-8").decode("unicode_escape")
569
571
  policy = ContextPackPolicy(join_with=join_with)
@@ -685,6 +687,58 @@ def cmd_analyze_topics(arguments: argparse.Namespace) -> int:
685
687
  return 0
686
688
 
687
689
 
690
+ def cmd_analyze_profile(arguments: argparse.Namespace) -> int:
691
+ """
692
+ Run profiling analysis for a corpus.
693
+
694
+ :param arguments: Parsed command-line interface arguments.
695
+ :type arguments: argparse.Namespace
696
+ :return: Exit code.
697
+ :rtype: int
698
+ """
699
+ import yaml
700
+
701
+ corpus = (
702
+ Corpus.open(arguments.corpus)
703
+ if getattr(arguments, "corpus", None)
704
+ else Corpus.find(Path.cwd())
705
+ )
706
+
707
+ recipe_data: dict[str, object] = {}
708
+ if arguments.recipe is not None:
709
+ recipe_path = Path(arguments.recipe)
710
+ if not recipe_path.is_file():
711
+ raise FileNotFoundError(f"Recipe file not found: {recipe_path}")
712
+ recipe_raw = yaml.safe_load(recipe_path.read_text(encoding="utf-8")) or {}
713
+ if not isinstance(recipe_raw, dict):
714
+ raise ValueError("Profiling recipe must be a mapping/object")
715
+ recipe_data = recipe_raw
716
+
717
+ if arguments.extraction_run:
718
+ extraction_run = parse_extraction_run_reference(arguments.extraction_run)
719
+ else:
720
+ extraction_run = corpus.latest_extraction_run_reference()
721
+ if extraction_run is None:
722
+ raise ValueError("Profiling analysis requires an extraction run to supply text inputs")
723
+ print(
724
+ "Warning: using latest extraction run; pass --extraction-run for reproducibility.",
725
+ file=sys.stderr,
726
+ )
727
+
728
+ backend = get_analysis_backend("profiling")
729
+ try:
730
+ output = backend.run_analysis(
731
+ corpus,
732
+ recipe_name=arguments.recipe_name,
733
+ config=recipe_data,
734
+ extraction_run=extraction_run,
735
+ )
736
+ except ValidationError as exc:
737
+ raise ValueError(f"Invalid profiling recipe: {exc}") from exc
738
+ print(output.model_dump_json(indent=2))
739
+ return 0
740
+
741
+
688
742
  def build_parser() -> argparse.ArgumentParser:
689
743
  """
690
744
  Build the command-line interface argument parser.
@@ -890,14 +944,20 @@ def build_parser() -> argparse.ArgumentParser:
890
944
 
891
945
  p_crawl = sub.add_parser("crawl", help="Crawl a website prefix into the corpus.")
892
946
  _add_common_corpus_arg(p_crawl)
893
- p_crawl.add_argument("--root-url", required=True, help="Root uniform resource locator to fetch.")
947
+ p_crawl.add_argument(
948
+ "--root-url", required=True, help="Root uniform resource locator to fetch."
949
+ )
894
950
  p_crawl.add_argument(
895
951
  "--allowed-prefix",
896
952
  required=True,
897
953
  help="Uniform resource locator prefix that limits which links are eligible for crawl.",
898
954
  )
899
- p_crawl.add_argument("--max-items", type=int, default=50, help="Maximum number of items to store.")
900
- p_crawl.add_argument("--tags", default=None, help="Comma-separated tags to apply to stored items.")
955
+ p_crawl.add_argument(
956
+ "--max-items", type=int, default=50, help="Maximum number of items to store."
957
+ )
958
+ p_crawl.add_argument(
959
+ "--tags", default=None, help="Comma-separated tags to apply to stored items."
960
+ )
901
961
  p_crawl.add_argument("--tag", action="append", help="Repeatable tag to apply to stored items.")
902
962
  p_crawl.set_defaults(func=cmd_crawl)
903
963
 
@@ -923,6 +983,25 @@ def build_parser() -> argparse.ArgumentParser:
923
983
  )
924
984
  p_analyze_topics.set_defaults(func=cmd_analyze_topics)
925
985
 
986
+ p_analyze_profile = analyze_sub.add_parser("profile", help="Run profiling analysis.")
987
+ _add_common_corpus_arg(p_analyze_profile)
988
+ p_analyze_profile.add_argument(
989
+ "--recipe",
990
+ default=None,
991
+ help="Optional profiling recipe YAML file.",
992
+ )
993
+ p_analyze_profile.add_argument(
994
+ "--recipe-name",
995
+ default="default",
996
+ help="Human-readable recipe name.",
997
+ )
998
+ p_analyze_profile.add_argument(
999
+ "--extraction-run",
1000
+ default=None,
1001
+ help="Extraction run reference in the form extractor_id:run_id.",
1002
+ )
1003
+ p_analyze_profile.set_defaults(func=cmd_analyze_profile)
1004
+
926
1005
  return parser
927
1006
 
928
1007
 
biblicus/corpus.py CHANGED
@@ -622,7 +622,9 @@ class Corpus:
622
622
  data = json.loads(manifest_path.read_text(encoding="utf-8"))
623
623
  return ExtractionRunManifest.model_validate(data)
624
624
 
625
- def list_extraction_runs(self, *, extractor_id: Optional[str] = None) -> List[ExtractionRunListEntry]:
625
+ def list_extraction_runs(
626
+ self, *, extractor_id: Optional[str] = None
627
+ ) -> List[ExtractionRunListEntry]:
626
628
  """
627
629
  List extraction runs stored under the corpus.
628
630
 
@@ -669,7 +671,9 @@ class Corpus:
669
671
  )
670
672
  )
671
673
 
672
- entries.sort(key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True)
674
+ entries.sort(
675
+ key=lambda entry: (entry.created_at, entry.extractor_id, entry.run_id), reverse=True
676
+ )
673
677
  return entries
674
678
 
675
679
  def latest_extraction_run_reference(
@@ -1366,7 +1370,9 @@ class Corpus:
1366
1370
  """
1367
1371
  _ = filename
1368
1372
  item_id = str(uuid.uuid4())
1369
- destination_relpath = str(Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path)
1373
+ destination_relpath = str(
1374
+ Path(DEFAULT_RAW_DIR) / "imports" / "crawl" / crawl_id / relative_path
1375
+ )
1370
1376
  destination_path = (self.root / destination_relpath).resolve()
1371
1377
  destination_path.parent.mkdir(parents=True, exist_ok=True)
1372
1378
  destination_path.write_bytes(data)
@@ -99,7 +99,10 @@ class EvidenceRerankLongestText(EvidenceReranker):
99
99
  """
100
100
  return sorted(
101
101
  evidence,
102
- key=lambda evidence_item: (-len((evidence_item.text or "").strip()), evidence_item.item_id),
102
+ key=lambda evidence_item: (
103
+ -len((evidence_item.text or "").strip()),
104
+ evidence_item.item_id,
105
+ ),
103
106
  )
104
107
 
105
108
 
@@ -198,4 +201,3 @@ def apply_evidence_filter(
198
201
  """
199
202
  evidence_filter = _EVIDENCE_FILTERS[filter_id]
200
203
  return evidence_filter.filter(query_text=query_text, evidence=evidence, config=config)
201
-
biblicus/extraction.py CHANGED
@@ -345,7 +345,9 @@ def build_extraction_run(
345
345
  manifest = create_extraction_run_manifest(corpus, recipe=recipe)
346
346
  run_dir = corpus.extraction_run_dir(extractor_id=extractor_id, run_id=manifest.run_id)
347
347
  if run_dir.exists():
348
- return corpus.load_extraction_run_manifest(extractor_id=extractor_id, run_id=manifest.run_id)
348
+ return corpus.load_extraction_run_manifest(
349
+ extractor_id=extractor_id, run_id=manifest.run_id
350
+ )
349
351
  run_dir.mkdir(parents=True, exist_ok=False)
350
352
 
351
353
  catalog = corpus.load_catalog()
@@ -29,6 +29,7 @@ class MarkItDownExtractorConfig(BaseModel):
29
29
 
30
30
  enable_plugins: bool = Field(default=False)
31
31
 
32
+
32
33
  class MarkItDownExtractor(TextExtractor):
33
34
  """
34
35
  Extractor plugin backed by the `markitdown` library.
@@ -152,9 +152,7 @@ class PaddleOcrVlExtractor(TextExtractor):
152
152
  parsed_config.backend.api_provider,
153
153
  config_override=parsed_config.backend.api_key,
154
154
  )
155
- text, confidence = self._extract_via_api(
156
- source_path, parsed_config, api_key
157
- )
155
+ text, confidence = self._extract_via_api(source_path, parsed_config, api_key)
158
156
 
159
157
  return ExtractedText(
160
158
  text=text,
biblicus/user_config.py CHANGED
@@ -170,9 +170,7 @@ def resolve_openai_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Op
170
170
  return loaded.openai.api_key
171
171
 
172
172
 
173
- def resolve_huggingface_api_key(
174
- *, config: Optional[BiblicusUserConfig] = None
175
- ) -> Optional[str]:
173
+ def resolve_huggingface_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Optional[str]:
176
174
  """
177
175
  Resolve a HuggingFace API key from environment or user configuration.
178
176
 
@@ -192,9 +190,7 @@ def resolve_huggingface_api_key(
192
190
  return loaded.huggingface.api_key
193
191
 
194
192
 
195
- def resolve_deepgram_api_key(
196
- *, config: Optional[BiblicusUserConfig] = None
197
- ) -> Optional[str]:
193
+ def resolve_deepgram_api_key(*, config: Optional[BiblicusUserConfig] = None) -> Optional[str]:
198
194
  """
199
195
  Resolve a Deepgram API key from environment or user configuration.
200
196
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.9.0
3
+ Version: 0.10.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -531,12 +531,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
531
531
 
532
532
  ## Topic modeling analysis
533
533
 
534
- Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
535
- analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
536
- processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
537
- JavaScript Object Notation.
534
+ Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
535
+ are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
536
+ an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
537
+ optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
538
538
 
539
- See `docs/ANALYSIS.md` for the analysis pipeline overview and `docs/TOPIC_MODELING.md` for topic modeling details.
539
+ See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
540
+ `docs/TOPIC_MODELING.md` for topic modeling details.
540
541
 
541
542
  Run a topic analysis using a recipe file:
542
543
 
@@ -1,14 +1,14 @@
1
- biblicus/__init__.py,sha256=x14R9a_6nu3qTg2F-sUOaS_ZepXNBPpa3nsEgp4PZhg,495
1
+ biblicus/__init__.py,sha256=BejOPHIlCnT74pu9fNuLm14HsmWjGqCIwpfD9hDOqSo,496
2
2
  biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
- biblicus/cli.py,sha256=GVmZlCSZPUMBbq69yjN16f4xNw71edlFbGPHX3300oI,32643
3
+ biblicus/cli.py,sha256=aH3plnednnYgcPnSoYQf200nboKc6N-tuc3FuLPQEcU,35132
4
4
  biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
5
5
  biblicus/context.py,sha256=qnT9CH7_ldoPcg-rxnUOtRhheOmpDAbF8uqhf8OdjC4,5832
6
- biblicus/corpus.py,sha256=Pq2OvXom7giwD1tuWoM3RhFnak5YFx5bCh6JTd6JYtI,55554
6
+ biblicus/corpus.py,sha256=qSDnYJXhWlF2p_BbFLl6xtI53lIIPxwyKLLGLC432Sg,55612
7
7
  biblicus/crawl.py,sha256=n8rXBMnziBK9vtKQQCXYOpBzqsPCswj2PzVJUb370KY,6250
8
8
  biblicus/errors.py,sha256=uMajd5DvgnJ_-jq5sbeom1GV8DPUc-kojBaECFi6CsY,467
9
9
  biblicus/evaluation.py,sha256=5xWpb-8f49Osh9aHzo1ab3AXOmls3Imc5rdnEC0pN-8,8143
10
- biblicus/evidence_processing.py,sha256=EMv1AkV_Eufk-poBz9nRR1dZgC-QewvI-NrULBUGVGA,6074
11
- biblicus/extraction.py,sha256=20lRxz6Te6IcA4d-rfT4qjJtgRG_c4YvrqfXNA7EYfs,19738
10
+ biblicus/evidence_processing.py,sha256=sJe6T1nLxvU0xs9yMH8JZZS19zHXMR-Fpr5lWi5ndUM,6120
11
+ biblicus/extraction.py,sha256=qvrsq6zSz2Kg-cap-18HPHC9pQlqEGo7pyID2uKCyBo,19760
12
12
  biblicus/frontmatter.py,sha256=JOGjIDzbbOkebQw2RzA-3WDVMAMtJta2INjS4e7-LMg,2463
13
13
  biblicus/hook_logging.py,sha256=IMvde-JhVWrx9tNz3eDJ1CY_rr5Sj7DZ2YNomYCZbz0,5366
14
14
  biblicus/hook_manager.py,sha256=ZCAkE5wLvn4lnQz8jho_o0HGEC9KdQd9qitkAEUQRcw,6997
@@ -21,30 +21,31 @@ biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
21
21
  biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
22
22
  biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
23
23
  biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
24
- biblicus/user_config.py,sha256=okK57CRmT0W_yrc45tMPRl_abT7-D96IOrCBZtKtumM,6507
25
- biblicus/_vendor/dotyaml/__init__.py,sha256=e4zbejeJRwlD4I0q3YvotMypO19lXqmT8iyU1q6SvhY,376
26
- biblicus/_vendor/dotyaml/interpolation.py,sha256=PfUAEEOTFobv7Ox0E6nAxht6BqhHIDe4hP32fZn5TOs,1992
27
- biblicus/_vendor/dotyaml/loader.py,sha256=KePkjyhKZSvQZphmlmlzTYZJBQsqL5qhtGV1y7G6wzM,5624
28
- biblicus/_vendor/dotyaml/transformer.py,sha256=2AKPS8DMOPuYtzmM-dlwIqVbARfbBH5jYV1m5qpR49E,3725
29
- biblicus/analysis/__init__.py,sha256=TrKsE2GmdZDr3OARo2poa9H0powo0bjiEEWVx0tZmEg,1192
24
+ biblicus/user_config.py,sha256=UXUYBNUN4FR37ggZGJG1wv3K8XzsMR8pXW1T18lrivw,6495
25
+ biblicus/_vendor/dotyaml/__init__.py,sha256=WAWdbFNFqO5cJPthxA8Kx-L76Bh07sKMosUxC_3o9qA,375
26
+ biblicus/_vendor/dotyaml/interpolation.py,sha256=FVUkdQr_KbXjoFPvGTv6I5v0X5iZkJe5yhZtYKRbYzI,1991
27
+ biblicus/_vendor/dotyaml/loader.py,sha256=zy_zinR5fiatmRyZSiELHv1vVz1Y2eRSboSf_x3kfi4,5623
28
+ biblicus/_vendor/dotyaml/transformer.py,sha256=RWNrm_KAsanG409HEIWquTH9i_jz-ZFK9fM86emXeF4,3724
29
+ biblicus/analysis/__init__.py,sha256=Z4Wb4d-EoUuGHkcfRm9ILuZ8vr9FBqRxC0u1i6Fp_0w,1288
30
30
  biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
31
31
  biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
32
- biblicus/analysis/models.py,sha256=4N8abx2kSMYYfckbq_QHl5YUnups3FFx5atepYR9cu4,19705
32
+ biblicus/analysis/models.py,sha256=LuR52w27JRzV-Mr-WAOduZrBOCTrp5uYkMc46QHTRrI,27300
33
+ biblicus/analysis/profiling.py,sha256=z4w14LVJrTEXcQ3PBNwwb_61KuuwQgXw4-EiAaxOQ4Y,10672
33
34
  biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
34
- biblicus/analysis/topic_modeling.py,sha256=9jSZrlpPK44H4UMfig7YNs3pPc0pNAqu-i4OlXzHET8,19454
35
+ biblicus/analysis/topic_modeling.py,sha256=ZGXvm2MyU6plxz2FE1RQU-3bra6QZ-t8EJj8kG1TW0M,19438
35
36
  biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
36
37
  biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
37
38
  biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
38
- biblicus/backends/sqlite_full_text_search.py,sha256=KgmwOiKvkA0pv7vD0V7bcOdDx_nZIOfuIN6Z4Ij7I68,16516
39
+ biblicus/backends/sqlite_full_text_search.py,sha256=XFuIbEHYWMD9JkjgRZcgYH3kP3b4hRnJ3PwP8rSFjUU,16502
39
40
  biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
40
41
  biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
41
42
  biblicus/extractors/deepgram_stt.py,sha256=VI71i4lbE-EFHcvpNcCPRpT8z7A5IuaSrT1UaPyZ8UY,6323
42
43
  biblicus/extractors/docling_granite_text.py,sha256=aFNx-HubvaMmVJHbNqk3CR_ilSwN96-phkaENT6E2B0,6879
43
44
  biblicus/extractors/docling_smol_text.py,sha256=cSbQcT4O47MMcM6_pmQCvqgC5ferLvaxJnm3v9EQd0A,6811
44
- biblicus/extractors/markitdown_text.py,sha256=-7N8ebi3pYfNPnplccyy3qvsKi6uImC1xyo_dSDiD10,4546
45
+ biblicus/extractors/markitdown_text.py,sha256=ZvN2TFh65icTTdzCe7L-ZB8zTPP2mxQ4MhOOqSc81Z0,4547
45
46
  biblicus/extractors/metadata_text.py,sha256=7FbEPp0K1mXc7FH1_c0KhPhPexF9U6eLd3TVY1vTp1s,3537
46
47
  biblicus/extractors/openai_stt.py,sha256=fggErIu6YN6tXbleNTuROhfYi7zDgMd2vD_ecXZ7eXs,7162
47
- biblicus/extractors/paddleocr_vl_text.py,sha256=augbxZ-kx22yHvFR1b6CUAS2I6ktXFsJx8nLWRfvdOA,11722
48
+ biblicus/extractors/paddleocr_vl_text.py,sha256=59csxihkqK0lELpAtK2YLcfbSUvNGiuOw7CwPa_0l_c,11692
48
49
  biblicus/extractors/pass_through_text.py,sha256=DNxkCwpH2bbXjPGPEQwsx8kfqXi6rIxXNY_n3TU2-WI,2777
49
50
  biblicus/extractors/pdf_text.py,sha256=YtUphgLVxyWJXew6ZsJ8wBRh67Y5ri4ZTRlMmq3g1Bk,3255
50
51
  biblicus/extractors/pipeline.py,sha256=LY6eM3ypw50MDB2cPEQqZrjxkhVvIc6sv4UEhHdNDrE,3208
@@ -54,9 +55,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
54
55
  biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
55
56
  biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
56
57
  biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
57
- biblicus-0.9.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
58
- biblicus-0.9.0.dist-info/METADATA,sha256=7NBBKWloUkQ2mx_CuPqAQzQJWHEwM7aJT7XQHGL2VwU,27325
59
- biblicus-0.9.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
60
- biblicus-0.9.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
61
- biblicus-0.9.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
62
- biblicus-0.9.0.dist-info/RECORD,,
58
+ biblicus-0.10.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
59
+ biblicus-0.10.0.dist-info/METADATA,sha256=xZ7scJLdlKHRtm0EU5Ravq5ih2mS2KNfMbbLXNqZ8Ek,27455
60
+ biblicus-0.10.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
61
+ biblicus-0.10.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
62
+ biblicus-0.10.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
63
+ biblicus-0.10.0.dist-info/RECORD,,