biblicus 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/analysis/models.py +42 -0
- biblicus/analysis/topic_modeling.py +26 -2
- {biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/METADATA +12 -6
- {biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/RECORD +9 -9
- {biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/WHEEL +0 -0
- {biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/top_level.txt +0 -0
biblicus/__init__.py
CHANGED
biblicus/analysis/models.py
CHANGED
|
@@ -171,15 +171,54 @@ class TopicModelingLexicalProcessingConfig(AnalysisSchemaModel):
|
|
|
171
171
|
collapse_whitespace: bool = Field(default=True)
|
|
172
172
|
|
|
173
173
|
|
|
174
|
+
class TopicModelingVectorizerConfig(AnalysisSchemaModel):
|
|
175
|
+
"""
|
|
176
|
+
Vectorizer configuration for BERTopic tokenization.
|
|
177
|
+
|
|
178
|
+
:ivar ngram_range: Inclusive n-gram range as a two-item list.
|
|
179
|
+
:vartype ngram_range: list[int]
|
|
180
|
+
:ivar stop_words: Stop word configuration for tokenization.
|
|
181
|
+
:vartype stop_words: str or list[str] or None
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
ngram_range: List[int] = Field(default_factory=lambda: [1, 1], min_length=2, max_length=2)
|
|
185
|
+
stop_words: Optional[object] = None
|
|
186
|
+
|
|
187
|
+
@model_validator(mode="after")
|
|
188
|
+
def _validate_ngram_range(self) -> "TopicModelingVectorizerConfig":
|
|
189
|
+
start, end = self.ngram_range
|
|
190
|
+
if start < 1 or end < start:
|
|
191
|
+
raise ValueError("vectorizer.ngram_range must include two integers with start >= 1 and end >= start")
|
|
192
|
+
return self
|
|
193
|
+
|
|
194
|
+
@field_validator("stop_words", mode="before")
|
|
195
|
+
@classmethod
|
|
196
|
+
def _validate_stop_words(cls, value: object) -> object:
|
|
197
|
+
if value is None:
|
|
198
|
+
return None
|
|
199
|
+
if isinstance(value, str):
|
|
200
|
+
if value != "english":
|
|
201
|
+
raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
|
|
202
|
+
return value
|
|
203
|
+
if isinstance(value, list):
|
|
204
|
+
if not all(isinstance(entry, str) and entry for entry in value):
|
|
205
|
+
raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
|
|
206
|
+
return value
|
|
207
|
+
raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
|
|
208
|
+
|
|
209
|
+
|
|
174
210
|
class TopicModelingBerTopicConfig(AnalysisSchemaModel):
|
|
175
211
|
"""
|
|
176
212
|
Configuration for BERTopic analysis.
|
|
177
213
|
|
|
178
214
|
:ivar parameters: Parameters forwarded to the BERTopic constructor.
|
|
179
215
|
:vartype parameters: dict[str, Any]
|
|
216
|
+
:ivar vectorizer: Vectorizer configuration for tokenization.
|
|
217
|
+
:vartype vectorizer: TopicModelingVectorizerConfig or None
|
|
180
218
|
"""
|
|
181
219
|
|
|
182
220
|
parameters: Dict[str, Any] = Field(default_factory=dict)
|
|
221
|
+
vectorizer: Optional[TopicModelingVectorizerConfig] = None
|
|
183
222
|
|
|
184
223
|
|
|
185
224
|
class TopicModelingLlmFineTuningConfig(AnalysisSchemaModel):
|
|
@@ -371,6 +410,8 @@ class TopicModelingBerTopicReport(AnalysisSchemaModel):
|
|
|
371
410
|
:vartype document_count: int
|
|
372
411
|
:ivar parameters: BERTopic configuration parameters.
|
|
373
412
|
:vartype parameters: dict[str, Any]
|
|
413
|
+
:ivar vectorizer: Vectorizer configuration applied to BERTopic.
|
|
414
|
+
:vartype vectorizer: TopicModelingVectorizerConfig or None
|
|
374
415
|
:ivar warnings: Warning messages.
|
|
375
416
|
:vartype warnings: list[str]
|
|
376
417
|
:ivar errors: Error messages.
|
|
@@ -381,6 +422,7 @@ class TopicModelingBerTopicReport(AnalysisSchemaModel):
|
|
|
381
422
|
topic_count: int = Field(ge=0)
|
|
382
423
|
document_count: int = Field(ge=0)
|
|
383
424
|
parameters: Dict[str, Any] = Field(default_factory=dict)
|
|
425
|
+
vectorizer: Optional[TopicModelingVectorizerConfig] = None
|
|
384
426
|
warnings: List[str] = Field(default_factory=list)
|
|
385
427
|
errors: List[str] = Field(default_factory=list)
|
|
386
428
|
|
|
@@ -429,14 +429,37 @@ def _run_bertopic(
|
|
|
429
429
|
config: TopicModelingBerTopicConfig,
|
|
430
430
|
) -> Tuple[TopicModelingBerTopicReport, List[TopicModelingTopic]]:
|
|
431
431
|
try:
|
|
432
|
-
|
|
432
|
+
import importlib
|
|
433
|
+
|
|
434
|
+
bertopic_module = importlib.import_module("bertopic")
|
|
435
|
+
if not hasattr(bertopic_module, "BERTopic"):
|
|
436
|
+
raise ImportError("BERTopic class is unavailable")
|
|
437
|
+
BERTopic = bertopic_module.BERTopic
|
|
433
438
|
except ImportError as import_error:
|
|
434
439
|
raise ValueError(
|
|
435
440
|
"BERTopic analysis requires an optional dependency. "
|
|
436
441
|
'Install it with pip install "biblicus[topic-modeling]".'
|
|
437
442
|
) from import_error
|
|
438
443
|
|
|
439
|
-
|
|
444
|
+
bertopic_kwargs = dict(config.parameters)
|
|
445
|
+
is_fake = bool(getattr(bertopic_module, "__biblicus_fake__", False))
|
|
446
|
+
if config.vectorizer is not None and "vectorizer_model" not in bertopic_kwargs:
|
|
447
|
+
if is_fake:
|
|
448
|
+
bertopic_kwargs["vectorizer_model"] = None
|
|
449
|
+
else:
|
|
450
|
+
try:
|
|
451
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
452
|
+
except ImportError as import_error:
|
|
453
|
+
raise ValueError(
|
|
454
|
+
"Vectorizer configuration requires scikit-learn. "
|
|
455
|
+
"Install with pip install \"biblicus[topic-modeling]\"."
|
|
456
|
+
) from import_error
|
|
457
|
+
bertopic_kwargs["vectorizer_model"] = CountVectorizer(
|
|
458
|
+
ngram_range=tuple(config.vectorizer.ngram_range),
|
|
459
|
+
stop_words=config.vectorizer.stop_words,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
topic_model = BERTopic(**bertopic_kwargs)
|
|
440
463
|
texts = [document.text for document in documents]
|
|
441
464
|
assignments, _ = topic_model.fit_transform(texts)
|
|
442
465
|
assignment_list = list(assignments)
|
|
@@ -465,6 +488,7 @@ def _run_bertopic(
|
|
|
465
488
|
topic_count=len(topics),
|
|
466
489
|
document_count=len(documents),
|
|
467
490
|
parameters=dict(config.parameters),
|
|
491
|
+
vectorizer=config.vectorizer,
|
|
468
492
|
warnings=[],
|
|
469
493
|
errors=[],
|
|
470
494
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -40,6 +40,8 @@ Provides-Extra: docling-mlx
|
|
|
40
40
|
Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
|
|
41
41
|
Provides-Extra: topic-modeling
|
|
42
42
|
Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
|
|
43
|
+
Provides-Extra: datasets
|
|
44
|
+
Requires-Dist: datasets>=2.18.0; extra == "datasets"
|
|
43
45
|
Dynamic: license-file
|
|
44
46
|
|
|
45
47
|
# Biblicus
|
|
@@ -534,6 +536,8 @@ analysis backend. It reads an extraction run, optionally applies an LLM-driven e
|
|
|
534
536
|
processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
|
|
535
537
|
JavaScript Object Notation.
|
|
536
538
|
|
|
539
|
+
See `docs/ANALYSIS.md` for the analysis pipeline overview and `docs/TOPIC_MODELING.md` for topic modeling details.
|
|
540
|
+
|
|
537
541
|
Run a topic analysis using a recipe file:
|
|
538
542
|
|
|
539
543
|
```
|
|
@@ -564,26 +568,28 @@ bertopic_analysis:
|
|
|
564
568
|
parameters:
|
|
565
569
|
min_topic_size: 8
|
|
566
570
|
nr_topics: 10
|
|
571
|
+
vectorizer:
|
|
572
|
+
ngram_range: [1, 2]
|
|
573
|
+
stop_words: english
|
|
567
574
|
llm_fine_tuning:
|
|
568
575
|
enabled: false
|
|
569
576
|
```
|
|
570
577
|
|
|
571
578
|
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
572
579
|
Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
580
|
+
AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
|
|
573
581
|
|
|
574
|
-
For a repeatable, real-world integration run that downloads
|
|
582
|
+
For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
|
|
575
583
|
|
|
576
584
|
```
|
|
577
|
-
python3 scripts/topic_modeling_integration.py --corpus corpora/
|
|
585
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
578
586
|
```
|
|
579
587
|
|
|
580
588
|
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
581
589
|
|
|
582
590
|
## Integration corpus and evaluation dataset
|
|
583
591
|
|
|
584
|
-
Use `scripts/
|
|
585
|
-
|
|
586
|
-
The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
|
|
592
|
+
Use `scripts/download_ag_news.py` to download the AG News dataset when running topic modeling demos. The repository does not include that content.
|
|
587
593
|
|
|
588
594
|
Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
|
|
589
595
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=x14R9a_6nu3qTg2F-sUOaS_ZepXNBPpa3nsEgp4PZhg,495
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
3
|
biblicus/cli.py,sha256=GVmZlCSZPUMBbq69yjN16f4xNw71edlFbGPHX3300oI,32643
|
|
4
4
|
biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
|
|
@@ -29,9 +29,9 @@ biblicus/_vendor/dotyaml/transformer.py,sha256=2AKPS8DMOPuYtzmM-dlwIqVbARfbBH5jY
|
|
|
29
29
|
biblicus/analysis/__init__.py,sha256=TrKsE2GmdZDr3OARo2poa9H0powo0bjiEEWVx0tZmEg,1192
|
|
30
30
|
biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
|
|
31
31
|
biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
|
|
32
|
-
biblicus/analysis/models.py,sha256=
|
|
32
|
+
biblicus/analysis/models.py,sha256=4N8abx2kSMYYfckbq_QHl5YUnups3FFx5atepYR9cu4,19705
|
|
33
33
|
biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
|
|
34
|
-
biblicus/analysis/topic_modeling.py,sha256=
|
|
34
|
+
biblicus/analysis/topic_modeling.py,sha256=9jSZrlpPK44H4UMfig7YNs3pPc0pNAqu-i4OlXzHET8,19454
|
|
35
35
|
biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
|
|
36
36
|
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
37
37
|
biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
|
|
@@ -54,9 +54,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
|
|
|
54
54
|
biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
|
|
55
55
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
56
56
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
57
|
-
biblicus-0.
|
|
58
|
-
biblicus-0.
|
|
59
|
-
biblicus-0.
|
|
60
|
-
biblicus-0.
|
|
61
|
-
biblicus-0.
|
|
62
|
-
biblicus-0.
|
|
57
|
+
biblicus-0.9.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
58
|
+
biblicus-0.9.0.dist-info/METADATA,sha256=7NBBKWloUkQ2mx_CuPqAQzQJWHEwM7aJT7XQHGL2VwU,27325
|
|
59
|
+
biblicus-0.9.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
60
|
+
biblicus-0.9.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
61
|
+
biblicus-0.9.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
62
|
+
biblicus-0.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|