biblicus 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biblicus/__init__.py CHANGED
@@ -27,4 +27,4 @@ __all__ = [
27
27
  "RetrievalRun",
28
28
  ]
29
29
 
30
- __version__ = "0.8.0"
30
+ __version__ = "0.9.0"
@@ -171,15 +171,54 @@ class TopicModelingLexicalProcessingConfig(AnalysisSchemaModel):
171
171
  collapse_whitespace: bool = Field(default=True)
172
172
 
173
173
 
174
+ class TopicModelingVectorizerConfig(AnalysisSchemaModel):
175
+ """
176
+ Vectorizer configuration for BERTopic tokenization.
177
+
178
+ :ivar ngram_range: Inclusive n-gram range as a two-item list.
179
+ :vartype ngram_range: list[int]
180
+ :ivar stop_words: Stop word configuration for tokenization.
181
+ :vartype stop_words: str or list[str] or None
182
+ """
183
+
184
+ ngram_range: List[int] = Field(default_factory=lambda: [1, 1], min_length=2, max_length=2)
185
+ stop_words: Optional[object] = None
186
+
187
+ @model_validator(mode="after")
188
+ def _validate_ngram_range(self) -> "TopicModelingVectorizerConfig":
189
+ start, end = self.ngram_range
190
+ if start < 1 or end < start:
191
+ raise ValueError("vectorizer.ngram_range must include two integers with start >= 1 and end >= start")
192
+ return self
193
+
194
+ @field_validator("stop_words", mode="before")
195
+ @classmethod
196
+ def _validate_stop_words(cls, value: object) -> object:
197
+ if value is None:
198
+ return None
199
+ if isinstance(value, str):
200
+ if value != "english":
201
+ raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
202
+ return value
203
+ if isinstance(value, list):
204
+ if not all(isinstance(entry, str) and entry for entry in value):
205
+ raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
206
+ return value
207
+ raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
208
+
209
+
174
210
  class TopicModelingBerTopicConfig(AnalysisSchemaModel):
175
211
  """
176
212
  Configuration for BERTopic analysis.
177
213
 
178
214
  :ivar parameters: Parameters forwarded to the BERTopic constructor.
179
215
  :vartype parameters: dict[str, Any]
216
+ :ivar vectorizer: Vectorizer configuration for tokenization.
217
+ :vartype vectorizer: TopicModelingVectorizerConfig or None
180
218
  """
181
219
 
182
220
  parameters: Dict[str, Any] = Field(default_factory=dict)
221
+ vectorizer: Optional[TopicModelingVectorizerConfig] = None
183
222
 
184
223
 
185
224
  class TopicModelingLlmFineTuningConfig(AnalysisSchemaModel):
@@ -371,6 +410,8 @@ class TopicModelingBerTopicReport(AnalysisSchemaModel):
371
410
  :vartype document_count: int
372
411
  :ivar parameters: BERTopic configuration parameters.
373
412
  :vartype parameters: dict[str, Any]
413
+ :ivar vectorizer: Vectorizer configuration applied to BERTopic.
414
+ :vartype vectorizer: TopicModelingVectorizerConfig or None
374
415
  :ivar warnings: Warning messages.
375
416
  :vartype warnings: list[str]
376
417
  :ivar errors: Error messages.
@@ -381,6 +422,7 @@ class TopicModelingBerTopicReport(AnalysisSchemaModel):
381
422
  topic_count: int = Field(ge=0)
382
423
  document_count: int = Field(ge=0)
383
424
  parameters: Dict[str, Any] = Field(default_factory=dict)
425
+ vectorizer: Optional[TopicModelingVectorizerConfig] = None
384
426
  warnings: List[str] = Field(default_factory=list)
385
427
  errors: List[str] = Field(default_factory=list)
386
428
 
@@ -429,14 +429,37 @@ def _run_bertopic(
429
429
  config: TopicModelingBerTopicConfig,
430
430
  ) -> Tuple[TopicModelingBerTopicReport, List[TopicModelingTopic]]:
431
431
  try:
432
- from bertopic import BERTopic
432
+ import importlib
433
+
434
+ bertopic_module = importlib.import_module("bertopic")
435
+ if not hasattr(bertopic_module, "BERTopic"):
436
+ raise ImportError("BERTopic class is unavailable")
437
+ BERTopic = bertopic_module.BERTopic
433
438
  except ImportError as import_error:
434
439
  raise ValueError(
435
440
  "BERTopic analysis requires an optional dependency. "
436
441
  'Install it with pip install "biblicus[topic-modeling]".'
437
442
  ) from import_error
438
443
 
439
- topic_model = BERTopic(**config.parameters)
444
+ bertopic_kwargs = dict(config.parameters)
445
+ is_fake = bool(getattr(bertopic_module, "__biblicus_fake__", False))
446
+ if config.vectorizer is not None and "vectorizer_model" not in bertopic_kwargs:
447
+ if is_fake:
448
+ bertopic_kwargs["vectorizer_model"] = None
449
+ else:
450
+ try:
451
+ from sklearn.feature_extraction.text import CountVectorizer
452
+ except ImportError as import_error:
453
+ raise ValueError(
454
+ "Vectorizer configuration requires scikit-learn. "
455
+ "Install with pip install \"biblicus[topic-modeling]\"."
456
+ ) from import_error
457
+ bertopic_kwargs["vectorizer_model"] = CountVectorizer(
458
+ ngram_range=tuple(config.vectorizer.ngram_range),
459
+ stop_words=config.vectorizer.stop_words,
460
+ )
461
+
462
+ topic_model = BERTopic(**bertopic_kwargs)
440
463
  texts = [document.text for document in documents]
441
464
  assignments, _ = topic_model.fit_transform(texts)
442
465
  assignment_list = list(assignments)
@@ -465,6 +488,7 @@ def _run_bertopic(
465
488
  topic_count=len(topics),
466
489
  document_count=len(documents),
467
490
  parameters=dict(config.parameters),
491
+ vectorizer=config.vectorizer,
468
492
  warnings=[],
469
493
  errors=[],
470
494
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -40,6 +40,8 @@ Provides-Extra: docling-mlx
40
40
  Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
41
41
  Provides-Extra: topic-modeling
42
42
  Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
43
+ Provides-Extra: datasets
44
+ Requires-Dist: datasets>=2.18.0; extra == "datasets"
43
45
  Dynamic: license-file
44
46
 
45
47
  # Biblicus
@@ -534,6 +536,8 @@ analysis backend. It reads an extraction run, optionally applies an LLM-driven e
534
536
  processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
535
537
  JavaScript Object Notation.
536
538
 
539
+ See `docs/ANALYSIS.md` for the analysis pipeline overview and `docs/TOPIC_MODELING.md` for topic modeling details.
540
+
537
541
  Run a topic analysis using a recipe file:
538
542
 
539
543
  ```
@@ -564,26 +568,28 @@ bertopic_analysis:
564
568
  parameters:
565
569
  min_topic_size: 8
566
570
  nr_topics: 10
571
+ vectorizer:
572
+ ngram_range: [1, 2]
573
+ stop_words: english
567
574
  llm_fine_tuning:
568
575
  enabled: false
569
576
  ```
570
577
 
571
578
  LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
572
579
  Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
580
+ AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
573
581
 
574
- For a repeatable, real-world integration run that downloads a Wikipedia corpus and executes topic modeling, use:
582
+ For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
575
583
 
576
584
  ```
577
- python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
585
+ python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
578
586
  ```
579
587
 
580
588
  See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
581
589
 
582
590
  ## Integration corpus and evaluation dataset
583
591
 
584
- Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
585
-
586
- The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
592
+ Use `scripts/download_ag_news.py` to download the AG News dataset when running topic modeling demos. The repository does not include that content.
587
593
 
588
594
  Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
589
595
 
@@ -1,4 +1,4 @@
1
- biblicus/__init__.py,sha256=XhgZfXIpkQ5_SzHj-2Vqt_N3hvx6TSOv6KMdac6HfaI,495
1
+ biblicus/__init__.py,sha256=x14R9a_6nu3qTg2F-sUOaS_ZepXNBPpa3nsEgp4PZhg,495
2
2
  biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
3
  biblicus/cli.py,sha256=GVmZlCSZPUMBbq69yjN16f4xNw71edlFbGPHX3300oI,32643
4
4
  biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
@@ -29,9 +29,9 @@ biblicus/_vendor/dotyaml/transformer.py,sha256=2AKPS8DMOPuYtzmM-dlwIqVbARfbBH5jY
29
29
  biblicus/analysis/__init__.py,sha256=TrKsE2GmdZDr3OARo2poa9H0powo0bjiEEWVx0tZmEg,1192
30
30
  biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
31
31
  biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
32
- biblicus/analysis/models.py,sha256=XocDiEVF7ud53hd9eCFTuMXS68U-eBthpe7a6J9j6uU,17824
32
+ biblicus/analysis/models.py,sha256=4N8abx2kSMYYfckbq_QHl5YUnups3FFx5atepYR9cu4,19705
33
33
  biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
34
- biblicus/analysis/topic_modeling.py,sha256=Y_9Auh47_wRD4LXVZ_c-S7AYeO72wLu39CHHa_ZLunI,18352
34
+ biblicus/analysis/topic_modeling.py,sha256=9jSZrlpPK44H4UMfig7YNs3pPc0pNAqu-i4OlXzHET8,19454
35
35
  biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
36
36
  biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
37
37
  biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
@@ -54,9 +54,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
54
54
  biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
55
55
  biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
56
56
  biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
57
- biblicus-0.8.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
58
- biblicus-0.8.0.dist-info/METADATA,sha256=I4zW3JWMOmyh4tBpR-D2MGAl9YCp9IqtFo8wxoNA1qQ,27116
59
- biblicus-0.8.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
60
- biblicus-0.8.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
61
- biblicus-0.8.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
62
- biblicus-0.8.0.dist-info/RECORD,,
57
+ biblicus-0.9.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
58
+ biblicus-0.9.0.dist-info/METADATA,sha256=7NBBKWloUkQ2mx_CuPqAQzQJWHEwM7aJT7XQHGL2VwU,27325
59
+ biblicus-0.9.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
60
+ biblicus-0.9.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
61
+ biblicus-0.9.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
62
+ biblicus-0.9.0.dist-info/RECORD,,