PyPI - biblicus - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

biblicus 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

biblicus/__init__.py CHANGED Viewed

@@ -27,4 +27,4 @@ __all__ = [
     "RetrievalRun",
 ]
-__version__ = "0.8.0"
+__version__ = "0.9.0"

biblicus/analysis/models.py CHANGED Viewed

@@ -171,15 +171,54 @@ class TopicModelingLexicalProcessingConfig(AnalysisSchemaModel):
     collapse_whitespace: bool = Field(default=True)
+class TopicModelingVectorizerConfig(AnalysisSchemaModel):
+    """
+    Vectorizer configuration for BERTopic tokenization.
+    :ivar ngram_range: Inclusive n-gram range as a two-item list.
+    :vartype ngram_range: list[int]
+    :ivar stop_words: Stop word configuration for tokenization.
+    :vartype stop_words: str or list[str] or None
+    """
+    ngram_range: List[int] = Field(default_factory=lambda: [1, 1], min_length=2, max_length=2)
+    stop_words: Optional[object] = None
+    @model_validator(mode="after")
+    def _validate_ngram_range(self) -> "TopicModelingVectorizerConfig":
+        start, end = self.ngram_range
+        if start < 1 or end < start:
+            raise ValueError("vectorizer.ngram_range must include two integers with start >= 1 and end >= start")
+        return self
+    @field_validator("stop_words", mode="before")
+    @classmethod
+    def _validate_stop_words(cls, value: object) -> object:
+        if value is None:
+            return None
+        if isinstance(value, str):
+            if value != "english":
+                raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
+            return value
+        if isinstance(value, list):
+            if not all(isinstance(entry, str) and entry for entry in value):
+                raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
+            return value
+        raise ValueError("vectorizer.stop_words must be 'english' or a list of strings")
 class TopicModelingBerTopicConfig(AnalysisSchemaModel):
     """
     Configuration for BERTopic analysis.
     :ivar parameters: Parameters forwarded to the BERTopic constructor.
     :vartype parameters: dict[str, Any]
+    :ivar vectorizer: Vectorizer configuration for tokenization.
+    :vartype vectorizer: TopicModelingVectorizerConfig or None
     """
     parameters: Dict[str, Any] = Field(default_factory=dict)
+    vectorizer: Optional[TopicModelingVectorizerConfig] = None
 class TopicModelingLlmFineTuningConfig(AnalysisSchemaModel):
@@ -371,6 +410,8 @@ class TopicModelingBerTopicReport(AnalysisSchemaModel):
     :vartype document_count: int
     :ivar parameters: BERTopic configuration parameters.
     :vartype parameters: dict[str, Any]
+    :ivar vectorizer: Vectorizer configuration applied to BERTopic.
+    :vartype vectorizer: TopicModelingVectorizerConfig or None
     :ivar warnings: Warning messages.
     :vartype warnings: list[str]
     :ivar errors: Error messages.
@@ -381,6 +422,7 @@ class TopicModelingBerTopicReport(AnalysisSchemaModel):
     topic_count: int = Field(ge=0)
     document_count: int = Field(ge=0)
     parameters: Dict[str, Any] = Field(default_factory=dict)
+    vectorizer: Optional[TopicModelingVectorizerConfig] = None
     warnings: List[str] = Field(default_factory=list)
     errors: List[str] = Field(default_factory=list)

biblicus/analysis/topic_modeling.py CHANGED Viewed

@@ -429,14 +429,37 @@ def _run_bertopic(
     config: TopicModelingBerTopicConfig,
 ) -> Tuple[TopicModelingBerTopicReport, List[TopicModelingTopic]]:
     try:
-        from bertopic import BERTopic
+        import importlib
+        bertopic_module = importlib.import_module("bertopic")
+        if not hasattr(bertopic_module, "BERTopic"):
+            raise ImportError("BERTopic class is unavailable")
+        BERTopic = bertopic_module.BERTopic
     except ImportError as import_error:
         raise ValueError(
             "BERTopic analysis requires an optional dependency. "
             'Install it with pip install "biblicus[topic-modeling]".'
         ) from import_error
-    topic_model = BERTopic(**config.parameters)
+    bertopic_kwargs = dict(config.parameters)
+    is_fake = bool(getattr(bertopic_module, "__biblicus_fake__", False))
+    if config.vectorizer is not None and "vectorizer_model" not in bertopic_kwargs:
+        if is_fake:
+            bertopic_kwargs["vectorizer_model"] = None
+        else:
+            try:
+                from sklearn.feature_extraction.text import CountVectorizer
+            except ImportError as import_error:
+                raise ValueError(
+                    "Vectorizer configuration requires scikit-learn. "
+                    "Install with pip install \"biblicus[topic-modeling]\"."
+                ) from import_error
+            bertopic_kwargs["vectorizer_model"] = CountVectorizer(
+                ngram_range=tuple(config.vectorizer.ngram_range),
+                stop_words=config.vectorizer.stop_words,
+            )
+    topic_model = BERTopic(**bertopic_kwargs)
     texts = [document.text for document in documents]
     assignments, _ = topic_model.fit_transform(texts)
     assignment_list = list(assignments)
@@ -465,6 +488,7 @@ def _run_bertopic(
         topic_count=len(topics),
         document_count=len(documents),
         parameters=dict(config.parameters),
+        vectorizer=config.vectorizer,
         warnings=[],
         errors=[],
     )

{biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: biblicus
-Version: 0.8.0
+Version: 0.9.0
 Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
 License: MIT
 Requires-Python: >=3.9
@@ -40,6 +40,8 @@ Provides-Extra: docling-mlx
 Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
 Provides-Extra: topic-modeling
 Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
+Provides-Extra: datasets
+Requires-Dist: datasets>=2.18.0; extra == "datasets"
 Dynamic: license-file
 # Biblicus
@@ -534,6 +536,8 @@ analysis backend. It reads an extraction run, optionally applies an LLM-driven e
 processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
 JavaScript Object Notation.
+See `docs/ANALYSIS.md` for the analysis pipeline overview and `docs/TOPIC_MODELING.md` for topic modeling details.
 Run a topic analysis using a recipe file:
 ```
@@ -564,26 +568,28 @@ bertopic_analysis:
   parameters:
     min_topic_size: 8
     nr_topics: 10
+  vectorizer:
+    ngram_range: [1, 2]
+    stop_words: english
 llm_fine_tuning:
   enabled: false
 ```
 LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
 Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
+AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
-For a repeatable, real-world integration run that downloads a Wikipedia corpus and executes topic modeling, use:
+For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
 ```
-python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
+python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
 ```
 See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
 ## Integration corpus and evaluation dataset
-Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
-The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
+Use `scripts/download_ag_news.py` to download the AG News dataset when running topic modeling demos. The repository does not include that content.
 Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.

{biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-biblicus/__init__.py,sha256=XhgZfXIpkQ5_SzHj-2Vqt_N3hvx6TSOv6KMdac6HfaI,495
+biblicus/__init__.py,sha256=x14R9a_6nu3qTg2F-sUOaS_ZepXNBPpa3nsEgp4PZhg,495
 biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
 biblicus/cli.py,sha256=GVmZlCSZPUMBbq69yjN16f4xNw71edlFbGPHX3300oI,32643
 biblicus/constants.py,sha256=-JaHI3Dngte2drawx93cGWxFVobbgIuaVhmjUJpf4GI,333
@@ -29,9 +29,9 @@ biblicus/_vendor/dotyaml/transformer.py,sha256=2AKPS8DMOPuYtzmM-dlwIqVbARfbBH5jY
 biblicus/analysis/__init__.py,sha256=TrKsE2GmdZDr3OARo2poa9H0powo0bjiEEWVx0tZmEg,1192
 biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
 biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
-biblicus/analysis/models.py,sha256=XocDiEVF7ud53hd9eCFTuMXS68U-eBthpe7a6J9j6uU,17824
+biblicus/analysis/models.py,sha256=4N8abx2kSMYYfckbq_QHl5YUnups3FFx5atepYR9cu4,19705
 biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
-biblicus/analysis/topic_modeling.py,sha256=Y_9Auh47_wRD4LXVZ_c-S7AYeO72wLu39CHHa_ZLunI,18352
+biblicus/analysis/topic_modeling.py,sha256=9jSZrlpPK44H4UMfig7YNs3pPc0pNAqu-i4OlXzHET8,19454
 biblicus/backends/__init__.py,sha256=wLXIumV51l6ZIKzjoKKeU7AgIxGOryG7T7ls3a_Fv98,1212
 biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
 biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
@@ -54,9 +54,9 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
 biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
 biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
 biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
-biblicus-0.8.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
-biblicus-0.8.0.dist-info/METADATA,sha256=I4zW3JWMOmyh4tBpR-D2MGAl9YCp9IqtFo8wxoNA1qQ,27116
-biblicus-0.8.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-biblicus-0.8.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
-biblicus-0.8.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
-biblicus-0.8.0.dist-info/RECORD,,
+biblicus-0.9.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
+biblicus-0.9.0.dist-info/METADATA,sha256=7NBBKWloUkQ2mx_CuPqAQzQJWHEwM7aJT7XQHGL2VwU,27325
+biblicus-0.9.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+biblicus-0.9.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
+biblicus-0.9.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
+biblicus-0.9.0.dist-info/RECORD,,

{biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{biblicus-0.8.0.dist-info → biblicus-0.9.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

biblicus 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

biblicus 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl