PyPI - diversify-text - Versions diffs - 0.1.2__tar.gz → 0.2.0__tar.gz - Mend

diversify-text 0.1.2tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{diversify_text-0.1.2 → diversify_text-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.4
 Name: diversify-text
-Version: 0.1.2
+Version: 0.2.0
 Summary: Generate stylistic paraphrases of texts using local transformer models.
 Project-URL: Homepage, https://github.com/AnnaWegmann/diversify_text
 Project-URL: Documentation, https://annawegmann.github.io/diversify_text/
 Project-URL: Repository, https://github.com/AnnaWegmann/diversify_text
 Project-URL: Issues, https://github.com/AnnaWegmann/diversify_text/issues
-Author: Anna Wegmann
+Author: Anna Wegmann, Eduardo Calò, Menan Velayuthan
 License-Expression: MIT
 License-File: LICENSE
 Keywords: augmentation,nlp,paraphrase,style-transfer,text-generation
@@ -26,7 +26,7 @@ Requires-Dist: pysbd>=0.3.4
 Requires-Dist: sentence-transformers
 Requires-Dist: sentencepiece
 Requires-Dist: tiktoken
-Requires-Dist: torch
+Requires-Dist: torch>=2.10.0
 Requires-Dist: tqdm>=4.67.3
 Requires-Dist: transformers>=5.3.0
 Description-Content-Type: text/markdown
@@ -46,6 +46,7 @@ pip install diversify-text
 - [Usage](#usage)
   - [Single text](#single-text)
   - [Control number of paraphrases](#control-number-of-paraphrases)
+  - [Caching](#caching)
   - [Using the class directly](#using-the-class-directly)
   - [List of texts](#list-of-texts)
   - [Customising the TinyStyler style bank](#customising-the-tinystyler-style-bank)
@@ -84,24 +85,38 @@ results = diversify("The experiment was conducted in a controlled lab setting.")
 ### Control number of paraphrases
 ```python
-results = diversify("Some text.", n_styles=3)
+results = diversify("Some text.", n=3)
 ```
 ```
 [{"original": "Some text.", "paraphrases": ["...", "...", "..."]}]
 ```
+### Caching
+The `diversify()` function automatically caches loaded models between calls.
+The generation model and the semantic filter are cached independently, so
+toggling `semantic_filter` does not reload the generation model and vice
+versa. Call `clear_cache()` to drop cached models and allow memory to be reclaimed when possible:
+```python
+from diversify_text import clear_cache
+clear_cache()
+```
 ### Using the class directly
-Recommended when processing texts across several calls — the model is loaded once and reused across calls.
+You can also instantiate a `Diversifier` yourself for full control over the
+model lifecycle:
 ```python
 from diversify_text import Diversifier
 div = Diversifier(device="cuda", methods=["tinystyler"])
-batch_1 = div.diversify(texts_1, n_styles=5)
-batch_2 = div.diversify(texts_2, n_styles=5)
+batch_1 = div.diversify(texts_1, n=5)
+batch_2 = div.diversify(texts_2, n=5)
 ```
 ### List of texts
@@ -130,7 +145,7 @@ A style bank can be a `dict[str, list[str]]` or a `list[list[str]]`:
 ```python
 from diversify_text import diversify
-from diversify_text.method.tinystyler import DEFAULT_STYLE_BANK
+from diversify_text.styles import DEFAULT_STYLE_BANK
 custom_bank = {
     "academic": ["The results demonstrate a statistically significant effect."],
@@ -144,10 +159,10 @@ results = diversify(
 )
 ```
-`DEFAULT_STYLE_BANK` is exported from `diversify_text.method.tinystyler` so you can build on it:
+`DEFAULT_STYLE_BANK` is exported from `diversify_text.styles` so you can build on it:
 ```python
-from diversify_text.method.tinystyler import DEFAULT_STYLE_BANK
+from diversify_text.styles import DEFAULT_STYLE_BANK
 extended_bank = {
     **DEFAULT_STYLE_BANK,
@@ -175,11 +190,11 @@ from diversify_text.method import DiversificationMethod
 class MyMethod(DiversificationMethod):
     name = "my_method"
-    def generate(self, texts, *, n_styles, max_new_tokens, temperature, top_p, **kwargs):
-        return [[f"{text} :: variant {i}" for i in range(n_styles)] for text in texts]
+    def generate(self, texts, *, n, max_new_tokens, temperature, top_p, **kwargs):
+        return [[f"{text} :: variant {i}" for i in range(n)] for text in texts]
-results = Diversifier(methods=[MyMethod()]).diversify("Hello", n_styles=3)
+results = Diversifier(methods=[MyMethod()]).diversify("Hello", n=3)
 ```
 ```

{diversify_text-0.1.2 → diversify_text-0.2.0}/README.md RENAMED Viewed

@@ -13,6 +13,7 @@ pip install diversify-text
 - [Usage](#usage)
   - [Single text](#single-text)
   - [Control number of paraphrases](#control-number-of-paraphrases)
+  - [Caching](#caching)
   - [Using the class directly](#using-the-class-directly)
   - [List of texts](#list-of-texts)
   - [Customising the TinyStyler style bank](#customising-the-tinystyler-style-bank)
@@ -51,24 +52,38 @@ results = diversify("The experiment was conducted in a controlled lab setting.")
 ### Control number of paraphrases
 ```python
-results = diversify("Some text.", n_styles=3)
+results = diversify("Some text.", n=3)
 ```
 ```
 [{"original": "Some text.", "paraphrases": ["...", "...", "..."]}]
 ```
+### Caching
+The `diversify()` function automatically caches loaded models between calls.
+The generation model and the semantic filter are cached independently, so
+toggling `semantic_filter` does not reload the generation model and vice
+versa. Call `clear_cache()` to drop cached models and allow memory to be reclaimed when possible:
+```python
+from diversify_text import clear_cache
+clear_cache()
+```
 ### Using the class directly
-Recommended when processing texts across several calls — the model is loaded once and reused across calls.
+You can also instantiate a `Diversifier` yourself for full control over the
+model lifecycle:
 ```python
 from diversify_text import Diversifier
 div = Diversifier(device="cuda", methods=["tinystyler"])
-batch_1 = div.diversify(texts_1, n_styles=5)
-batch_2 = div.diversify(texts_2, n_styles=5)
+batch_1 = div.diversify(texts_1, n=5)
+batch_2 = div.diversify(texts_2, n=5)
 ```
 ### List of texts
@@ -97,7 +112,7 @@ A style bank can be a `dict[str, list[str]]` or a `list[list[str]]`:
 ```python
 from diversify_text import diversify
-from diversify_text.method.tinystyler import DEFAULT_STYLE_BANK
+from diversify_text.styles import DEFAULT_STYLE_BANK
 custom_bank = {
     "academic": ["The results demonstrate a statistically significant effect."],
@@ -111,10 +126,10 @@ results = diversify(
 )
 ```
-`DEFAULT_STYLE_BANK` is exported from `diversify_text.method.tinystyler` so you can build on it:
+`DEFAULT_STYLE_BANK` is exported from `diversify_text.styles` so you can build on it:
 ```python
-from diversify_text.method.tinystyler import DEFAULT_STYLE_BANK
+from diversify_text.styles import DEFAULT_STYLE_BANK
 extended_bank = {
     **DEFAULT_STYLE_BANK,
@@ -142,11 +157,11 @@ from diversify_text.method import DiversificationMethod
 class MyMethod(DiversificationMethod):
     name = "my_method"
-    def generate(self, texts, *, n_styles, max_new_tokens, temperature, top_p, **kwargs):
-        return [[f"{text} :: variant {i}" for i in range(n_styles)] for text in texts]
+    def generate(self, texts, *, n, max_new_tokens, temperature, top_p, **kwargs):
+        return [[f"{text} :: variant {i}" for i in range(n)] for text in texts]
-results = Diversifier(methods=[MyMethod()]).diversify("Hello", n_styles=3)
+results = Diversifier(methods=[MyMethod()]).diversify("Hello", n=3)
 ```
 ```

{diversify_text-0.1.2 → diversify_text-0.2.0}/diversify_text/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@
 import logging
+from diversify_text._cache import clear_cache
 from diversify_text.core import (
     Diversifier,
     diversify,
@@ -9,6 +10,7 @@ from diversify_text.core import (
 __all__ = [
     "Diversifier",
+    "clear_cache",
     "diversify",
 ]

diversify_text-0.2.0/diversify_text/_cache.py ADDED Viewed

@@ -0,0 +1,281 @@
+"""Per-model caching for the :func:`~diversify_text.core.diversify` convenience function.
+Keeps the generation method(s) and the MIS filter in independent
+module-level caches so that toggling ``semantic_filter`` does not
+reload the generation model, and switching methods does not reload the
+MIS model.
+Each generation method is cached individually so that adding, removing,
+or reordering methods only (re)loads the ones whose configuration
+actually changed.
+Not thread-safe.  Intended for single-threaded use in scripts and
+notebooks.  For multi-threaded applications, use :class:`Diversifier`
+directly with your own instance management.
+"""
+from __future__ import annotations
+import inspect
+from collections.abc import Mapping, Sequence
+from functools import lru_cache
+from typing import Any
+from diversify_text._utils import default_device
+from diversify_text.filter.mis import MISFilter, _DEFAULT_MIN_SCORE, _DEFAULT_N_CANDIDATES
+from diversify_text.method import DEFAULT_METHOD_REGISTRY, DiversificationMethod
+# kwargs that affect model construction and should invalidate the cache.
+# Per-call kwargs (styles, prompts, n_style_examples, etc.) are excluded.
+_CACHE_KWARGS = {"model", "device", "precision"}
+# ------------------------------------------------------------------
+# Generation method cache (dict-based, one entry per method)
+# ------------------------------------------------------------------
+_METHOD_CACHE: dict[tuple, DiversificationMethod] = {}
+def _resolve_cache_kwargs(
+    method_name: str,
+    device: str,
+    method_kwargs: Mapping[str, dict[str, Any]] | None = None,
+) -> dict[str, Any]:
+    """Resolve the full set of cache-relevant kwargs for a method.
+    Merges caller-provided kwargs with the constructor's own defaults
+    (discovered via ``inspect.signature``) so that the cache key is
+    the same whether the caller explicitly passes a default value or
+    omits it.  Only kwargs in :data:`_CACHE_KWARGS` are included.
+    For example, ``PromptingMethod.__init__`` has
+    ``model="HuggingFaceTB/SmolLM3-3B"`` as a default.
+    These two calls should hit the same cache entry::
+        # Omit model — default is filled in from the signature.
+        get_methods(device=None, methods=["prompting"])
+        # Explicitly pass the same default value.
+        get_methods(device=None, methods=["prompting"],
+            method_kwargs={"prompting": {"model": "HuggingFaceTB/SmolLM3-3B"}})
+    Without this function the first call would produce the key
+    ``("prompting", (("device", "cpu"),))`` (no model) and the second
+    ``("prompting", (("device", "cpu"), ("model", "HuggingFaceTB/..."),))``
+    — different keys, two copies of the same model loaded.
+    Parameters
+    ----------
+    method_name : str
+        Registry name of the method (e.g. ``"tinystyler"``).
+    device : str
+        Torch device string (already resolved, never ``None``).
+    method_kwargs : mapping, optional
+        Per-method keyword arguments keyed by method name.  Only the
+        entry for *method_name* is inspected.
+    Returns
+    -------
+    dict[str, Any]
+        The full set of cache-relevant kwargs, e.g.
+        ``{"device": "cpu", "model": "HuggingFaceTB/SmolLM3-3B", "precision": "auto"}``.
+    """
+    # Start with device (always present).
+    resolved: dict[str, Any] = {"device": device}
+    # Fill in constructor defaults from the method class signature.
+    method_class = DEFAULT_METHOD_REGISTRY.get(method_name)
+    signature = inspect.signature(method_class)
+    for param_name, param in signature.parameters.items():
+        # inspect.Parameter.empty is a sentinel meaning "no default value."
+        # We skip those — only fill in defaults that actually exist.
+        if (
+            param_name in _CACHE_KWARGS
+            and param_name not in resolved
+            and param.default is not inspect.Parameter.empty
+        ):
+            resolved[param_name] = param.default
+    # Override defaults with caller-provided kwargs.
+    if method_kwargs and (method_name in method_kwargs):
+        for k, v in method_kwargs[method_name].items():
+            if k in _CACHE_KWARGS:
+                resolved[k] = v
+    return resolved
+def _single_METHOD_CACHE_key(
+    method_name: str,
+    device: str,
+    method_kwargs: Mapping[str, dict[str, Any]] | None = None,
+) -> tuple:
+    """Build a hashable key for a single generation method.
+    The key uniquely identifies a loaded model instance.  It includes
+    only constructor-level kwargs (``model``, ``device``, ``precision``)
+    that determine *which* model gets loaded.  Per-call kwargs like
+    ``styles`` or ``prompts`` are excluded — changing those should reuse
+    the same model, not trigger an expensive reload.
+    Constructor defaults are resolved via ``inspect.signature`` so that
+    explicitly passing a default value produces the same cache key as
+    omitting it entirely.
+    Parameters
+    ----------
+    method_name : str
+        Registry name of the method (e.g. ``"tinystyler"``).
+    device : str
+        Torch device string (already resolved, never ``None``).
+    method_kwargs : mapping, optional
+        Per-method keyword arguments keyed by method name.  Only the
+        entry for *method_name* is inspected, and only cache
+        kwargs within that entry are included in the key.
+    Returns
+    -------
+    tuple
+        A hashable key, e.g.
+        ``("prompting", "cpu", (("model", "default-model"), ("precision", "auto")))``.
+    """
+    resolved = _resolve_cache_kwargs(method_name, device, method_kwargs)
+    constructor_kwargs = tuple(sorted(resolved.items()))
+    return method_name, constructor_kwargs
+def get_methods(
+    device: str | None,
+    methods: Sequence[str | DiversificationMethod] | None,
+    method_kwargs: Mapping[str, dict[str, Any]] | None = None,
+) -> list[DiversificationMethod]:
+    """Return cached generation methods, resolving only on config change.
+    Iterates the requested *methods* list and resolves each one
+    individually against a module-level dict cache.  On a cache miss
+    the method is instantiated via the registry (expensive — may load
+    a model); on a hit the existing instance is reused.
+    Methods can be specified as strings (looked up in the registry) or
+    as pre-built :class:`DiversificationMethod` instances (passed
+    through as-is without caching, since they're already instantiated).
+    You can mix both in one call, e.g.
+    ``methods=["tinystyler", my_custom_method]``.
+    Because each method is cached independently, adding or removing a
+    method from the list only loads the new ones — already-cached
+    methods are not affected.
+    Parameters
+    ----------
+    device : str or None
+        Torch device.  ``None`` resolves to :func:`default_device`.
+    methods : sequence of str or DiversificationMethod, optional
+        Method names and/or pre-built instances.  Defaults to
+        ``["tinystyler"]``.
+    method_kwargs : mapping, optional
+        Per-method keyword arguments keyed by method name, e.g.
+        ``{"prompting": {"model": "gpt2"}}``.  Constructor kwargs
+        (``model``, ``device``, ``precision``) affect the cache key;
+        per-call kwargs (``styles``, ``prompts``) do not.
+    Returns
+    -------
+    list[DiversificationMethod]
+        Resolved method instances in the same order as *methods*.
+    """
+    device = device or default_device()
+    if methods is None:
+        methods = ["tinystyler"]
+    result: list[DiversificationMethod] = []
+    for method in methods:
+        if isinstance(method, DiversificationMethod):
+            result.append(method)
+        elif isinstance(method, str):
+            key = _single_METHOD_CACHE_key(method, device, method_kwargs)
+            if key not in _METHOD_CACHE:  # cache miss → resolve and store
+                resolve_kwargs: dict[str, Any] = {"device": device}
+                if method_kwargs and (method in method_kwargs):
+                    resolve_kwargs.update(method_kwargs[method])
+                _METHOD_CACHE[key] = DEFAULT_METHOD_REGISTRY.resolve(
+                    [method], **resolve_kwargs
+                )[0]
+            result.append(_METHOD_CACHE[key])
+        else:
+            raise TypeError(
+                "method must be str or DiversificationMethod instance."
+            )
+    if not result:
+        raise ValueError("At least one method is required.")
+    return result
+# ------------------------------------------------------------------
+# MIS filter cache (lru_cache for expensive model load, thin wrapper
+# for cheap per-call settings like min_score and n_candidates)
+# ------------------------------------------------------------------
+@lru_cache(maxsize=1)
+def _load_mis_filter(device: str) -> MISFilter:
+    """Load the MIS filter model (expensive).
+    This is the expensive part — loading the model weights.  The
+    ``lru_cache`` decorator ensures this only runs once per last used device
+    string.  Cheap per-call settings (``min_score``, ``n_candidates``)
+    are applied separately in :func:`get_cached_mis_filter`.
+    """
+    return MISFilter(device=device)
+def get_cached_mis_filter(
+    device: str | None,
+    **filter_kwargs: Any,
+) -> MISFilter:
+    """Return cached MIS filter, reloading only when *device* changes.
+    Thin wrapper around :func:`_load_mis_filter`.  The model load is
+    cached (expensive); this function just applies cheap per-call
+    threshold settings on the existing instance.  Changing
+    ``min_score`` or ``n_candidates`` between calls does not trigger a
+    model reload — only a device change does.
+    Parameters
+    ----------
+    device : str or None
+        Torch device.  ``None`` resolves to :func:`default_device`.
+    **filter_kwargs
+        ``min_score`` and ``n_candidates``.  Missing keys reset to
+        their defaults so that omitting a kwarg doesn't leave a stale
+        value from a previous call.
+    """
+    device = device or default_device()
+    mis_filter = _load_mis_filter(device)
+    mis_filter.min_score = filter_kwargs.get("min_score", _DEFAULT_MIN_SCORE)
+    mis_filter.n_candidates = filter_kwargs.get("n_candidates", _DEFAULT_N_CANDIDATES)
+    return mis_filter
+# ------------------------------------------------------------------
+# Cache management
+# ------------------------------------------------------------------
+def clear_cache() -> None:
+    """Drop references to all cached models so their memory can be reclaimed when possible.
+    Clears both the generation method dict cache and the ``lru_cache``
+    backing the MIS filter.  After calling this, the next
+    :func:`get_methods` or :func:`get_cached_mis_filter` call will
+    load models from scratch.
+    This clears Python-level references but does not guarantee immediate
+    GPU/CPU memory release (e.g., allocator pools may retain reserved
+    memory).
+    """
+    global _METHOD_CACHE
+    _METHOD_CACHE = {}
+    _load_mis_filter.cache_clear()

{diversify_text-0.1.2 → diversify_text-0.2.0}/diversify_text/_output.py RENAMED Viewed

@@ -128,7 +128,7 @@ class OutputWriter:
     def __init__(
         self,
         input_context: InputContext,
-        n_styles: int,
+        n: int,
         output_path: Path | None,
     ) -> None:
         """Initialize the writer.
@@ -137,14 +137,14 @@ class OutputWriter:
         ----------
         input_context : InputContext
             Metadata about the input source (kind, path, etc.).
-        n_styles : int
+        n : int
             Number of paraphrase styles requested per text.
         output_path : Path or None
             Where to write results on disk.  ``None`` means results
             are kept in memory and returned as ``list[dict]``.
         """
         self._input_context = input_context
-        self._n_styles = n_styles
+        self._n = n
         self._output_path = output_path
         # Open file handle — set by open() when writing to disk.
         self._handle: IO[str] | None = None
@@ -181,7 +181,7 @@ class OutputWriter:
         originals : list[str]
             The original texts in this batch.
         paraphrases_by_text : list[list[str]]
-            One inner list per original text, each containing *n_styles*
+            One inner list per original text, each containing *n*
             paraphrased variants.  For example, with 2 styles and 2
             texts: ``[["a_style1", "a_style2"], ["b_style1", "b_style2"]]``.
         Raises
@@ -197,10 +197,10 @@ class OutputWriter:
             )
         for i, (orig, paras) in enumerate(zip(originals, paraphrases_by_text)):
-            if len(paras) != self._n_styles:
-                _log.warning(
+            if len(paras) != self._n:
+                _log.debug(
                     "Expected %d paraphrases for text %d, got %d.",
-                    self._n_styles, i, len(paras),
+                    self._n, i, len(paras),
                 )
             record = {"original": orig, "paraphrases": paras}
             if self._output_path is None:

{diversify_text-0.1.2 → diversify_text-0.2.0}/diversify_text/_postprocess.py RENAMED Viewed

@@ -18,19 +18,19 @@ def reassemble_segments(
         :func:`~diversify_text._preprocess.split_sentences`).
     paraphrases_by_segment : list[list[str]]
         Flat list of paraphrases for every segment, shape
-        ``[total_segments][n_styles]``.
+        ``[total_segments][n]``.
     Returns
     -------
     list[list[str]]
-        Shape ``[n_texts][n_styles]`` — reassembled paraphrases.
+        Shape ``[n_texts][n]`` — reassembled paraphrases.
     """
     result = []
     seg_idx = 0
     for segs in segments_per_text:
         seg_paras = paraphrases_by_segment[seg_idx : seg_idx + len(segs)]
-        n_styles = len(seg_paras[0])
-        result.append([" ".join(sp[i] for sp in seg_paras) for i in range(n_styles)])
+        n = len(seg_paras[0])
+        result.append([" ".join(sp[i] for sp in seg_paras) for i in range(n)])
         seg_idx += len(segs)
     return result
@@ -48,14 +48,14 @@ def postprocess(
     Parameters
     ----------
     candidate : list[list[str]]
-        Raw generation output, shape ``[n_generation_texts][n_styles]``.
+        Raw generation output, shape ``[n_generation_texts][n]``.
     context : PreprocessContext
         Context returned by :func:`~diversify_text._preprocess.preprocess`.
     Returns
     -------
     list[list[str]]
-        Shape ``[n_texts][n_styles]`` — one paraphrase per original text
+        Shape ``[n_texts][n]`` — one paraphrase per original text
         per style.
     """
     if context.segments_per_text is not None:

diversify_text-0.2.0/diversify_text/_utils.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Shared internal utilities for diversify."""
+from __future__ import annotations
+import contextlib
+import itertools
+import logging
+import sys
+import threading
+import warnings
+def default_device() -> str:
+    """Return the best available torch device (``"cuda"``, ``"mps"``, or ``"cpu"``)."""
+    import torch
+    if torch.cuda.is_available():
+        return "cuda"
+    if torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+@contextlib.contextmanager
+def spinner(message: str = "Loading"):
+    """Display a CLI spinner while a blocking operation runs."""
+    frames = itertools.cycle(["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"])
+    stop = threading.Event()
+    def _spin() -> None:
+        while not stop.is_set():
+            frame = next(frames)
+            sys.stderr.write(f"\r{frame} {message}")
+            sys.stderr.flush()
+            stop.wait(0.08)
+        sys.stderr.write(f"\r✓ {message}\n")
+        sys.stderr.flush()
+    thread = threading.Thread(target=_spin, daemon=True)
+    thread.start()
+    try:
+        yield
+    finally:
+        stop.set()
+        thread.join()
+@contextlib.contextmanager
+def suppress_hf_load_noise():
+    """Silence harmless noise emitted when loading HuggingFace models.
+    Covers two sources that Python's warnings module alone cannot reach:
+    - Tied-weights notices from the ``transformers`` logging system.
+    - Unexpected-key load reports from the style-embedding model.
+    """
+    transformers_logger = logging.getLogger("transformers")
+    prev_level = transformers_logger.level
+    transformers_logger.setLevel(logging.ERROR)
+    try:
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message=".*tie.*weight.*")
+            yield
+    finally:
+        transformers_logger.setLevel(prev_level)

diversify-text 0.1.2__tar.gz → 0.2.0__tar.gz

diversify-text 0.1.2tar.gz → 0.2.0tar.gz