diversify-text 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diversify_text-0.2.0 → diversify_text-0.2.1}/PKG-INFO +26 -1
- {diversify_text-0.2.0 → diversify_text-0.2.1}/README.md +25 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/core.py +30 -6
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/prompting/method.py +51 -13
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/prompting/prompts.py +3 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/tinystyler/method.py +1 -1
- {diversify_text-0.2.0 → diversify_text-0.2.1}/pyproject.toml +1 -1
- {diversify_text-0.2.0 → diversify_text-0.2.1}/.gitignore +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/LICENSE +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/__init__.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/_cache.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/_input.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/_output.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/_postprocess.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/_preprocess.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/_utils.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/filter/__init__.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/filter/mis.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/__init__.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/base.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/echo.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/prompting/__init__.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/prompting/model.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/registry.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/tinystyler/__init__.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/method/tinystyler/model.py +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/py.typed +0 -0
- {diversify_text-0.2.0 → diversify_text-0.2.1}/diversify_text/styles.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: diversify-text
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Generate stylistic paraphrases of texts using local transformer models.
|
|
5
5
|
Project-URL: Homepage, https://github.com/AnnaWegmann/diversify_text
|
|
6
6
|
Project-URL: Documentation, https://annawegmann.github.io/diversify_text/
|
|
@@ -46,6 +46,7 @@ pip install diversify-text
|
|
|
46
46
|
- [Usage](#usage)
|
|
47
47
|
- [Single text](#single-text)
|
|
48
48
|
- [Control number of paraphrases](#control-number-of-paraphrases)
|
|
49
|
+
- [Prompting method](#prompting-method)
|
|
49
50
|
- [Caching](#caching)
|
|
50
51
|
- [Using the class directly](#using-the-class-directly)
|
|
51
52
|
- [List of texts](#list-of-texts)
|
|
@@ -92,6 +93,30 @@ results = diversify("Some text.", n=3)
|
|
|
92
93
|
[{"original": "Some text.", "paraphrases": ["...", "...", "..."]}]
|
|
93
94
|
```
|
|
94
95
|
|
|
96
|
+
### Prompting method
|
|
97
|
+
|
|
98
|
+
Use the prompting method to generate paraphrases via a causal language model (default: [SmolLM3-3B](https://huggingface.co/HuggingFaceTB/SmolLM3-3B)):
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
results = diversify("The experiment was conducted in a controlled lab setting.", methods=["prompting"])
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Select specific prompt styles:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
results = diversify(
|
|
108
|
+
"The experiment was conducted in a controlled lab setting.",
|
|
109
|
+
methods=["prompting"],
|
|
110
|
+
method_kwargs={
|
|
111
|
+
"prompting": {
|
|
112
|
+
"prompt_keys": ["simple_kew", "complex_kew", "caps_reif"]
|
|
113
|
+
}
|
|
114
|
+
},
|
|
115
|
+
)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Available prompt keys: `wikipedia_paraphrase`, `simple_kew`, `complex_kew`, `formal_reif`, `simple_reif`, `passive_reif`, `caps_reif`, `lowcaps_reif`, `text_emojis_reif`, `less_common_verbs_reif`, `humanize_llm-as-coauthor_original`, and all `finephrase_*` templates. See the [full prompt reference](https://annawegmann.github.io/diversify_text/prompts.html) for details.
|
|
119
|
+
|
|
95
120
|
### Caching
|
|
96
121
|
|
|
97
122
|
The `diversify()` function automatically caches loaded models between calls.
|
|
@@ -13,6 +13,7 @@ pip install diversify-text
|
|
|
13
13
|
- [Usage](#usage)
|
|
14
14
|
- [Single text](#single-text)
|
|
15
15
|
- [Control number of paraphrases](#control-number-of-paraphrases)
|
|
16
|
+
- [Prompting method](#prompting-method)
|
|
16
17
|
- [Caching](#caching)
|
|
17
18
|
- [Using the class directly](#using-the-class-directly)
|
|
18
19
|
- [List of texts](#list-of-texts)
|
|
@@ -59,6 +60,30 @@ results = diversify("Some text.", n=3)
|
|
|
59
60
|
[{"original": "Some text.", "paraphrases": ["...", "...", "..."]}]
|
|
60
61
|
```
|
|
61
62
|
|
|
63
|
+
### Prompting method
|
|
64
|
+
|
|
65
|
+
Use the prompting method to generate paraphrases via a causal language model (default: [SmolLM3-3B](https://huggingface.co/HuggingFaceTB/SmolLM3-3B)):
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
results = diversify("The experiment was conducted in a controlled lab setting.", methods=["prompting"])
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Select specific prompt styles:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
results = diversify(
|
|
75
|
+
"The experiment was conducted in a controlled lab setting.",
|
|
76
|
+
methods=["prompting"],
|
|
77
|
+
method_kwargs={
|
|
78
|
+
"prompting": {
|
|
79
|
+
"prompt_keys": ["simple_kew", "complex_kew", "caps_reif"]
|
|
80
|
+
}
|
|
81
|
+
},
|
|
82
|
+
)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Available prompt keys: `wikipedia_paraphrase`, `simple_kew`, `complex_kew`, `formal_reif`, `simple_reif`, `passive_reif`, `caps_reif`, `lowcaps_reif`, `text_emojis_reif`, `less_common_verbs_reif`, `humanize_llm-as-coauthor_original`, and all `finephrase_*` templates. See the [full prompt reference](https://annawegmann.github.io/diversify_text/prompts.html) for details.
|
|
86
|
+
|
|
62
87
|
### Caching
|
|
63
88
|
|
|
64
89
|
The `diversify()` function automatically caches loaded models between calls.
|
|
@@ -248,16 +248,40 @@ class Diversifier:
|
|
|
248
248
|
) -> int:
|
|
249
249
|
"""Infer *n* from per-method kwargs when only one method is used.
|
|
250
250
|
|
|
251
|
-
When a single method is active
|
|
252
|
-
specific keys
|
|
253
|
-
|
|
254
|
-
|
|
251
|
+
When a single method is active (currently outof tinystyler and prompting)
|
|
252
|
+
and the caller provided method-specific keys, infers the number of
|
|
253
|
+
paraphrases from number of keys. Otherwise returns :attr:`_DEFAULT_N`.
|
|
254
|
+
|
|
255
|
+
For the prompting method, the inference depends on what is provided:
|
|
256
|
+
* ``prompt_keys`` only → ``len(prompt_keys)`` (one per template).
|
|
257
|
+
* ``styles`` only → ``len(styles)`` (style transfer, one per style).
|
|
258
|
+
* Both → each style-dependent template (in
|
|
259
|
+
:data:`EXAMPLE_BASED_PROMPT_BANK` or :data:`NAME_BASED_PROMPT_BANK`)
|
|
260
|
+
contributes ``len(styles)``, each zero-shot template contributes 1.
|
|
255
261
|
"""
|
|
256
262
|
if len(self._methods) == 1 and method_kwargs:
|
|
257
263
|
method = self._methods[0]
|
|
258
264
|
kw = method_kwargs.get(method.name, {})
|
|
259
|
-
if method.name == "prompting"
|
|
260
|
-
|
|
265
|
+
if method.name == "prompting":
|
|
266
|
+
from diversify_text.method.prompting.prompts import STYLE_DEP_PROMPTS
|
|
267
|
+
from diversify_text.styles import DEFAULT_STYLES
|
|
268
|
+
|
|
269
|
+
prompt_keys = kw.get("prompt_keys")
|
|
270
|
+
styles = kw.get("styles")
|
|
271
|
+
# When styles are not provided but style-dependent prompts
|
|
272
|
+
# are selected, default to DEFAULT_STYLES.
|
|
273
|
+
if not styles and prompt_keys and any(k in STYLE_DEP_PROMPTS for k in prompt_keys):
|
|
274
|
+
styles = DEFAULT_STYLES
|
|
275
|
+
if prompt_keys:
|
|
276
|
+
n = 0
|
|
277
|
+
for key in prompt_keys:
|
|
278
|
+
if key in STYLE_DEP_PROMPTS and styles:
|
|
279
|
+
n += len(styles)
|
|
280
|
+
else:
|
|
281
|
+
n += 1
|
|
282
|
+
return n
|
|
283
|
+
if styles:
|
|
284
|
+
return len(styles)
|
|
261
285
|
if method.name == "tinystyler" and "styles" in kw:
|
|
262
286
|
return len(kw["styles"])
|
|
263
287
|
return self._DEFAULT_N
|
|
@@ -14,6 +14,7 @@ from diversify_text.method.prompting.prompts import (
|
|
|
14
14
|
PLACEHOLDER_STYLE_NAME,
|
|
15
15
|
PLACEHOLDER_TEXT,
|
|
16
16
|
PROMPT_BANK,
|
|
17
|
+
STYLE_DEP_PROMPTS,
|
|
17
18
|
)
|
|
18
19
|
from diversify_text.styles import resolve_style_sets
|
|
19
20
|
|
|
@@ -75,7 +76,7 @@ class PromptingMethod(DiversificationMethod):
|
|
|
75
76
|
def _resolve_prompts(
|
|
76
77
|
prompt_bank: dict[str, str] | None = None,
|
|
77
78
|
prompt_keys: list[str] | None = None,
|
|
78
|
-
|
|
79
|
+
styles: list[str] | None = None,
|
|
79
80
|
custom_style_bank: dict[str, list[str]] | None = None,
|
|
80
81
|
) -> list[tuple[str, str]]:
|
|
81
82
|
"""Resolve prompt configuration into an ordered list of (key, template) pairs.
|
|
@@ -93,7 +94,7 @@ class PromptingMethod(DiversificationMethod):
|
|
|
93
94
|
zero-shot + few-shot bank).
|
|
94
95
|
prompt_keys : list[str] or None
|
|
95
96
|
Select only these keys from the bank. Order is preserved.
|
|
96
|
-
|
|
97
|
+
styles : list[str] or None
|
|
97
98
|
Names of style sets for few-shot examples. If provided
|
|
98
99
|
without *prompt_keys*, the method automatically selects
|
|
99
100
|
the ``"style_transfer"`` prompt template. When combined
|
|
@@ -101,7 +102,7 @@ class PromptingMethod(DiversificationMethod):
|
|
|
101
102
|
the ``[STYLE EXAMPLES]`` placeholder.
|
|
102
103
|
custom_style_bank : dict or None
|
|
103
104
|
Custom style bank — same trigger behavior as
|
|
104
|
-
*
|
|
105
|
+
*styles*.
|
|
105
106
|
|
|
106
107
|
Returns
|
|
107
108
|
-------
|
|
@@ -113,13 +114,13 @@ class PromptingMethod(DiversificationMethod):
|
|
|
113
114
|
Raises
|
|
114
115
|
------
|
|
115
116
|
ValueError
|
|
116
|
-
If *prompt_keys* contains unknown keys, or if *
|
|
117
|
+
If *prompt_keys* contains unknown keys, or if *styles*
|
|
117
118
|
/ *custom_style_bank* are provided but the selected
|
|
118
119
|
templates do not contain a ``[STYLE EXAMPLES]``
|
|
119
120
|
placeholder.
|
|
120
121
|
"""
|
|
121
122
|
bank = prompt_bank if prompt_bank is not None else PROMPT_BANK
|
|
122
|
-
has_styles =
|
|
123
|
+
has_styles = styles is not None or custom_style_bank is not None
|
|
123
124
|
|
|
124
125
|
# --- Select templates (four mutually exclusive cases) ---
|
|
125
126
|
|
|
@@ -153,12 +154,12 @@ class PromptingMethod(DiversificationMethod):
|
|
|
153
154
|
for _k, t in templates
|
|
154
155
|
):
|
|
155
156
|
raise ValueError(
|
|
156
|
-
"
|
|
157
|
+
"styles or custom_style_bank were provided, but the "
|
|
157
158
|
"selected prompt template(s) do not contain the "
|
|
158
159
|
f"{PLACEHOLDER_STYLE_EXAMPLES} or {PLACEHOLDER_STYLE_NAME} "
|
|
159
160
|
f"placeholder. Use a style-aware template "
|
|
160
161
|
f"(e.g. prompt_keys=['style_transfer'] or prompt_keys=['reif']) "
|
|
161
|
-
f"or remove
|
|
162
|
+
f"or remove styles. See "
|
|
162
163
|
f"https://annawegmann.github.io/diversify_text/prompts.html"
|
|
163
164
|
)
|
|
164
165
|
|
|
@@ -244,12 +245,41 @@ class PromptingMethod(DiversificationMethod):
|
|
|
244
245
|
Returns a dict mapping style names to example sentences, or
|
|
245
246
|
an empty dict when no style kwargs are provided.
|
|
246
247
|
"""
|
|
247
|
-
style_keys = kwargs.get("
|
|
248
|
+
style_keys = kwargs.get("styles")
|
|
248
249
|
custom_bank = kwargs.get("custom_style_bank")
|
|
249
250
|
if style_keys is not None or custom_bank is not None:
|
|
250
251
|
return resolve_style_sets(custom_bank, style_keys)
|
|
251
252
|
return {}
|
|
252
253
|
|
|
254
|
+
@staticmethod
|
|
255
|
+
def _build_schedule(
|
|
256
|
+
prompt_templates: list[tuple[str, str]],
|
|
257
|
+
fs_style_examples: dict[str, list[str]],
|
|
258
|
+
) -> list[tuple[str, str, int | None]]:
|
|
259
|
+
"""Build a generation schedule from templates and style examples.
|
|
260
|
+
|
|
261
|
+
Style-dependent templates (in :data:`EXAMPLE_BASED_PROMPT_BANK` or
|
|
262
|
+
:data:`NAME_BASED_PROMPT_BANK`) expand to one entry per style;
|
|
263
|
+
zero-shot templates get a single entry.
|
|
264
|
+
|
|
265
|
+
The caller iterates the schedule with modulo to fill ``n`` slots,
|
|
266
|
+
so the schedule represents one full "natural" cycle.
|
|
267
|
+
|
|
268
|
+
Each entry is ``(key, template, style_idx)`` where *style_idx* is
|
|
269
|
+
``None`` for zero-shot templates.
|
|
270
|
+
"""
|
|
271
|
+
n_styles = len(fs_style_examples) if fs_style_examples else 0
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
schedule: list[tuple[str, str, int | None]] = []
|
|
275
|
+
for key, tmpl in prompt_templates:
|
|
276
|
+
if n_styles and key in STYLE_DEP_PROMPTS:
|
|
277
|
+
for style_idx in range(n_styles):
|
|
278
|
+
schedule.append((key, tmpl, style_idx))
|
|
279
|
+
else:
|
|
280
|
+
schedule.append((key, tmpl, None))
|
|
281
|
+
return schedule
|
|
282
|
+
|
|
253
283
|
def _fill_template(
|
|
254
284
|
self,
|
|
255
285
|
template: str,
|
|
@@ -346,7 +376,7 @@ class PromptingMethod(DiversificationMethod):
|
|
|
346
376
|
Sampling parameters. ``None`` uses defaults.
|
|
347
377
|
**kwargs
|
|
348
378
|
Extra options forwarded from ``Diversifier``, including
|
|
349
|
-
``prompt_keys``, ``prompt_bank``, ``
|
|
379
|
+
``prompt_keys``, ``prompt_bank``, ``styles``,
|
|
350
380
|
``custom_style_bank``, and ``n_style_examples``.
|
|
351
381
|
"""
|
|
352
382
|
model = self._ensure_model()
|
|
@@ -356,7 +386,7 @@ class PromptingMethod(DiversificationMethod):
|
|
|
356
386
|
prompt_templates = self._resolve_prompts(
|
|
357
387
|
prompt_bank=kwargs.get("prompt_bank"),
|
|
358
388
|
prompt_keys=kwargs.get("prompt_keys"),
|
|
359
|
-
|
|
389
|
+
styles=kwargs.get("styles"),
|
|
360
390
|
custom_style_bank=kwargs.get("custom_style_bank"),
|
|
361
391
|
)
|
|
362
392
|
all_max_new_tokens = self._compute_max_new_tokens(
|
|
@@ -369,24 +399,32 @@ class PromptingMethod(DiversificationMethod):
|
|
|
369
399
|
)
|
|
370
400
|
|
|
371
401
|
fs_style_examples = self._resolve_few_shot_examples(**kwargs)
|
|
402
|
+
# Default to DEFAULT_STYLES when style-dependent prompts are
|
|
403
|
+
# selected but no explicit styles were provided.
|
|
404
|
+
if not fs_style_examples:
|
|
405
|
+
|
|
406
|
+
if any(k in STYLE_DEP_PROMPTS for k, _ in prompt_templates):
|
|
407
|
+
from diversify_text.styles import DEFAULT_STYLES
|
|
408
|
+
fs_style_examples = resolve_style_sets(None, DEFAULT_STYLES)
|
|
372
409
|
if fs_style_examples:
|
|
373
410
|
logger.info("Style sets: %s", ", ".join(fs_style_examples.keys()))
|
|
374
411
|
|
|
375
412
|
n_ex = kwargs.get("n_style_examples", _DEFAULT_N_STYLE_EXAMPLES)
|
|
376
413
|
|
|
377
|
-
|
|
414
|
+
schedule = self._build_schedule(prompt_templates, fs_style_examples)
|
|
415
|
+
|
|
378
416
|
# TODO: accept texts as an Iterable (not just list) to support
|
|
379
417
|
# streaming from large files without materialising everything
|
|
380
418
|
# in memory.
|
|
381
419
|
all_prompts: list[str] = []
|
|
382
420
|
for i in range(n):
|
|
383
|
-
_key, template =
|
|
421
|
+
_key, template, style_idx = schedule[i % len(schedule)]
|
|
384
422
|
for t in texts:
|
|
385
423
|
all_prompts.append(
|
|
386
424
|
self._fill_template(
|
|
387
425
|
template=template,
|
|
388
426
|
text=t,
|
|
389
|
-
style_idx=
|
|
427
|
+
style_idx=style_idx,
|
|
390
428
|
fs_style_examples=fs_style_examples,
|
|
391
429
|
n_style_examples=n_ex,
|
|
392
430
|
)
|
|
@@ -249,6 +249,9 @@ PLACEHOLDER_STYLE_NAME = "[STYLE NAME]"
|
|
|
249
249
|
|
|
250
250
|
PROMPT_BANK: dict[str, str] = {**ZS_PROMPT_BANK, **EXAMPLE_BASED_PROMPT_BANK, **NAME_BASED_PROMPT_BANK}
|
|
251
251
|
|
|
252
|
+
#: Prompt keys whose templates depend on style examples or style names.
|
|
253
|
+
STYLE_DEP_PROMPTS: set[str] = {*EXAMPLE_BASED_PROMPT_BANK, *NAME_BASED_PROMPT_BANK}
|
|
254
|
+
|
|
252
255
|
DEFAULT_PROMPTS: list[str] = [
|
|
253
256
|
HUMANIZE_LLM_AS_COAUTHOR_ORIGINAL,
|
|
254
257
|
COMPLEX_KEW,
|
|
@@ -118,7 +118,7 @@ class TinyStylerMethod(DiversificationMethod):
|
|
|
118
118
|
styles_arg,
|
|
119
119
|
)
|
|
120
120
|
# When explicit style keys are given, they determine the count.
|
|
121
|
-
effective_n =
|
|
121
|
+
effective_n = n
|
|
122
122
|
if effective_n > len(style_bank):
|
|
123
123
|
logger.warning(
|
|
124
124
|
"n=%d exceeds the number of style bank entries (%d). "
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|