waterfall 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waterfall/WatermarkerBase.py +1 -0
- waterfall/watermark.py +31 -6
- {waterfall-0.1.1.dist-info → waterfall-0.1.4.dist-info}/METADATA +20 -10
- waterfall-0.1.4.dist-info/RECORD +12 -0
- {waterfall-0.1.1.dist-info → waterfall-0.1.4.dist-info}/WHEEL +1 -2
- waterfall-0.1.1.dist-info/RECORD +0 -13
- waterfall-0.1.1.dist-info/top_level.txt +0 -1
- {waterfall-0.1.1.dist-info → waterfall-0.1.4.dist-info}/entry_points.txt +0 -0
- {waterfall-0.1.1.dist-info → waterfall-0.1.4.dist-info}/licenses/LICENSE +0 -0
waterfall/WatermarkerBase.py
CHANGED
waterfall/watermark.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
+
import gc
|
|
4
5
|
import torch
|
|
5
6
|
from typing import List, Literal, Optional, Tuple
|
|
6
7
|
|
|
7
8
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
9
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
8
10
|
from sentence_transformers import SentenceTransformer
|
|
9
11
|
from tqdm.auto import tqdm
|
|
10
12
|
|
|
@@ -21,6 +23,8 @@ PROMPT = (
|
|
|
21
23
|
)
|
|
22
24
|
PRE_PARAPHRASED = "Here is a paraphrased version of the text while preserving the semantic similarity:\n\n"
|
|
23
25
|
|
|
26
|
+
waterfall_cached_watermarking_model = None # Global variable to cache the watermarking model
|
|
27
|
+
|
|
24
28
|
def detect_gpu() -> str:
|
|
25
29
|
"""
|
|
26
30
|
Use torch to detect if MPS, CUDA, or neither (default CPU)
|
|
@@ -42,9 +46,10 @@ def watermark(
|
|
|
42
46
|
sts_model: SentenceTransformer,
|
|
43
47
|
num_beam_groups: int = 4,
|
|
44
48
|
beams_per_group: int = 2,
|
|
45
|
-
STS_scale:float = 2.0,
|
|
49
|
+
STS_scale: float = 2.0,
|
|
46
50
|
diversity_penalty: float = 0.5,
|
|
47
51
|
max_new_tokens: Optional[int] = None,
|
|
52
|
+
**kwargs
|
|
48
53
|
) -> str:
|
|
49
54
|
paraphrasing_prompt = watermarker.tokenizer.apply_chat_template(
|
|
50
55
|
[
|
|
@@ -61,6 +66,7 @@ def watermark(
|
|
|
61
66
|
num_beam_groups = num_beam_groups,
|
|
62
67
|
num_return_sequences = num_beam_groups * beams_per_group,
|
|
63
68
|
diversity_penalty = diversity_penalty,
|
|
69
|
+
**kwargs,
|
|
64
70
|
)
|
|
65
71
|
|
|
66
72
|
# Select best paraphrasing based on q_score and semantic similarity
|
|
@@ -140,6 +146,7 @@ def watermark_texts(
|
|
|
140
146
|
diversity_penalty: float = 0.5,
|
|
141
147
|
STS_scale:float = 2.0,
|
|
142
148
|
use_tqdm: bool = False,
|
|
149
|
+
stop_at_double_newline: bool = True, # if True, will stop generation at the first double newline. Prevent repeated paraphrasing of the same text.
|
|
143
150
|
) -> List[str]:
|
|
144
151
|
if watermark_fn == 'fourier':
|
|
145
152
|
watermarkingFnClass = WatermarkingFnFourier
|
|
@@ -150,11 +157,25 @@ def watermark_texts(
|
|
|
150
157
|
|
|
151
158
|
if watermarker is None:
|
|
152
159
|
assert model_path is not None, "model_path must be provided if watermarker is not passed"
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
160
|
+
global waterfall_cached_watermarking_model
|
|
161
|
+
|
|
162
|
+
if isinstance(waterfall_cached_watermarking_model, PreTrainedModel) and waterfall_cached_watermarking_model.name_or_path != model_path:
|
|
163
|
+
device = waterfall_cached_watermarking_model.device.type
|
|
164
|
+
del waterfall_cached_watermarking_model
|
|
165
|
+
gc.collect()
|
|
166
|
+
if device == "cuda":
|
|
167
|
+
torch.cuda.empty_cache()
|
|
168
|
+
elif device == "mps":
|
|
169
|
+
torch.mps.empty_cache()
|
|
170
|
+
waterfall_cached_watermarking_model = None
|
|
171
|
+
|
|
172
|
+
if waterfall_cached_watermarking_model is None:
|
|
173
|
+
waterfall_cached_watermarking_model = AutoModelForCausalLM.from_pretrained(
|
|
174
|
+
model_path,
|
|
175
|
+
torch_dtype=torch_dtype,
|
|
176
|
+
device_map=device,
|
|
177
|
+
)
|
|
178
|
+
model = waterfall_cached_watermarking_model
|
|
158
179
|
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
159
180
|
|
|
160
181
|
watermarker = Watermarker(tokenizer=tokenizer, model=model, id=id, kappa=kappa, k_p=k_p, watermarkingFnClass=watermarkingFnClass)
|
|
@@ -173,6 +194,9 @@ def watermark_texts(
|
|
|
173
194
|
T_ws = []
|
|
174
195
|
|
|
175
196
|
for T_o in tqdm(T_os, desc="Watermarking texts", disable=not use_tqdm):
|
|
197
|
+
if stop_at_double_newline and "\n\n" in T_o:
|
|
198
|
+
logging.warning("Text contains \\n\\n and stop_at_double_newline is set to True, replacing all \\n\\n in text.")
|
|
199
|
+
T_o = T_o.replace("\n\n", " ") # replace double newlines with space
|
|
176
200
|
T_w = watermark(
|
|
177
201
|
T_o,
|
|
178
202
|
watermarker = watermarker,
|
|
@@ -181,6 +205,7 @@ def watermark_texts(
|
|
|
181
205
|
beams_per_group = beams_per_group,
|
|
182
206
|
diversity_penalty = diversity_penalty,
|
|
183
207
|
STS_scale = STS_scale,
|
|
208
|
+
stop_strings=["\n\n"] if stop_at_double_newline else None,
|
|
184
209
|
)
|
|
185
210
|
T_ws.append(T_w)
|
|
186
211
|
|
|
@@ -1,23 +1,22 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: waterfall
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Scalable Framework for Robust Text Watermarking and Provenance for LLMs
|
|
5
|
-
Author-email: Xinyuan Niu <aperture@outlook.sg>
|
|
6
|
-
License-Expression: Apache-2.0
|
|
7
5
|
Project-URL: Homepage, https://github.com/aoi3142/Waterfall
|
|
8
6
|
Project-URL: Issues, https://github.com/aoi3142/Waterfall/issues
|
|
9
|
-
|
|
7
|
+
Author-email: Xinyuan Niu <aperture@outlook.sg>
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
License-File: LICENSE
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
12
|
Requires-Python: >=3.10
|
|
12
|
-
Description-Content-Type: text/markdown
|
|
13
|
-
License-File: LICENSE
|
|
14
13
|
Requires-Dist: accelerate>=0.29.0
|
|
15
14
|
Requires-Dist: numpy>=2.0.0
|
|
16
15
|
Requires-Dist: scipy>=1.13.0
|
|
17
16
|
Requires-Dist: sentence-transformers>=3.0.0
|
|
18
17
|
Requires-Dist: torch>=2.3.0
|
|
19
18
|
Requires-Dist: transformers>=4.43.1
|
|
20
|
-
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
21
20
|
|
|
22
21
|
# Waterfall: Scalable Framework for Robust Text Watermarking and Provenance for LLMs [EMNLP 2024 Main Long]
|
|
23
22
|
Gregory Kang Ruey Lau*, Xinyuan Niu*, Hieu Dao, Jiangwei Chen, Chuan-Sheng Foo, Bryan Kian Hsiang Low
|
|
@@ -53,7 +52,7 @@ Protecting intellectual property (IP) of text such as articles and code is incre
|
|
|
53
52
|
|
|
54
53
|
5. A token is sampled from the perturbed logits $\check{L}$ and is appended to the watermarked text.
|
|
55
54
|
|
|
56
|
-
6. Append the generated token to the prompt and continue autoregressive generation (steps 1-5) until the eos token.
|
|
55
|
+
6. Append the generated token to the prompt and continue autoregressive generation (steps 1-5) until the `eos` token.
|
|
57
56
|
|
|
58
57
|
# Verification of un/watermarked text
|
|
59
58
|
|
|
@@ -77,6 +76,13 @@ Protecting intellectual property (IP) of text such as articles and code is incre
|
|
|
77
76
|
|
|
78
77
|
# Using our code
|
|
79
78
|
|
|
79
|
+
Install our package using `pip`
|
|
80
|
+
```sh
|
|
81
|
+
pip install waterfall
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Alternative installation from source
|
|
85
|
+
|
|
80
86
|
[Optional]
|
|
81
87
|
If using `conda` (or other pkg managers), it is highly advisable to create a new environment
|
|
82
88
|
|
|
@@ -98,6 +104,8 @@ Use the command `waterfall_demo` to watermark a piece of text, and then verify t
|
|
|
98
104
|
waterfall_demo
|
|
99
105
|
```
|
|
100
106
|
|
|
107
|
+
\* Ensure that your device (`cuda`/`cpu`/`mps`) has enough memory to load the model and perform inference (~18GB+ for default Llama 3.1 8B model)
|
|
108
|
+
|
|
101
109
|
Additional arguments
|
|
102
110
|
```sh
|
|
103
111
|
waterfall_demo \
|
|
@@ -107,9 +115,11 @@ waterfall_demo \
|
|
|
107
115
|
--kappa 2 `# Watermark strength` \
|
|
108
116
|
--model meta-llama/Llama-3.1-8B-Instruct `# Paraphrasing LLM` \
|
|
109
117
|
--watermark_fn fourier `# fourier/square watermark` \
|
|
110
|
-
--device cuda `# Use cuda/cpu`
|
|
118
|
+
--device cuda `# Use cuda/cpu/mps`
|
|
111
119
|
```
|
|
112
120
|
|
|
121
|
+
\* By default, `--device` automatically selects among `cuda`/`cpu`/`mps` if not set
|
|
122
|
+
|
|
113
123
|
## Using our code to watermark and verify
|
|
114
124
|
|
|
115
125
|
To watermark texts
|
|
@@ -134,7 +144,7 @@ test_texts = ["...", "..."] # Suspected texts to verify
|
|
|
134
144
|
watermark_strength = verify_texts(test_texts, id)[0] # np array of floats
|
|
135
145
|
```
|
|
136
146
|
|
|
137
|
-
|
|
147
|
+
## Code structure
|
|
138
148
|
|
|
139
149
|
- `watermark.py` : Sample watermarking script used by with `watermark_demo` command, includes beam search and other optimizations
|
|
140
150
|
- `WatermarkerBase.py` : Underlying generation and verification code provided by `Watermarker` class
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
waterfall/WatermarkerBase.py,sha256=AyScrZz3hdjikvz5Fm4-B4acDz46i5wDFwCBg6Fp-vY,12947
|
|
2
|
+
waterfall/WatermarkingFn.py,sha256=-b-kGRdL0a7eKRqJmcHPAR_rCjxQYnsg1Ne6bTwBc1I,1931
|
|
3
|
+
waterfall/WatermarkingFnFourier.py,sha256=QYayAQYwi1dQkDIyqmvhU568VhrVYTVy47HkI8F8SZs,1358
|
|
4
|
+
waterfall/WatermarkingFnSquare.py,sha256=2PAO05DdKT02npo7GDf_82D520nP7kGAWK6H4E4JMt4,1638
|
|
5
|
+
waterfall/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
waterfall/permute.py,sha256=RwxOHFhx_VSOhhFwy5s79YgwTUBkfW2-LCCXYR3VT2o,2582
|
|
7
|
+
waterfall/watermark.py,sha256=h7e1z8vWTUAKxCcQsJ2Jkx_1ZL-ug2dEDs5FzWcYfCs,13332
|
|
8
|
+
waterfall-0.1.4.dist-info/METADATA,sha256=3hBQwb1JyrTWrayLCPFxXVlTpPjuE-ukPstW5F9F9rg,8715
|
|
9
|
+
waterfall-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
+
waterfall-0.1.4.dist-info/entry_points.txt,sha256=XXnUzuWXu2nc9j4WAll9tq6HyodN_8WJLjeG0O4Y2Gw,60
|
|
11
|
+
waterfall-0.1.4.dist-info/licenses/LICENSE,sha256=zAtaO-k41Q-Q4Etl4bzuh7pgNJsPH-dYfzvznRa0OvM,11341
|
|
12
|
+
waterfall-0.1.4.dist-info/RECORD,,
|
waterfall-0.1.1.dist-info/RECORD
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
waterfall/WatermarkerBase.py,sha256=ou78I1XisalHbJLqyST6ryuLjtkFnY7Y60fUKdIwLy4,12905
|
|
2
|
-
waterfall/WatermarkingFn.py,sha256=-b-kGRdL0a7eKRqJmcHPAR_rCjxQYnsg1Ne6bTwBc1I,1931
|
|
3
|
-
waterfall/WatermarkingFnFourier.py,sha256=QYayAQYwi1dQkDIyqmvhU568VhrVYTVy47HkI8F8SZs,1358
|
|
4
|
-
waterfall/WatermarkingFnSquare.py,sha256=2PAO05DdKT02npo7GDf_82D520nP7kGAWK6H4E4JMt4,1638
|
|
5
|
-
waterfall/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
waterfall/permute.py,sha256=RwxOHFhx_VSOhhFwy5s79YgwTUBkfW2-LCCXYR3VT2o,2582
|
|
7
|
-
waterfall/watermark.py,sha256=whiNhPwWNNIZwXMH6r7QzEE3A7Niq2Ro9elA1iSRoxI,11952
|
|
8
|
-
waterfall-0.1.1.dist-info/licenses/LICENSE,sha256=zAtaO-k41Q-Q4Etl4bzuh7pgNJsPH-dYfzvznRa0OvM,11341
|
|
9
|
-
waterfall-0.1.1.dist-info/METADATA,sha256=Ik8I-yLPuHSWdGsrSj7YgTCDJ0uTbfbV8FDvKYlPQ6M,8392
|
|
10
|
-
waterfall-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
-
waterfall-0.1.1.dist-info/entry_points.txt,sha256=XXnUzuWXu2nc9j4WAll9tq6HyodN_8WJLjeG0O4Y2Gw,60
|
|
12
|
-
waterfall-0.1.1.dist-info/top_level.txt,sha256=5rTgijeT9V5GRCwIDZmhjeZ4khgH1lmfhS9ZmdUUCKQ,10
|
|
13
|
-
waterfall-0.1.1.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
waterfall
|
|
File without changes
|
|
File without changes
|