waterfall 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -141,6 +141,7 @@ class Watermarker:
141
141
  do_sample=do_sample,
142
142
  logits_processor=logits_processor,
143
143
  pad_token_id=self.tokenizer.eos_token_id,
144
+ tokenizer=self.tokenizer,
144
145
  **kwargs
145
146
  )
146
147
  output = output[:,tokd_input["input_ids"].shape[-1]:].cpu()
waterfall/watermark.py CHANGED
@@ -1,10 +1,12 @@
1
1
  import argparse
2
2
  import logging
3
3
  import os
4
+ import gc
4
5
  import torch
5
6
  from typing import List, Literal, Optional, Tuple
6
7
 
7
8
  from transformers import AutoTokenizer, AutoModelForCausalLM
9
+ from transformers.modeling_utils import PreTrainedModel
8
10
  from sentence_transformers import SentenceTransformer
9
11
  from tqdm.auto import tqdm
10
12
 
@@ -21,6 +23,8 @@ PROMPT = (
21
23
  )
22
24
  PRE_PARAPHRASED = "Here is a paraphrased version of the text while preserving the semantic similarity:\n\n"
23
25
 
26
+ waterfall_cached_watermarking_model = None # Global variable to cache the watermarking model
27
+
24
28
  def detect_gpu() -> str:
25
29
  """
26
30
  Use torch to detect if MPS, CUDA, or neither (default CPU)
@@ -42,9 +46,10 @@ def watermark(
42
46
  sts_model: SentenceTransformer,
43
47
  num_beam_groups: int = 4,
44
48
  beams_per_group: int = 2,
45
- STS_scale:float = 2.0,
49
+ STS_scale: float = 2.0,
46
50
  diversity_penalty: float = 0.5,
47
51
  max_new_tokens: Optional[int] = None,
52
+ **kwargs
48
53
  ) -> str:
49
54
  paraphrasing_prompt = watermarker.tokenizer.apply_chat_template(
50
55
  [
@@ -61,6 +66,7 @@ def watermark(
61
66
  num_beam_groups = num_beam_groups,
62
67
  num_return_sequences = num_beam_groups * beams_per_group,
63
68
  diversity_penalty = diversity_penalty,
69
+ **kwargs,
64
70
  )
65
71
 
66
72
  # Select best paraphrasing based on q_score and semantic similarity
@@ -140,6 +146,7 @@ def watermark_texts(
140
146
  diversity_penalty: float = 0.5,
141
147
  STS_scale:float = 2.0,
142
148
  use_tqdm: bool = False,
149
+ stop_at_double_newline: bool = True, # if True, will stop generation at the first double newline. Prevent repeated paraphrasing of the same text.
143
150
  ) -> List[str]:
144
151
  if watermark_fn == 'fourier':
145
152
  watermarkingFnClass = WatermarkingFnFourier
@@ -150,11 +157,25 @@ def watermark_texts(
150
157
 
151
158
  if watermarker is None:
152
159
  assert model_path is not None, "model_path must be provided if watermarker is not passed"
153
- model = AutoModelForCausalLM.from_pretrained(
154
- model_path,
155
- torch_dtype=torch_dtype,
156
- device_map=device,
157
- )
160
+ global waterfall_cached_watermarking_model
161
+
162
+ if isinstance(waterfall_cached_watermarking_model, PreTrainedModel) and waterfall_cached_watermarking_model.name_or_path != model_path:
163
+ device = waterfall_cached_watermarking_model.device.type
164
+ del waterfall_cached_watermarking_model
165
+ gc.collect()
166
+ if device == "cuda":
167
+ torch.cuda.empty_cache()
168
+ elif device == "mps":
169
+ torch.mps.empty_cache()
170
+ waterfall_cached_watermarking_model = None
171
+
172
+ if waterfall_cached_watermarking_model is None:
173
+ waterfall_cached_watermarking_model = AutoModelForCausalLM.from_pretrained(
174
+ model_path,
175
+ torch_dtype=torch_dtype,
176
+ device_map=device,
177
+ )
178
+ model = waterfall_cached_watermarking_model
158
179
  tokenizer = AutoTokenizer.from_pretrained(model_path)
159
180
 
160
181
  watermarker = Watermarker(tokenizer=tokenizer, model=model, id=id, kappa=kappa, k_p=k_p, watermarkingFnClass=watermarkingFnClass)
@@ -173,6 +194,9 @@ def watermark_texts(
173
194
  T_ws = []
174
195
 
175
196
  for T_o in tqdm(T_os, desc="Watermarking texts", disable=not use_tqdm):
197
+ if stop_at_double_newline and "\n\n" in T_o:
198
+ logging.warning("Text contains \\n\\n and stop_at_double_newline is set to True, replacing all \\n\\n in text.")
199
+ T_o = T_o.replace("\n\n", " ") # replace double newlines with space
176
200
  T_w = watermark(
177
201
  T_o,
178
202
  watermarker = watermarker,
@@ -181,6 +205,7 @@ def watermark_texts(
181
205
  beams_per_group = beams_per_group,
182
206
  diversity_penalty = diversity_penalty,
183
207
  STS_scale = STS_scale,
208
+ stop_strings=["\n\n"] if stop_at_double_newline else None,
184
209
  )
185
210
  T_ws.append(T_w)
186
211
 
@@ -1,23 +1,22 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: waterfall
3
- Version: 0.1.1
3
+ Version: 0.1.4
4
4
  Summary: Scalable Framework for Robust Text Watermarking and Provenance for LLMs
5
- Author-email: Xinyuan Niu <aperture@outlook.sg>
6
- License-Expression: Apache-2.0
7
5
  Project-URL: Homepage, https://github.com/aoi3142/Waterfall
8
6
  Project-URL: Issues, https://github.com/aoi3142/Waterfall/issues
9
- Classifier: Programming Language :: Python :: 3
7
+ Author-email: Xinyuan Niu <aperture@outlook.sg>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE
10
10
  Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
11
12
  Requires-Python: >=3.10
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
13
  Requires-Dist: accelerate>=0.29.0
15
14
  Requires-Dist: numpy>=2.0.0
16
15
  Requires-Dist: scipy>=1.13.0
17
16
  Requires-Dist: sentence-transformers>=3.0.0
18
17
  Requires-Dist: torch>=2.3.0
19
18
  Requires-Dist: transformers>=4.43.1
20
- Dynamic: license-file
19
+ Description-Content-Type: text/markdown
21
20
 
22
21
  # Waterfall: Scalable Framework for Robust Text Watermarking and Provenance for LLMs [EMNLP 2024 Main Long]
23
22
  Gregory Kang Ruey Lau*, Xinyuan Niu*, Hieu Dao, Jiangwei Chen, Chuan-Sheng Foo, Bryan Kian Hsiang Low
@@ -53,7 +52,7 @@ Protecting intellectual property (IP) of text such as articles and code is incre
53
52
 
54
53
  5. A token is sampled from the perturbed logits $\check{L}$ and is appended to the watermarked text.
55
54
 
56
- 6. Append the generated token to the prompt and continue autoregressive generation (steps 1-5) until the eos token.
55
+ 6. Append the generated token to the prompt and continue autoregressive generation (steps 1-5) until the `eos` token.
57
56
 
58
57
  # Verification of un/watermarked text
59
58
 
@@ -77,6 +76,13 @@ Protecting intellectual property (IP) of text such as articles and code is incre
77
76
 
78
77
  # Using our code
79
78
 
79
+ Install our package using `pip`
80
+ ```sh
81
+ pip install waterfall
82
+ ```
83
+
84
+ ## Alternative installation from source
85
+
80
86
  [Optional]
81
87
  If using `conda` (or other pkg managers), it is highly advisable to create a new environment
82
88
 
@@ -98,6 +104,8 @@ Use the command `waterfall_demo` to watermark a piece of text, and then verify t
98
104
  waterfall_demo
99
105
  ```
100
106
 
107
+ \* Ensure that your device (`cuda`/`cpu`/`mps`) has enough memory to load the model and perform inference (~18GB+ for default Llama 3.1 8B model)
108
+
101
109
  Additional arguments
102
110
  ```sh
103
111
  waterfall_demo \
@@ -107,9 +115,11 @@ waterfall_demo \
107
115
  --kappa 2 `# Watermark strength` \
108
116
  --model meta-llama/Llama-3.1-8B-Instruct `# Paraphrasing LLM` \
109
117
  --watermark_fn fourier `# fourier/square watermark` \
110
- --device cuda `# Use cuda/cpu`
118
+ --device cuda `# Use cuda/cpu/mps`
111
119
  ```
112
120
 
121
+ \* By default, `--device` automatically selects among `cuda`/`cpu`/`mps` if not set
122
+
113
123
  ## Using our code to watermark and verify
114
124
 
115
125
  To watermark texts
@@ -134,7 +144,7 @@ test_texts = ["...", "..."] # Suspected texts to verify
134
144
  watermark_strength = verify_texts(test_texts, id)[0] # np array of floats
135
145
  ```
136
146
 
137
- # Code structure
147
+ ## Code structure
138
148
 
139
149
  - `watermark.py` : Sample watermarking script used by with `watermark_demo` command, includes beam search and other optimizations
140
150
  - `WatermarkerBase.py` : Underlying generation and verification code provided by `Watermarker` class
@@ -0,0 +1,12 @@
1
+ waterfall/WatermarkerBase.py,sha256=AyScrZz3hdjikvz5Fm4-B4acDz46i5wDFwCBg6Fp-vY,12947
2
+ waterfall/WatermarkingFn.py,sha256=-b-kGRdL0a7eKRqJmcHPAR_rCjxQYnsg1Ne6bTwBc1I,1931
3
+ waterfall/WatermarkingFnFourier.py,sha256=QYayAQYwi1dQkDIyqmvhU568VhrVYTVy47HkI8F8SZs,1358
4
+ waterfall/WatermarkingFnSquare.py,sha256=2PAO05DdKT02npo7GDf_82D520nP7kGAWK6H4E4JMt4,1638
5
+ waterfall/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ waterfall/permute.py,sha256=RwxOHFhx_VSOhhFwy5s79YgwTUBkfW2-LCCXYR3VT2o,2582
7
+ waterfall/watermark.py,sha256=h7e1z8vWTUAKxCcQsJ2Jkx_1ZL-ug2dEDs5FzWcYfCs,13332
8
+ waterfall-0.1.4.dist-info/METADATA,sha256=3hBQwb1JyrTWrayLCPFxXVlTpPjuE-ukPstW5F9F9rg,8715
9
+ waterfall-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
+ waterfall-0.1.4.dist-info/entry_points.txt,sha256=XXnUzuWXu2nc9j4WAll9tq6HyodN_8WJLjeG0O4Y2Gw,60
11
+ waterfall-0.1.4.dist-info/licenses/LICENSE,sha256=zAtaO-k41Q-Q4Etl4bzuh7pgNJsPH-dYfzvznRa0OvM,11341
12
+ waterfall-0.1.4.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -1,13 +0,0 @@
1
- waterfall/WatermarkerBase.py,sha256=ou78I1XisalHbJLqyST6ryuLjtkFnY7Y60fUKdIwLy4,12905
2
- waterfall/WatermarkingFn.py,sha256=-b-kGRdL0a7eKRqJmcHPAR_rCjxQYnsg1Ne6bTwBc1I,1931
3
- waterfall/WatermarkingFnFourier.py,sha256=QYayAQYwi1dQkDIyqmvhU568VhrVYTVy47HkI8F8SZs,1358
4
- waterfall/WatermarkingFnSquare.py,sha256=2PAO05DdKT02npo7GDf_82D520nP7kGAWK6H4E4JMt4,1638
5
- waterfall/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- waterfall/permute.py,sha256=RwxOHFhx_VSOhhFwy5s79YgwTUBkfW2-LCCXYR3VT2o,2582
7
- waterfall/watermark.py,sha256=whiNhPwWNNIZwXMH6r7QzEE3A7Niq2Ro9elA1iSRoxI,11952
8
- waterfall-0.1.1.dist-info/licenses/LICENSE,sha256=zAtaO-k41Q-Q4Etl4bzuh7pgNJsPH-dYfzvznRa0OvM,11341
9
- waterfall-0.1.1.dist-info/METADATA,sha256=Ik8I-yLPuHSWdGsrSj7YgTCDJ0uTbfbV8FDvKYlPQ6M,8392
10
- waterfall-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- waterfall-0.1.1.dist-info/entry_points.txt,sha256=XXnUzuWXu2nc9j4WAll9tq6HyodN_8WJLjeG0O4Y2Gw,60
12
- waterfall-0.1.1.dist-info/top_level.txt,sha256=5rTgijeT9V5GRCwIDZmhjeZ4khgH1lmfhS9ZmdUUCKQ,10
13
- waterfall-0.1.1.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- waterfall